2021
Juez-Gil, Mario; Arnaiz-González, Álvar; Rodríguez, Juan José; López-Nozal, Carlos; García-Osorio, César
Approx-SMOTE: Fast SMOTE for Big Data on Apache Spark Journal Article
In: Neurocomputing, vol. 464, pp. 432-437, 2021, ISSN: 0925-2312.
Abstract | Links | BibTeX | Tags: Big data, Data Mining, imbalance, SMOTE, Spark
@article{Juez-Gil2021bb,
title = {Approx-SMOTE: Fast SMOTE for Big Data on Apache Spark},
author = {Mario Juez-Gil and Álvar Arnaiz-González and Juan José Rodríguez and Carlos López-Nozal and César García-Osorio},
url = {https://www.sciencedirect.com/science/article/pii/S0925231221012832},
doi = {https://doi.org/10.1016/j.neucom.2021.08.086},
issn = {0925-2312},
year = {2021},
date = {2021-11-13},
journal = {Neurocomputing},
volume = {464},
pages = {432-437},
abstract = {One of the main goals of Big Data research, is to find new data mining methods that are able to process large amounts of data in acceptable times. In Big Data classification, as in traditional classification, class imbalance is a common problem that must be addressed, in the case of Big Data also looking for a solution that can be applied in an acceptable execution time. In this paper we present Approx-SMOTE, a parallel implementation of the SMOTE algorithm for the Apache Spark framework. The key difference with the original SMOTE, besides parallelism, is that it uses an approximated version of k-Nearest Neighbor which makes it highly scalable. Although an implementation of SMOTE for Big Data already exists (SMOTE-BD), it uses an exact Nearest Neighbor search, which does not make it entirely scalable. Approx-SMOTE on the other hand is able to achieve up to 30 times faster run times without sacrificing the improved classification performance offered by the original SMOTE.},
keywords = {Big data, Data Mining, imbalance, SMOTE, Spark},
pubstate = {published},
tppubtype = {article}
}
2020
Bustillo, Andrés; Pimenov, Danil Yurievich; Mia, Mozammel; Kapłonek, Wojciech
Machine-learning for automatic prediction of flatness deviation considering the wear of the face mill teeth Journal Article
In: Journal of Intelligent Manufacturing, 2020, ISSN: 0956-5515.
Abstract | Links | BibTeX | Tags: Cutting power, Face milling, Flatness deviation, Random forest, SMOTE, tool condition monitoring, Tool life, Wear
@article{Bustillo2020b,
title = {Machine-learning for automatic prediction of flatness deviation considering the wear of the face mill teeth},
author = {Andrés Bustillo and Danil Yurievich Pimenov and Mozammel Mia and Wojciech Kapłonek},
url = {https://link.springer.com/article/10.1007/s10845-020-01645-3},
doi = {https://doi.org/10.1007/s10845-020-01645-3},
issn = {0956-5515},
year = {2020},
date = {2020-09-03},
journal = {Journal of Intelligent Manufacturing},
abstract = {The acceptance of the machined surfaces not only depends on roughness parameters but also in the flatness deviation (Δfl). Hence, before reaching the threshold of flatness deviation caused by the wear of the face mill, the tool inserts need to be changed to avoid the expected product rejection. As current CNC machines have the facility to track, in real-time, the main drive power, the present study utilizes this facility to predict the flatness deviation—with proper consideration to the amount of wear of cutting tool insert’s edge. The prediction of deviation from flatness is evaluated as a regression and a classification problem, while different machine-learning techniques like Multilayer Perceptrons, Radial Basis Functions Networks, Decision Trees and Random Forest ensembles have been examined. Finally, Random Forest ensembles combined with Synthetic Minority Over-sampling Technique (SMOTE) balancing technique showed the highest performance when the flatness levels are discretized taking into account industrial requirements. The SMOTE balancing technique resulted in a very useful strategy to avoid the strong limitations that small experiment datasets produce in the accuracy of machine-learning models.},
keywords = {Cutting power, Face milling, Flatness deviation, Random forest, SMOTE, tool condition monitoring, Tool life, Wear},
pubstate = {published},
tppubtype = {article}
}
2015
Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César; Kuncheva, Ludmila I
Random Balance: Ensembles of variable priors classifiers for imbalanced data Journal Article
In: Knowledge-Based Systems, vol. 85, pp. 96-111, 2015, ISSN: 0950-7051.
Abstract | Links | BibTeX | Tags: AdaBoost, Bagging, Class-imbalanced problems, Classifier ensembles, Data Mining, Ensemble methods, SELECTED, SMOTE, Undersampling
@article{RandomBalance,
title = {Random Balance: Ensembles of variable priors classifiers for imbalanced data},
author = {José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio and Ludmila I Kuncheva},
url = {http://www.sciencedirect.com/science/article/pii/S0950705115001720},
doi = {10.1016/j.knosys.2015.04.022},
issn = {0950-7051},
year = {2015},
date = {2015-01-01},
journal = {Knowledge-Based Systems},
volume = {85},
pages = {96-111},
abstract = {Abstract In Machine Learning, a data set is imbalanced when the class proportions are highly skewed. Class-imbalanced problems sets arise routinely in many application domains and pose a challenge to traditional classifiers. We propose a new approach to building ensembles of classifiers for two-class imbalanced data sets, called Random Balance. Each member of the Random Balance ensemble is trained with data sampled from the training set and augmented by artificial instances obtained using SMOTE. The novelty in the approach is that the proportions of the classes for each ensemble member are chosen randomly. The intuition behind the method is that the proposed diversity heuristic will ensure that the ensemble contains classifiers that are specialized for different operating points on the ROC space, thereby leading to larger AUC compared to other ensembles of classifiers. Experiments have been carried out to test the Random Balance approach by itself, and also in combination with standard ensemble methods. As a result, we propose a new ensemble creation method called RB-Boost which combines Random Balance with AdaBoost.M2. This combination involves enforcing random class proportions in addition to instance re-weighting. Experiments with 86 imbalanced data sets from two well known repositories demonstrate the advantage of the Random Balance approach.},
keywords = {AdaBoost, Bagging, Class-imbalanced problems, Classifier ensembles, Data Mining, Ensemble methods, SELECTED, SMOTE, Undersampling},
pubstate = {published},
tppubtype = {article}
}
Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César; Kuncheva, Ludmila I
Diversity techniques improve the performance of the best imbalance learning ensembles Journal Article
In: Information Sciences, vol. 325, pp. 98 - 117, 2015, ISSN: 0020-0255.
Abstract | Links | BibTeX | Tags: Class-imbalanced problems, Classifier ensembles, Data Mining, Diversity, Ensemble methods, Rotation forest, SELECTED, SMOTE, Undersampling
@article{DiezPastor201598,
title = {Diversity techniques improve the performance of the best imbalance learning ensembles},
author = {José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio and Ludmila I Kuncheva},
url = {http://www.sciencedirect.com/science/article/pii/S0020025515005186},
doi = {10.1016/j.ins.2015.07.025},
issn = {0020-0255},
year = {2015},
date = {2015-01-01},
journal = {Information Sciences},
volume = {325},
pages = {98 - 117},
abstract = {Abstract Many real-life problems can be described as unbalanced, where the number of instances belonging to one of the classes is much larger than the numbers in other classes. Examples are spam detection, credit card fraud detection or medical diagnosis. Ensembles of classifiers have acquired popularity in this kind of problems for their ability to obtain better results than individual classifiers. The most commonly used techniques by those ensembles especially designed to deal with imbalanced problems are for example Re-weighting, Oversampling and Undersampling. Other techniques, originally intended to increase the ensemble diversity, have not been systematically studied for their effect on imbalanced problems. Among these are Random Oracles, Disturbing Neighbors, Random Feature Weights or Rotation Forest. This paper presents an overview and an experimental study of various ensemble-based methods for imbalanced problems, the methods have been tested in its original form and in conjunction with several diversity-increasing techniques, using 84 imbalanced data sets from two well known repositories. This paper shows that these diversity-increasing techniques significantly improve the performance of ensemble methods for imbalanced problems and provides some ideas about when it is more convenient to use these diversifying techniques.},
keywords = {Class-imbalanced problems, Classifier ensembles, Data Mining, Diversity, Ensemble methods, Rotation forest, SELECTED, SMOTE, Undersampling},
pubstate = {published},
tppubtype = {article}
}