2018
Arnaiz-González, Álvar; Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César
Local sets for multi-label instance selection Journal Article
In: Applied Soft Computing, vol. 68, pp. 651-666, 2018, ISSN: 1568-4946.
Abstract | Links | BibTeX | Tags: Data reduction, Instance selection, Local set, Multi-label classification, Nearest neighbor, SELECTED
@article{Arnaiz-González2018b,
title = {Local sets for multi-label instance selection},
author = {Álvar Arnaiz-González and José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio},
url = {https://www.sciencedirect.com/science/article/pii/S1568494618302072},
doi = {10.1016/j.asoc.2018.04.016},
issn = {1568-4946},
year = {2018},
date = {2018-07-01},
journal = {Applied Soft Computing},
volume = {68},
pages = {651-666},
abstract = {The multi-label classification problem is an extension of traditional (single-label) classification, in which the output is a vector of values rather than a single categorical value. The multi-label problem is therefore a very different and much more challenging one than the single-label problem. Recently, multi-label classification has attracted interest, because of its real-life applications, such as image recognition, bio-informatics, and text categorization, among others. Unfortunately, there are few instance selection techniques capable of processing the data used for these applications. These techniques are also very useful for cleaning and reducing the size of data sets.
In single-label problems, the local set of an instance x comprises all instances in the largest hypersphere centered on x, so that they are all of the same class. This concept has been successfully integrated in the design of Iterative Case Filtering, one of the most influential instance selection methods in single-label learning. Unfortunately, the concept that was originally defined for single-label learning cannot be directly applied to multi-label data, as each instance has more than one label.
An adaptation of the local set concept to multi-label data is proposed in this paper and its effectiveness is verified in the design of two new algorithms that yielded competitive results. One of the adaptations cleans the data sets, to improve their predictive capabilities, while the other aims to reduce data set sizes. Both are tested and compared against the state-of-the-art instance selection methods available for multi-label learning.},
keywords = {Data reduction, Instance selection, Local set, Multi-label classification, Nearest neighbor, SELECTED},
pubstate = {published},
tppubtype = {article}
}
In single-label problems, the local set of an instance x comprises all instances in the largest hypersphere centered on x, so that they are all of the same class. This concept has been successfully integrated in the design of Iterative Case Filtering, one of the most influential instance selection methods in single-label learning. Unfortunately, the concept that was originally defined for single-label learning cannot be directly applied to multi-label data, as each instance has more than one label.
An adaptation of the local set concept to multi-label data is proposed in this paper and its effectiveness is verified in the design of two new algorithms that yielded competitive results. One of the adaptations cleans the data sets, to improve their predictive capabilities, while the other aims to reduce data set sizes. Both are tested and compared against the state-of-the-art instance selection methods available for multi-label learning.
Gunn, Iain A D; Arnaiz-González, Álvar; Kuncheva, Ludmila I
A taxonomic look at instance-based stream classifiers Journal Article
In: Neurocomputing, vol. 286, pp. 167-178, 2018, ISSN: 0925-2312.
Abstract | Links | BibTeX | Tags: Concept drift, Instance selection, Machine learning, Prototype generation, Stream classification
@article{Gunn2018,
title = {A taxonomic look at instance-based stream classifiers},
author = {Iain A D Gunn and Álvar Arnaiz-González and Ludmila I Kuncheva},
url = {https://www.sciencedirect.com/science/article/pii/S092523121830095X},
doi = {10.1016/j.neucom.2018.01.062},
issn = {0925-2312},
year = {2018},
date = {2018-04-19},
journal = {Neurocomputing},
volume = {286},
pages = {167-178},
abstract = {Large numbers of data streams are today generated in many fields. A key challenge when learning from such streams is the problem of concept drift. Many methods, including many prototype methods, have been proposed in recent years to address this problem. This paper presents a refined taxonomy of instance selection and generation methods for the classification of data streams subject to concept drift. The taxonomy allows discrimination among a large number of methods which pre-existing taxonomies for offline instance selection methods did not distinguish. This makes possible a valuable new perspective on experimental results, and provides a framework for discussion of the concepts behind different algorithm-design approaches. We review a selection of modern algorithms for the purpose of illustrating the distinctions made by the taxonomy. We present the results of a numerical experiment which examined the performance of a number of representative methods on both synthetic and real-world data sets with and without concept drift, and discuss the implications for the directions of future research in light of the taxonomy. On the basis of the experimental results, we are able to give recommendations for the experimental evaluation of algorithms which may be proposed in the future.},
keywords = {Concept drift, Instance selection, Machine learning, Prototype generation, Stream classification},
pubstate = {published},
tppubtype = {article}
}
2016
Arnaiz-González, Álvar; Blachnik, Marcin; Kordos, Mirosław; García-Osorio, César
Fusion of instance selection methods in regression tasks Journal Article
In: Information Fusion, vol. 30, pp. 69 - 79, 2016, ISSN: 1566-2535.
Abstract | Links | BibTeX | Tags: Data Mining, Ensemble methods, Instance selection, Regression, SELECTED
@article{ArnaizGonzalez201669,
title = {Fusion of instance selection methods in regression tasks},
author = {Álvar Arnaiz-González and Marcin Blachnik and Mirosław Kordos and César García-Osorio},
url = {http://www.sciencedirect.com/science/article/pii/S1566253515001141},
doi = {10.1016/j.inffus.2015.12.002},
issn = {1566-2535},
year = {2016},
date = {2016-01-01},
journal = {Information Fusion},
volume = {30},
pages = {69 - 79},
abstract = {Abstract Data pre-processing is a very important aspect of data mining. In this paper we discuss instance selection used for prediction algorithms, which is one of the pre-processing approaches. The purpose of instance selection is to improve the data quality by data size reduction and noise elimination. Until recently, instance selection has been applied mainly to classification problems. Very few recent papers address instance selection for regression tasks. This paper proposes fusion of instance selection algorithms for regression tasks to improve the selection performance. As the members of the ensemble two different families of instance selection methods are evaluated: one based on distance threshold and the other one on converting the regression task into a multiple class classification task. Extensive experimental evaluation performed on the two regression versions of the Edited Nearest Neighbor (ENN) and Condensed Nearest Neighbor (CNN) methods showed that the best performance measured by the error value and data size reduction are in most cases obtained for the ensemble methods.},
keywords = {Data Mining, Ensemble methods, Instance selection, Regression, SELECTED},
pubstate = {published},
tppubtype = {article}
}
Arnaiz-González, Álvar; Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César
Instance selection for regression by discretization Journal Article
In: Expert Systems With Applications, 2016, ISSN: 0957-4174.
Links | BibTeX | Tags: Data Mining, Instance selection, Regression
@article{ArnaizGonzalez201669b,
title = {Instance selection for regression by discretization},
author = {Álvar Arnaiz-González and José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio},
doi = {10.1016/j.eswa.2015.12.046},
issn = {0957-4174},
year = {2016},
date = {2016-01-01},
journal = {Expert Systems With Applications},
keywords = {Data Mining, Instance selection, Regression},
pubstate = {published},
tppubtype = {article}
}
Arnaiz-González, Álvar; Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César
Instance selection for regression: Adapting DROP Journal Article
In: Neurocomputing, vol. 201, pp. 66–81, 2016, ISSN: 0925-2312.
Abstract | Links | BibTeX | Tags: Data Mining, DROP, Instance selection, Noise filtering, Regression
@article{ArnaizGonzález2016,
title = {Instance selection for regression: Adapting DROP},
author = {Álvar Arnaiz-González and José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio},
url = {http://www.sciencedirect.com/science/article/pii/S0925231216301953},
doi = {10.1016/j.neucom.2016.04.003},
issn = {0925-2312},
year = {2016},
date = {2016-01-01},
journal = {Neurocomputing},
volume = {201},
pages = {66–81},
abstract = {Abstract Machine Learning has two central processes of interest that captivate the scientific community: classification and regression. Although instance selection for classification has shown its usefulness and has been researched in depth, instance selection for regression has not followed the same path and there are few published algorithms on the subject. In this paper, we propose that various adaptations of DROP, a well-known family of instance selection methods for classification, be applied to regression. Their behaviour is analysed using a broad range of datasets. The results are presented of the analysis of four new proposals for the reduction of dataset size, the effect on error when several classifiers are trained with the reduced dataset, and their robustness against noise. This last aspect is especially important, since in real life, it is frequent that the registered data be inexact and present distortions due to different causes: errors in the measurement tools, typos when writing results, existence of outliers and spurious readings, corruption in files, etc. When the datasets are small it is possible to manually correct these problems, but for big and huge datasets is better to have automatic methods to deal with these problems. In the experimental part, the proposed methods are found to be quite robust to noise.},
keywords = {Data Mining, DROP, Instance selection, Noise filtering, Regression},
pubstate = {published},
tppubtype = {article}
}
Arnaiz-González, Álvar; Díez-Pastor, José Francisco; Rodríguez, Juan José; García-Osorio, César
Instance selection of linear complexity for big data Journal Article
In: Knowledge-Based Systems, vol. 107, pp. 83–95, 2016, ISSN: 0950-7051.
Abstract | Links | BibTeX | Tags: Big data, Data Mining, Data reduction, Hashing, Instance selection, Nearest neighbors, SELECTED
@article{ArnaizGonzálezLSHIS2016,
title = {Instance selection of linear complexity for big data},
author = {Álvar Arnaiz-González and José Francisco Díez-Pastor and Juan José Rodríguez and César García-Osorio},
url = {http://www.sciencedirect.com/science/article/pii/S0950705116301617},
doi = {10.1016/j.knosys.2016.05.056},
issn = {0950-7051},
year = {2016},
date = {2016-01-01},
journal = {Knowledge-Based Systems},
volume = {107},
pages = {83–95},
abstract = {Abstract Over recent decades, database sizes have grown considerably. Larger sizes present new challenges, because machine learning algorithms are not prepared to process such large volumes of information. Instance selection methods can alleviate this problem when the size of the data set is medium to large. However, even these methods face similar problems with very large-to-massive data sets. In this paper, two new algorithms with linear complexity for instance selection purposes are presented. Both algorithms use locality-sensitive hashing to find similarities between instances. While the complexity of conventional methods (usually quadratic, O ( n 2 ) , or log-linear, O ( n log n ) ) means that they are unable to process large-sized data sets, the new proposal shows competitive results in terms of accuracy. Even more remarkably, it shortens execution time, as the proposal manages to reduce complexity and make it linear with respect to the data set size. The new proposal has been compared with some of the best known instance selection methods for testing and has also been evaluated on large data sets (up to a million instances).},
keywords = {Big data, Data Mining, Data reduction, Hashing, Instance selection, Nearest neighbors, SELECTED},
pubstate = {published},
tppubtype = {article}
}
2014
Arnaiz-González, Alvar; Díez-Pastor, José Francisco; García-Osorio, César; Rodríguez, Juan José
Selección de instancias en regresión mediante discretización Proceedings Article
In: XVII congreso español sobre tecnologías y lógica fuzzy, ESTYLF 2014, pp. 351-356, Zaragoza, Spain, 2014, ISBN: 978-84-15688-76-1.
BibTeX | Tags: Data Mining, Instance selection
@inproceedings{ESTYLF2014b,
title = {Selección de instancias en regresión mediante discretización},
author = {Alvar Arnaiz-González and José Francisco Díez-Pastor and César García-Osorio and Juan José Rodríguez},
isbn = {978-84-15688-76-1},
year = {2014},
date = {2014-01-01},
booktitle = {XVII congreso español sobre tecnologías y lógica fuzzy, ESTYLF 2014},
pages = {351-356},
address = {Zaragoza, Spain},
keywords = {Data Mining, Instance selection},
pubstate = {published},
tppubtype = {inproceedings}
}
2012
Arnaiz-González, Álvar; Diez-Pastor, José Francisco; García-Osorio, César; Rodríguez, Juan José
Tool for supporting the teaching of instance selection algorithms Proceedings Article
In: Proceedings of the 4th international conference on education and new learning technologies (EDULEARN12), pp. 6088–6096, IATED, Barcelona, Spain, 2012, ISSN: 2340-1117, (2nd-4th July 2012).
Links | BibTeX | Tags: Computer Science teaching, Instance selection
@inproceedings{ISBur:EDULEARN2012,
title = {Tool for supporting the teaching of instance selection algorithms},
author = {Álvar Arnaiz-González and José Francisco Diez-Pastor and César García-Osorio and Juan José Rodríguez},
url = {http://library.iated.org/view/ARNAIZGONZLEZ2012TOO},
issn = {2340-1117},
year = {2012},
date = {2012-00-01},
booktitle = {Proceedings of the 4th international conference on education and new learning technologies (EDULEARN12)},
pages = {6088–6096},
publisher = {IATED},
address = {Barcelona, Spain},
note = {2nd-4th July 2012},
keywords = {Computer Science teaching, Instance selection},
pubstate = {published},
tppubtype = {inproceedings}
}
Arnaiz-González, Álvar; Diez-Pastor, José Francisco; García-Osorio, César; Rodríguez, Juan José
Herramienta de apoyo a la docencia de algoritmos de selección de instancias Proceedings Article
In: Actas de las XVIII Jornadas de Enseñanza Universitaria de Informática (JENUI 2012), pp. 33–40, Ciudad Real,Spain, 2012, ISBN: 978–84–615–7157–4, (10-13 de julio 2012).
BibTeX | Tags: Computer Science teaching, Instance selection
@inproceedings{JENUI2012,
title = {Herramienta de apoyo a la docencia de algoritmos de selección de instancias},
author = {Álvar Arnaiz-González and José Francisco Diez-Pastor and César García-Osorio and Juan José Rodríguez},
isbn = {978–84–615–7157–4},
year = {2012},
date = {2012-00-01},
booktitle = {Actas de las XVIII Jornadas de Enseñanza Universitaria de Informática (JENUI 2012)},
pages = {33–40},
address = {Ciudad Real,Spain},
note = {10-13 de julio 2012},
keywords = {Computer Science teaching, Instance selection},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
García-Osorio, César; Haro-García, Aida; García-Pedrajas, Nicolás
Democratic instance selection: A linear complexity instance selection algorithm based on classifier ensemble concepts Journal Article
In: Artif. Intell., vol. 174, no. 5-6, pp. 410–441, 2010, ISSN: 0004-3702.
Links | BibTeX | Tags: Big data, Data Mining, Instance selection
@article{1746771,
title = {Democratic instance selection: A linear complexity instance selection algorithm based on classifier ensemble concepts},
author = {César García-Osorio and Aida Haro-García and Nicolás García-Pedrajas},
doi = {10.1016/j.artint.2010.01.001},
issn = {0004-3702},
year = {2010},
date = {2010-01-01},
journal = {Artif. Intell.},
volume = {174},
number = {5-6},
pages = {410–441},
publisher = {Elsevier Science Publishers Ltd.},
address = {Essex, UK},
keywords = {Big data, Data Mining, Instance selection},
pubstate = {published},
tppubtype = {article}
}
2007
García-Pedrajas, Nicolás; Romero-del-Castillo, Juan; García-Osorio, César
Boosting $k$-nearest neighbors classifiers by weighted evolutionary instace selection Proceedings Article
In: Actas de las I Jornadas sobre Algoritmos Evolutivos y Metaheurísticas (JAEM 2007), pp. 301 – 308, Zaragoza, 2007.
BibTeX | Tags: Classifier ensembles, Instance selection
@inproceedings{JAEM2007,
title = {Boosting $k$-nearest neighbors classifiers by weighted evolutionary instace selection},
author = {Nicolás García-Pedrajas and Juan Romero-del-Castillo and César García-Osorio},
year = {2007},
date = {2007-00-01},
booktitle = {Actas de las I Jornadas sobre Algoritmos Evolutivos y Metaheurísticas (JAEM 2007)},
pages = {301 – 308},
address = {Zaragoza},
keywords = {Classifier ensembles, Instance selection},
pubstate = {published},
tppubtype = {inproceedings}
}