2023
Barbero-Aparicio, José A.; Olivares-Gil, Alicia; Díez-Pastor, José F.; García-Osorio, César
Deep learning and support vector machines for transcription start site identification Journal Article
In: PeerJ Computer Science, vol. 9, iss. e1340, 2023, ISSN: 2376-5992.
Abstract | Links | BibTeX | Tags: bioinformatics, Convolutional neural network, Deep learning, Long short-term memory, Machine learning, Support vector machines, transcription start site
@article{barbero-aparicio2023,
title = {Deep learning and support vector machines for transcription start site identification},
author = {José A. Barbero-Aparicio and Alicia Olivares-Gil and José F. Díez-Pastor and César García-Osorio},
editor = {Carlos Fernandez-Lozano},
url = {https://doi.org/10.7717/peerj-cs.1340},
doi = {10.7717/peerj-cs.1340},
issn = {2376-5992},
year = {2023},
date = {2023-04-17},
urldate = {2023-04-17},
journal = {PeerJ Computer Science},
volume = {9},
issue = {e1340},
abstract = {Recognizing transcription start sites is key to gene identification. Several approaches have been employed in related problems such as detecting translation initiation sites or promoters, many of the most recent ones based on machine learning. Deep learning methods have been proven to be exceptionally effective for this task, but their use in transcription start site identification has not yet been explored in depth. Also, the very few existing works do not compare their methods to support vector machines (SVMs), the most established technique in this area of study, nor provide the curated dataset used in the study. The reduced amount of published papers in this specific problem could be explained by this lack of datasets. Given that both support vector machines and deep neural networks have been applied in related problems with remarkable results, we compared their performance in transcription start site predictions, concluding that SVMs are computationally much slower, and deep learning methods, specially long short-term memory neural networks (LSTMs), are best suited to work with sequences than SVMs. For such a purpose, we used the reference human genome GRCh38. Additionally, we studied two different aspects related to data processing: the proper way to generate training examples and the imbalanced nature of the data. Furthermore, the generalization performance of the models studied was also tested using the mouse genome, where the LSTM neural network stood out from the rest of the algorithms. To sum up, this article provides an analysis of the best architecture choices in transcription start site identification, as well as a method to generate transcription start site datasets including negative instances on any species available in Ensembl. We found that deep learning methods are better suited than SVMs to solve this problem, being more efficient and better adapted to long sequences and large amounts of data. We also create a transcription start site (TSS) dataset large enough to be used in deep learning experiments},
keywords = {bioinformatics, Convolutional neural network, Deep learning, Long short-term memory, Machine learning, Support vector machines, transcription start site},
pubstate = {published},
tppubtype = {article}
}
2022
Barbero-Aparicio, José Antonio; Cuesta-Lopez, Santiago; García-Osorio, César Ignacio; Pérez-Rodríguez, Javier; García-Pedrajas, Nicolás
Nonlinear physics opens a new paradigm for accurate transcription start site prediction Journal Article
In: BMC Bioinformatics, vol. 23, no. 565, 2022, ISSN: 1471-2105.
Abstract | Links | BibTeX | Tags: DNA breathing, DNA modelling, Machine learning, String kernels, SVM, TSS prediction
@article{Barbero-Aparicio2022,
title = {Nonlinear physics opens a new paradigm for accurate transcription start site prediction},
author = { José Antonio Barbero-Aparicio and Santiago Cuesta-Lopez and César Ignacio García-Osorio and Javier Pérez-Rodríguez and Nicolás García-Pedrajas },
editor = {José Manuel Benítez},
url = {https://doi.org/10.1186/s12859-022-05129-4},
doi = {10.1186/s12859-022-05129-4},
issn = {1471-2105},
year = {2022},
date = {2022-12-30},
urldate = {2022-12-30},
journal = {BMC Bioinformatics},
volume = {23},
number = {565},
abstract = {There is evidence that DNA breathing (spontaneous opening of the DNA strands) plays a relevant role in the interactions of DNA with other molecules, and in particular in the transcription process. Therefore, having physical models that can predict these openings is of interest. However, this source of information has not been used before either in transcription start sites (TSSs) or promoter prediction. In this article, one such model is used as an additional information source that, when used by a machine learning (ML) model, improves the results of current methods for the prediction of TSSs. In addition, we provide evidence on the validity of the physical model, as it is able by itself to predict TSSs with high accuracy. This opens an exciting avenue of research at the intersection of statistical mechanics and ML, where ML models in bioinformatics can be improved using physical models of DNA as feature extractors.},
keywords = {DNA breathing, DNA modelling, Machine learning, String kernels, SVM, TSS prediction},
pubstate = {published},
tppubtype = {article}
}
Ramos-Pérez, Ismael; Arnaiz-González, Álvar; Rodríguez, Juan José; García-Osorio, César
When is resampling beneficial for feature selection with imbalanced wide data? Journal Article
In: Expert Systems with Applications, vol. 188, pp. 116015, 2022, ISSN: 0957-4174.
Abstract | Links | BibTeX | Tags: Feature selection, High dimensional data, Machine learning, Unbalanced, Very low sample size, Wide data
@article{Ramos-Pérez2022,
title = {When is resampling beneficial for feature selection with imbalanced wide data?},
author = {Ismael Ramos-Pérez and Álvar Arnaiz-González and Juan José Rodríguez and César García-Osorio},
url = {https://www.sciencedirect.com/science/article/pii/S0957417421013622},
doi = {https://doi.org/10.1016/j.eswa.2021.116015},
issn = {0957-4174},
year = {2022},
date = {2022-02-01},
journal = {Expert Systems with Applications},
volume = {188},
pages = {116015},
abstract = {This paper studies the effects that combinations of balancing and feature selection techniques have on wide data (many more attributes than instances) when different classifiers are used. For this, an extensive study is done using 14 datasets, 3 balancing strategies, and 7 feature selection algorithms. The evaluation is carried out using 5 classification algorithms, analyzing the results for different percentages of selected features, and establishing the statistical significance using Bayesian tests.
Some general conclusions of the study are that it is better to use RUS before the feature selection, while ROS and SMOTE offer better results when applied afterwards. Additionally, specific results are also obtained depending on the classifier used, for example, for Gaussian SVM the best performance is obtained when the feature selection is done with SVM-RFE before balancing the data with RUS.},
keywords = {Feature selection, High dimensional data, Machine learning, Unbalanced, Very low sample size, Wide data},
pubstate = {published},
tppubtype = {article}
}
Some general conclusions of the study are that it is better to use RUS before the feature selection, while ROS and SMOTE offer better results when applied afterwards. Additionally, specific results are also obtained depending on the classifier used, for example, for Gaussian SVM the best performance is obtained when the feature selection is done with SVM-RFE before balancing the data with RUS.
2021
Juez-Gil, Mario; Arnaiz-González, Álvar; Rodríguez, Juan José; López-Nozal, Carlos; García-Osorio, César
Rotation Forest for Big Data Journal Article
In: Information Fusion, vol. 74, pp. 39-49, 2021, ISSN: 1566-2535.
Abstract | Links | BibTeX | Tags: Big data, Ensemble learning, Machine learning, Random forest, Rotation forest, Spark
@article{Juez-Gil2021,
title = {Rotation Forest for Big Data},
author = {Mario Juez-Gil and Álvar Arnaiz-González and Juan José Rodríguez and Carlos López-Nozal and César García-Osorio},
url = {https://www.sciencedirect.com/science/article/pii/S1566253521000634},
doi = {10.1016/j.inffus.2021.03.007},
issn = {1566-2535},
year = {2021},
date = {2021-10-01},
journal = {Information Fusion},
volume = {74},
pages = {39-49},
abstract = {The Rotation Forest classifier is a successful ensemble method for a wide variety of data mining applications. However, the way in which Rotation Forest transforms the feature space through PCA, although powerful, penalizes training and prediction times, making it unfeasible for Big Data. In this paper, a MapReduce Rotation Forest and its implementation under the Spark framework are presented. The proposed MapReduce Rotation Forest behaves in the same way as the standard Rotation Forest, training the base classifiers on a rotated space, but using a functional implementation of the rotation that enables its execution in Big Data frameworks. Experimental results are obtained using different cloud-based cluster configurations. Bayesian tests are used to validate the method against two ensembles for Big Data: Random Forest and PCARDE classifiers. Our proposal incorporates the parallelization of both the PCA calculation and the tree training, providing a scalable solution that retains the performance of the original Rotation Forest and achieves a competitive execution time (in average, at training, more than 3 times faster than other PCA-based alternatives). In addition, extensive experimentation shows that by setting some parameters of the classifier (i.e., bootstrap sample size, number of trees, and number of rotations), the execution time is reduced with no significant loss of performance using a small ensemble.},
keywords = {Big data, Ensemble learning, Machine learning, Random forest, Rotation forest, Spark},
pubstate = {published},
tppubtype = {article}
}
Cerro, Azahara; Romero, Pablo E.; Yiğit, Okan; Bustillo, Andrés
Use of machine learning algorithms for surface roughness prediction of printed parts in polyvinyl butyral via fused deposition modeling Journal Article
In: The International Journal of Advanced Manufacturing Technology, 2021, ISSN: 0268-3768.
Abstract | Links | BibTeX | Tags: 3d printing, Fused deposition modeling Fused filament fabrication, Machine learning, surface roughness, WEKA
@article{Cerro2021,
title = {Use of machine learning algorithms for surface roughness prediction of printed parts in polyvinyl butyral via fused deposition modeling},
author = {Azahara Cerro and Pablo E. Romero and Okan Yiğit and Andrés Bustillo},
url = {https://link.springer.com/article/10.1007/s00170-021-07300-2},
doi = {https://doi.org/10.1007/s00170-021-07300-2},
issn = {0268-3768},
year = {2021},
date = {2021-05-25},
journal = {The International Journal of Advanced Manufacturing Technology},
abstract = {Machine learning algorithms for classification are employed in this study to generate different models that can predict the surface roughness of parts manufactured from polyvinyl butyral by means of Fused Deposition Modeling (FDM). Five input variables are defined (layer height, print speed, number of perimeters, wall angle, and extruder temperature), and 16 parts are 3D printed, each with three different surfaces (48 surfaces in total). The print values used to print each part were defined by a fractionated orthogonal experimental design. Using a perthometer, the average value of surface roughness, Ra, on each surface was obtained. From these experimental values, 40 models were trained and validated. The model with the best prediction results was the one generated by bagging and Multilayer Perceptron (BMLP), with a Kappa statistic of 0.9143. The input variables with the highest influence on the surface finish are the wall angle and the layer height.},
keywords = {3d printing, Fused deposition modeling Fused filament fabrication, Machine learning, surface roughness, WEKA},
pubstate = {published},
tppubtype = {article}
}
2020
Díez-Pastor, José Francisco; Latorre-Carmona, Pedro; Arnaiz-González, Álvar; Ruiz-Pérez, Javier; Zurro, Débora
“You Are Not My Type”: An Evaluation of Classification Methods for Automatic Phytolith Identification Journal Article
In: Microscopy and Microanalysis, vol. 26, pp. 1158-1167, 2020, ISSN: 1431-9276.
Abstract | Links | BibTeX | Tags: Feature extraction, Machine learning, Microfossils, Morphometry, Proxy
@article{Díez-Pastor2020,
title = {“You Are Not My Type”: An Evaluation of Classification Methods for Automatic Phytolith Identification},
author = {José Francisco Díez-Pastor and Pedro Latorre-Carmona and Álvar Arnaiz-González and Javier Ruiz-Pérez and Débora Zurro},
url = {https://www.cambridge.org/core/journals/microscopy-and-microanalysis/article/you-are-not-my-type-an-evaluation-of-classification-methods-for-automatic-phytolith-identification/48F88E9407086B797BBE383B8BC15904},
doi = {https://doi.org/10.1017/S1431927620024629},
issn = {1431-9276},
year = {2020},
date = {2020-11-10},
journal = {Microscopy and Microanalysis},
volume = {26},
pages = { 1158-1167},
abstract = {Phytoliths can be an important source of information related to environmental and climatic change, as well as to ancient plant use by humans, particularly within the disciplines of paleoecology and archaeology. Currently, phytolith identification and categorization is performed manually by researchers, a time-consuming task liable to misclassifications. The automated classification of phytoliths would allow the standardization of identification processes, avoiding possible biases related to the classification capability of researchers. This paper presents a comparative analysis of six classification methods, using digitized microscopic images to examine the efficacy of different quantitative approaches for characterizing phytoliths. A comprehensive experiment performed on images of 429 phytoliths demonstrated that the automatic phytolith classification is a promising area of research that will help researchers to invest time more efficiently and improve their recognition accuracy rate.},
keywords = {Feature extraction, Machine learning, Microfossils, Morphometry, Proxy},
pubstate = {published},
tppubtype = {article}
}
Bustillo, Andrés; Reis, Roberto; Machado, Alisson R.; Pimenov, Danil Yurievich
Improving the accuracy of machine-learning models with data from machine test repetitions Journal Article
In: Journal of Intelligent Manufacturing, 2020, ISSN: 0956-5515.
Abstract | Links | BibTeX | Tags: Artificial intelligence, Brandsma facing tests, Ensembles, Machine learning, Tool geometry, Turning
@article{Bustillo2020,
title = {Improving the accuracy of machine-learning models with data from machine test repetitions},
author = {Andrés Bustillo and Roberto Reis and Alisson R. Machado and Danil Yurievich Pimenov},
url = {https://link.springer.com/article/10.1007%2Fs10845-020-01661-3},
doi = {https://doi.org/10.1007/s10845-020-01661-3},
issn = {0956-5515},
year = {2020},
date = {2020-09-17},
journal = {Journal of Intelligent Manufacturing},
abstract = {The modelling of machining processes by means of machine-learning algorithms is still based on principles that are especially adapted to mechanical approaches, in which very few inputs are varied with little repetition of experimental conditions. These principles might not be ideal to achieve accurate machine-learning models and they are certainly not aligned with the practicalities of industrial machining in factories. In this research the effect of a new strategy to improve machine-learning model accuracy is studied: experimental repetition. Tool-life prediction in the face-turning operations of AISI 1045 steel discs, depending on different cooling systems and tool geometries, is selected as a case study. Both the side rake and the relief angles of HSS tools are optimized using the Brandsma facing test under dry, MQL, and flooding conditions. Different machine-learning algorithms, such as regression trees, kNNs, artificial neural networks, and ensembles (bagging and Random Forest) are tested. On the one hand, the results of the study showed that artificial neural networks of Radial Basis Functions presented the highest model accuracy (11.4 mm RMSE), but required a very sensitive and complex tuning process. On the other hand, they demonstrated that ensembles, especially Random Forest, provided models with accuracy in the same range, but with no tuning procedure (12.8 mm RMSE). Secondly, the effect of an increased dataset size, by means of experimental repetition, is evaluated and compared with traditional experimental modelling that used average values. The results showed that some machine-learning techniques, including both ensemble types, significantly improved their accuracy with this strategy, by up to 23%. The results therefore suggested that the use of raw experimental data, rather than their averaged values, can achieve machine-learning models of higher accuracy for tool-wear processes.},
keywords = {Artificial intelligence, Brandsma facing tests, Ensembles, Machine learning, Tool geometry, Turning},
pubstate = {published},
tppubtype = {article}
}
2018
Gunn, Iain A D; Arnaiz-González, Álvar; Kuncheva, Ludmila I
A taxonomic look at instance-based stream classifiers Journal Article
In: Neurocomputing, vol. 286, pp. 167-178, 2018, ISSN: 0925-2312.
Abstract | Links | BibTeX | Tags: Concept drift, Instance selection, Machine learning, Prototype generation, Stream classification
@article{Gunn2018,
title = {A taxonomic look at instance-based stream classifiers},
author = {Iain A D Gunn and Álvar Arnaiz-González and Ludmila I Kuncheva},
url = {https://www.sciencedirect.com/science/article/pii/S092523121830095X},
doi = {10.1016/j.neucom.2018.01.062},
issn = {0925-2312},
year = {2018},
date = {2018-04-19},
journal = {Neurocomputing},
volume = {286},
pages = {167-178},
abstract = {Large numbers of data streams are today generated in many fields. A key challenge when learning from such streams is the problem of concept drift. Many methods, including many prototype methods, have been proposed in recent years to address this problem. This paper presents a refined taxonomy of instance selection and generation methods for the classification of data streams subject to concept drift. The taxonomy allows discrimination among a large number of methods which pre-existing taxonomies for offline instance selection methods did not distinguish. This makes possible a valuable new perspective on experimental results, and provides a framework for discussion of the concepts behind different algorithm-design approaches. We review a selection of modern algorithms for the purpose of illustrating the distinctions made by the taxonomy. We present the results of a numerical experiment which examined the performance of a number of representative methods on both synthetic and real-world data sets with and without concept drift, and discuss the implications for the directions of future research in light of the taxonomy. On the basis of the experimental results, we are able to give recommendations for the experimental evaluation of algorithms which may be proposed in the future.},
keywords = {Concept drift, Instance selection, Machine learning, Prototype generation, Stream classification},
pubstate = {published},
tppubtype = {article}
}
Güemes-Peña, Diego; López-Nozal, Carlos; Marticorena-Sánchez, Raúl; Maudes-Raedo, Jesús
Emerging topics in mining software repositories Journal Article
In: Progress in Artificial Intelligence, pp. 1-11, 2018, ISSN: 2192-6360.
Abstract | Links | BibTeX | Tags: Data Mining, Machine learning, Software engineering, Software process, Software repository
@article{Güemes-Peña2018,
title = {Emerging topics in mining software repositories},
author = {Diego Güemes-Peña and Carlos López-Nozal and Raúl Marticorena-Sánchez and Jesús Maudes-Raedo},
url = {https://link.springer.com/content/pdf/10.1007/s13748-018-0147-7.pdf},
doi = {10.1007/s13748-018-0147-7},
issn = {2192-6360},
year = {2018},
date = {2018-01-01},
journal = {Progress in Artificial Intelligence},
pages = {1-11},
abstract = {A software process is a set of related activities that culminates in the production of a software package: specification, design, implementation, testing, evolution into new versions, and maintenance. There are also other supporting activities such as configuration and change management, quality assurance, project management, evaluation of user experience, etc. Software repositories are infrastructures to support all these activities. They can be composed with several systems that include code change management, bug tracking, code review, build system, release binaries, wikis, forums, etc. This position paper on mining software repositories presents a review and a discussion of research in this field over the past decade. We also identify applied machine learning strategies, current working topics, and future challenges for the improvement of company decision-making systems. Machine learning is defined as the process of discovering patterns in data. It can be applied to software repositories, since every change is recorded as data. Companies can then use these patterns as the basis for their decision-making systems and for knowledge discovery.},
keywords = {Data Mining, Machine learning, Software engineering, Software process, Software repository},
pubstate = {published},
tppubtype = {article}
}