@article {792, title = {Decomposition-Fusion for Label Distribution Learning}, journal = {Information Fusion}, volume = {66}, year = {2021}, note = {TIN2017-89517-P}, month = {02/2021}, pages = {64-75}, abstract = {Label Distribution Learning (LDL) is a general learning framework that assigns an instance to a distribution over a set of labels rather than to a single label or multiple labels. Current LDL methods have proven their effectiveness in many real-life machine learning applications. However, LDL is a generalization of the classification task and as such it is exposed to the same problems as standard classification algorithms, including class-imbalanced, noise, overlapping or irregularities. The purpose of this paper is to mitigate these effects by using decomposition strategies. The technique devised, called Decomposition-Fusion for LDL (DF-LDL), is based on one of the most renowned strategy in decomposition: the One-vs-One scheme, which we adapt to be able to deal with LDL datasets. In addition, we propose a competent fusion method that allows us to discard non-competent classifiers when their output is probably not of interest. The effectiveness of the proposed DF-LDL method is verified on several real-world LDL datasets on which we have carried out two types of experiments. First, comparing our proposal with the base learners and, second, comparing our proposal with the state-of-the-art LDL algorithms. DF-LDL shows significant improvements in both experiments.}, keywords = {Decomposition strategies, Label Distribution Learning, machine learning, One vs. One}, doi = {https://doi.org/10.1016/j.inffus.2020.08.024}, author = {Gonzalez, M and Germ{\'a}n Gonz{\'a}lez-Almagro and Triguero, Isaac and J. R. Cano and Garc{\'\i}a, Salvador} } @article {793, title = {Enhancing instance-level constrained clustering through differential evolution}, journal = {Applied Soft Computing}, volume = {108}, number = {107435}, year = {2021}, note = {TIN2017-89517-P; PP2019.PRI.I.06.}, pages = {1-19}, abstract = {Clustering has always been a powerful tool in knowledge discovery. Traditionally unsupervised, it received renewed attention when it was shown to produce better results when provided with new types of information, thus leading to a new kind of semi-supervised learning: constrained clustering. This technique is a generalization of traditional clustering that considers additional information encoded by constraints. Constraints can be given in the form of instance-level must-link and cannot-link constraints, which this paper focuses on. We propose the first application of Differential Evolution to the constrained clustering problem, which has proven to produce a better exploration{\textendash}exploitation trade-off when comparing with previous approaches. We will compare the results obtained by this proposal to those obtained by previous nature-inspired techniques and by some of the state-of-the-art algorithms on 25 datasets with incremental levels of constraint-based information, supporting our conclusions with the aid of Bayesian statistical tests.}, keywords = {Cannot-link, constrained clustering, Differential evolution, Instance-level, Must-link}, doi = {https://doi.org/10.1016/j.asoc.2021.107435}, author = {Germ{\'a}n Gonz{\'a}lez-Almagro and Luengo, Juli{\'a}n and J. R. Cano and Garc{\'\i}a, Salvador} } @article {791, title = {Synthetic Sample Generation for Label Distribution Learning}, journal = {Information Sciences}, volume = {544}, year = {2021}, note = {TIN2017-89517-P}, month = {01/2021}, pages = {197-213}, abstract = {Label Distribution Learning (LDL) is a general learning framework that assigns an instance to a distribution over a set of labels rather than a single label or multiple labels. Current LDL methods have proven their effectiveness in many machine learning applications. As of the first formulation of the LDL problem, numerous studies have been carried out that apply the LDL methodology to various real-life problem solving. Others have focused more specifically on the proposal of new algorithms. The purpose of this article is to start addressing the LDL problem as of the data pre-processing stage. The baseline hypothesis is that, due to the high dimensionality of existing LDL data sets, it is very likely that this data will be incomplete and/or that poor data quality will lead to poor performance once applied to the learning algorithms. In this paper, we propose an oversampling method, which creates a superset of the original dataset by creating new instances from existing ones. Then, we apply already existing algorithms to the pre-processed training set in order to validate the effcacy of our method. The effectiveness of the proposed SSG-LDL is verified on several LDL datasets, showing significant improvements to the state-of-the-art LDL methods.}, keywords = {Data pre-processing, Label Distribution Learning, machine learning, Oversampling}, doi = {https://doi.org/10.1016/j.ins.2020.07.071}, author = {Gonzalez, M and Luengo, Juli{\'a}n and J. R. Cano and Garc{\'\i}a, Salvador} } @conference {790, title = {Agglomerative Constrained Clustering Through Similarity and Distance Recalculation}, booktitle = {International Conference on Hybrid Artificial Intelligence Systems}, year = {2020}, pages = {424-436}, abstract = {Constrained clustering has become a topic of considerable interest in machine learning, as it has been shown to produce promising results in domains where only partial information about how to solve the problem is available. Constrained clustering can be viewed as a semi-supervised generalization of clustering, which is traditionally unsupervised. It is able to leverage a new type of information encoded by constraints that guide the clustering process. In particular, this study focuses on instance-level must-link and cannot-link constraints. We propose an agglomerative constrained clustering algorithm, which combines distance-based and clustering-engine adapting methods to incorporate constraints into the partitioning process. It computes a similarity measure on the basis of distances (in the dataset) and constraints (in the constraint set) to later apply an agglomerative clustering method, whose clustering engine has been adapted to consider constraints and raw distances. We prove its capability to produce quality results for the constrained clustering problem by comparing its performance to previous proposals on several datasets with incremental levels of constraint-based information.}, keywords = {Agglomerative clustering, constrained clustering, Semi-supervised learning, Similarity recalculation}, doi = {https://doi.org/10.1007/978-3-030-61705-9_35}, author = {Germ{\'a}n Gonz{\'a}lez-Almagro and Juan Luis Suarez and Luengo, Juli{\'a}n and J. R. Cano and Garc{\'\i}a, Salvador} } @article {789, title = {DILS: Constrained clustering through dual iterative local search}, journal = {Computers \& Operations Research}, volume = {121}, year = {2020}, note = {TIN2017- 89517-P; PP2016.PRI.I.02.}, pages = {104979}, abstract = {Clustering has always been a powerful tool in knowledge discovery. Traditionally unsupervised, it has received renewed attention recently as it has shown to produce better results when provided with new types of information, thus leading to a new kind of semi-supervised learning: constrained clustering. This technique is a generalization of traditional clustering that considers additional information encoded by constraints. Constraints can be given in the form of instance-level must-link and cannot-link constraints, which is the focus of this paper. We propose a new metaheuristic algorithm, the Dual Iterative Local Search, and prove its ability to produce quality results for the constrained clustering problem. We compare the results obtained by this proposal to those obtained by the state-of-the-art algorithms on 25 datasets with incremental levels of constraint-based information, supporting our conclusions with the aid of Bayesian statistical tests.}, keywords = {Cannot-link, constrained clustering, Dual iterative local search, Instance-level, Must-link}, doi = {https://doi.org/10.1016/j.cor.2020.104979}, author = {Germ{\'a}n Gonz{\'a}lez-Almagro and Luengo, Juli{\'a}n and J. R. Cano and Garc{\'\i}a, Salvador} } @conference {788, title = {Improving constrained clustering via decomposition-based multiobjective optimization with memetic elitism}, booktitle = {GECCO {\textquoteright}20: Proceedings of the 2020 Genetic and Evolutionary Computation Conference}, year = {2020}, note = {TIN2017-89517-P; PP2016.PRI.I.02.}, month = {06/2020}, pages = {333{\textendash}341}, abstract = {Clustering has always been a topic of interest in knowledge discovery, it is able to provide us with valuable information within the unsupervised machine learning framework. It received renewed attention when it was shown to produce better results in environments where partial information about how to solve the problem is available, thus leading to a new machine learning paradigm: semi-supervised machine learning. This new type of information can be given in the form of constraints, which guide the clustering process towards quality solutions. In particular, this study considers the pairwise instance-level must-link and cannot-link constraints. Given the ill-posed nature of the constrained clustering problem, we approach it from the multiobjective optimization point of view. Our proposal consists in a memetic elitist evolutionary strategy that favors exploitation by applying a local search procedure to the elite of the population and transferring its results only to the external population, which will also be used to generate new individuals. We show the capability of this method to produce quality results for the constrained clustering problem when considering incremental levels of constraint-based information. For the comparison with state-of-the-art methods, we include previous multi-objective approaches, single-objective genetic algorithms and classic constrained clustering methods.}, keywords = {constrained clustering, memetic elitis MOEA, multiobjective optimization, pairwise instance- level constraints, Semi-supervised learning}, doi = {https://doi.org/10.1145/3377930.3390187}, author = {Germ{\'a}n Gonz{\'a}lez-Almagro and Rosales-P{\'e}rez, Alejandro and Luengo, Juli{\'a}n and J. R. Cano and Garc{\'\i}a, Salvador} } @article {786, title = {ProLSFEO-LDL: Prototype Selection and Label- Specific Feature Evolutionary Optimization for Label Distribution Learning}, journal = {Applied Sciences}, volume = {10}, year = {2020}, note = {TIN2017-89517-P}, pages = {3089}, abstract = {Label Distribution Learning (LDL) is a general learning framework that assigns an instance to a distribution over a set of labels rather than to a single label or multiple labels. Current LDL methods have proven their effectiveness in many real-life machine learning applications. In LDL problems, instance-based algorithms and particularly the adapted version of the k-nearest neighbors method for LDL (AA-kNN) has proven to be very competitive, achieving acceptable results and allowing an explainable model. However, it suffers from several handicaps: it needs large storage requirements, it is not efficient predicting and presents a low tolerance to noise. The purpose of this paper is to mitigate these effects by adding a data reduction stage. The technique devised, called Prototype selection and Label-Specific Feature Evolutionary Optimization for LDL (ProLSFEO-LDL), is a novel method to simultaneously address the prototype selection and the label-specific feature selection pre-processing techniques. Both techniques pose a complex optimization problem with a huge search space. Therefore, we have proposed a search method based on evolutionary algorithms that allows us to obtain a solution to both problems in a reasonable time. The effectiveness of the proposed ProLSFEO-LDL method is verified on several real-world LDL datasets, showing significant improvements in comparison with using raw datasets.}, doi = {https://doi.org/10.3390/app10093089}, author = {Gonzalez, M and J. R. Cano and Garc{\'\i}a, Salvador} } @article {787, title = {Similarity-based and Iterative Label Noise Filters for Monotonic Classification}, journal = {Proceedings of the 53rd Hawaii International Conference on System Sciences}, year = {2020}, note = {TIN2017-89517-P; TEC2015-69496-R; BigDaP-TOOLS - Ayudas Fundaci{\'o}n BBVA a Equipos de Investigaci{\'o}n Cient{\'\i}fica 2016}, pages = {1698-1706}, abstract = {Monotonic ordinal classification has received an increasing interest in the latest years. Building monotone models from these problems usually requires datasets that verify monotonic relationships among the samples. When the monotonic relationships are not met, changing the labels may be a viable option, but the risk is high: wrong label changes would completely change the information contained in the data. In this work, we tackle the construction of monotone datasets by removing the wrong or noisy examples that violate monotonicity restrictions. We propose two monotonic noise filtering algorithms to preprocess the ordinal datasets and improve the monotonic relations between instances. The experiments are carried out over eleven ordinal datasets, showing that the application of the proposed filters improve the prediction capabilities over different levels of noise.}, keywords = {Monotonic classification, noise, noise filter, Ordinal classification, Soft Computing: Theory Innovations and Problem Solving Benefits}, doi = {https://doi.org/10.24251/HICSS.2020.210}, author = {J. R. Cano and Luengo, Juli{\'a}n and Garc{\'\i}a, Salvador} } @article {785, title = {Monotonic classification: An overview on algorithms, performance measures and data sets}, journal = {Neurocomputing}, volume = {341}, year = {2019}, note = {TIN2017-89517-P; TIN2015-70308-REDT; TIN2014-54583-C2-1-R; TEC2015-69496-R}, month = {05/2019}, pages = {168-182}, abstract = {Currently, knowledge discovery in databases is an essential first step when identifying valid, novel and useful patterns for decision making. There are many real-world scenarios, such as bankruptcy prediction, option pricing or medical diagnosis, where the classification models to be learned need to fulfill restrictions of monotonicity (i.e. the target class label should not decrease when input attributes values increase). For instance, it is rational to assume that a higher debt ratio of a company should never result in a lower level of bankruptcy risk. Consequently, there is a growing interest from the data mining research community concerning monotonic predictive models. This paper aims to present an overview of the literature in the field, analyzing existing techniques and proposing a taxonomy of the algorithms based on the type of model generated. For each method, we review the quality metrics considered in the evaluation and the different data sets and monotonic problems used in the analysis. In this way, this paper serves as an overview of monotonic classification research in specialized literature and can be used as a functional guide for the field.}, keywords = {Monotonic classification, Monotonic data sets, Ordinal classification, Software Performance metrics, Taxonomy}, doi = {https://doi.org/10.1016/j.neucom.2019.02.024}, author = {J. R. Cano and Pedro Antonio Guti{\'e}rrez and Bartosz Krawczyk and Michat Wo{\'z}niak and Garc{\'\i}a, Salvador} } @article {796, title = {Smartdata: Data preprocessing to achieve smart data in R}, journal = {Neurocomputing}, volume = {360}, year = {2019}, note = {BigDaP-TOOLS - Ayudas Fundaci{\'o}n BBVA a Equipos de Investigaci{\'o}n Cient{\'\i}fica 2016}, month = {09/2019}, pages = {1-13}, abstract = {As the amount of data available exponentially grows, data scientists are aware that finding the value in the data is key to a successful data exploiting. However, the data rarely presents itself in a ordered, clean way. In opposition to dealing with raw data, the term smart data is becoming more and more visible both in the specialized literature and companies. While software packages publicly exist to deal with raw data, there is no unified framework that encompasses all the required fields to transform such raw data to smart data. In this paper, the novel smartdata package is introduced. Written in R and available at CRAN repository, it includes the most recent and relevant algorithms to treat raw data from multiple perspectives, now unified under a simple yet powerful API, which enables the data scientist to easily pipeline their application. The main features of the package, as well as some illustrative examples of its use are detailed throughout this manuscript.}, keywords = {Data preprocessing, machine learning, Preprocessing, Smart data}, doi = {https://doi.org/10.1016/j.neucom.2019.06.006}, author = {I. Cordon and Luengo, Juli{\'a}n and Garc{\'\i}a, Salvador and F. Herrera and Francisco Charte} } @article {552, title = {CommuniMents: A Framework for Detecting Community Based Sentiments for Events}, journal = {International Journal on Semantic Web and Information Systems}, volume = {13}, year = {2017}, pages = {87-108}, author = {Jarwar, Muhammad Aslam and Abbasi, Rabeeh Ayaz and Mushtaq, Mubashar and Maqbool, Onaiza and Aljohani, Naif R and Daud, Ali and Alowibdi, Jalal S and J. R. Cano and Garc{\'\i}a, Salvador and Chong, Ilyoung} } @article {610, title = {Making CN1 -SD Subgroup Discovery Algorithm Scalable to Large Size Data Sets Using Instance Selection}, journal = {Expert System with Applications}, volume = {35}, number = {4}, year = {2008}, pages = {1949-1965}, author = {J. R. Cano and F. Herrera and Lozano, Manuel and Garc{\'\i}a, Salvador} } @conference {article, title = {Un primer estudio sobre el uso de los sistemas de clasificaci{\'o}n basados en reglas difusas en problemas de clasificaci{\'o}n con clases no balanceadas}, booktitle = {XIV Congreso Espa{\~n}ol sobre tecnolog{\'\i}as y l{\'o}gica fuzzy}, year = {2006}, month = {01}, address = {Ciudad Real (Espa{\~n}ol)}, author = {Fern{\'a}ndez, Alberto and Garc{\'\i}a, Salvador and F. Herrera and M. J. del Jesus} }