@article {OTERO201642, title = {Finding informative code metrics under uncertainty for predicting the pass rate of online courses}, journal = {Information Sciences}, volume = {373}, year = {2016}, pages = {42 - 56}, abstract = {A method is proposed for predicting the pass rate of a Computer Science course. Input data comprises different software metrics that are evaluated on a set of programs, comprising students{\textquoteright} answers to a list of computing challenges proposed by the course instructor. Different kinds of uncertainty are accepted, including missing answers and multiple responses to the same challenge. The most informative metrics are selected according to an extension to vague data of the observed Fisher information. The proposed method was tested on experimental data collected during two years at Oviedo University. Yearly changes in the pass rate of two groups were accurately predicted on the basis of 7 software metrics. 73 volunteer students and 1500 source files were involved in the experimentation.}, keywords = {Automatic grading, feature selection, Genetic Fuzzy Systems, Low Quality Data, vague data}, issn = {0020-0255}, doi = {https://doi.org/10.1016/j.ins.2016.08.090}, url = {http://www.sciencedirect.com/science/article/pii/S0020025516306715}, author = {Jos{\'e} Otero and Luis Junco and Rosario Su{\'a}rez and Ana Palacios and In{\'e}s Couso and Luciano S{\'a}nchez} } @article {SANCHEZ2008607, title = {Mutual information-based feature selection and partition design in fuzzy rule-based classifiers from vague data}, journal = {International Journal of Approximate Reasoning}, volume = {49}, number = {3}, year = {2008}, pages = {607 - 622}, abstract = {Algorithms for preprocessing databases with incomplete and imprecise data are seldom studied. For the most part, we lack numerical tools to quantify the mutual information between fuzzy random variables. Therefore, these algorithms (discretization, instance selection, feature selection, etc.) have to use crisp estimations of the interdependency between continuous variables, whose application to vague datasets is arguable. In particular, when we select features for being used in fuzzy rule-based classifiers, we often use a mutual information-based ranking of the relevance of inputs. But, either with crisp or fuzzy data, fuzzy rule-based systems route the input through a fuzzification interface. The fuzzification process may alter this ranking, as the partition of the input data does not need to be optimal. In our opinion, to discover the most important variables for a fuzzy rule-based system, we want to compute the mutual information between the fuzzified variables, and we should not assume that the ranking between the crisp variables is the best one. In this paper we address these problems, and propose an extended definition of the mutual information between two fuzzified continuous variables. We also introduce a numerical algorithm for estimating the mutual information from a sample of vague data. We will show that this estimation can be included in a feature selection algorithm, and also that, in combination with a genetic optimization, the same definition can be used to obtain the most informative fuzzy partition for the data. Both applications will be exemplified with the help of some benchmark problems.}, keywords = {feature selection, Fuzzy fitness, Genetic Fuzzy Systems, vague data}, issn = {0888-613X}, doi = {https://doi.org/10.1016/j.ijar.2008.06.005}, url = {http://www.sciencedirect.com/science/article/pii/S0888613X08001102}, author = {Luciano S{\'a}nchez and M. Rosario Su{\'a}rez and J.R. Villar and In{\'e}s Couso} } @conference {4295665, title = {Some Results about Mutual Information-based Feature Selection and Fuzzy Discretization of Vague Data}, booktitle = {2007 IEEE International Fuzzy Systems Conference}, year = {2007}, month = {July}, pages = {1-6}, abstract = {Algorithms for preprocessing databases with incomplete and imprecise data are seldom studied, partly because we lack numerical tools to quantify the interdependency between fuzzy random variables. In particular, many filter-type feature selection algorithms rely on crisp discretizations for estimating the mutual information between continuous variables, effectively preventing the use of vague data. Fuzzy rule based systems pass continuous input variables, in turn, through their own fuzzification interface. In the context of feature selection, should we rank the relevance of the inputs by means of their mutual information, it might happen that an apparently informative variable is useless after having been codified as a fuzzy subset of our catalog of linguistic terms. In this paper we propose to address both problems by estimating the mutual information with the same set of fuzzy partitions that will be used to codify the antecedents of the fuzzy rules. That is to say, we introduce a numerical algorithm for estimating the mutual information between two fuzzified continuous variables. This algorithm can be included in certain feature selection algorithms, and can also be used to obtain the most informative fuzzy partition for the data. The use of our definition will be exemplified with the help of some benchmark problems.}, keywords = {codification, computational linguistics, Data preprocessing, Feature extraction, feature selection, fuzzification interface, fuzzy discretization, fuzzy random variables, fuzzy rule based systems, fuzzy set theory, Fuzzy sets, Fuzzy systems, Information filtering, Information filters, knowledge based systems, linguistic terms, mutual information, Partitioning algorithms, Random variables, Spatial databases, vague data}, issn = {1098-7584}, doi = {10.1109/FUZZY.2007.4295665}, author = {L. S{\'a}nchez and M. R. Suarez and J. R. Villar and I. Couso} } @article {CASILLAS2001135, title = {Genetic feature selection in a fuzzy rule-based classification system learning process for high-dimensional problems}, journal = {Information Sciences}, volume = {136}, number = {1}, year = {2001}, note = {Recent Advances in Genetic Fuzzy Systems}, pages = {135 - 157}, abstract = {The inductive learning of a fuzzy rule-based classification system (FRBCS) is made difficult by the presence of a large number of features that increases the dimensionality of the problem being solved. The difficulty comes from the exponential growth of the fuzzy rule search space with the increase in the number of features considered in the learning process. In this work, we present a genetic feature selection process that can be integrated in a multistage genetic learning method to obtain, in a more efficient way, FRBCSs composed of a set of comprehensible fuzzy rules with high-classification ability. The proposed process fixes, a priori, the number of selected features, and therefore, the size of the search space of candidate fuzzy rules. The experimentation carried out, using Sonar example base, shows a significant improvement on simplicity, precision and efficiency achieved by adding the proposed feature selection processes to the multistage genetic learning method or to other learning methods.}, keywords = {feature selection, Fuzzy reasoning methods, fuzzy rule-based classification systems, Inductive learning}, issn = {0020-0255}, doi = {https://doi.org/10.1016/S0020-0255(01)00147-5}, url = {http://www.sciencedirect.com/science/article/pii/S0020025501001475}, author = {J Casillas and O. Cord{\'o}n and M. J. del Jesus and F. Herrera} }