/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    CHAIDModelSelection.java
 *    Copyright (C) 2021 ALDAPA Team (http://www.aldapa.eus)
 *    Faculty of Informatics, Donostia, 20018
 *    University of the Basque Country (UPV/EHU), Basque Country
 *
 */

package weka.classifiers.trees.jchaid;

import java.util.Enumeration;

import weka.classifiers.trees.j48.C45ModelSelection;
import weka.classifiers.trees.j48.ClassifierSplitModel;
import weka.classifiers.trees.j48.Distribution;
import weka.classifiers.trees.j48.NoSplit;
import weka.core.Attribute;
import weka.core.Instances;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 * Class for selecting a CHAID-type split for a given dataset.
 * *************************************************************************************<br/>
 * 
 * @author Jes&uacute;s M. P&eacute;rez (txus.perez@ehu.eus)
 * @author Oscar Teixeira (oteixeira001@ikasle.ehu.es)
 * @version $Revision: 1.2 $
 */
public class CHAIDModelSelection extends C45ModelSelection {

	/** for serialization */
	private static final long serialVersionUID = -4401438489738631349L;

	/** Significance level for the selection of the attribute to split a node. */
	protected double m_sigLevelAtt = 0.05;

	/** Significance level for the quest of the best combination of the categories of an attribute */
	protected double m_sigLevelMergeSplit = 0.05;

	/** Indicates if the quest of the best binary split will be done, after merging 3 or more categories
	 *  This process could add a considerable latency and that is why it is optional.
	 */
	protected boolean m_searchBestSplit = true;

	/** Minimum number of instances to split a node */
	protected int m_minNumObjSplit = 3;

	/** Stores which attributes are ordinals (monotonic predictors)
	 *  (based on the member m_DiscretizeCols of weka.filters.supervised.attribute.Discretize class) */
	protected Range m_ordinalAtts;

	/**
	 * Initializes the split selection method with the given parameters.
	 * 
	 * @param minNoObj minimum number of instances that have to occur in at least
	 *          two subsets induced by split
	 * @param allData FULL training dataset (necessary for selection of split
	 *          points).
	 * @param doNotMakeSplitPointActualValue if true, split point is not relocated
	 *          by scanning the entire dataset for the closest data value
	 * @param sigLevelAtt Significance level for the selection of attributes
	 * @param sigLevelMergeSplit Significance level for the best combination of categories
	 * @param searchBestSplit true if the quest of the best binary split will be done
	 * @param minNumObjSplit minimum number of instances to split a node
	 * @param ordinalAtts List of ordinal attributes
	 */
	public CHAIDModelSelection(int minNoObj, Instances allData,
			boolean doNotMakeSplitPointActualValue,
			double sigLevelAtt, double sigLevelMergeSplit, boolean searchBestSplit, int minNumObjSplit, Range ordinalAtts) {
		super(minNoObj, allData, true, doNotMakeSplitPointActualValue);
		m_sigLevelAtt = sigLevelAtt;
		m_sigLevelMergeSplit = sigLevelMergeSplit;
		m_searchBestSplit = searchBestSplit;
		m_minNumObjSplit = minNumObjSplit;
		m_ordinalAtts = ordinalAtts;
	}

	/**
	 * Selects CHAID-type split for the given dataset.
	 */
	@Override
	public ClassifierSplitModel selectModel(Instances data) {

		double minResult;
		CHAIDSplit[] currentModel;
		CHAIDSplit bestModel = null;
		NoSplit noSplitModel = null;
		int validModels = 0;
		boolean multiVal = true;
		Distribution checkDistribution;
		Attribute attribute;
		int i;

		try {

			// Check if all Instances belong to one class or if not
			// enough Instances to split.
			checkDistribution = new Distribution(data);
			noSplitModel = new NoSplit(checkDistribution);
			if (Utils.sm(checkDistribution.total(), m_minNumObjSplit)
					|| Utils.eq(checkDistribution.total(),
							checkDistribution.perClass(checkDistribution.maxClass()))) {
				return noSplitModel;
			}

			// Check if all attributes have a lot of values.
			if (m_allData != null) {
				Enumeration<Attribute> enu = data.enumerateAttributes();
				while (enu.hasMoreElements()) {
					attribute = enu.nextElement();
					if ((Utils.sm(attribute.numValues(),
							(0.3 * m_allData.numInstances())))) {
						multiVal = false;
						break;
					}
				}
			}

			currentModel = new CHAIDSplit[data.numAttributes()];

			// For each attribute.
			for (i = 0; i < data.numAttributes(); i++) {

				// Apart from class attribute.
				if (i != (data).classIndex()) {

					// Get models for current attribute.
					currentModel[i] = new CHAIDSplit(i, m_minNoObj,
							m_sigLevelAtt, m_sigLevelMergeSplit, m_searchBestSplit, isOrdinalAtt(i));
					currentModel[i].buildClassifier(data);

					// Check if useful split for current attribute
					// exists and check for enumerated attributes with
					// a lot of values.
					if (currentModel[i].checkModel()) {
						if (m_allData != null) {
							if ((data.attribute(i).isNumeric())
									|| (multiVal || Utils.sm(data.attribute(i).numValues(),
											(0.3 * m_allData.numInstances())))) {
								validModels++;
							}
						} else {
							validModels++;
						}
					}
				} else {
					currentModel[i] = null;
				}
			}

			// Check if any useful split was found.
			if (validModels == 0) {
				return noSplitModel;
			}

			// Find "best" attribute to split on.
			minResult = Double.MAX_VALUE;
			for (i = 0; i < data.numAttributes(); i++) {
				if ((i != (data).classIndex()) && (currentModel[i].checkModel())) {
					if (Utils.smOrEq(currentModel[i].chiSquaredProb(), minResult)) {
						bestModel = currentModel[i];
						minResult = currentModel[i].chiSquaredProb();
					}
				}
			}

			// Check if useful split was found.
			if (Utils.gr(minResult, m_sigLevelAtt)) {
				return noSplitModel;
			}

			return bestModel;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}

	/**
	 * Indicates if i_att attribute is marked as ordinal
	 * @param i_att Index of attribute
	 * @return true if ordinal, false otherwise 
	 */
	public boolean isOrdinalAtt(int i_att){
		return m_ordinalAtts.isInRange(i_att); 
	}

	/**
	 * Returns the revision string.
	 *
	 * @return the revision
	 */
	@Override
	public String getRevision() {
		return RevisionUtils.extract("$Revision: 1.1 $");
	}
}
