/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    JCHAIDStar.java
 *    Copyright (C) 2021 ALDAPA Team (http://www.aldapa.eus)
 *    Faculty of Informatics, Donostia, 20018
 *    University of the Basque Country (UPV/EHU), Basque Country
 *
 */

package weka.classifiers.trees;

import weka.classifiers.Sourcable;
import weka.classifiers.trees.jchaidstar.CHAIDStarClassifierTree;
import weka.classifiers.trees.jchaidstar.CHAIDStarModelSelection;
import weka.core.AdditionalMeasureProducer;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.Drawable;
import weka.core.Instances;
import weka.core.Matchable;
import weka.core.OptionHandler;
import weka.core.PartitionGenerator;
import weka.core.Summarizable;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;

/**
 * <!-- globalinfo-start --> Class for generating a decision tree based on the CHAID* algorithm,
 * a modified version of the CHAID decision tree induction algorithm that also handles continuous features
 * and includes the same post-pruning mechanism used by C45.
 * For more information, see<br/>
 * <br/>
 * Igor Ibarguren and Aritz Lasarguren and Jes&uacute;s M. P&eacute;rez and Javier Muguerza and Ibai Gurrutxaga and Olatz Arbelaitz.
 * "BFPART: Best-First PART". Information Sciences (2016), Vol. 367-368, pp 927-952.
 * <a href="http://dx.doi.org/10.1016/j.ins.2016.07.023" target="_blank">http://dx.doi.org/10.1016/j.ins.2016.07.023</a>
 * <p/>
 * <br/>
 * G. V. Kass. "An Exploratory Technique for Investigating Large Quantities of 
 * Categorical Data". Journal of the Royal Statistical Society - Series C (Applied Statistics) (1980), 29(2), pp 119-127.
 * <a href="http://www.jstor.org/stable/2986296" target="_blank">http://www.jstor.org/stable/2986296</a>
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- technical-bibtex-start --> BibTeX:
 * <pre>
 * &#64;article{Ibarguren2016,
 *    author = "Igor Ibarguren and Aritz Lasarguren and Jes\'us M. P\'erez and Javier Muguerza and Ibai Gurrutxaga and Olatz Arbelaitz",
 *    title = "BFPART: Best-First PART",
 *    journal = "Information Sciences",
 *    volume = "367-368",
 *    pages = "927-952",
 *    year = "2016",
 *    doi = "10.1016/j.ins.2016.07.023",
 *    abstract = "In supervised classification, decision tree and rule induction algorithms possess
 *    the desired ability to build understandable models. The PART algorithm creates partially developed
 *    C4.5 decision trees and extracts a rule from each tree. Some of the criteria used by this algorithm
 *    can be modified to yield better results. In this work, we propose and compare 16 variants of the PART
 *    algorithm from the perspectives of discriminating capacity, complexity of the models, and the
 *    computational cost, for 36 real-world problems obtained from the UCI repository. The use of the
 *    Best-First optimization algorithm to find the next node to develop in a partial tree improves the
 *    results of the PART algorithm. The best-performing variant also ranks first when compared to the
 *    well-established C4.5 algorithm and a modified version of the CHAID decision tree induction algorithm
 *    that handles continuous features, which is also proposed in this paper. In order to study its
 *    performance in comparison to other rivals, this comparison of algorithms also includes the original
 *    PART algorithm. For all performance measures, we test the results for statistical significance using
 *    state-of-the-art methods."
 * }
 * </pre>
 * <p/>
 * <pre>
 * &#64;article{Kass1980,
 *    author = "G. V. Kass",
 *    title = "An Exploratory Technique for Investigating Large Quantities of Categorical Data",
 *    journal = "Journal of the Royal Statistical Society. Series C (Applied Statistics)",
 *    year = "1980",
 *    volume = "29 (2)",
 *    pages = "119-127",
 *    abstract = "The technique set out in the paper, CHAID, is an offshoot of AID
 *        (Automatic Interaction Detection) designed for a categorized dependent
 *        variable. Some important modifications which are relevant to standard
 *        AID include: built-in significance testing with the consequence of
 *        using the most significant predictor (rather than the most explanatory),
 *        multi-way splits (in contrast to binary) and a new type of predictor
 *        which is especially useful in handling missing information.",
 *    url = "http://www.jstor.org/stable/2986296"
 * }
 * </pre>
 * <p/>
 * <!-- technical-bibtex-end -->
 * 
 * <!-- options-start -->
 * Valid options are: <p/>
 * 
 * J48 options <br/>
 * =========== <br/>
 *
 * <pre>
 * -U
 *  Use unpruned tree.
 * </pre>
 * 
 * <pre>
 * -O
 *  Do not collapse tree.
 * </pre>
 * 
 * <pre>
 * -C &lt;pruning confidence&gt;
 *  Set confidence threshold for pruning.
 *  (default 0.25)
 * </pre>
 * 
 * <pre>
 * -M &lt;minimum number of instances&gt;
 *  Set minimum number of instances per leaf.
 *  (default 2)
 *  </pre>
 *  
 * <pre>
 * -S
 *  Don't perform subtree raising.
 * </pre>
 * 
 * <pre>
 * -L
 *  Do not clean up after the tree has been built.
 * </pre>
 * 
 * <pre>
 * -A
 *  Laplace smoothing for predicted probabilities.
 * </pre>
 * 
 * <pre>
 * -doNotMakeSplitPointActualValue
 *  Do not make split point actual value.
 * </pre>
 *  
 * CHAID options <br/>
 * ============= <br/>
 *
 * <pre> -CH-A &lt;attribute significance level&gt;
 *  Set the significance level for the selection of the attribute to split a node.
 *  (default 0.05)</pre>
 *
 * <pre> -CH-M &lt;merge-split significance level&gt;
 *  Set the significance level for the quest of the best combination of values of attributes.
 *  (default 0.05)</pre>
 *  
 * <pre> -CH-S &lt;minimum number of instances to split a node&gt;
 *  Set minimum number of instances to consider a node to be split.
 *  (default 3)</pre>
 *  
 * <pre> -CH-O &lt;att1,att2-att4,...&gt;
 *  Specifies list of attribute indexes to set as ordinal. First and last are valid indexes.
 *  Warning: The list of attributes includes the class!
 *  (default none)</pre>
 * 
 * <!-- options-end -->
 * 
 * @author Jes&uacute;s M. P&eacute;rez (txus.perez@ehu.eus)
 * @author Oscar Teixeira (oteixeira001@ikasle.ehu.es)
 * @version $Revision: 1.2 $
 */
public class JCHAIDStar extends JCHAID implements OptionHandler, Drawable,
Matchable, Sourcable, Summarizable,
AdditionalMeasureProducer, TechnicalInformationHandler, PartitionGenerator {

	/** for serialization */
	private static final long serialVersionUID = 5844369889575666221L;

	/**
	 * Constructor for JCHAIDStar in order to change the default value of the
	 *  Collapse Tree option of the J48 class.
	 */
	public JCHAIDStar(){
		m_unpruned = false;
		m_collapseTree = false;
		m_doNotMakeSplitPointActualValue = false;
	}

	/**
	 * Returns default capabilities of the classifier.
	 * 
	 * @return the capabilities of this classifier
	 */
	@Override
	public Capabilities getCapabilities() {
		Capabilities result = super.getCapabilities();

		result.enable(Capability.NUMERIC_ATTRIBUTES);

		return result;
	}

	/**
	 * Returns a string describing the classifier
	 * @return a description suitable for
	 * displaying in the explorer/experimenter gui
	 */
	public String globalInfo() {
		return "Class for generating a decision tree based on the CHAID* algorithm, "
				+ "a modified version of the CHAID decision tree induction algorithm that also handles continuous features "
				+ "and includes the same post-pruning mechanism used by C45. "
				+ "For more information, see:\n\n"
				+ getTechnicalInformation().toString();
	}

	/**
	 * Returns an instance of a TechnicalInformation object, containing 
	 * detailed information about the technical background of this class,
	 * e.g., paper reference or book this class is based on.
	 * 
	 * @return the technical information about this class
	 */
	public TechnicalInformation getTechnicalInformation() {
		TechnicalInformation  result;

		result = new TechnicalInformation(Type.ARTICLE);
		result.setValue(Field.AUTHOR, "Igor Ibarguren and Aritz Lasarguren and Jesús M. Pérez and Javier Muguerza and Ibai Gurrutxaga and Olatz Arbelaitz");
		result.setValue(Field.YEAR, "2016");
		result.setValue(Field.TITLE, "BFPART: Best-First PART");
		result.setValue(Field.JOURNAL, "Information Sciences");
		result.setValue(Field.VOLUME, "367-368");
		result.setValue(Field.PAGES, "927-952");
		result.setValue(Field.URL, "10.1016/j.ins.2016.07.023");

		TechnicalInformation additional = new TechnicalInformation(Type.ARTICLE);
		additional.setValue(Field.AUTHOR, "G. V. Kass");
		additional.setValue(Field.YEAR, "1980");
		additional.setValue(Field.TITLE, "An Exploratory Technique for Investigating Large Quantities of Categorical Data");
		additional.setValue(Field.JOURNAL, "Journal of the Royal Statistical Society. Series C (Applied Statistics)");
		additional.setValue(Field.VOLUME, "29");
		additional.setValue(Field.NUMBER, "2");
		additional.setValue(Field.PAGES, "119-127");
		additional.setValue(Field.URL, "http://www.jstor.org/stable/2986296");
		result.add(additional);

		return result;
	}

	/**
	 * Generates the classifier.
	 * 
	 * @param instances the data to train the classifier with
	 * @throws Exception if classifier can't be built successfully
	 */
	@Override
	public void buildClassifier(Instances instances) throws Exception {

		// Some checks based on weka.classifiers.trees.J48.buildClassifier(Instances)
		if ((m_unpruned) && (!m_subtreeRaising)) {
			throw new Exception("Subtree raising does not need to be unset for unpruned trees!");
		}
		if ((m_unpruned) && (m_CF != 0.25f)) {
			throw new Exception("It does not make sense to change the confidence for an unpruned tree!");
		}
		if ((m_CF <= 0) || (m_CF >= 1)) {
			throw new Exception("Confidence has to be greater than zero and smaller than one!");
		}
		getCapabilities().testWithFail(instances);

		double sumOfWeights = instances.sumOfWeights();
		if (Utils.gr(sumOfWeights, 0) && !Utils.eq(sumOfWeights, instances.numInstances())) {
			System.err.println("CHAID* cannot use instance weights (sumOfWeights: " + sumOfWeights + ", numInstances: " + instances.numInstances() + ")!");
			throw new Exception("CHAID* cannot use instance weights (sumOfWeights: " + sumOfWeights + ", numInstances: " + instances.numInstances() + ")!");
		}

		CHAIDStarModelSelection modSelection;

		// Prepare the list of ordinal attributes, if exist
		prepareOrdinalAtts(instances);

		modSelection = new CHAIDStarModelSelection(m_minNumObj, instances,
				m_doNotMakeSplitPointActualValue, 
				m_CHsigLevelAtt, m_CHsigLevelMergeSplit, m_CHsearchBestSplit, m_CHminNumObjSplit, m_CHordinalAtts);

		m_root = new CHAIDStarClassifierTree(modSelection, !m_unpruned, m_CF,
				m_subtreeRaising, !m_noCleanup, m_collapseTree);

		m_root.buildClassifier(instances);

		modSelection.cleanup();
	}

	/**
	 * Returns a description of the classifier.
	 * 
	 * @return a description of the classifier
	 */
	@Override
	public String toString() {

		if (m_root == null) {
			return "No classifier built";
		} else {
			if (m_unpruned) {
				return "JCHAIDStar unpruned tree\n" +
						"------------------------\n" +
						toStringOrdinalAttributesList() +
						m_root.toString();
			} else {
				return "JCHAIDStar pruned tree\n" +
						"----------------------\n" +
						toStringOrdinalAttributesList() +
						m_root.toString();
			}
		}
	}

	/**
	 * Set the value of unpruned. Turns reduced-error pruning off if set.
	 * 
	 * @param v Value to assign to unpruned.
	 */
	@Override
	public void setUnpruned(boolean v) {
		m_unpruned = v;
	}

	/**
	 * Set the value of collapseTree.
	 * 
	 * @param v Value to assign to collapseTree.
	 */
	@Override
	public void setCollapseTree(boolean v) {
		m_collapseTree = v;
	}

	/**
	 * Set the value of CF.
	 * 
	 * @param v Value to assign to CF.
	 */
	@Override
	public void setConfidenceFactor(float v) {
		m_CF = v;
	}

	/**
	 * Set the value of subtreeRaising.
	 * 
	 * @param v Value to assign to subtreeRaising.
	 */
	@Override
	public void setSubtreeRaising(boolean v) {
		m_subtreeRaising = v;
	}

	/**
	 * Set the value of useLaplace.
	 * 
	 * @param newuseLaplace Value to assign to useLaplace.
	 */
	@Override
	public void setUseLaplace(boolean newuseLaplace) {

		m_useLaplace = newuseLaplace;
	}

	/**
	 * Sets the value of doNotMakeSplitPointActualValue.
	 * 
	 * @param doNotMakeSplitPointActualValue the value to set
	 */
	@Override
	public void setDoNotMakeSplitPointActualValue(
			boolean doNotMakeSplitPointActualValue) {
		m_doNotMakeSplitPointActualValue = doNotMakeSplitPointActualValue;
	}

	/**
	 * Main method for testing this class
	 * 
	 * @param argv the command line options
	 */
	public static void main(String[] argv) {
		runClassifier(new JCHAIDStar(), argv);
	}
}
