/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    CHAIDSplit.java
 *    Copyright (C) 2021 ALDAPA Team (http://www.aldapa.eus)
 *    Faculty of Informatics, Donostia, 20018
 *    University of the Basque Country (UPV/EHU), Basque Country
 *
 */

package weka.classifiers.trees.jchaid;

import java.util.Enumeration;

import weka.classifiers.trees.j48.ClassifierSplitModel;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;

/**
 * Class implementing a CHAID-type split on an attribute.
 * *************************************************************************************<br/>
 * 
 * @author Jes&uacute;s M. P&eacute;rez (txus.perez@ehu.eus)
 * @author Oscar Teixeira (oteixeira001@ikasle.ehu.es)
 * @version $Revision: 1.3 $
 */
public class CHAIDSplit extends ClassifierSplitModel {

  /** for serialization */
  private static final long serialVersionUID = -9205288918018922878L;

  /** Desired number of branches. */
  protected int m_complexityIndex;
  
  /** Attribute to split on. */
  protected final int m_attIndex;

  /** Minimum number of objects in a split. */
  protected final int m_minNoObj;
  
  /** Number of split points. */
  protected int m_index;

  /** Original position for missing values on the attribute to split on. */
  protected int m_missingIdx;

  /** Current position for missing values on the attribute to split on. */
  protected int m_missingCurrentIndex;

  /** The indicators used to map the old values.
   *  Indicates where is merged each category. 
   * (Based on the filter MergeNominalValues (package weka.filters.supervised.attribute))  
   */
  protected int[] m_indicators;
  
  /** Indicates if there are missing values or not */
  protected boolean m_hasMissingValues = false;

  /** ChiSquared probability of split. */
  protected double m_chiSquaredProb;

  /** Set the significance level for the selection of the attribute to split a node. */
  protected double m_sigLevelAtt = 0.05;
  
  /** Set the significance level for the quest of the best combination of the categories of an attribute */
  protected double m_sigLevelMergeSplit = 0.05;
  
  /** Indicates if the quest of the best binary split will be done, after merging 3 or more categories
   *  This process could add a considerable latency and that is why it is optional.
   */
  protected boolean m_searchBestSplit = true;
  
  /** Indicate if the nature of the categories is ordered
   *  Default: false, that is, Attribute.ORDERING_SYMBOLIC
   */
  protected boolean m_ordered = false;

  /**
   * Initializes the split model.
   * 
   * @param attIndex Attribute to split on
   * @param minNoObj minimum number of instances that have to occur in at least
   *          two subsets induced by split
   * @param sigLevelAtt Significance level for the selection of attributes
   * @param sigLevelMergeSplit Significance level for the best combination of categories
   * @param searchBestSplit true if the quest of the best binary split will be done
   * @param ordered true if the nature of the categories is ordered
   */
  public CHAIDSplit(int attIndex, int minNoObj, 
    double sigLevelAtt, double sigLevelMergeSplit,
    boolean searchBestSplit, boolean ordered) {

    // Get index of attribute to split on.
    m_attIndex = attIndex;

    // Set minimum number of objects.
    m_minNoObj = minNoObj;

    // Set the attribute significance level.
    m_sigLevelAtt = sigLevelAtt;

    // Set the merge/split significance level.
    m_sigLevelMergeSplit = sigLevelMergeSplit;
    
    // Set if look for the best binary split or not
    m_searchBestSplit = searchBestSplit;

    // Set attribute's ordering
    m_ordered = ordered;
  }

  /**
   * Creates a CHAID-type split on the given data. Assumes that none of the class
   * values is missing.
   * 
   * @exception Exception if something goes wrong
   */
  @Override
  public void buildClassifier(Instances trainInstances) throws Exception {

    // Initialize the remaining instance variables.
    m_numSubsets = 0;
    m_chiSquaredProb = 1.0;

    // Only enumerated attributes.
    m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
    m_index = m_complexityIndex;
    m_missingIdx = m_complexityIndex;
    handleEnumeratedAttribute(trainInstances);
  }

  /**
   * Returns Chi Squared Probability for the generated split.
   */
  public final double chiSquaredProb() {
    return m_chiSquaredProb;
  }

  /**
   * Creates split on enumerated attribute.
   * (based on the same function of the class C45Split (package weka.classifiers.trees.j48)
   * 
   * @exception Exception if something goes wrong
   */
  protected void handleEnumeratedAttribute(Instances trainInstances)
    throws Exception {

    Instance instance;

    // Initializes the contingency table (in CHAID terminology) 
    m_distribution = new CHAIDDistribution(m_complexityIndex,
      trainInstances.numClasses(), m_minNoObj, m_sigLevelAtt, m_sigLevelMergeSplit, m_searchBestSplit, m_ordered);

    // Instances with both known and unknown values are relevant.
    Enumeration<Instance> enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = enu.nextElement();
      if (!instance.isMissing(m_attIndex)) {
        // There is a row in the contingency table for each value.
        m_distribution.add((int) instance.value(m_attIndex), instance);
      } else {
        // Missing values are initially added to the last position.
        m_distribution.add(m_missingIdx, instance);
      }
    }

    // The segmentation algorithm CHAID
    getCHAIDDistribution().mergeValues();
    m_indicators = getCHAIDDistribution().getIndicators();
    m_missingCurrentIndex = getCHAIDDistribution().getMissingCurrentIndex();
    m_hasMissingValues = getCHAIDDistribution().getHasMissingValues();
    m_complexityIndex = getCHAIDDistribution().numBags();
    m_index = m_complexityIndex;
    m_numSubsets = m_complexityIndex;
    m_chiSquaredProb = getCHAIDDistribution().getChiSquaredProb();
  }

  /**
   * Returns the distribution of class values.
   */
  protected CHAIDDistribution getCHAIDDistribution() {
    return (CHAIDDistribution) m_distribution;
  }
  
  /**
   * Returns whether the split has missing values or not
   * @return true if has missing values
   */
  public boolean hasMissingValues() {
    return m_hasMissingValues;
  }

  /**
   * Get the current index of the missing values
   */
  public int getMissingCurrentIndex() {
    return m_missingCurrentIndex;
  }

  /**
   * Returns index of subset instance is assigned to.
   * 
   * @exception Exception if something goes wrong
   */
  @Override
  public int whichSubset(Instance instance) throws Exception {
    int originalCategoryIndex;

    if (instance.isMissing(m_attIndex)) {
      originalCategoryIndex = m_missingIdx;
    } else {
      originalCategoryIndex = (int) instance.value(m_attIndex);
    }
    return m_indicators[originalCategoryIndex];
  }

  /**
   * Prints left side of condition..
   * 
   * @param data training set.
   */
  @Override
  public final String leftSide(Instances data) {

    return data.attribute(m_attIndex).name();
  }

  /**
   * Prints the condition satisfied by instances in a subset.
   * 
   * @param index of subset
   * @param data training set.
   */
  @Override
  public String rightSide(int index, Instances data) {
	  StringBuffer text;

	  text = new StringBuffer();
	  boolean firstFound = true;
	  for (int i=0; i<m_indicators.length; i++) {
		  if (m_indicators[i] == index) {
			  if (firstFound) {
				  firstFound = false;
				  text.append(" = ");
			  } else {
				  text.append(" | ");
			  }
			  if (i == m_missingIdx) {
				  text.append("?");
			  } else {
				  text.append(data.attribute(m_attIndex).value(i));
			  }
		  }
	  }

	  return text.toString();
  }

  /**
   * Returns a string containing java source code equivalent to the test made at
   * this node. The instance being tested is called "i".
   * 
   * @param index index of the nominal value tested
   * @param data the data containing instance structure info
   * @return a value of type 'String'
   */
  @Override
  public String sourceExpression(int index, Instances data) {

    if (index < 0) {
      return "i[" + m_attIndex + "] == null";
    } else {
      StringBuffer expr = null;
      expr = new StringBuffer("");
      StringBuffer expr2 = null;
      boolean firstFound = true;
      for (int i=0; i<m_indicators.length; i++) {
        if (m_indicators[i] == index) {
          expr2 = new StringBuffer("");
          expr2.append("( i[");
          expr2.append(m_attIndex).append("]");
          if (i == m_missingIdx) {
            expr2.append(" == null");
          } else {
            expr2.append(".equals(\"");
            expr2.append(data.attribute(m_attIndex).value(i));
            expr2.append("\")");
          }
          expr2.append(" )");
          if (firstFound) {
            firstFound = false;
            expr = expr2;
          }
          else {
            if (i == m_missingIdx)
              // Comparison with null always has to be the first one!
              expr = expr2.append(" || ").append(expr);
            else
              expr.append(" || ").append(expr2);
          }
        }
      }
      return expr.toString();
    }
  }

  
  /**
   * Returns index of attribute for which split was generated.
   * (Copied from C45Split class)
   */
  public final int attIndex() {

    return m_attIndex;
  }
  
  /**
   * Splits the given set of instances into subsets.
   *
   * @exception Exception if something goes wrong
   */
  @Override
  public final Instances [] split(Instances data)
       throws Exception {
    // Find size and constitution of subsets
    int[] subsetSize = new int[m_numSubsets];
    for (Instance instance : data) {
      int subset = whichSubset(instance);
      if (subset > -1) {
        subsetSize[subset]++;
      } else {
        System.out.println("CHAIDSplit.split(): Unable to manage this instance!");
      }
    }

    // Create subsets
    Instances [] instances = new Instances [m_numSubsets];
    for (int j = 0; j < m_numSubsets; j++) {
      instances[j] = new Instances(data, subsetSize[j]);
    }
    for (Instance instance : data) {
      int subset = whichSubset(instance);
      if (subset > -1) {
        instances[subset].add(instance);
      }
    }

    return instances;
  }

  /**
   * Returns null, indicating that instance is only assigned to one subset.
   */
  @Override
  public final double[] weights(Instance instance) {
    // CHAID always assign each value to only one subset,
    // even the 'missing' value.
    return null;
  }

  /**
   * Sets distribution associated with model.
   */
  @Override
  public void resetDistribution(Instances data) throws Exception {

    m_distribution = new CHAIDDistribution(data, this);
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.1 $");
  }
}
