/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    CHAIDStarSplit.java
 *    Copyright (C) 2021 ALDAPA Team (http://www.aldapa.eus)
 *    Faculty of Informatics, Donostia, 20018
 *    University of the Basque Country (UPV/EHU), Basque Country
 *
 */

package weka.classifiers.trees.jchaidstar;

import java.util.Enumeration;

import weka.classifiers.trees.jchaid.CHAIDDistribution;
import weka.classifiers.trees.jchaid.CHAIDSplit;
import weka.classifiers.trees.jchaid.ChiSquareSplitCrit;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;
import weka.core.Utils;

/**
 * Class implementing a CHAID*-type split on an attribute.
 * 
 * @author Jes&uacute;s M. P&eacute;rez (txus.perez@ehu.eus)
 * @author Oscar Teixeira (oteixeira001@ikasle.ehu.es)
 * @version $Revision: 1.3 $
 */
public class CHAIDStarSplit extends CHAIDSplit {

  /** for serialization */
  private static final long serialVersionUID = -3294982505825172506L;

  /** Value of split point. */
  protected double m_splitPoint;

  /** Static reference to splitting criterion. */
  protected static ChiSquareSplitCrit chiSquareCrit = new ChiSquareSplitCrit();
  
  /**
   * Initializes the split model.
   * 
   * @param attIndex Attribute to split on
   * @param minNoObj minimum number of instances that have to occur in at least
   *          two subsets induced by split
   * @param sigLevelAtt Significance level for the selection of attributes
   * @param sigLevelMergeSplit Significance level for the best combination of categories
   * @param searchBestSplit true if the quest of the best binary split will be done
   * @param ordered true if the nature of the categories is ordered
   */
  public CHAIDStarSplit(int attIndex, int minNoObj, 
    double sigLevelAtt, double sigLevelMergeSplit, boolean searchBestSplit, boolean ordered) {
    super(attIndex, minNoObj, sigLevelAtt, sigLevelMergeSplit, searchBestSplit, ordered);
  }

  /**
   * Creates a CHAID*-type split on the given data. Assumes that none of the class
   * values is missing.
   * 
   * @exception Exception if something goes wrong
   */
  @Override
  public void buildClassifier(Instances trainInstances) throws Exception {

    // Initialize the remaining instance variables.
    m_numSubsets = 0;
    m_splitPoint = Double.MAX_VALUE;
    m_chiSquaredProb = 1.0;

    // Different treatment for enumerated and numeric attributes.
    if (trainInstances.attribute(m_attIndex).isNominal()) {
      m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
      m_index = m_complexityIndex;
      m_missingIdx = m_complexityIndex;
      handleEnumeratedAttribute(trainInstances);
    } else {
      m_complexityIndex = 2;
      m_index = 0;
      m_missingIdx = m_complexityIndex;
      trainInstances.sort(trainInstances.attribute(m_attIndex));
      handleNumericAttribute(trainInstances);
    }
  }

  /**
   * Creates split on numeric attribute.
   * (based on the same function of the class C45Split (package weka.classifiers.trees.j48)
   * 
   * @exception Exception if something goes wrong
   */
  public void handleNumericAttribute(Instances trainInstances)
      throws Exception {
    int firstMiss;
    int next = 1;
    int last = 0;
    int splitIndex = -1;
    double val;
    double maxVal = Double.MIN_VALUE;
    boolean hasMissings = false;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new CHAIDDistribution(2, trainInstances.numClasses(), m_minNoObj, 
        m_sigLevelAtt, m_sigLevelMergeSplit, m_searchBestSplit, m_ordered);

    // Only Instances with known values are relevant.
    // firstMiss indicates the last known value as last split index candidate
    // // Initially the number of instances
    firstMiss = trainInstances.numInstances();
    Enumeration<Instance> enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
      instance = enu.nextElement();
      if (instance.isMissing(m_attIndex)) {
        if (!hasMissings) { // Is it the first one?
          hasMissings = true;
          firstMiss = i;
        }
        m_distribution.add(2, instance);
      }
      else
        m_distribution.add(1, instance);
      i++;
    }
    m_hasMissingValues = hasMissings;

    // Compute values of criteria for all possible split
    // indices.
    while (next < firstMiss) {

      if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5
          < trainInstances.instance(next).value(m_attIndex)) {

        // Move class values for all Instances up to next
        // possible split point.
        m_distribution.shiftRange(1, 0, trainInstances, last, next);

        // Check if enough Instances in each subset and compute
        // values for criteria.
        if (Utils.grOrEq(m_distribution.perBag(0), m_minNoObj)
          && Utils.grOrEq(m_distribution.perBag(1), m_minNoObj)) {
          val = chiSquareCrit.chiVal(m_distribution);
          if (Utils.gr(val, maxVal)) {
            maxVal = val;
            splitIndex = next - 1;
          }
          m_index++;
        }
        last = next;
      }
      next++;
    }

    // Was there any useful split?
    if ((m_index == 0) || (splitIndex < 0)) {
      return;
    }

    // Set instance variables' values to values for best split.
    //m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances
      .instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(
      m_attIndex)) {
      m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distribution for best split.
    if (!m_hasMissingValues) {
      // It will be created two bags
      m_distribution = new CHAIDDistribution(1, trainInstances.numClasses(), m_minNoObj, 
          m_sigLevelAtt, m_sigLevelMergeSplit, m_searchBestSplit, m_ordered);
      m_missingCurrentIndex = -1;
    } else {
      // It will be created three bags
      m_distribution = new CHAIDDistribution(2, trainInstances.numClasses(), m_minNoObj, 
          m_sigLevelAtt, m_sigLevelMergeSplit, m_searchBestSplit, m_ordered);
      m_distribution.addRange(2, trainInstances, firstMiss, trainInstances.numInstances());
      getCHAIDDistribution().setHasMissingValues();
      m_missingCurrentIndex = 2;
    }
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    if (m_hasMissingValues) {
      m_index++;
      getCHAIDDistribution().initializeIndicators();
      getCHAIDDistribution().handleMissingValues();
      m_missingCurrentIndex = getCHAIDDistribution().getMissingCurrentIndex();
    }

    double originalSig = chiSquareCrit.splitCritValue(m_distribution);
    // Compute Bonferroni correction for best split.
    m_chiSquaredProb = originalSig * chiSquareCrit.bonferroniFactor(m_index, m_distribution.numBags(), m_hasMissingValues, true);
    if (Utils.gr(m_chiSquaredProb, m_sigLevelAtt)) {
      // Not significant: merge all values
      getCHAIDDistribution().mergeAll();
    }
    m_indicators = getCHAIDDistribution().getIndicators();
    m_complexityIndex = m_distribution.numBags();
    m_numSubsets = m_distribution.numBags();
  }

  /**
   * Returns index of subset instance is assigned to. Returns -1 if instance is
   * assigned to more than one subset.
   * 
   * @exception Exception if something goes wrong
   */
  @Override
  public final int whichSubset(Instance instance) throws Exception {
    if (instance.attribute(m_attIndex).isNominal()) {
      return super.whichSubset(instance);
    } else {
      if (instance.isMissing(m_attIndex)) {
        return m_missingCurrentIndex;
      } else if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint)) {
        return 0;
      } else {
        return 1;
      }
    }
  }

  /**
   * Prints the condition satisfied by instances in a subset.
   * 
   * @param index of subset
   * @param data training set.
   */
  @Override
  public final String rightSide(int index, Instances data) {

    if (data.attribute(m_attIndex).isNominal()) {
      return super.rightSide(index, data);
    } else {
        StringBuffer text = new StringBuffer();
    	switch(index) {
    	case 0:
    		text.append(" <= " + Utils.doubleToString(m_splitPoint, 6));
    		if (m_missingCurrentIndex == 0)
    			text.append(" | = ?");
    		break;
    	case 1:
    		text.append(" > " + Utils.doubleToString(m_splitPoint, 6));
    		if (m_missingCurrentIndex == 1)
    			text.append(" | = ?");
    		break;
    	case 2:
    		text.append(" = ?");
    		break;
    	default:
    		System.out.println("CHAIDStarSplit.rightSide(): index unexpected!!!");
    	}    	
        return text.toString();
    }
  }

  /**
   * Returns a string containing java source code equivalent to the test made at
   * this node. The instance being tested is called "i".
   * 
   * @param index index of the nominal value tested
   * @param data the data containing instance structure info
   * @return a value of type 'String'
   */
  @Override
  public final String sourceExpression(int index, Instances data) {

    if ((index < 0) ||
        (data.attribute(m_attIndex).isNominal())) {
      return super.sourceExpression(index, data);
    } else {
      StringBuffer expr = null;
      StringBuffer expr2 = null;
      expr2 = new StringBuffer("((Double) i[");
      expr2.append(m_attIndex).append("])");
      switch(index) {
      case 0:
        expr2.append(".doubleValue() <= ").append(m_splitPoint);
        if (m_missingCurrentIndex == 0)
          // Comparison with null always has to be the first one!
          expr = new StringBuffer("(i[" + m_attIndex + "] == null) || (" + expr2 + ")");
        else
          expr = expr2;
        break;
      case 1:
        expr2.append(".doubleValue() > ").append(m_splitPoint);
        if (m_missingCurrentIndex == 1)
          // Comparison with null always has to be the first one!
          expr = new StringBuffer("(i[" + m_attIndex + "] == null) || (" + expr2 + ")");
        else
          expr = expr2;
        break;
      case 2:
        expr = new StringBuffer("i[" + m_attIndex + "] == null");
        break;
      default:
        System.out.println("CHAIDStarSplit.sourceExpression(): index unexpected!!!");
      }
      return expr.toString();
    }
  }
  
  /**
   * Returns the split point (numeric attribute only).
   * (Copied from C45Split class)
   * @return the split point used for a test on a numeric attribute
   */
  public double splitPoint() {
    return m_splitPoint;
  }


  /**
   * Sets split point to greatest value in given data smaller or equal to old
   * split point. (C4.5 does this for some strange reason).
   * (Copied from C45Split class)
   */
  public final void setSplitPoint(Instances allInstances) {

    double newSplitPoint = -Double.MAX_VALUE;
    double tempValue;
    Instance instance;

    if ((allInstances.attribute(m_attIndex).isNumeric()) && (m_numSubsets > 1)) {
      Enumeration<Instance> enu = allInstances.enumerateInstances();
      while (enu.hasMoreElements()) {
        instance = enu.nextElement();
        if (!instance.isMissing(m_attIndex)) {
          tempValue = instance.value(m_attIndex);
          if (Utils.gr(tempValue, newSplitPoint)
            && Utils.smOrEq(tempValue, m_splitPoint)) {
            newSplitPoint = tempValue;
          }
        }
      }
      m_splitPoint = newSplitPoint;
    }
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.1 $");
  }
}
