/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    ChiSquareSplitCrit.java
 *    Copyright (C) 2021 ALDAPA Team (http://www.aldapa.eus)
 *    Faculty of Informatics, Donostia, 20018
 *    University of the Basque Country (UPV/EHU), Basque Country
 *
 */

package weka.classifiers.trees.jchaid;

import weka.classifiers.trees.j48.Distribution;
import weka.classifiers.trees.j48.SplitCriterion;
import weka.core.ContingencyTables;
import weka.core.RevisionUtils;
import weka.core.SpecialFunctions;
import weka.core.Statistics;

/**
 * Class for computing the chi-squared probability for a given distribution.
 * Also, it computes the Bonferroni factor for different situations 
 *
 * @author Jes&uacute;s M. P&eacute;rez (txus.perez@ehu.eus)
 * @author Oscar Teixeira (oteixeira001@ikasle.ehu.es)
 * @version $Revision: 1.1 $
 */
public class ChiSquareSplitCrit extends SplitCriterion {

  /** for serialization */
  private static final long serialVersionUID = -6231235331711552781L;

  @Override  
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.0 $");
  }

  /**
   * Computes chi-squared statistic for given distribution.
   */
  public double chiVal(Distribution bags) {
    return ContingencyTables.chiVal(bags.matrix(), false);
  }

  /**
   * Computes chi-squared statistic for given matrix.
   */
  public double chiVal(double [][] matrix) {
    return ContingencyTables.chiVal(matrix, false);
  }

  /**
   * Computes chi-squared probability for given distribution.
   */
  public double splitCritValue(Distribution bags) {
    return ContingencyTables.chiSquared(bags.matrix(), false);
  }
  
  /**
   * Computes chi-squared probability for given ch statistic and dg degrees of freedom.
   */
  public double splitCritValue(double ch, double dg) {
    return Statistics.chiSquaredProbability(ch, dg);
  }
  
  /**
   * Compute factor for Bonferroni correction. This is based on Equation 3.2 in
   * Kass (1980).
   * Copied from the filter weka.filters.supervised.attribute.MergeNominalValues
   */
  public double BFfactor(int c, int r) {

    double sum = 0;
    double multiplier = 1.0;
    for (int i = 0; i < r; i++) {
      sum += multiplier
        * Math
          .exp((c * Math.log(r - i) - (SpecialFunctions.lnFactorial(i) + SpecialFunctions
            .lnFactorial(r - i))));
      multiplier *= -1.0;
    }
    return sum;
  }

  /**
   * Compute the binomial coefficient or the number of ways/combinations to choose k elements,
   *  disregarding their order, from a set of n elements.
   *  Based on the book "Numerical Recipes in C". Cambridge University Press. W.H. Press, S.A. Teukolsky, W.T. Vetterling and B.P. Flannery
   * @param n number of elements of the set
   * @param k elements to choose
   * @return number of combinations to choose k elements from a set of n elements.
   */
  public double combinations(int n, int k){
	  if(n < k)
		  // Warning! Can't compute the binomial coefficient for n < k!
		  return (double)0;
	  if(n == k)
		  return (double)1;
	  // The floor() function cleans up roundoff error for smaller values of n and k.
	  return Math.floor((double)0.5 +
			  Math.exp(SpecialFunctions.lnFactorial(n) -
					  SpecialFunctions.lnFactorial(k) -
					  SpecialFunctions.lnFactorial(n -k)));
  }

  /**
   * Compute factor for Bonferroni correction for Nominal (Free) and Ordinal (Monotonic) attributes
   *  taking into account whether there is missing values or not (Floating).
   * This is based on 3.1, 3.2 and 3.3 Equations in Kass (1980).
   * @param c Number of known categories or values that the attribute can take
   *  (included missing value, if exists)
   * @param r Number of groups of values in which the original values have been combined
   * @param hasMissing Indicates if missing values are present
   * @param ordered Indicates if the attribute's categories lie on an ordinal scale
   * @return factor for Bonferroni correction
   */
  public double bonferroniFactor(int c, int r, boolean hasMissing, boolean ordered){
	  double b;
	  if (ordered) { // Discrete or Enumerated but Ordinal attributes
		  if (hasMissing) { // (Floating in CHAID terminology)
		    c--; // because missing value is included yet!
			  b = combinations(c-1, r-1) * ((r-1) + (double)r*(c-r)) / (c-1);
		  }
		  else // (Monotonic in CHAID terminology)
			  b = combinations(c-1, r-1);
	  }
	  else // Discrete or Enumerated but Nominal attributes (Free in CHAID terminology)
		  b = BFfactor(c, r);
	  return b;
  }
}
