public double computeCostAndPrune() { double costS = getMDL(); if (isLeaf()) return costS + 1; double minCost1 = getLeftChild().computeCostAndPrune(); double minCost2 = getRightChild().computeCostAndPrune(); double costSplit = Math.log(m_gainRatio.getNumSplitPointsForBestFeature()) / GainRatio.log2; double minCostN = Math.min(costS+1, costSplit+1+minCost1+minCost2); if (Maths.almostEquals(minCostN, costS+1)) m_leftChild = m_rightChild = null; return minCostN; }
public StringBuffer getStringBufferName() { StringBuffer sb = new StringBuffer(); if (m_parent == null) return sb.append("root"); else if (m_parent.getParent() == null) { sb.append("(\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } else { sb.append(m_parent.getStringBufferName()); sb.append(" && (\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } }
public void print(String prefix) if (isLeaf()) { int bestLabelIndex = getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * getSize()); System.out.println("root:" + getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + getSize()); String featName = m_dataDict.lookupObject(getGainRatio().getMaxValuedIndex()).toString(); double threshold = getGainRatio().getMaxValuedThreshold(); System.out.print(prefix + "\"" + featName + "\" <= " + threshold + ":"); if (m_leftChild.isLeaf()) { int bestLabelIndex = m_leftChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_leftChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_leftChild.getSize()); System.out.println(m_leftChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_leftChild.getSize()); m_leftChild.print(prefix + "| "); if (m_rightChild.isLeaf()) { int bestLabelIndex = m_rightChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_rightChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_rightChild.getSize()); System.out.println(m_rightChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_rightChild.getSize()); m_rightChild.print(prefix + "| ");
public void print(String prefix) if (isLeaf()) { int bestLabelIndex = getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * getSize()); System.out.println("root:" + getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + getSize()); String featName = m_dataDict.lookupObject(getGainRatio().getMaxValuedIndex()).toString(); double threshold = getGainRatio().getMaxValuedThreshold(); System.out.print(prefix + "\"" + featName + "\" <= " + threshold + ":"); if (m_leftChild.isLeaf()) { int bestLabelIndex = m_leftChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_leftChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_leftChild.getSize()); System.out.println(m_leftChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_leftChild.getSize()); m_leftChild.print(prefix + "| "); if (m_rightChild.isLeaf()) { int bestLabelIndex = m_rightChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_rightChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_rightChild.getSize()); System.out.println(m_rightChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_rightChild.getSize()); m_rightChild.print(prefix + "| ");
public void print(String prefix) if (isLeaf()) { int bestLabelIndex = getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * getSize()); System.out.println("root:" + getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + getSize()); String featName = m_dataDict.lookupObject(getGainRatio().getMaxValuedIndex()).toString(); double threshold = getGainRatio().getMaxValuedThreshold(); System.out.print(prefix + "\"" + featName + "\" <= " + threshold + ":"); if (m_leftChild.isLeaf()) { int bestLabelIndex = m_leftChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_leftChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_leftChild.getSize()); System.out.println(m_leftChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_leftChild.getSize()); m_leftChild.print(prefix + "| "); if (m_rightChild.isLeaf()) { int bestLabelIndex = m_rightChild.getGainRatio().getBaseLabelDistribution().getBestIndex(); int numMajorityLabel = (int) (m_rightChild.getGainRatio().getBaseLabelDistribution().value(bestLabelIndex) * m_rightChild.getSize()); System.out.println(m_rightChild.getGainRatio().getBaseLabelDistribution().getBestLabel() + " " + numMajorityLabel + "/" + m_rightChild.getSize()); m_rightChild.print(prefix + "| ");
public StringBuffer getStringBufferName() { StringBuffer sb = new StringBuffer(); if (m_parent == null) return sb.append("root"); else if (m_parent.getParent() == null) { sb.append("(\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } else { sb.append(m_parent.getStringBufferName()); sb.append(" && (\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } }
public StringBuffer getStringBufferName() { StringBuffer sb = new StringBuffer(); if (m_parent == null) return sb.append("root"); else if (m_parent.getParent() == null) { sb.append("(\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } else { sb.append(m_parent.getStringBufferName()); sb.append(" && (\""); sb.append(m_dataDict.lookupObject(m_parent.getGainRatio().getMaxValuedIndex()).toString()); sb.append("\""); if (m_parent.getLeftChild() == this) sb.append(" <= "); else sb.append(" > "); sb.append(m_parent.getGainRatio().getMaxValuedThreshold()); return sb.append(")"); } }
protected void splitTree(C45.Node node, int depth) { // Stop growing the tree when any of the following is true: // 1. We care about tree depth and maximum depth is reached // 2. The entropy of the node is too small (i.e., all // instances belong to the same class) // 3. The gain ratio of the best split available is too small if (m_depthLimited && depth == m_maxDepth) { logger.info("Splitting stopped: maximum depth reached (" + m_maxDepth + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getBaseEntropy(), 0)) { logger.info("Splitting stopped: entropy of node too small (" + node.getGainRatio().getBaseEntropy() + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getMaxValue(), 0)) { logger.info("Splitting stopped: node has insignificant gain ratio (" + node.getGainRatio().getMaxValue() + ")"); return; } logger.info("Splitting feature \""+node.getSplitFeature() +"\" at threshold=" + node.getGainRatio().getMaxValuedThreshold() + " gain ratio="+node.getGainRatio().getMaxValue()); node.split(); splitTree(node.getLeftChild(), depth+1); splitTree(node.getRightChild(), depth+1); }
protected void splitTree(C45.Node node, int depth) { // Stop growing the tree when any of the following is true: // 1. We care about tree depth and maximum depth is reached // 2. The entropy of the node is too small (i.e., all // instances belong to the same class) // 3. The gain ratio of the best split available is too small if (m_depthLimited && depth == m_maxDepth) { logger.info("Splitting stopped: maximum depth reached (" + m_maxDepth + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getBaseEntropy(), 0)) { logger.info("Splitting stopped: entropy of node too small (" + node.getGainRatio().getBaseEntropy() + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getMaxValue(), 0)) { logger.info("Splitting stopped: node has insignificant gain ratio (" + node.getGainRatio().getMaxValue() + ")"); return; } logger.info("Splitting feature \""+node.getSplitFeature() +"\" at threshold=" + node.getGainRatio().getMaxValuedThreshold() + " gain ratio="+node.getGainRatio().getMaxValue()); node.split(); splitTree(node.getLeftChild(), depth+1); splitTree(node.getRightChild(), depth+1); }
protected void splitTree(C45.Node node, int depth) { // Stop growing the tree when any of the following is true: // 1. We care about tree depth and maximum depth is reached // 2. The entropy of the node is too small (i.e., all // instances belong to the same class) // 3. The gain ratio of the best split available is too small if (m_depthLimited && depth == m_maxDepth) { logger.info("Splitting stopped: maximum depth reached (" + m_maxDepth + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getBaseEntropy(), 0)) { logger.info("Splitting stopped: entropy of node too small (" + node.getGainRatio().getBaseEntropy() + ")"); return; } else if (Maths.almostEquals(node.getGainRatio().getMaxValue(), 0)) { logger.info("Splitting stopped: node has insignificant gain ratio (" + node.getGainRatio().getMaxValue() + ")"); return; } logger.info("Splitting feature \""+node.getSplitFeature() +"\" at threshold=" + node.getGainRatio().getMaxValuedThreshold() + " gain ratio="+node.getGainRatio().getMaxValue()); node.split(); splitTree(node.getLeftChild(), depth+1); splitTree(node.getRightChild(), depth+1); }
public C45 train (InstanceList trainingList) { FeatureSelection selectedFeatures = trainingList.getFeatureSelection(); if (selectedFeatures != null) // xxx Attend to FeatureSelection!!! throw new UnsupportedOperationException ("FeatureSelection not yet implemented."); C45.Node root = new C45.Node(trainingList, null, m_minNumInsts); splitTree(root, 0); C45 tree = new C45 (trainingList.getPipe(), root); logger.info("C45 learned: (size=" + tree.getSize() + ")\n"); tree.print(); if (m_doPruning) { tree.prune(); logger.info("\nPruned C45: (size=" + tree.getSize() + ")\n"); root.print(); } root.stopGrowth(); this.classifier = tree; return classifier; }
public C45 train (InstanceList trainingList) { FeatureSelection selectedFeatures = trainingList.getFeatureSelection(); if (selectedFeatures != null) // xxx Attend to FeatureSelection!!! throw new UnsupportedOperationException ("FeatureSelection not yet implemented."); C45.Node root = new C45.Node(trainingList, null, m_minNumInsts); splitTree(root, 0); C45 tree = new C45 (trainingList.getPipe(), root); logger.info("C45 learned: (size=" + tree.getSize() + ")\n"); tree.print(); if (m_doPruning) { tree.prune(); logger.info("\nPruned C45: (size=" + tree.getSize() + ")\n"); root.print(); } root.stopGrowth(); this.classifier = tree; return classifier; }
public C45 train (InstanceList trainingList) { FeatureSelection selectedFeatures = trainingList.getFeatureSelection(); if (selectedFeatures != null) // xxx Attend to FeatureSelection!!! throw new UnsupportedOperationException ("FeatureSelection not yet implemented."); C45.Node root = new C45.Node(trainingList, null, m_minNumInsts); splitTree(root, 0); C45 tree = new C45 (trainingList.getPipe(), root); logger.info("C45 learned: (size=" + tree.getSize() + ")\n"); tree.print(); if (m_doPruning) { tree.prune(); logger.info("\nPruned C45: (size=" + tree.getSize() + ")\n"); root.print(); } root.stopGrowth(); this.classifier = tree; return classifier; }
/** * Calculates the minimum description length of this node, i.e., * the length of the binary encoding that describes the feature * and the split value used at this node */ public double getMDL() { int numClasses = m_ilist.getTargetAlphabet().size(); double mdl = getSize() * getGainRatio().getBaseEntropy(); mdl += ((numClasses-1) * Math.log(getSize() / 2.0)) / (2 * GainRatio.log2); double piPow = Math.pow(Math.PI, numClasses/2.0); double gammaVal = Maths.gamma(numClasses/2.0); mdl += Math.log(piPow/gammaVal) / GainRatio.log2; return mdl; }
/** * Calculates the minimum description length of this node, i.e., * the length of the binary encoding that describes the feature * and the split value used at this node */ public double getMDL() { int numClasses = m_ilist.getTargetAlphabet().size(); double mdl = getSize() * getGainRatio().getBaseEntropy(); mdl += ((numClasses-1) * Math.log(getSize() / 2.0)) / (2 * GainRatio.log2); double piPow = Math.pow(Math.PI, numClasses/2.0); double gammaVal = Maths.gamma(numClasses/2.0); mdl += Math.log(piPow/gammaVal) / GainRatio.log2; return mdl; }
/** * Calculates the minimum description length of this node, i.e., * the length of the binary encoding that describes the feature * and the split value used at this node */ public double getMDL() { int numClasses = m_ilist.getTargetAlphabet().size(); double mdl = getSize() * getGainRatio().getBaseEntropy(); mdl += ((numClasses-1) * Math.log(getSize() / 2.0)) / (2 * GainRatio.log2); double piPow = Math.pow(Math.PI, numClasses/2.0); double gammaVal = Maths.gamma(numClasses/2.0); mdl += Math.log(piPow/gammaVal) / GainRatio.log2; return mdl; }
/** * Count the number of non-leaf descendant nodes */ public int getNumDescendants() { if (isLeaf()) return 0; int count = 0; if (! getLeftChild().isLeaf()) count += 1 + getLeftChild().getNumDescendants(); if (! getRightChild().isLeaf()) count += 1 + getRightChild().getNumDescendants(); return count; }