/** * Returns a String representation of this BasicDatum (lists features and * labels). */ @Override public String toString() { return "RVFDatum[id="+id+", features=" + asFeaturesCounter() + ",label=" + label() + "]"; }
protected void addInstances(Collection<RVFDatum<K, V>> datums) { for (RVFDatum<K, V> datum : datums) { K label = datum.label(); Counter<V> vec = datum.asFeaturesCounter(); instances.add(label, vec); classLookup.put(vec, label); } }
public float accuracy(Iterator<RVFDatum<L, F>> exampleIterator) { int correct = 0; int total = 0; for (; exampleIterator.hasNext();) { RVFDatum<L, F> next = exampleIterator.next(); L guess = classOf(next); if (guess.equals(next.label())) { correct++; } total++; } logger.info("correct " + correct + " out of " + total); return correct / (float) total; }
/** * Evaluate the accuracy of this classifier on the given dataset. * * @param testData The dataset to evaluate the classifier on. * @return The accuracy of the classifier on the given dataset. */ public default double evaluateAccuracy(GeneralDataset<L, F> testData) { int numCorrect = 0; for (RVFDatum<L, F> datum : testData) { // Get the gold label L label = datum.label(); if (label == null) { throw new IllegalArgumentException("Cannot compute precision and recall on unlabelled dataset. Offending datum: " + datum); } // Get the guess L guess = classOf(datum); // Compute statistics if (label.equals(guess)) { numCorrect += 1; } } return ((double) numCorrect) / ((double) testData.size); }
private double probabilityOfRVFDatum(RVFDatum<L, F> example) { return probabilityOf(example.asFeaturesCounter(), example.label()); }
public void writeSVMLightFormat(PrintWriter writer) { for (RVFDatum<L, F> datum : this) { writer.print(this.labelIndex.indexOf(datum.label())); Counter<F> features = datum.asFeaturesCounter(); for (F feature : features.keySet()) { double count = features.getCount(feature); writer.format(Locale.ENGLISH, " %s:%f", this.featureIndex.indexOf(feature), count); } writer.println(); } }
L label = datum.label(); if (label == null) { throw new IllegalArgumentException("Cannot compute precision and recall on unlabelled dataset. Offending datum: " + datum);
/** * A helper function for dumping the accuracy of the trained classifier. * * @param classifier The classifier to evaluate. * @param dataset The dataset to evaluate the classifier on. */ public static void dumpAccuracy(Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier, GeneralDataset<ClauseSplitter.ClauseClassifierLabel, String> dataset) { DecimalFormat df = new DecimalFormat("0.00%"); log("size: " + dataset.size()); log("split count: " + StreamSupport.stream(dataset.spliterator(), false).filter(x -> x.label() == ClauseSplitter.ClauseClassifierLabel.CLAUSE_SPLIT).collect(Collectors.toList()).size()); log("interm count: " + StreamSupport.stream(dataset.spliterator(), false).filter(x -> x.label() == ClauseSplitter.ClauseClassifierLabel.CLAUSE_INTERM).collect(Collectors.toList()).size()); Pair<Double, Double> pr = classifier.evaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.CLAUSE_SPLIT); log("p (split): " + df.format(pr.first)); log("r (split): " + df.format(pr.second)); log("f1 (split): " + df.format(2 * pr.first * pr.second / (pr.first + pr.second))); pr = classifier.evaluatePrecisionAndRecall(dataset, ClauseSplitter.ClauseClassifierLabel.CLAUSE_INTERM); log("p (interm): " + df.format(pr.first)); log("r (interm): " + df.format(pr.second)); log("f1 (interm): " + df.format(2 * pr.first * pr.second / (pr.first + pr.second))); }
/** * The examples are assumed to be a list of RFVDatum. * The datums are assumed to not contain the zeroes and then they are added to each instance. */ public NaiveBayesClassifier<L, F> trainClassifier(GeneralDataset<L, F> examples, Set<F> featureSet) { int numFeatures = featureSet.size(); int[][] data = new int[examples.size()][numFeatures]; int[] labels = new int[examples.size()]; labelIndex = new HashIndex<>(); featureIndex = new HashIndex<>(); for (F feat : featureSet) { featureIndex.add(feat); } for (int d = 0; d < examples.size(); d++) { RVFDatum<L, F> datum = examples.getRVFDatum(d); Counter<F> c = datum.asFeaturesCounter(); for (F feature : c.keySet()) { int fNo = featureIndex.indexOf(feature); int value = (int) c.getCount(feature); data[d][fNo] = value; } labelIndex.add(datum.label()); labels[d] = labelIndex.indexOf(datum.label()); } int numClasses = labelIndex.size(); return trainClassifier(data, labels, numFeatures, numClasses, labelIndex, featureIndex); }
public RVFDatum<L, F> scaleDatumGaussian(RVFDatum<L, F> datum) { // scale this dataset before scaling the datum if (means == null || stdevs == null) scaleFeaturesGaussian(); Counter<F> scaledFeatures = new ClassicCounter<>(); for (F feature : datum.asFeatures()) { int fID = this.featureIndex.indexOf(feature); if (fID >= 0) { double oldVal = datum.asFeaturesCounter().getCount(feature); double newVal; if (stdevs[fID] != 0) newVal = (oldVal - means[fID]) / stdevs[fID]; else newVal = oldVal; scaledFeatures.incrementCount(feature, newVal); } } return new RVFDatum<>(scaledFeatures, datum.label()); }
while(iter.hasNext()){ RVFDatum<String, ScorePhraseMeasures> inst = iter.next(); newdataset.add(new BasicDatum<>(inst.asFeatures(), inst.label()));
/** * Scales the values of each feature linearly using the min and max values * found in the training set. NOTE1: Not guaranteed to be between 0 and 1 for * a test datum. NOTE2: Also filters out features from the datum that are not * seen at training time. * * @param datum * @return a new datum */ public RVFDatum<L, F> scaleDatum(RVFDatum<L, F> datum) { // scale this dataset before scaling the datum if (minValues == null || maxValues == null) scaleFeatures(); Counter<F> scaledFeatures = new ClassicCounter<>(); for (F feature : datum.asFeatures()) { int fID = this.featureIndex.indexOf(feature); if (fID >= 0) { double oldVal = datum.asFeaturesCounter().getCount(feature); double newVal; if (minValues[fID] != maxValues[fID]) newVal = (oldVal - minValues[fID]) / (maxValues[fID] - minValues[fID]); else newVal = oldVal; scaledFeatures.incrementCount(feature, newVal); } } return new RVFDatum<>(scaledFeatures, datum.label()); }
/** * Builds a sigmoid model to turn the classifier outputs into probabilities. */ private LinearClassifier<L, L> fitSigmoid(SVMLightClassifier<L, F> classifier, GeneralDataset<L, F> dataset) { RVFDataset<L, L> plattDataset = new RVFDataset<>(); for (int i = 0; i < dataset.size(); i++) { RVFDatum<L, F> d = dataset.getRVFDatum(i); Counter<L> scores = classifier.scoresOf((Datum<L,F>)d); scores.incrementCount(null); plattDataset.add(new RVFDatum<>(scores, d.label())); } LinearClassifierFactory<L, L> factory = new LinearClassifierFactory<>(); factory.setPrior(new LogPrior(LogPrior.LogPriorType.NULL)); return factory.trainClassifier(plattDataset); }
.forEach(x -> { synchronized (dataset) { distribution.incrementCount(x.label()); dataset.add(x);
/** * Returns a String representation of this BasicDatum (lists features and * labels). */ @Override public String toString() { return "RVFDatum[id="+id+", features=" + asFeaturesCounter() + ",label=" + label() + "]"; }
/** * Returns a String representation of this BasicDatum (lists features and * labels). */ @Override public String toString() { return "RVFDatum[features=" + asFeaturesCounter() + ",label=" + label() + "]"; }
/** * Returns a String representation of this BasicDatum (lists features and labels). */ @Override public String toString() { return "RVFDatum[features=" + asFeatures() + ",label=" + label() + "]"; }
private double probabilityOfRVFDatum(RVFDatum<L, F> example) { return probabilityOf(example.asFeaturesCounter(), example.label()); }
public void writeSVMLightFormat(PrintWriter writer) { for (RVFDatum<L, F> datum : this) { writer.print(this.labelIndex.indexOf(datum.label())); Counter<F> features = datum.asFeaturesCounter(); for (F feature : features.keySet()) { double count = features.getCount(feature); writer.format(Locale.ENGLISH, " %s:%f", this.featureIndex.indexOf(feature), count); } writer.println(); } }
public void writeSVMLightFormat(PrintWriter writer) { for (RVFDatum<L, F> datum : this) { writer.print(this.labelIndex.indexOf(datum.label())); Counter<F> features = datum.asFeaturesCounter(); for (F feature : features.keySet()) { double count = features.getCount(feature); writer.format(" %s:%f", this.featureIndex.indexOf(feature), count); } writer.println(); } }