public ClassicCounter<L> scoresOf(Datum<L, F> example) { RVFDatum<L, F> rvf = new RVFDatum<>(example); return scoresOf(rvf); }
public L classOf(Datum<L, F> example) { RVFDatum<L, F> rvf = new RVFDatum<>(example); return classOf(rvf); }
public static RVFDatum<String, String> svmLightLineToRVFDatum(String l) { l = l.replaceFirst("#.*$", ""); // remove any trailing comments String[] line = l.split("\\s+"); ClassicCounter<String> features = new ClassicCounter<>(); for (int i = 1; i < line.length; i++) { String[] f = line[i].split(":"); if (f.length != 2) { throw new IllegalArgumentException("Bad data format: " + l); } double val = Double.parseDouble(f[1]); features.incrementCount(f[0], val); } return new RVFDatum<>(features, line[0]); }
public double probabilityOf(Mention p, Set<Mention> shares, Set<String> neStrings, Dictionaries dict, Properties props) { try { boolean dummyLabel = false; RVFDatum<Boolean, String> datum = new RVFDatum<>(extractFeatures(p, shares, neStrings, dict, props), dummyLabel); return rf.probabilityOfTrue(datum); } catch (Exception e) { throw new RuntimeException(e); } }
/** * Get the sentiment of a sentence. * * @param sentence The sentence as a core map. * POS tags and Lemmas are a prerequisite. * See {@link edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation} and * {@link edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation}. * * @return The sentiment class of this sentence. */ public SentimentClass classify(CoreMap sentence) { Counter<String> features = featurize(sentence); RVFDatum<SentimentClass, String> datum = new RVFDatum<>(features); return impl.classOf(datum); }
public Datum<String,String> createDatum(RelationMention rel, String positiveLabel) { Counter<String> features = new ClassicCounter<>(); if (rel.getArgs().size() != 2) { return null; } addFeatures(features, rel, featureList); String labelString = rel.getType(); if(! labelString.equals(positiveLabel)) labelString = RelationMention.UNRELATED; return new RVFDatum<>(features, labelString); }
/** * Given a set of vectors, and a mapping from each vector to its class label, * generates the sets of instances used to perform classifications and returns * the corresponding K-NN classifier. * * NOTE: if l2NormalizeVectors is T, creates a copy and applies L2Normalize to it. */ public KNNClassifier<K,V> train(Collection<Counter<V>> vectors, Map<V, K> labelMap) { KNNClassifier<K, V> classifier = new KNNClassifier<>(k, weightedVotes, l2NormalizeVectors); Collection<RVFDatum<K, V>> instances = new ArrayList<>(); for (Counter<V> vector : vectors) { K label = labelMap.get(vector); RVFDatum<K, V> datum; if (l2NormalizeVectors) { datum = new RVFDatum<>(Counters.L2Normalize(new ClassicCounter<>(vector)), label); } else { datum = new RVFDatum<>(vector, label); } instances.add(datum); } classifier.addInstances(instances); return classifier; }
/** * Returns a counter for the log probability of each of the classes * looking at the the sum of e^v for each count v, should be 1 * Note: Uses SloppyMath.logSum which isn't exact but isn't as * offensively slow as doing a series of exponentials */ @Override public Counter<L> logProbabilityOf(RVFDatum<L, F> example) { if (platt == null) { throw new UnsupportedOperationException("If you want to ask for the probability, you must train a Platt model!"); } Counter<L> scores = scoresOf(example); scores.incrementCount(null); Counter<L> probs = platt.logProbabilityOf(new RVFDatum<>(scores)); //System.out.println(scores+" "+probs); return probs; } }
/** * Returns a counter for the log probability of each of the classes * looking at the the sum of e^v for each count v, should be 1 * Note: Uses SloppyMath.logSum which isn't exact but isn't as * offensively slow as doing a series of exponentials */ @Override public Counter<L> logProbabilityOf(Datum<L, F> example) { if (platt == null) { throw new UnsupportedOperationException("If you want to ask for the probability, you must train a Platt model!"); } Counter<L> scores = scoresOf(example); scores.incrementCount(null); Counter<L> probs = platt.logProbabilityOf(new RVFDatum<>(scores)); //System.out.println(scores+" "+probs); return probs; }
public static <L,L2,F> Datum<L2,F> mapDatum(Datum<L,F> d, Map<L,L2> labelMapping, L2 defaultLabel) { // TODO: How to copy datum? L2 newLabel = labelMapping.get(d.label()); if (newLabel == null) { newLabel = defaultLabel; } if (d instanceof RVFDatum) { return new RVFDatum<>(((RVFDatum<L, F>) d).asFeaturesCounter(), newLabel); } else { return new BasicDatum<>(d.asFeatures(), newLabel); } }
public Datum<String,String> createDatum(RelationMention rel, Logger logger) { Counter<String> features = new ClassicCounter<>(); if (rel.getArgs().size() != 2) { return null; } addFeatures(features, rel, featureList, logger); String labelString = rel.getType(); return new RVFDatum<>(features, labelString); }
/** * Read SVM-light formatted data into this dataset. * * A strict SVM-light format is expected, where labels and features are both * encoded as integers. These integers are converted into the dataset label * and feature types using the indexes stored in this dataset. * * @param file The file from which the data should be read. */ public void readSVMLightFormat(File file) { for (String line : IOUtils.readLines(file)) { line = line.replaceAll("#.*", ""); // remove any trailing comments String[] items = line.split("\\s+"); Integer label = Integer.parseInt(items[0]); Counter<F> features = new ClassicCounter<>(); for (int i = 1; i < items.length; i++) { String[] featureItems = items[i].split(":"); int feature = Integer.parseInt(featureItems[0]); double value = Double.parseDouble(featureItems[1]); features.incrementCount(this.featureIndex.get(feature), value); } this.add(new RVFDatum<>(features, this.labelIndex.get(label))); } }
/** * Given a CollectionValued Map of vectors, treats outer key as label for each * set of inner vectors. * NOTE: if l2NormalizeVectors is T, creates a copy of each vector and applies * l2Normalize to it. */ public KNNClassifier<K,V> train(CollectionValuedMap<K, Counter<V>> vecBag) { KNNClassifier<K, V> classifier = new KNNClassifier<>(k, weightedVotes, l2NormalizeVectors); Collection<RVFDatum<K, V>> instances = new ArrayList<>(); for (K label : vecBag.keySet()) { RVFDatum<K, V> datum; for (Counter<V> vector : vecBag.get(label)) { if (l2NormalizeVectors) { datum = new RVFDatum<>(Counters.L2Normalize(new ClassicCounter<>(vector)), label); } else { datum = new RVFDatum<>(vector, label); } instances.add(datum); } } classifier.addInstances(instances); return classifier; }
/** * @return the index-ed datum */ @Override public RVFDatum<L, F> getRVFDatum(int index) { ClassicCounter<F> c = new ClassicCounter<>(); for (F key : featureIndex.objects(data[index])) { c.incrementCount(key); } return new RVFDatum<>(c, labelIndex.get(labels[index])); }
/** * @return the index-ed datum * * Note, this returns a new RVFDatum object, not the original RVFDatum * that was added to the dataset. */ @Override public RVFDatum<L, F> getRVFDatum(int index) { ClassicCounter<F> c = new ClassicCounter<>(); for (int i = 0; i < data[index].length; i++) { c.incrementCount(featureIndex.get(data[index][i]), values[index][i]); } return new RVFDatum<>(c, labelIndex.get(labels[index])); }
/** * @see SimpleSentiment#classify(CoreMap) */ public SentimentClass classify(String text) { Annotation ann = new Annotation(text); pipeline.get().annotate(ann); CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0); Counter<String> features = featurize(sentence); RVFDatum<SentimentClass, String> datum = new RVFDatum<>(features); return impl.classOf(datum); }
/** * Mostly just an alias, but make sure our featurizer is serializable! */ public interface Featurizer extends Function<Triple<ClauseSplitterSearchProblem.State, ClauseSplitterSearchProblem.Action, ClauseSplitterSearchProblem.State>, Counter<String>>, Serializable { boolean isSimpleSplit(Counter<String> feats); }
private RVFDatum<String,String> makeRVFDatumFromStrings(String[] strings) { if (globalFlags.featureFormat) { ClassicCounter<String> theFeatures = new ClassicCounter<>(); for (int i = 0; i < strings.length; i++) { if (i != globalFlags.goldAnswerColumn) { if (isRealValued(flags[i])) { addFeatureValue(strings[i], flags[i], theFeatures); } else { theFeatures.setCount(strings[i], 1.0); } } } return new RVFDatum<>(theFeatures, strings[globalFlags.goldAnswerColumn]); } else { //logger.info("Read in " + strings); return makeRVFDatum(strings); } }
public RVFDatum<L, F> scaleDatumGaussian(RVFDatum<L, F> datum) { // scale this dataset before scaling the datum if (means == null || stdevs == null) scaleFeaturesGaussian(); Counter<F> scaledFeatures = new ClassicCounter<>(); for (F feature : datum.asFeatures()) { int fID = this.featureIndex.indexOf(feature); if (fID >= 0) { double oldVal = datum.asFeaturesCounter().getCount(feature); double newVal; if (stdevs[fID] != 0) newVal = (oldVal - means[fID]) / stdevs[fID]; else newVal = oldVal; scaledFeatures.incrementCount(feature, newVal); } } return new RVFDatum<>(scaledFeatures, datum.label()); }
/** * Builds a sigmoid model to turn the classifier outputs into probabilities. */ private LinearClassifier<L, L> fitSigmoid(SVMLightClassifier<L, F> classifier, GeneralDataset<L, F> dataset) { RVFDataset<L, L> plattDataset = new RVFDataset<>(); for (int i = 0; i < dataset.size(); i++) { RVFDatum<L, F> d = dataset.getRVFDatum(i); Counter<L> scores = classifier.scoresOf((Datum<L,F>)d); scores.incrementCount(null); plattDataset.add(new RVFDatum<>(scores, d.label())); } LinearClassifierFactory<L, L> factory = new LinearClassifierFactory<>(); factory.setPrior(new LogPrior(LogPrior.LogPriorType.NULL)); return factory.trainClassifier(plattDataset); }