@Override public long getNrOfTokens() throws Exception { return cfd.getFrequencyDistribution(1).getN(); } }
public void incAll(Iterable<T> iterable ) { for (T o : iterable) { addSample(o, 1); } }
@Override public String toString() { return token + " (" + pos.toString() + ")"; } }
@Test public void cfdTest() { List<String> tokens = Arrays.asList("This is a first test that contains a first test example".split(" ")); FrequencyDistribution<String> fd = new FrequencyDistribution<String>(); fd.incAll(tokens); System.out.println(fd); assertEquals(11, fd.getN()); assertEquals(8, fd.getB()); assertEquals(0, fd.getCount("humpelgrumpf")); assertEquals(1, fd.getCount("This")); assertEquals(2, fd.getCount("test")); } }
public BrownProvider(int minN, int maxN) throws Exception { BrownCorpus brownCorpus = new BrownCorpus(); cfd = new ConditionalFrequencyDistribution<Integer,String>(); if (minN > maxN) { throw new IllegalArgumentException("minN > maxN"); } for (int i=minN; i<=maxN; i++) { for (Sentence s : brownCorpus.getSentences()) { cfd.addSamples( i, new NGramIterable(s.getTokens(), i, i) ); } } }
public void addSamples(C t, Iterable<V> samples) { FrequencyDistribution<V> freqDist = null; if (cfd.containsKey(t)) { freqDist = cfd.get(t); } else { freqDist = new FrequencyDistribution<V>(); cfd.put(t, freqDist); } long countBefore = freqDist.getN(); freqDist.incAll(samples); this.n = n + (freqDist.getN() - countBefore); }
public static void main(String[] args) throws Exception { Web1TProvider web1t = new Web1TProvider(new Locale("de"), 1); BrownCorpus brown = new BrownCorpus(); StopWatch watch = new StopWatch(); watch.start(); watch.suspend(); for (Text text : brown.getTexts()) { for (Sentence s : text.getSentences()) { for (String t : s.getTokens()) { watch.resume(); web1t.getFrequency(t); watch.suspend(); } } } double time = (double) watch.getTime() / 1000; System.out.println(time + "s"); } }
@Override public long getFrequency(String phrase) throws Exception { int phraseLength = CoreUtils.getPhraseLength(phrase); if (cfd.hasCondition(phraseLength)) { return cfd.getCount(phraseLength, phrase); } else { return 0; } }
public static void main(String[] args) { List<String> tokens = Arrays.asList(StringUtils.split("This is a simple example sentence containing an example .")); FrequencyDistribution<String> fq = new FrequencyDistribution<String>(tokens); System.out.println(fq.getCount("example")); System.out.println(fq.getCount("is")); } }
public ConditionalFrequencyDistribution(Map<C,Iterable<V>> samples) { this(); for (C t : samples.keySet()) { addSamples(t, samples.get(t)); } }
@Override public String toString() { return StringUtils.join(getTokens(), ' '); }
@Override public String toString() { return StringUtils.join(getSentences(), ' '); }
public long getCount(C t, V u) { if (cfd.containsKey(t)) { return cfd.get(t).getCount(u); } else { return 0; } }
@Override public double getProbability(String phrase) throws Exception { long count = getFrequency(phrase); long N = cfd.getN(); if (N == 0) { return 0; } else { return (double) count / N; } }
@Override public String toString() { StringBuilder sb = new StringBuilder(); for (C t : cfd.keySet()) { sb.append(t.toString()); sb.append(System.getProperty("line.separator")); sb.append(cfd.get(t).toString()); sb.append(System.getProperty("line.separator")); } return sb.toString(); } }
public void addSample(C t, V sample) { List<V> samples = new ArrayList<V>(); samples.add(sample); addSamples(t, samples); }
public String getFormattedString() { return "[" + StringUtils.join(getTokens(), ' ') + "]"; }
public String getFormattedString() { return "[" + StringUtils.join(getSentences(), ' ') + "]"; }
public FrequencyDistribution(Iterable<T> iterable) { this(); for (T o : iterable) { addSample(o, 1); } }
public void inc(T o) { addSample(o, 1); }