private BOW createBow(String[] s) { Tokenizer tokenizer = HardTokenizer.getInstance(); BOW bow = new BOW(); String[] left = tokenizer.stringArray(s[2].toLowerCase()); bow.addAll(left); if (s.length == 5) { String[] right = tokenizer.stringArray(s[4].toLowerCase()); bow.addAll(right); } return bow; }
public String toString() { StringBuilder sb = new StringBuilder(); Iterator<String> it = iterator(); sb.append("bow(\n"); while (it.hasNext()) { sb.append(w); sb.append("\t"); sb.append(getFrequency(w)); sb.append("\t"); sb.append(rawFrequency(w)); sb.append("\t"); sb.append(logarithmicFrequency(w)); sb.append("\t"); sb.append(booleanFrequency(w)); sb.append("\t"); sb.append(augmentedFrequency(w)); sb.append("\n");
public double tf(String term) { switch (termFrequencyType) { case RAW_TERM_FREQUENCY: return rawFrequency(term); case BOOLEAN_TERM_FREQUENCY: return booleanFrequency(term); case AUGMENTED_TERM_FREQUENCY: return augmentedFrequency(term); case LOGARITHMIC_TERM_FREQUENCY: return logarithmicFrequency(term); } return logarithmicFrequency(term); }
Example() { bow = new BOW(); freq = 0; }
public WebPageComparator(URL page, URL[] concept, LSSimilarity lss) throws IOException { logger.info("parsing " + page + "..."); BOW bow = new BOW(toText(page)); logger.info("size bow " + bow.size()); BOW[] bows = new BOW[concept.length]; for (int i = 0; i < concept.length; i++) { logger.info("parsing concept " + concept[i]); bows[i] = new BOW(toText(concept[i])); logger.info("size concept " + i + " " + bows[i].size()); float f = lss.compare(bow, bows[i]); logger.info(i + " = " + f); } //logger.info(toText(page)); interactive(concept, bows, lss); } // end constructor
public void addAll(String[] words) { //logger.debug(Arrays.toString(words)); for (int i = 0; i < words.length; i++) { add(words[i]); } }
int[] indexes = new int[bow.size()]; double[] values = new double[bow.size()]; String term; int index; double tfIdf; int current = 0; Iterator<String> it = bow.termSet().iterator(); for (int i = 0; it.hasNext(); i++) { term = it.next(); tf = bow.getFrequency(term); tfIdf = 1.0 + Math.log10(tf); if (b) {
public String toSingleLine() { StringBuilder sb = new StringBuilder(); Iterator<String> it = iterator(); String w; int f; if (it.hasNext()) { w = it.next(); f = getFrequency(w); sb.append(w); sb.append(":"); sb.append(f); } while (it.hasNext()) { w = it.next(); f = getFrequency(w); sb.append(" "); sb.append(w); sb.append(":"); sb.append(f); } // end while return sb.toString(); }
/** * Returns a document in the VSM. */ public Vector mapDocument(BOW bow, boolean b) { //logger.info("lsm.mapDocument " + b); SparseVector vector = new SparseVector(); Iterator<String> it = bow.termSet().iterator(); for (int i = 0; it.hasNext(); i++) { //logger.info(i + " " + t[i]); String term = it.next(); int index = termIndex.get(term); if (index != -1) { int tf = bow.getFrequency(term); float tfIdf = (float) (log2(tf)); if (b) { tfIdf *= Iidf[index]; } //logger.info(term + " ==> " + index + ", tf.idf = " + tf + "(" + (log2(tf)) + ") * " + Iidf[index] + " = " + tfIdf); vector.add(index, tfIdf); } } // end for return vector; } // end map
void add(String[] tokens) { bow.addAll(tokens); }
public String toSortedLine() { SortedMap<Integer, List<String>> sortedMap = getSortedMap(); StringBuilder sb = new StringBuilder(); Iterator<Integer> it = sortedMap.keySet().iterator(); List<String> list; String w; int f; boolean b = true; while (it.hasNext()) { f = it.next(); list = sortedMap.get(f); for (int i = 0; i < list.size(); i++) { w = list.get(i); if (b) { b = false; } else { sb.append(" "); } sb.append(w); sb.append(":"); sb.append(f); } } return sb.toString(); }
public double logarithmicFrequency(String term) { Counter c = map.get(term); if (c == null) { return 0; } int tf = c.get(); if (tf == 1) { return 1; } return log2(c.get() + 1); }
BOW bow = new BOW(s); Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d);
public void addAll(String[] words, int from, int to) { //logger.debug(Arrays.toString(words)); for (int i = from; i < to; i++) { add(words[i]); } }
int[] indexes = new int[bow.size()]; float[] values = new float[bow.size()]; String term; int index; float tfIdf; int current = 0; Iterator<String> it = bow.termSet().iterator(); for (int i = 0; it.hasNext(); i++) { term = it.next(); tf = bow.getFrequency(term); tfIdf = (float) (1.0 + Math.log10(tf)); if (b) {
SortedMap<Integer, List<String>> getSortedMap() { SortedMap<Integer, List<String>> sortedMap = new TreeMap<Integer, List<String>>(new IntegerComparator()); Iterator<String> it = iterator(); String w; List<String> list; int f; for (int i = 0; it.hasNext(); i++) { w = it.next(); f = getFrequency(w); list = sortedMap.get(f); if (list == null) { list = new ArrayList<String>(); sortedMap.put(f, list); } list.add(w); } return sortedMap; } // end sort
int tf; double tfIdf; Iterator<String> it = bow.termSet().iterator(); for (int i = 0; it.hasNext(); i++) { term = it.next(); tf = bow.getFrequency(term); tfIdf = log2(tf); if (b) {
BOW createBow(List<String[]> list) { BOW bow = new BOW(); String[] s; String[] leftContext; String[] form; String[] rightContext; for (int i = 0; i < list.size(); i++) { s = list.get(i); leftContext = spacePattern.split(s[OneExamplePerSenseExtractor.LEFT_CONTEXT_INDEX].toLowerCase()); form = spacePattern.split(s[OneExamplePerSenseExtractor.FORM_INDEX].toLowerCase()); rightContext = spacePattern.split(s[OneExamplePerSenseExtractor.RIGHT_CONTEXT_INDEX].toLowerCase()); bow.addAll(leftContext); bow.addAll(form); bow.addAll(rightContext); } logger.debug(bow); return bow; }