Example() { bow = new BOW(); freq = 0; }
private BOW createBow(Token[] tokenArray) { BOW bow = new BOW(); for (int i = 0; i < tokenArray.length; i++) { bow.add(tokenArray[i].getForm().toLowerCase()); } return bow; }
BOW createBow(List<String[]> list) { BOW bow = new BOW(); String[] s; String[] leftContext; String[] form; String[] rightContext; for (int i = 0; i < list.size(); i++) { s = list.get(i); leftContext = spacePattern.split(s[OneExamplePerSenseExtractor.LEFT_CONTEXT_INDEX].toLowerCase()); form = spacePattern.split(s[OneExamplePerSenseExtractor.FORM_INDEX].toLowerCase()); rightContext = spacePattern.split(s[OneExamplePerSenseExtractor.RIGHT_CONTEXT_INDEX].toLowerCase()); bow.addAll(leftContext); bow.addAll(form); bow.addAll(rightContext); } logger.debug(bow); return bow; }
BOW bow = new BOW(s); Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d);
public NgramComparator(String page, LSSimilarity lss) throws IOException, MalformedURLException { logger.info("parsing " + page + "..."); BOW bow = new BOW(getText(new File(page)));
public WebPageComparator(URL page, URL[] concept, LSSimilarity lss) throws IOException { logger.info("parsing " + page + "..."); BOW bow = new BOW(toText(page)); logger.info("size bow " + bow.size()); BOW[] bows = new BOW[concept.length]; for (int i = 0; i < concept.length; i++) { logger.info("parsing concept " + concept[i]); bows[i] = new BOW(toText(concept[i])); logger.info("size concept " + i + " " + bows[i].size()); float f = lss.compare(bow, bows[i]); logger.info(i + " = " + f); } //logger.info(toText(page)); interactive(concept, bows, lss); } // end constructor
private float[] compareAll(BOW bow, String[] s, LSSimilarity lss) throws IOException, MalformedURLException { URL[] concept = getConceptURL(s); BOW[] bows = new BOW[concept.length]; float[] f = new float[concept.length]; for (int i = 0; i < concept.length; i++) { //logger.info("parsing concept " + concept[i]); bows[i] = new BOW(toText(concept[i])); //logger.info("concept " + bows[i]); //logger.info("size concept " + i + " " + bows[i].size()); f[i] = lss.compare(bow, bows[i]); logger.info(i + ", " + concept[i] + ", " + f[i]); //logger.info(i + " = " + f[i]); } return f; } // end compareAll
String query = myInput.readLine().toString().toLowerCase(); BOW bow = new BOW(toText(new URL(query))); logger.info("size bow " + bow.size()); logger.info("bow " + bow);
Vector createVector(List<String[]> list) { BOW bow = new BOW(); String[] s; String[] leftContext; String[] rightContext; for (int i = 0; i < list.size(); i++) { s = list.get(i); leftContext = spacePattern.split(s[OneExamplePerSenseExtractor.LEFT_CONTEXT_INDEX].toLowerCase()); rightContext = spacePattern.split(s[OneExamplePerSenseExtractor.RIGHT_CONTEXT_INDEX].toLowerCase()); bow.addAll(leftContext); bow.addAll(rightContext); } Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); return pd; }
private BOW createBow(String[] s) { Tokenizer tokenizer = HardTokenizer.getInstance(); BOW bow = new BOW(); String[] left = tokenizer.stringArray(s[2].toLowerCase()); bow.addAll(left); if (s.length == 5) { String[] right = tokenizer.stringArray(s[4].toLowerCase()); bow.addAll(right); } return bow; }
@Override public void processLine(String line) { //To change body of implemented methods use File | Settings | File Templates. String[] tokens = spacePattern.split(line); if (tokens.length < 2) { return; } try { BOW bow = new BOW(); for (int i = 1; i < tokens.length; i++) { //logger.debug(i + "\t'" + tokens[i] + "'\t" + tokens[0]); bow.add(tokens[i].toLowerCase()); } Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); synchronized (this) { vectorWriter.print(tokens[0]); //vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); //vectorWriter.print(bow.toSingleLine()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.print(pd.toString()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.println(d.toString()); } } catch (Exception e) { logger.error("Error processing page " + tokens[0]); } }
Example(String page, List<String[]> list) { this.page = page; freq = list.size(); totalFreq += freq; StringBuilder sb = new StringBuilder(); bow = new BOW(); String[] s; String[] leftContext; String[] rightContext; for (int i = 0; i < list.size(); i++) { s = list.get(i); try { leftContext = spacePattern.split(s[LEFT_CONTEXT_INDEX].toLowerCase()); rightContext = spacePattern.split(s[RIGHT_CONTEXT_INDEX].toLowerCase()); bow.addAll(leftContext); bow.addAll(rightContext); } catch (Exception e) { logger.error(e); } } bowVector = lsm.mapDocument(bow); if (normalized) { bowVector.normalize(); } lsVector = lsm.mapPseudoDocument(bowVector); //bowVector.normalize(); if (normalized) { lsVector.normalize(); } }
private float compare(BOW bow, String[] s, LSSimilarity lss) throws IOException, MalformedURLException { URL[] concept = getConceptURL(s); BOW[] bows = new BOW[concept.length]; float[] f = new float[concept.length]; for (int i = 0; i < concept.length; i++) { //logger.info("parsing concept " + concept[i]); bows[i] = new BOW(toText(concept[i])); //logger.info("concept " + bows[i]); //logger.info("size concept " + i + " " + bows[i].size()); f[i] = lss.compare(bow, bows[i]); logger.info(i + ", " + concept[i] + ", " + f[i]); //logger.info(i + " = " + f[i]); } int i = maxIndex(f); if (i != -1) { logger.info("max = " + concept[i] + ", " + f[i]); return f[i]; } return 0; } // end compare
@Override public void contentPage(String text, String title, int wikiID) { try { WikiMarkupParser wikiMarkupParser = WikiMarkupParser.getInstance(); //logger.debug(title + "\t" + wikiID); String[] prefixes = {filePrefix, imagePrefix}; ParsedPage parsedPage = wikiMarkupParser.parsePage(text, prefixes); String page = tokenizedText(parsedPage, title); BOW bow = new BOW(page.toLowerCase()); Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); synchronized (this) { vectorWriter.print(title); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.print(pd.toString()); vectorWriter.print(CharacterTable.HORIZONTAL_TABULATION); vectorWriter.println(d.toString()); } } catch (Exception e) { logger.error("Error processing page " + title + " (" + wikiID + ")"); } }
public Category[] classify(String text) { String tokenizedText = tokenizer.tokenizedString(text); BOW bow = new BOW(tokenizedText.toLowerCase()); Vector d = lsm.mapDocument(bow); Vector pd = lsm.mapPseudoDocument(d); d.normalize(); pd.normalize(); Node[] nd = d.toNodeArray(); Node[] npd = pd.toNodeArray(); //logger.debug(Node.toString(nd)); //logger.debug(Node.toString(npd)); Category[] categoryArray = new Category[pageCategoryMap.size()]; Iterator<String> it = pageCategoryMap.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { String page = it.next(); Entry entry = pageCategoryMap.get(page); Node[][] nodes = entry.getNodes(); Node[] cd = nodes[1]; Node[] cpd = nodes[0]; double cbow = Node.dot(nd, cd); double clsa = Node.dot(npd, cpd); double combo = (cbow + clsa) / 2; categoryArray[i] = new Category(entry.getCategory(), cbow, clsa); //logger.debug(i + "\t" + categories[i] + "\t" + cbow + "\t" + clsa + "\t" + combo); } Arrays.sort(categoryArray); return categoryArray; }
private Node[][] mapInstance(String[] s) { Tokenizer tokenizer = HardTokenizer.getInstance(); BOW bow = new BOW(); String[] left = tokenizer.stringArray(s[2].toLowerCase()); bow.addAll(left); if (s.length == 5) { String[] right = tokenizer.stringArray(s[4].toLowerCase()); bow.addAll(right); } logger.debug(bow); Vector bowVector = lsm.mapDocument(bow); Vector lsVector = lsm.mapPseudoDocument(bowVector); if (normalized) { bowVector.normalize(); lsVector.normalize(); } logger.debug("bow\t" + bowVector); //logger.debug("lsi\t" + lsVector); Node[][] nodes = new Node[2][]; nodes[0] = bowVector.toNodeArray(); nodes[1] = lsVector.toNodeArray(); return nodes; }