private static ArrayList<Word> postProcessSentence(ArrayList<Word> sent) { ArrayList<Word> newSent = new ArrayList<>(); for(Word word : sent) { if(newSent.size() > 0) { String prevWord = newSent.get(newSent.size()-1).toString(); String curWord = word.toString(); String prevChar = prevWord.substring(prevWord.length()-1); String curChar = curWord.substring(0,1); if(!isChinese(prevChar) && !isChinese(curChar)) { Word mergedWord = new Word(prevWord+curWord); newSent.set(newSent.size()-1, mergedWord); //printlnErr("merged: "+mergedWord); //printlnErr("merged: "+mergedWord+" from: "+prevWord+" and: "+curWord); continue; } } newSent.add(word); } return new ArrayList<>(newSent); }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
docStr.append(' '); docStr.append(aDoc.toString());
private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
docStr.append(' '); docStr.append(aDoc.toString());