return; cp = new ChineseDocumentToSentenceProcessor(); if (props.containsKey("encoding")) { log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now"); List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true); numAdded + " sentences."); } else { List<String> sent = cp.fromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);
private void addOneDict(String item) { int length = item.length(); if (length == 0) { // Do nothing for empty items } else if (length <= MAX_LEXICON_LENGTH-1) { if (cdtos_ != null) { item = cdtos_.normalization(item); } if (DEBUG) EncodingPrintWriter.err.println("DICT: "+item, "UTF-8"); words_[length].add(item); } else { // insist on new String as it may save memory String subItem = new String(item.substring(0,MAX_LEXICON_LENGTH)); if (cdtos_ != null) { subItem = cdtos_.normalization(subItem); } if (DEBUG) EncodingPrintWriter.err.println("DICT: "+subItem, "UTF-8"); // length=MAX_LEXICON_LENGTH and MAX_LEXICON_LENGTH+ words_[MAX_LEXICON_LENGTH].add(subItem); } }
/** This should now become disused, and other people should call * ChineseUtils directly! CDM June 2006. */ public String normalization(String in) { //log.info("BEFOR NORM: "+in); String norm = ChineseUtils.normalize(in); String out = normalize(norm); //log.info("AFTER NORM: "+out); return out; }
/** * @param contentString Chinese document text * @return a List of sentence strings * @throws IOException */ public static List<String> fromPlainText(String contentString) throws IOException { return fromPlainText(contentString, false); }
= new ChineseDocumentToSentenceProcessor(null); boolean expandMidDot = true;
sentenceEnd = false; sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) { sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) {
/** * Strip off HTML tags before processing. * Only the simplest tag stripping is implemented. * * @param inputString Chinese document text which contains HTML tags * @return a List of sentence strings */ public static List<String> fromHTML(String inputString) throws IOException { //HTMLParser parser = new HTMLParser(); //return fromPlainText(parser.parse(inputString)); List<String> ans = new ArrayList<>(); MyHTMLParser parser = new MyHTMLParser(); List<String> sents = parser.parse(inputString); for (String s : sents) { ans.addAll(fromPlainText(s)); } return ans; }
@Override public void init(SeqClassifierFlags flags) { this.flags = flags; factory = LineIterator.getFactory(new CTBDocumentParser()); if (DEBUG) EncodingPrintWriter.err.println("Sighan2005DocRandW: using normalization file " + flags.normalizationTable, "UTF-8"); // pichuan : flags.normalizationTable is null --> i believe this is replaced by some java class?? // (Thu Apr 24 11:10:42 2008) cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable); if (flags.dictionary != null) { String[] dicts = flags.dictionary.split(","); cdict = new ChineseDictionary(dicts, cdtos, flags.expandMidDot); } if (flags.serializedDictionary != null) { String dict = flags.serializedDictionary; cdict = new ChineseDictionary(dict, cdtos, flags.expandMidDot); } if (flags.dictionary2 != null) { String[] dicts2 = flags.dictionary2.split(","); cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot); } }
sentenceEnd = false; sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) { sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) {
return; cp = new ChineseDocumentToSentenceProcessor(); if (props.containsKey("encoding")) { System.err.println("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now"); List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true); numAdded + " sentences."); } else { List<String> sent = cp.fromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);
/** * @param contentString Chinese document text * @return a List of sentence strings * @throws IOException */ public static List<String> fromPlainText(String contentString) throws IOException { return fromPlainText(contentString, false); }
String origLine = line; if (DEBUG) EncodingPrintWriter.err.println("ORIG: " + line, "UTF-8"); line = cdtos.normalization(origLine); if (DEBUG) EncodingPrintWriter.err.println("NORM: " + line, "UTF-8"); int origIndex = 0;
= new ChineseDocumentToSentenceProcessor(null); boolean expandMidDot = true;
/** This should now become disused, and other people should call * ChineseUtils directly! CDM June 2006. */ public String normalization(String in) { //log.info("BEFOR NORM: "+in); String norm = ChineseUtils.normalize(in); String out = normalize(norm); //log.info("AFTER NORM: "+out); return out; }
sentenceEnd = false; sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) { sentenceString = removeWhitespace(sentenceString, segmented); if (sentenceString.length() > 0) {
return; cp = new ChineseDocumentToSentenceProcessor(); if (props.containsKey("encoding")) { log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now"); List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true); numAdded + " sentences."); } else { List<String> sent = cp.fromHTML(input); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);
/** * @param contentString Chinese document text * @return a List of sentence strings * @throws IOException */ public static List<String> fromPlainText(String contentString) throws IOException { return fromPlainText(contentString, false); }
private void addOneDict(String item) { int length = item.length(); if (length == 0) { // Do nothing for empty items } else if (length <= MAX_LEXICON_LENGTH-1) { if (cdtos_ != null) { item = cdtos_.normalization(item); } if (DEBUG) EncodingPrintWriter.err.println("DICT: "+item, "UTF-8"); words_[length].add(item); } else { // insist on new String as it may save memory String subItem = new String(item.substring(0,MAX_LEXICON_LENGTH)); if (cdtos_ != null) { subItem = cdtos_.normalization(subItem); } if (DEBUG) EncodingPrintWriter.err.println("DICT: "+subItem, "UTF-8"); // length=MAX_LEXICON_LENGTH and MAX_LEXICON_LENGTH+ words_[MAX_LEXICON_LENGTH].add(subItem); } }
= new ChineseDocumentToSentenceProcessor(null); boolean expandMidDot = true;
/** This should now become disused, and other people should call * ChineseUtils directly! CDM June 2006. */ public String normalization(String in) { //System.err.println("BEFOR NORM: "+in); String norm = ChineseUtils.normalize(in); String out = normalize(norm); //System.err.println("AFTER NORM: "+out); return out; }