edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor java code examples

 return;
cp = new ChineseDocumentToSentenceProcessor();
if (props.containsKey("encoding")) {
 log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now");
     List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
           numAdded + " sentences.");
} else {
 List<String> sent = cp.fromHTML(input);
 PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);

private void addOneDict(String item) {
 int length = item.length();
 if (length == 0) {
  // Do nothing for empty items
 } else if (length <= MAX_LEXICON_LENGTH-1) {
  if (cdtos_ != null) {
   item = cdtos_.normalization(item);
  }
  if (DEBUG) EncodingPrintWriter.err.println("DICT: "+item, "UTF-8");
  words_[length].add(item);
 } else {
  // insist on new String as it may save memory
  String subItem = new String(item.substring(0,MAX_LEXICON_LENGTH));
  if (cdtos_ != null) {
   subItem = cdtos_.normalization(subItem);
  }
  if (DEBUG) EncodingPrintWriter.err.println("DICT: "+subItem, "UTF-8");
  // length=MAX_LEXICON_LENGTH and MAX_LEXICON_LENGTH+
  words_[MAX_LEXICON_LENGTH].add(subItem);
 }
}

/** This should now become disused, and other people should call
 *  ChineseUtils directly!  CDM June 2006.
 */
public String normalization(String in) {
 //log.info("BEFOR NORM: "+in);
 String norm = ChineseUtils.normalize(in);
 String out = normalize(norm);
 //log.info("AFTER NORM: "+out);
 return out;
}

/**
 * @param contentString Chinese document text
 * @return a List of sentence strings
 * @throws IOException
 */
public static List<String> fromPlainText(String contentString) throws IOException {
 return fromPlainText(contentString, false);
}

 = new ChineseDocumentToSentenceProcessor(null);
boolean expandMidDot = true;

    sentenceEnd = false;
   sentenceString = removeWhitespace(sentenceString, segmented);
   if (sentenceString.length() > 0) {
sentenceString = removeWhitespace(sentenceString, segmented);
if (sentenceString.length() > 0) {

/**
 * Strip off HTML tags before processing.
 * Only the simplest tag stripping is implemented.
 *
 * @param inputString Chinese document text which contains HTML tags
 * @return a List of sentence strings
 */
public static List<String> fromHTML(String inputString) throws IOException {
 //HTMLParser parser = new HTMLParser();
 //return fromPlainText(parser.parse(inputString));
 List<String> ans = new ArrayList<>();
 MyHTMLParser parser = new MyHTMLParser();
 List<String> sents = parser.parse(inputString);
 for (String s : sents) {
  ans.addAll(fromPlainText(s));
 }
 return ans;
}

@Override
public void init(SeqClassifierFlags flags) {
 this.flags = flags;
 factory = LineIterator.getFactory(new CTBDocumentParser());
 if (DEBUG) EncodingPrintWriter.err.println("Sighan2005DocRandW: using normalization file " + flags.normalizationTable, "UTF-8");
 // pichuan : flags.normalizationTable is null --> i believe this is replaced by some java class??
 // (Thu Apr 24 11:10:42 2008)
 cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable);
 if (flags.dictionary != null) {
  String[] dicts = flags.dictionary.split(",");
  cdict = new ChineseDictionary(dicts, cdtos, flags.expandMidDot);
 }
 if (flags.serializedDictionary != null) {
  String dict = flags.serializedDictionary;
  cdict = new ChineseDictionary(dict, cdtos, flags.expandMidDot);
 }
 if (flags.dictionary2 != null) {
  String[] dicts2 = flags.dictionary2.split(",");
  cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
 }
}

    sentenceEnd = false;
   sentenceString = removeWhitespace(sentenceString, segmented);
   if (sentenceString.length() > 0) {
sentenceString = removeWhitespace(sentenceString, segmented);
if (sentenceString.length() > 0) {

 return;
cp = new ChineseDocumentToSentenceProcessor();
if (props.containsKey("encoding")) {
 System.err.println("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now");
     List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
           numAdded + " sentences.");
} else {
 List<String> sent = cp.fromHTML(input);
 PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);

/**
 * @param contentString Chinese document text
 * @return a List of sentence strings
 * @throws IOException
 */
public static List<String> fromPlainText(String contentString) throws IOException {
 return fromPlainText(contentString, false);
}

String origLine = line;
if (DEBUG) EncodingPrintWriter.err.println("ORIG: " + line, "UTF-8");
line = cdtos.normalization(origLine);
if (DEBUG) EncodingPrintWriter.err.println("NORM: " + line, "UTF-8");
int origIndex = 0;

 = new ChineseDocumentToSentenceProcessor(null);
boolean expandMidDot = true;

/** This should now become disused, and other people should call
 *  ChineseUtils directly!  CDM June 2006.
 */
public String normalization(String in) {
 //log.info("BEFOR NORM: "+in);
 String norm = ChineseUtils.normalize(in);
 String out = normalize(norm);
 //log.info("AFTER NORM: "+out);
 return out;
}

    sentenceEnd = false;
   sentenceString = removeWhitespace(sentenceString, segmented);
   if (sentenceString.length() > 0) {
sentenceString = removeWhitespace(sentenceString, segmented);
if (sentenceString.length() > 0) {

 return;
cp = new ChineseDocumentToSentenceProcessor();
if (props.containsKey("encoding")) {
 log.info("WARNING: for now the default encoding is "+cp.encoding+". It's not changeable for now");
     List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
           numAdded + " sentences.");
} else {
 List<String> sent = cp.fromHTML(input);
 PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.err, cp.encoding), true);

/**
 * @param contentString Chinese document text
 * @return a List of sentence strings
 * @throws IOException
 */
public static List<String> fromPlainText(String contentString) throws IOException {
 return fromPlainText(contentString, false);
}

private void addOneDict(String item) {
 int length = item.length();
 if (length == 0) {
  // Do nothing for empty items
 } else if (length <= MAX_LEXICON_LENGTH-1) {
  if (cdtos_ != null) {
   item = cdtos_.normalization(item);
  }
  if (DEBUG) EncodingPrintWriter.err.println("DICT: "+item, "UTF-8");
  words_[length].add(item);
 } else {
  // insist on new String as it may save memory
  String subItem = new String(item.substring(0,MAX_LEXICON_LENGTH));
  if (cdtos_ != null) {
   subItem = cdtos_.normalization(subItem);
  }
  if (DEBUG) EncodingPrintWriter.err.println("DICT: "+subItem, "UTF-8");
  // length=MAX_LEXICON_LENGTH and MAX_LEXICON_LENGTH+
  words_[MAX_LEXICON_LENGTH].add(subItem);
 }
}

 = new ChineseDocumentToSentenceProcessor(null);
boolean expandMidDot = true;

/** This should now become disused, and other people should call
 *  ChineseUtils directly!  CDM June 2006.
 */
public String normalization(String in) {
 //System.err.println("BEFOR NORM: "+in);
 String norm = ChineseUtils.normalize(in);
 String out = normalize(norm);
 //System.err.println("AFTER NORM: "+out);
 return out;
}

Javadoc

Convert a Chinese Document into a List of sentence Strings.

Most used methods

<init>
fromHTML
Strip off HTML tags before processing. Only the simplest tag stripping is implemented.
fromPlainText
normalization
This should now become disused, and other people should call ChineseUtils directly! CDM June 2006.
normalize
removeWhitespace
In non-segmented mode, all whitespace is removed, in segmented mode only leading and trailing whites

Popular in Java

Reading from database using SQL prepared statement
setContentView (Activity)
onCreateOptionsMenu (Activity)
addToBackStack (FragmentTransaction)
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
BoxLayout (javax.swing)
From CI to AI: The AI layer in your organization

How to useChineseDocumentToSentenceProcessor in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor (Showing top 20 results out of 315)

How to use
ChineseDocumentToSentenceProcessor
in
edu.stanford.nlp.process