private DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends CoreAnnotation<String>> answerField) { if (doc.isEmpty()) { return new DFSA<>(null); } // TODO get rid of ObjectBankWrapper ObjectBankWrapper<IN> obw = new ObjectBankWrapper<>(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); return ViterbiSearchGraphBuilder.getGraph(model, classIndex); }
public List<IN> processDocument(List<IN> doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; }
String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords)); fl.set(CoreAnnotations.ShapeAnnotation.class, s); fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class)))); } else { fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class))));
public ObjectBank<List<IN>> makeObjectBankFromFiles(Collection<File> files, DocumentReaderAndWriter<IN> readerAndWriter) { if (files.isEmpty()) { throw new RuntimeException("Attempt to make ObjectBank with empty file list"); } // return new ObjectBank<List<IN>>(new // ResettableReaderIteratorFactory(files, flags.inputEncoding), // readerAndWriter); // TODO get rid of ObjectBankWrapper return new ObjectBankWrapper<>(flags, new ObjectBank<>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); }
public ObjectBank<List<IN>> makeObjectBankFromFiles(String baseDir, String filePattern, DocumentReaderAndWriter<IN> readerAndWriter) { File path = new File(baseDir); FileFilter filter = new RegExFileFilter(Pattern.compile(filePattern)); File[] origFiles = path.listFiles(filter); Collection<File> files = new ArrayList<>(); for (File file : origFiles) { if (file.isFile()) { if (flags.announceObjectBankEntries) { log.info("Getting data from " + file + " (" + flags.inputEncoding + " encoding)"); } files.add(file); } } if (files.isEmpty()) { throw new RuntimeException("No matching files: " + baseDir + '\t' + filePattern); } // return new ObjectBank<List<IN>>(new // ResettableReaderIteratorFactory(files, flags.inputEncoding), // readerAndWriter); // TODO get rid of ObjectBankWrapper return new ObjectBankWrapper<>(flags, new ObjectBank<>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); }
private List<IN> preprocessTokens(List<? extends HasWord> tokenSequence) { // log.info("knownLCWords.size is " + knownLCWords.size() + "; knownLCWords.maxSize is " + knownLCWords.getMaxSize() + // ", prior to NER for " + getClass().toString()); List<IN> document = new ArrayList<>(); int i = 0; for (HasWord word : tokenSequence) { IN wi; // initialized below if (word instanceof CoreMap) { // copy all annotations! some are required later in // AbstractSequenceClassifier.classifyWithInlineXML // wi = (IN) new ArrayCoreMap((ArrayCoreMap) word); wi = tokenFactory.makeToken((IN) word); } else { wi = tokenFactory.makeToken(); wi.set(CoreAnnotations.TextAnnotation.class, word.word()); // wi.setWord(word.word()); } wi.set(CoreAnnotations.PositionAnnotation.class, Integer.toString(i)); wi.set(CoreAnnotations.AnswerAnnotation.class, backgroundSymbol()); document.add(wi); i++; } // TODO get rid of ObjectBankWrapper ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<>(flags, null, knownLCWords); wrapper.processDocument(document); // log.info("Size of knownLCWords is " + knownLCWords.size() + ", after NER for " + getClass().toString()); return document; }
public List<IN> processDocument(List<IN> doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; }
public ObjectBank<List<IN>> makeObjectBankFromFiles(String[] trainFileList, DocumentReaderAndWriter<IN> readerAndWriter) { // try{ Collection<File> files = new ArrayList<>(); for (String trainFile : trainFileList) { File f = new File(trainFile); files.add(f); } // System.err.printf("trainFileList contains %d file%s in encoding %s.%n", files.size(), files.size() == 1 ? "": "s", flags.inputEncoding); // TODO get rid of ObjectBankWrapper // return new ObjectBank<List<IN>>(new // ResettableReaderIteratorFactory(files), readerAndWriter); return new ObjectBankWrapper<>(flags, new ObjectBank<>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); // } catch (IOException e) { // throw new RuntimeException(e); // } }
private void doBasicStuff(List<IN> doc) { int position = 0; for (IN fl : doc) { // position in document fl.set(PositionAnnotation.class, Integer.toString((position++))); // word shape if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) && (!flags.useShapeStrings)) { String s = intern(WordShapeClassifier.wordShape(fl.get(TextAnnotation.class), flags.wordShape, knownLCWords)); fl.set(ShapeAnnotation.class, s); } // normalizing and interning // was the following; should presumably now be // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) { if ("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter".equalsIgnoreCase(flags.readerAndWriter)) { // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency. fl.set(CharAnnotation.class,intern(fix(fl.get(CharAnnotation.class)))); } else { fl.set(TextAnnotation.class, intern(fix(fl.get(TextAnnotation.class)))); fl.set(GoldAnswerAnnotation.class, fl.get(AnswerAnnotation.class)); } } }
ObjectBankWrapper<IN> obw = new ObjectBankWrapper<>(flags, null, knownLCWords); doc = obw.processDocument(doc);
public List<IN> processDocument(List<IN> doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; }
/** * Set up an ObjectBank that will allow one to iterate over a collection of * documents obtained from the passed in Reader. Each document will be * represented as a list of IN. If the ObjectBank iterator() is called until * hasNext() returns false, then the Reader will be read till end of file, but * no reading is done at the time of this call. Reading is done using the * reading method specified in {@code flags.documentReader}, and for some * reader choices, the column mapping given in {@code flags.map}. * * @param in * Input data addNEWLCWords do we add new lowercase words from this * data to the word shape classifier * @return The list of documents */ public ObjectBank<List<IN>> makeObjectBankFromReader(BufferedReader in, DocumentReaderAndWriter<IN> readerAndWriter) { if (flags.announceObjectBankEntries) { log.info("Reading data using " + readerAndWriter.getClass()); } // TODO get rid of ObjectBankWrapper // return new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(in), // readerAndWriter); return new ObjectBankWrapper<>(flags, new ObjectBank<>(new ResettableReaderIteratorFactory(in), readerAndWriter), knownLCWords); }
String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords)); fl.set(CoreAnnotations.ShapeAnnotation.class, s); fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class)))); } else { fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class))));
private DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends CoreAnnotation<String>> answerField) { if (doc.isEmpty()) { return new DFSA<>(null); } // TODO get rid of ObjectBankWrapper ObjectBankWrapper<IN> obw = new ObjectBankWrapper<>(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); return ViterbiSearchGraphBuilder.getGraph(model, classIndex); }
public List<IN> processDocument(List<IN> doc) { if (flags.mergeTags) { mergeTags(doc); } if (flags.iobTags) { iobTags(doc); } doBasicStuff(doc); return doc; }
/** * Reads a String into an ObjectBank object. NOTE: that the current * implementation of ReaderIteratorFactory will first try to interpret each * string as a filename, so this method will yield unwanted results if it * applies to a string that is at the same time a filename. It prints out a * warning, at least. * * @param string The String which will be the content of the ObjectBank * @return The ObjectBank */ public ObjectBank<List<IN>> makeObjectBankFromString(String string, DocumentReaderAndWriter<IN> readerAndWriter) { if (flags.announceObjectBankEntries) { log.info("Reading data using " + readerAndWriter.getClass()); if (flags.inputEncoding == null) { log.info("Getting data from " + string + " (default encoding)"); } else { log.info("Getting data from " + string + " (" + flags.inputEncoding + " encoding)"); } } // return new ObjectBank<List<IN>>(new // ResettableReaderIteratorFactory(string), readerAndWriter); // TODO return new ObjectBankWrapper<>(flags, new ObjectBank<>(new ResettableReaderIteratorFactory(string), readerAndWriter), knownLCWords); }
String s = intern(WordShapeClassifier.wordShape(word, flags.wordShape, knownLCWords)); fl.set(CoreAnnotations.ShapeAnnotation.class, s); fl.set(CoreAnnotations.CharAnnotation.class,intern(fix(fl.get(CoreAnnotations.CharAnnotation.class)))); } else { fl.set(CoreAnnotations.TextAnnotation.class, intern(fix(fl.get(CoreAnnotations.TextAnnotation.class)))); fl.set(CoreAnnotations.GoldAnswerAnnotation.class, fl.get(CoreAnnotations.AnswerAnnotation.class));
public DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends CoreAnnotation<String>> answerField) { if (doc.isEmpty()) { return new DFSA<String, Integer>(null); } // TODO get rid of objectbankwrapper ObjectBankWrapper<IN> obw = new ObjectBankWrapper<IN>(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); return ViterbiSearchGraphBuilder.getGraph(model, classIndex); }
public ObjectBank<List<IN>> makeObjectBankFromFiles(Collection<File> files, DocumentReaderAndWriter readerAndWriter) { if (files.isEmpty()) { throw new RuntimeException("Attempt to make ObjectBank with empty file list"); } // return new ObjectBank<List<IN>>(new // ResettableReaderIteratorFactory(files, flags.inputEncoding), // readerAndWriter); // TODO get rid of objectbankwrapper return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); }
public DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends CoreAnnotation<String>> answerField) { if (doc.isEmpty()) { return new DFSA<String, Integer>(null); } // TODO get rid of objectbankwrapper ObjectBankWrapper<IN> obw = new ObjectBankWrapper<IN>(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); return ViterbiSearchGraphBuilder.getGraph(model, classIndex); }