/** * This constructor takes an <code>InputStream</code> and a <code>Charset</code> * and opens an associated stream object with the specified encoding specified. * * @param in an <code>InputStreamFactory</code> for the input file. * @param encoding the <code>Charset</code> to apply to the input stream. * @throws IOException */ public NameFinderCensus90NameStream(InputStreamFactory in, Charset encoding) throws IOException { this.locale = new Locale("en"); // locale is English this.encoding = encoding; this.lineStream = new PlainTextByLineStream(in, this.encoding); }
public PlainTextByLineStream(InputStreamFactory inputStreamFactory, Charset charset) throws IOException { this.inputStreamFactory = Objects.requireNonNull(inputStreamFactory, "inputStreamFactory must not be null!"); this.encoding = charset; reset(); }
public EvalitaNameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException { this.lang = lang; try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
public BioNLP2004NameSampleStream(InputStreamFactory in, int types) throws IOException { try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
public ConlluStream(InputStreamFactory in) throws IOException { this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
public ADChunkSampleStream(InputStreamFactory in, String charsetName) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
public Conll02NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException { this.lang = lang; try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
public Conll03NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException { this.lang = lang; try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
public ConllXPOSSampleStream(InputStreamFactory in, Charset charset) throws IOException { super(new ParagraphStream(new PlainTextByLineStream(in, charset))); }
private LeipzigTestSampleStream(int sentencePerDocument, Tokenizer tokenizer, InputStreamFactory in) throws IOException { super(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); this.sentencePerDocument = sentencePerDocument; this.tokenizer = tokenizer; }
/** * Creates a new {@link NameSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus * @param splitHyphenatedTokens * if true hyphenated tokens will be separated: "carros-monstro" > * "carros" "-" "monstro" */ @Deprecated public ADNameSampleStream(InputStreamFactory in, String charsetName, boolean splitHyphenatedTokens) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); this.splitHyphenatedTokens = splitHyphenatedTokens; } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
/** * Creates a new {@link POSSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus * @param expandME * if true will expand the multiword expressions, each word of the * expression will have the POS Tag that was attributed to the * expression plus the prefix B- or I- (CONLL convention) * @param includeFeatures * if true will combine the POS Tag with the feature tags */ public ADPOSSampleStream(InputStreamFactory in, String charsetName, boolean expandME, boolean includeFeatures) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(in, charsetName)); this.expandME = expandME; this.isIncludeFeatures = includeFeatures; } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } }
/** * Creates a new {@link SentenceSample} stream from a {@link FileInputStream} * * @param in * input stream from the corpus * @param charsetName * the charset to use while reading the corpus * @param includeHeadlines * if true will output the sentences marked as news headlines */ public ADSentenceSampleStream(InputStreamFactory in, String charsetName, boolean includeHeadlines) throws IOException { try { this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream( in, charsetName)); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } ptEosCharacters = Factory.ptEosCharacters; Arrays.sort(ptEosCharacters); this.isIncludeTitles = includeHeadlines; }
private ObjectStream<NameSample> sampleStream(String sampleText) throws IOException { InputStreamFactory in = () -> new ByteArrayInputStream(sampleText.getBytes(StandardCharsets.UTF_8)); return new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ADSentenceStream openData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); return new ADSentenceStream(new PlainTextByLineStream(in, "UTF-8")); } }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<SentenceSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( SentenceDetectorFactoryTest.class, "/opennlp/tools/sentdetect/Sentences.txt"); return new SentenceSampleStream(new PlainTextByLineStream( in, StandardCharsets.UTF_8)); }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( POSTaggerFactoryTest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<Parse> createParseSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( ParseSampleStreamTest.class, "/opennlp/tools/parser/test.parse"); return new ParseSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<ChunkSample> createSampleStream() throws IOException { ResourceAsStreamFactory in = new ResourceAsStreamFactory( ChunkerFactoryTest.class, "/opennlp/tools/chunker/test.txt"); return new ChunkSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }