public EvalitaNameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException { this.lang = lang; try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
public BioNLP2004NameSampleStream(InputStreamFactory in, int types) throws IOException { try { this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8); System.setOut(new PrintStream(System.out, true, "UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 is available on all JVMs, will never happen throw new IllegalStateException(e); } this.types = types; }
private static ADSentenceStream openData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); return new ADSentenceStream(new PlainTextByLineStream(in, "UTF-8")); } }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<SentenceSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( SentenceDetectorFactoryTest.class, "/opennlp/tools/sentdetect/Sentences.txt"); return new SentenceSampleStream(new PlainTextByLineStream( in, StandardCharsets.UTF_8)); }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( POSTaggerFactoryTest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<Parse> createParseSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( ParseSampleStreamTest.class, "/opennlp/tools/parser/test.parse"); return new ParseSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<ChunkSample> createSampleStream() throws IOException { ResourceAsStreamFactory in = new ResourceAsStreamFactory( ChunkerFactoryTest.class, "/opennlp/tools/chunker/test.txt"); return new ChunkSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<DocumentSample> createSampleStream() throws IOException { InputStreamFactory isf = new ResourceAsStreamFactory( DoccatFactoryTest.class, "/opennlp/tools/doccat/DoccatSample.txt"); return new DocumentSampleStream(new PlainTextByLineStream(isf, "UTF-8")); }
public FootballEventStream() throws IOException { textStream = new PlainTextByLineStream( new URLInputStreamFactory(this.getClass().getResource("/opennlp/tools/ml/maxent/football.dat")), StandardCharsets.US_ASCII ); }
@Before public void setup() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); try (ADChunkSampleStream stream = new ADChunkSampleStream(new PlainTextByLineStream(in, "UTF-8"))) { ChunkSample sample; while ((sample = stream.read()) != null) { samples.add(sample); } } }
private static ObjectStream<String> getLineSample(String corpus) throws IOException { return new PlainTextByLineStream(new MarkableFileInputStreamFactory( new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1); }
@Before public void setup() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); try (ADNameSampleStream stream = new ADNameSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8), true)) { NameSample sample; while ((sample = stream.read()) != null) { samples.add(sample); } } }
public static LanguageDetectorSampleStream createSampleStream() throws IOException { ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); return new LanguageDetectorSampleStream(lineStream); } }
private static ChunkerModel train(File trainFile, TrainingParameters params) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream( new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8)); return ChunkerME.train("eng", samples, params, new ChunkerFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void testInsufficientData() throws IOException { ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 5); LemmatizerME.train("eng", sampleStream, params, new LemmatizerFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void testInsufficientData() throws IOException { ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/chunker/test-insufficient.txt"); ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); ChunkerME.train("eng", sampleStream, params, new ChunkerFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void testInsufficientData() throws IOException { InputStreamFactory trainDataIn = new ResourceAsStreamFactory( TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train"); ObjectStream<TokenSample> samples = new TokenSampleStream( new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8)); TrainingParameters mlParams = new TrainingParameters(); mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100); mlParams.put(TrainingParameters.CUTOFF_PARAM, 5); TokenizerME.train(samples, TokenizerFactory.create(null, "eng", null, true, null), mlParams); }
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt"); ObjectStream<POSSample> stream = new WordTagSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name()); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 5); POSTaggerME.train("eng", stream, params, new POSTaggerFactory()); }
@Test public void testLineSegmentation() throws IOException { ObjectStream<String> stream = new PlainTextByLineStream(new MockInputStreamFactory(testString), StandardCharsets.UTF_8); Assert.assertEquals("line1", stream.read()); Assert.assertEquals("line2", stream.read()); Assert.assertEquals("line3", stream.read()); Assert.assertEquals("line4", stream.read()); Assert.assertNull(stream.read()); stream.close(); }