private ObjectStream<NameSample> sampleStream(String sampleText) throws IOException { InputStreamFactory in = () -> new ByteArrayInputStream(sampleText.getBytes(StandardCharsets.UTF_8)); return new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
"/opennlp/tools/namefind/voa1.train"); NameSampleDataStream ds = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); while ((ns = ds.read()) != null) { Span[] nameSpans = ns.getNames(); ds.close();
@Test public void testWithNameTypeAndInvalidData() { try (NameSampleDataStream sampleStream = new NameSampleDataStream( ObjectStreamUtils.createObjectStream("<START:> Name <END>"))) { sampleStream.read(); fail(); } catch (IOException expected) { // the read above is expected to throw an exception } try (NameSampleDataStream sampleStream = new NameSampleDataStream( ObjectStreamUtils.createObjectStream( "<START:street> <START:person> Name <END> <END>"))) { sampleStream.read(); fail(); } catch (IOException expected) { // the read above is expected to throw an exception } }
/** * Checks that invalid spans cause an {@link ObjectStreamException} to be thrown. */ @Test public void testWithoutNameTypeAndInvalidData() { try (NameSampleDataStream sampleStream = new NameSampleDataStream( ObjectStreamUtils.createObjectStream("<START> <START> Name <END>"))) { sampleStream.read(); fail(); } catch (IOException expected) { // the read above is expected to throw an exception } try (NameSampleDataStream sampleStream = new NameSampleDataStream( ObjectStreamUtils.createObjectStream("<START> Name <END> <END>"))) { sampleStream.read(); fail(); } catch (IOException expected) { // the read above is expected to throw an exception } try (NameSampleDataStream sampleStream = new NameSampleDataStream( ObjectStreamUtils.createObjectStream( "<START> <START> Person <END> Street <END>"))) { sampleStream.read(); fail(); } catch (IOException expected) { // the read above is expected to throw an exception } }
/** * Creates a NameSample stream using an annotated corpus * * @return * @throws IOException * @throws URISyntaxException */ private static ObjectStream<NameSample> createSample() throws IOException, URISyntaxException { InputStreamFactory in = new ResourceAsStreamFactory( DictionaryNameFinderEvaluatorTest.class, "/opennlp/tools/namefind/AnnotatedSentences.txt"); return new NameSampleDataStream(new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); }
"/opennlp/tools/namefind/AnnotatedSentences.txt"); NameSampleDataStream ds = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); NameSample ns = ds.read(); spans.add(nameSpan); ns = ds.read(); ds.close();
private File trainModel() throws IOException { ObjectStream<String> lineStream = new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), StandardCharsets.ISO_8859_1); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel model; TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory(); try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) { model = NameFinderME.train("eng", null, sampleStream, params, nameFinderFactory); } File modelFile = File.createTempFile("model", ".bin"); try (BufferedOutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) { model.serialize(modelOut); } return modelFile; }
NameSampleDataStream ds = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); NameSample ns = ds.read(); ns = ds.read(); assertEquals(1, ns.getSentence().length); assertEquals("<head/>", ns.getSentence()[0]); ns = ds.read(); assertEquals(1, ns.getSentence().length); assertEquals("<body>", ns.getSentence()[0]); ns = ds.read(); assertEquals(1, ns.getSentence().length); assertEquals("<ul>", ns.getSentence()[0]); ns = ds.read(); assertEquals(6, ns.getSentence().length); assertEquals("<li>", ns.getSentence()[0]); ns = ds.read(); assertEquals(7, ns.getSentence().length); assertEquals("<li>", ns.getSentence()[0]); ns = ds.read(); assertEquals(1, ns.getSentence().length); assertEquals("</ul>", ns.getSentence()[0]);
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); CmdLineUtil.checkInputFile("Data", params.getData()); InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData()); ObjectStream<String> lineStream = null; try { lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding()); } catch (IOException ex) { CmdLineUtil.handleCreateObjectStreamError(ex); } return new NameSampleDataStream(lineStream); } }
ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));
@Test /* * Test that tries to reproduce jira OPENNLP-466 */ public void testWithNameEvaluationErrorListener() throws Exception { InputStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/namefind/AnnotatedSentences.txt"); ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70); mlParams.put(TrainingParameters.CUTOFF_PARAM, 1); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); ByteArrayOutputStream out = new ByteArrayOutputStream(); NameEvaluationErrorListener listener = new NameEvaluationErrorListener(out); Map<String, Object> resources = Collections.emptyMap(); TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng", TYPE, mlParams, null, resources, listener); cv.evaluate(sampleStream, 2); Assert.assertTrue(out.size() > 0); Assert.assertNotNull(cv.getFMeasure()); }
@Test public void testOnlyWithNamesTypeOverride() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", TYPE_OVERRIDE, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, TYPE_OVERRIDE), names1[0]); Assert.assertEquals(new Span(2, 4, TYPE_OVERRIDE), names1[1]); Assert.assertEquals(new Span(4, 6, TYPE_OVERRIDE), names1[2]); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }
/** * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithNames() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, DEFAULT), names1[0]); Assert.assertEquals(new Span(2, 4, DEFAULT), names1[1]); Assert.assertEquals(new Span(4, 6, DEFAULT), names1[2]); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }
@Test(expected = InsufficientTrainingDataException.class) public void testWithInsufficientData() throws Exception { InputStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt"); ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70); mlParams.put(TrainingParameters.CUTOFF_PARAM, 1); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng", TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null); cv.evaluate(sampleStream, 2); }
/** * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithEntitiesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT"); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = "NATO United States Barack Obama".split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 1, "organization"), names1[0]); // NATO Assert.assertEquals(new Span(1, 3, "location"), names1[1]); // United States Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }
/** * Train NamefinderME using OnlyWithNamesWithTypes.train. * The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithNamesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, "person"), names1[0]); Assert.assertEquals(new Span(2, 4, "person"), names1[1]); Assert.assertEquals(new Span(4, 6, "person"), names1[2]); Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); }
@Test /* * Test that reproduces jira OPENNLP-463 */ public void testWithNullResources() throws Exception { InputStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/namefind/AnnotatedSentences.txt"); ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1)); TrainingParameters mlParams = new TrainingParameters(); mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70); mlParams.put(TrainingParameters.CUTOFF_PARAM, 1); mlParams.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.toString()); TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng", TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null); cv.evaluate(sampleStream, 2); Assert.assertNotNull(cv.getFMeasure()); }
new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentences.txt")), encoding));
ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding));
ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));