opennlp.tools.namefind.NameSampleDataStream java code examples

private ObjectStream<NameSample> sampleStream(String sampleText) throws IOException {
 InputStreamFactory in = () -> new ByteArrayInputStream(sampleText.getBytes(StandardCharsets.UTF_8));
 return new NameSampleDataStream(
   new PlainTextByLineStream(in, StandardCharsets.UTF_8));
}

  "/opennlp/tools/namefind/voa1.train");
NameSampleDataStream ds = new NameSampleDataStream(
  new PlainTextByLineStream(in, StandardCharsets.UTF_8));
while ((ns = ds.read()) != null) {
 Span[] nameSpans = ns.getNames();
ds.close();

@Test
public void testWithNameTypeAndInvalidData() {
 try (NameSampleDataStream sampleStream = new NameSampleDataStream(
   ObjectStreamUtils.createObjectStream("<START:> Name <END>"))) {
  sampleStream.read();
  fail();
 } catch (IOException expected) {
  // the read above is expected to throw an exception
 }
 try (NameSampleDataStream sampleStream = new NameSampleDataStream(
   ObjectStreamUtils.createObjectStream(
     "<START:street> <START:person> Name <END> <END>"))) {
  sampleStream.read();
  fail();
 } catch (IOException expected) {
  // the read above is expected to throw an exception
 }
}

/**
 * Checks that invalid spans cause an {@link ObjectStreamException} to be thrown.
 */
@Test
public void testWithoutNameTypeAndInvalidData() {
 try (NameSampleDataStream sampleStream = new NameSampleDataStream(
   ObjectStreamUtils.createObjectStream("<START> <START> Name <END>"))) {
  sampleStream.read();
  fail();
 } catch (IOException expected) {
  // the read above is expected to throw an exception
 }
 try (NameSampleDataStream sampleStream = new NameSampleDataStream(
   ObjectStreamUtils.createObjectStream("<START> Name <END> <END>"))) {
  sampleStream.read();
  fail();
 } catch (IOException expected) {
  // the read above is expected to throw an exception
 }
 try (NameSampleDataStream sampleStream = new NameSampleDataStream(
   ObjectStreamUtils.createObjectStream(
     "<START> <START> Person <END> Street <END>"))) {
  sampleStream.read();
  fail();
 } catch (IOException expected) {
  // the read above is expected to throw an exception
 }
}

/**
 * Creates a NameSample stream using an annotated corpus
 *
 * @return
 * @throws IOException
 * @throws URISyntaxException
 */
private static ObjectStream<NameSample> createSample() throws IOException,
  URISyntaxException {
 InputStreamFactory in = new ResourceAsStreamFactory(
   DictionaryNameFinderEvaluatorTest.class,
   "/opennlp/tools/namefind/AnnotatedSentences.txt");
 return new NameSampleDataStream(new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
}

  "/opennlp/tools/namefind/AnnotatedSentences.txt");
NameSampleDataStream ds = new NameSampleDataStream(
  new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
NameSample ns = ds.read();
  spans.add(nameSpan);
 ns = ds.read();
ds.close();

private File trainModel() throws IOException {
 ObjectStream<String> lineStream =
   new PlainTextByLineStream(new MockInputStreamFactory(
     new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")),
     StandardCharsets.ISO_8859_1);
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 70);
 params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
 TokenNameFinderModel model;
 TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
 try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) {
  model = NameFinderME.train("eng", null, sampleStream, params,
    nameFinderFactory);
 }
 
 File modelFile = File.createTempFile("model", ".bin");
 
 try (BufferedOutputStream modelOut =
      new BufferedOutputStream(new FileOutputStream(modelFile))) {
  model.serialize(modelOut);
 }
 
 return modelFile;
}

NameSampleDataStream ds = new NameSampleDataStream(
  new PlainTextByLineStream(in, StandardCharsets.UTF_8));
NameSample ns = ds.read();
ns = ds.read();
assertEquals(1, ns.getSentence().length);
assertEquals("<head/>", ns.getSentence()[0]);
ns = ds.read();
assertEquals(1, ns.getSentence().length);
assertEquals("<body>", ns.getSentence()[0]);
ns = ds.read();
assertEquals(1, ns.getSentence().length);
assertEquals("<ul>", ns.getSentence()[0]);
ns = ds.read();
assertEquals(6, ns.getSentence().length);
assertEquals("<li>", ns.getSentence()[0]);
ns = ds.read();
assertEquals(7, ns.getSentence().length);
assertEquals("<li>", ns.getSentence()[0]);
ns = ds.read();
assertEquals(1, ns.getSentence().length);
assertEquals("</ul>", ns.getSentence()[0]);

 public ObjectStream<NameSample> create(String[] args) {
  Parameters params = ArgumentParser.parse(args, Parameters.class);

  CmdLineUtil.checkInputFile("Data", params.getData());

  InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());

  ObjectStream<String> lineStream = null;
  try {
   lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
  } catch (IOException ex) {
   CmdLineUtil.handleCreateObjectStreamError(ex);
  }

  return new NameSampleDataStream(lineStream);
 }
}

ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
  new PlainTextByLineStream(new MockInputStreamFactory(
    new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));

@Test
/*
 * Test that tries to reproduce jira OPENNLP-466
 */
public void testWithNameEvaluationErrorListener() throws Exception {
 InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
   "/opennlp/tools/namefind/AnnotatedSentences.txt");
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 TrainingParameters mlParams = new TrainingParameters();
 mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
 mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 mlParams.put(TrainingParameters.ALGORITHM_PARAM,
   ModelType.MAXENT.toString());
 ByteArrayOutputStream out = new ByteArrayOutputStream();
 NameEvaluationErrorListener listener = new NameEvaluationErrorListener(out);
 Map<String, Object> resources = Collections.emptyMap();
 TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng",
   TYPE, mlParams, null, resources, listener);
 cv.evaluate(sampleStream, 2);
 Assert.assertTrue(out.size() > 0);
 Assert.assertNotNull(cv.getFMeasure());
}

@Test
public void testOnlyWithNamesTypeOverride() throws Exception {
 // train the name finder
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(new MockInputStreamFactory(
    new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 70);
 params.put(TrainingParameters.CUTOFF_PARAM, 1);
 TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", TYPE_OVERRIDE, sampleStream,
   params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
 NameFinderME nameFinder = new NameFinderME(nameFinderModel);
 // now test if it can detect the sample sentences
 String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
   "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");
 Span[] names1 = nameFinder.find(sentence);
 Assert.assertEquals(new Span(0, 2, TYPE_OVERRIDE), names1[0]);
 Assert.assertEquals(new Span(2, 4, TYPE_OVERRIDE), names1[1]);
 Assert.assertEquals(new Span(4, 6, TYPE_OVERRIDE), names1[2]);
 Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
}

/**
 * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it.
 * This is related to the issue OPENNLP-9
 */
@Test
public void testOnlyWithNames() throws Exception {
 // train the name finder
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
     new PlainTextByLineStream(new MockInputStreamFactory(
      new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 70);
 params.put(TrainingParameters.CUTOFF_PARAM, 1);
 TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream,
     params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
 NameFinderME nameFinder = new NameFinderME(nameFinderModel);
 // now test if it can detect the sample sentences
 String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
     "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");
 Span[] names1 = nameFinder.find(sentence);
 Assert.assertEquals(new Span(0, 2, DEFAULT), names1[0]);
 Assert.assertEquals(new Span(2, 4, DEFAULT), names1[1]);
 Assert.assertEquals(new Span(4, 6, DEFAULT), names1[2]);
 Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
}

@Test(expected = InsufficientTrainingDataException.class)
public void testWithInsufficientData() throws Exception {
 InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
   "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt");
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 TrainingParameters mlParams = new TrainingParameters();
 mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
 mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 mlParams.put(TrainingParameters.ALGORITHM_PARAM,
   ModelType.MAXENT.toString());
 TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng",
   TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null);
 cv.evaluate(sampleStream, 2);
}

/**
 * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it.
 * This is related to the issue OPENNLP-9
 */
@Test
public void testOnlyWithEntitiesWithTypes() throws Exception {
 // train the name finder
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(new MockInputStreamFactory(
    new File("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")), "UTF-8"));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
 params.put(TrainingParameters.ITERATIONS_PARAM, 70);
 params.put(TrainingParameters.CUTOFF_PARAM, 1);
 TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream,
   params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
 NameFinderME nameFinder = new NameFinderME(nameFinderModel);
 // now test if it can detect the sample sentences
 String[] sentence = "NATO United States Barack Obama".split("\\s+");
 Span[] names1 = nameFinder.find(sentence);
 Assert.assertEquals(new Span(0, 1, "organization"), names1[0]); // NATO
 Assert.assertEquals(new Span(1, 3, "location"), names1[1]); // United States
 Assert.assertEquals("person", names1[2].getType());
 Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
}

/**
 * Train NamefinderME using OnlyWithNamesWithTypes.train.
 * The goal is to check if the model validator accepts it.
 * This is related to the issue OPENNLP-9
 */
@Test
public void testOnlyWithNamesWithTypes() throws Exception {
 // train the name finder
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(new MockInputStreamFactory(
    new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8"));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 70);
 params.put(TrainingParameters.CUTOFF_PARAM, 1);
 TokenNameFinderModel nameFinderModel = NameFinderME.train("eng", null, sampleStream,
   params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
 NameFinderME nameFinder = new NameFinderME(nameFinderModel);
 // now test if it can detect the sample sentences
 String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
   "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");
 Span[] names1 = nameFinder.find(sentence);
 Assert.assertEquals(new Span(0, 2, "person"), names1[0]);
 Assert.assertEquals(new Span(2, 4, "person"), names1[1]);
 Assert.assertEquals(new Span(4, 6, "person"), names1[2]);
 Assert.assertEquals("person", names1[2].getType());
 Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
}

@Test
/*
 * Test that reproduces jira OPENNLP-463
 */
public void testWithNullResources() throws Exception {
 InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
   "/opennlp/tools/namefind/AnnotatedSentences.txt");
 ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
   new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 TrainingParameters mlParams = new TrainingParameters();
 mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
 mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 mlParams.put(TrainingParameters.ALGORITHM_PARAM,
   ModelType.MAXENT.toString());
 TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("eng",
   TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null);
 cv.evaluate(sampleStream, 2);
 Assert.assertNotNull(cv.getFMeasure());
}

new NameSampleDataStream(
  new PlainTextByLineStream(new MockInputStreamFactory(
   new File("opennlp/tools/namefind/AnnotatedSentences.txt")), encoding));

ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
  new PlainTextByLineStream(new MockInputStreamFactory(
   new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding));

ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
  new PlainTextByLineStream(new MockInputStreamFactory(
   new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));

Javadoc

The NameSampleDataStream class converts tagged Strings provided by a DataStream to NameSample objects. It uses text that is is one-sentence per line and tokenized with names identified by <START> and <END> tags.

Most used methods

Popular in Java

Making http post requests using okhttp
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
scheduleAtFixedRate (ScheduledExecutorService)
getContentResolver (Context)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
Menu (java.awt)
Notification (javax.management)
From CI to AI: The AI layer in your organization

How to useNameSampleDataStream in opennlp.tools.namefind

Best Java code snippets using opennlp.tools.namefind.NameSampleDataStream (Showing top 20 results out of 315)

How to use
NameSampleDataStream
in
opennlp.tools.namefind