public static InputStreamFactory createInputStreamFactory(File file) { try { return new MarkableFileInputStreamFactory(file); } catch (FileNotFoundException e) { throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e); } }
new MarkableFileInputStreamFactory(sentencesFile), StandardCharsets.UTF_8)) {
private static ObjectStream<String> getLineSample(String corpus) throws IOException { return new PlainTextByLineStream(new MarkableFileInputStreamFactory( new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1); }
private ObjectStream<LeipzigTestSample> createLineWiseStream() throws IOException { return new LeipzigTestSampleStream(1, SimpleTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt"))); }
private POSModel train(File trainFile, String lang, TrainingParameters params) throws IOException { ObjectStream<POSSample> samples = new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8); return POSTaggerME.train(lang, samples, params, new POSTaggerFactory()); }
@BeforeClass public static void verifyTrainingData() throws Exception { verifyTrainingData(new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt"))), new BigInteger("172812413483919324675263268750583851712")); }
@BeforeClass public static void verifyTrainingData() throws Exception { TEST_DATA_FILE = new File(getOpennlpDataDir(), "conll00/test.txt"); TRAIN_DATA_FILE = new File(getOpennlpDataDir(), "conll00/train.txt"); verifyTrainingData(new ChunkSampleStream( new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE), StandardCharsets.UTF_8)), new BigInteger("84610235226433393380477662908529306002")); verifyTrainingData(new ChunkSampleStream( new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE), StandardCharsets.UTF_8)), new BigInteger("84610235226433393380477662908529306002")); }
private static ChunkerModel train(File trainFile, TrainingParameters params) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream( new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8)); return ChunkerME.train("eng", samples, params, new ChunkerFactory()); }
private TokenNameFinderModel train(File trainFile, LANGUAGE lang, TrainingParameters params, int types) throws IOException { ObjectStream<NameSample> samples = new Conll02NameSampleStream( lang,new MarkableFileInputStreamFactory(trainFile), types); return NameFinderME.train(lang.toString().toLowerCase(), null, samples, params, new TokenNameFinderFactory()); }
private double trainAndEval(String lang, File trainFile, TrainingParameters params, File evalFile) throws IOException { ConlluTagset tagset = ConlluTagset.X; ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream( new MarkableFileInputStreamFactory(trainFile)), tagset); LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory()); LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model)); evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream( new MarkableFileInputStreamFactory(evalFile)), tagset)); return evaluator.getWordAccuracy(); }
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/danish/ddt/train/danish_ddt_train.conll")), StandardCharsets.UTF_8), new BigInteger("30795670444498617202001550516753630016")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/danish/ddt/test/danish_ddt_test.conll")), StandardCharsets.UTF_8), new BigInteger("314104267846430512372780024568104131337")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/dutch/alpino/train/dutch_alpino_train.conll")), StandardCharsets.UTF_8), new BigInteger("109328245573060521952850454797286933887")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/dutch/alpino/test/dutch_alpino_test.conll")), StandardCharsets.UTF_8), new BigInteger("132343141132816640849897155456916243039")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll")), StandardCharsets.UTF_8), new BigInteger("9504382474772307801979515927230835901")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll")), StandardCharsets.UTF_8), new BigInteger("175256039869578311901318972681191182910")); new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll")), StandardCharsets.UTF_8), new BigInteger("128378790384268106811747599235147991544"));
LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTrainingFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("109687424525847313767541246922170457976")); verifyTrainingData(new Conll02NameSampleStream( LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestAFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("12942966701628852910737840182656846323")); verifyTrainingData(new Conll02NameSampleStream( LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestBFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("223206987942490952427646331013509976957")); LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTrainingFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("226089384066775461905386060946810714487")); verifyTrainingData(new Conll02NameSampleStream( LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestAFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("313879596837181728494732341737647284762")); verifyTrainingData(new Conll02NameSampleStream( LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestBFile), Conll02NameSampleStream.GENERATE_PERSON_ENTITIES), new BigInteger("24037715705115461166858183817622459974"));
@Test public void evalSentenceModel() throws Exception { SentenceModel model = new SentenceModel( new File(getOpennlpDataDir(), "models-sf/en-sent.bin")); MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); SentenceDetector sentenceDetector = new SentenceDetectorME(model); StringBuilder text = new StringBuilder(); try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt")))) { LeipzigTestSample lineBatch; while ((lineBatch = lineBatches.read()) != null) { text.append(String.join(" ", lineBatch.getText())).append(" "); } } String[] sentences = sentenceDetector.sentDetect(text.toString()); for (String sentence : sentences) { digest.update(sentence.getBytes(StandardCharsets.UTF_8)); } Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"), new BigInteger(1, digest.digest())); }
@Test public void evalTokenModel() throws Exception { // the input stream is currently tokenized, we should detokenize it again, // (or extend to pass in tokenizer, then whitespace tokenizer can be passed) // and then tokenize it here TokenizerModel model = new TokenizerModel( new File(getOpennlpDataDir(), "models-sf/en-token.bin")); MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); Tokenizer tokenizer = new TokenizerME(model); try (ObjectStream<LeipzigTestSample> lines = new LeipzigTestSampleStream(1, WhitespaceTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt")))) { LeipzigTestSample line; while ((line = lines.read()) != null) { String[] tokens = tokenizer.tokenize(String.join(" ", line.getText())); for (String token : tokens) { digest.update(token.getBytes(StandardCharsets.UTF_8)); } } } Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"), new BigInteger(1, digest.digest())); }
private static void eval(ChunkerModel model, File testData, double expectedFMeasure) throws IOException { ObjectStream<ChunkSample> samples = new ChunkSampleStream( new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8)); ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model)); evaluator.evaluate(samples); Assert.assertEquals(expectedFMeasure, evaluator.getFMeasure().getFMeasure(), 0.0001); }
private void eval(POSModel model, File testData, double expectedAccuracy) throws IOException { ObjectStream<POSSample> samples = new ConllXPOSSampleStream( new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8); POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model)); evaluator.evaluate(samples); Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001); }
private void eval(TokenNameFinderModel model, File testData, LANGUAGE lang, int types, double expectedFMeasure) throws IOException { ObjectStream<NameSample> samples = new Conll02NameSampleStream( lang, new MarkableFileInputStreamFactory(testData), types); TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model)); evaluator.evaluate(samples); Assert.assertEquals(expectedFMeasure, evaluator.getFMeasure().getFMeasure(), 0.0001); }
inputStreamFactory = new MarkableFileInputStreamFactory( new File("en-sent.train"));
public static InputStreamFactory createInputStreamFactory(File file) { try { return new MarkableFileInputStreamFactory(file); } catch (FileNotFoundException e) { throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e); } }
public static InputStreamFactory createInputStreamFactory(File file) { try { return new MarkableFileInputStreamFactory(file); } catch (FileNotFoundException e) { throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e); } }