@Test public void evalSentenceModel() throws Exception { SentenceModel model = new SentenceModel( new File(getOpennlpDataDir(), "models-sf/en-sent.bin")); MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); SentenceDetector sentenceDetector = new SentenceDetectorME(model); StringBuilder text = new StringBuilder(); try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt")))) { LeipzigTestSample lineBatch; while ((lineBatch = lineBatches.read()) != null) { text.append(String.join(" ", lineBatch.getText())).append(" "); } } String[] sentences = sentenceDetector.sentDetect(text.toString()); for (String sentence : sentences) { digest.update(sentence.getBytes(StandardCharsets.UTF_8)); } Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"), new BigInteger(1, digest.digest())); }
/** * Runs the OpenNLP <code>SentenceDetector</code> object on the given <code>String</code> source, * and returns an <code>Iterable<String></code> object over the detected sentences. */ protected Collection<String> extractCandidates(String source) { return Arrays.asList(detector.sentDetect(source)); } }
/** * Runs the OpenNLP <code>SentenceDetector</code> object on the given <code>String</code> source, * and returns an <code>Iterable<String></code> object over the detected sentences. */ protected Collection<String> extractCandidates(String source) { return Arrays.asList(detector.sentDetect(source)); } }
@Override /** * Extracts sentences from the given HTML. */ protected Collection<String> extractCandidates(String htmlBlock) { String content = HtmlUtils.removeHtml(htmlBlock); String[] lines = content.split("\n"); List<String> results = new ArrayList<String>(); SentenceDetector detector = getSentenceDetector(); for (String line : lines) { line = line.trim(); for (String sent : detector.sentDetect(line)) { if (!sent.trim().equals("")) { results.add(sent); } } } return results; }
@Override /** * Extracts sentences from the given HTML. */ protected Collection<String> extractCandidates(String htmlBlock) { String content = HtmlUtils.removeHtml(htmlBlock); String[] lines = content.split("\n"); List<String> results = new ArrayList<String>(); SentenceDetector detector = getSentenceDetector(); for (String line : lines) { line = line.trim(); for (String sent : detector.sentDetect(line)) { if (!sent.trim().equals("")) { results.add(sent); } } } return results; }
String sentences[] = sentenceDetector.sentDetect(s);