/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
/** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } }
@Override public void execute(Tuple input) { String url = input.getStringByField("url"); String html = input.getStringByField("html"); Object date = input.getValueByField("date"); if (html == null) { logger.error("No content for : {}", url); collector.ack(input); return; } try { TextDocument td = new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument(); ArticleSentencesExtractor.INSTANCE.process(td); collector.emit(input, new Values(td.getContent(), url, date)); collector.ack(input); logger.info("extracted text for {}", url); } catch (Exception e) { collector.fail(input); logger.error("error extracting text from {} {}", url, e); collector.reportError(e); } }