/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource( new StringReader(origHTML))); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource( new StringReader(origHTML))); }
/** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource( new StringReader(origHTML))); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * The processed {@link TextDocument}. * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }
/** * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the * retrieved HTML using the specified {@link BoilerpipeExtractor}. * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @return A List of enclosed {@link Image}s * @throws BoilerpipeProcessingException */ public List<Image> process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); }