return new HTMLDocument(data, cs);
return new HTMLDocument(data, cs);
return new HTMLDocument(data, cs);
return new HTMLDocument(data, cs);
/** * parses the media (picture, video) out of doc * * @param doc document to parse the media out * @param extractor extractor to use * @return list of extracted media, with size = 0 if no media found */ public List<Media> process(String doc, final BoilerpipeExtractor extractor) { final HTMLDocument htmlDoc = new HTMLDocument(doc); List<Media> media = new ArrayList<Media>(); TextDocument tdoc; try { tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(tdoc); final InputSource is = htmlDoc.toInputSource(); media = process(tdoc, is); } catch (Exception e) { return null; } return media; }
/** * parses the media (picture, video) out of doc * @param doc document to parse the media out * @param extractor extractor to use * @return list of extracted media, with size = 0 if no media found */ public List<Media> process(String doc, final BoilerpipeExtractor extractor) { final HTMLDocument htmlDoc = new HTMLDocument(doc); List<Media> media = new ArrayList<Media>(); TextDocument tdoc; try { tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(tdoc); final InputSource is = htmlDoc.toInputSource(); media = process(tdoc, is); } catch (Exception e) { return null; } return media; }
/** * parses the media (picture, video) out of doc * * @param doc document to parse the media out * @param extractor extractor to use * @return list of extracted media, with size = 0 if no media found */ public List<Media> process(String doc, final BoilerpipeExtractor extractor) { final HTMLDocument htmlDoc = new HTMLDocument(doc); List<Media> media = new ArrayList<Media>(); TextDocument tdoc; try { tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument(); extractor.process(tdoc); final InputSource is = htmlDoc.toInputSource(); media = process(tdoc, is); } catch (Exception e) { return null; } return media; }