/** * Loads a file to a Document. * @param in file to load * @param charsetName character set of input * @param baseUri base URI of document, to resolve relative links against * @return Document * @throws IOException on IO error */ public static Document load(File in, String charsetName, String baseUri) throws IOException { return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser()); }
/** * Parses a Document from an input steam. * @param in input stream to parse. You will need to close it. * @param charsetName character set of input * @param baseUri base URI of document, to resolve relative links against * @return Document * @throws IOException on IO error */ public static Document load(InputStream in, String charsetName, String baseUri) throws IOException { return parseInputStream(in, charsetName, baseUri, Parser.htmlParser()); }
Request() { timeoutMilliseconds = 30000; // 30 seconds maxBodySizeBytes = 1024 * 1024; // 1MB followRedirects = true; data = new ArrayList<>(); method = Method.GET; addHeader("Accept-Encoding", "gzip"); addHeader(USER_AGENT, DEFAULT_UA); parser = Parser.htmlParser(); }
/** * Parses the given input stream into a jsoup document * * @param html * the stream containing the design * @return the parsed jsoup document * @throws IOException */ private static Document parse(InputStream html) { try { Document doc = Jsoup.parse(html, UTF_8.name(), "", Parser.htmlParser()); return doc; } catch (IOException e) { throw new DesignException("The html document cannot be parsed."); } }
Document parse = Jsoup.parse(content, "", Parser.htmlParser());
/** * change parser to htmlParser. * * @return */ public SelectorExtractor htmlParser() { this.parser = Parser.htmlParser(); return this; }
/** * Gets the JSoup parser associated with the string representation. * The string "xml" (case insensitive) will return the XML parser. * Anything else will return the HTML parser. * @param parser "html" or "xml" * @return JSoup parser * @since 2.8.0 */ public static Parser toJSoupParser(String parser) { if ("xml".equalsIgnoreCase(parser)) { return Parser.xmlParser(); } return Parser.htmlParser(); }
/** * 將 HTML 轉化為 Jsoup Document 物件 * * HTML的內容就使用Jsoup原生的 HTML Parser * * @param html Html document * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document htmlToJsoupDoc(String html){ // 將 html(html/html5) 轉為 jsoup Document 物件 Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser() ); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; }
Parser parser = Parser.htmlParser().setTrackErrors(0); @Nonnull Document doc = parser.parseInput(html, ""); @Nonnull Elements tags = doc.select(tagName);
return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
/** * Attempt to find a META tag in the HTML that hints at the character set * used to write the document. */ private static String getCharsetFromMeta(byte buffer[], int maxlength) { // convert to UTF-8 String -- which hopefully will not mess up the // characters we're interested in... int len = buffer.length; if (maxlength > 0 && maxlength < len) { len = maxlength; } String html = new String(buffer, 0, len, DEFAULT_CHARSET); Document doc = Parser.htmlParser().parseInput(html, "dummy"); // look for <meta http-equiv="Content-Type" // content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> Elements metaElements = doc .select("meta[http-equiv=content-type], meta[charset]"); String foundCharset = null; for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) foundCharset = getCharsetFromContentType(meta.attr("content")); if (foundCharset == null && meta.hasAttr("charset")) foundCharset = meta.attr("charset"); if (foundCharset != null) return foundCharset; } return foundCharset; }
.decode(ByteBuffer.wrap(content)).toString(); jsoupDoc = Parser.htmlParser().parseInput(html, url);
@Test public void testExclusionCase() throws IOException { Config conf = new Config(); conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<STYLE>main</STYLE>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("the content of the page", text); }
@Test public void testMainContent() throws IOException { Config conf = new Config(); conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("main content", text); }
@Test public void testExclusion() throws IOException { Config conf = new Config(); conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE"); TextExtractor extractor = new TextExtractor(conf); String content = "<html>the<style>main</style>content of the page</html>"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://stormcrawler.net"); String text = extractor.text(jsoupDoc.body()); assertEquals("the content of the page", text); }