org.jsoup.parser.Parser.htmlParser java code examples

/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
  return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser());
}

/**
 * Parses a Document from an input steam.
 * @param in input stream to parse. You will need to close it.
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
  return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
}

Request() {
  timeoutMilliseconds = 30000; // 30 seconds
  maxBodySizeBytes = 1024 * 1024; // 1MB
  followRedirects = true;
  data = new ArrayList<>();
  method = Method.GET;
  addHeader("Accept-Encoding", "gzip");
  addHeader(USER_AGENT, DEFAULT_UA);
  parser = Parser.htmlParser();
}

/**
 * Parses the given input stream into a jsoup document
 *
 * @param html
 *            the stream containing the design
 * @return the parsed jsoup document
 * @throws IOException
 */
private static Document parse(InputStream html) {
  try {
    Document doc = Jsoup.parse(html, UTF_8.name(), "",
        Parser.htmlParser());
    return doc;
  } catch (IOException e) {
    throw new DesignException("The html document cannot be parsed.");
  }
}

Document parse = Jsoup.parse(content, "", Parser.htmlParser());

/**
 * change parser to htmlParser.
 *
 * @return
 */
public SelectorExtractor htmlParser() {
  this.parser = Parser.htmlParser();
  return this;
}

/**
 * Gets the JSoup parser associated with the string representation.
 * The string "xml" (case insensitive) will return the XML parser.  
 * Anything else will return the HTML parser. 
 * @param parser "html" or "xml"
 * @return JSoup parser
 * @since 2.8.0
 */
public static Parser toJSoupParser(String parser) {
  if ("xml".equalsIgnoreCase(parser)) {
    return Parser.xmlParser();
  }
  return Parser.htmlParser();
}

/**
 * 將 HTML 轉化為 Jsoup Document 物件
 *
 * HTML的內容就使用Jsoup原生的 HTML Parser
 *
 * @param html Html document
 * @return org.jsoup.nodes.Document
 */
public org.jsoup.nodes.Document htmlToJsoupDoc(String html){
  // 將 html(html/html5) 轉為 jsoup Document 物件
  Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser() );
  jsoupDoc.charset(StandardCharsets.UTF_8);
  return jsoupDoc;
}

Parser parser = Parser.htmlParser().setTrackErrors(0);
@Nonnull Document doc = parser.parseInput(html, "");
@Nonnull Elements tags = doc.select(tagName);

return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());

/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
  // convert to UTF-8 String -- which hopefully will not mess up the
  // characters we're interested in...
  int len = buffer.length;
  if (maxlength > 0 && maxlength < len) {
    len = maxlength;
  }
  String html = new String(buffer, 0, len, DEFAULT_CHARSET);
  Document doc = Parser.htmlParser().parseInput(html, "dummy");
  // look for <meta http-equiv="Content-Type"
  // content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
  Elements metaElements = doc
      .select("meta[http-equiv=content-type], meta[charset]");
  String foundCharset = null;
  for (Element meta : metaElements) {
    if (meta.hasAttr("http-equiv"))
      foundCharset = getCharsetFromContentType(meta.attr("content"));
    if (foundCharset == null && meta.hasAttr("charset"))
      foundCharset = meta.attr("charset");
    if (foundCharset != null)
      return foundCharset;
  }
  return foundCharset;
}

    .decode(ByteBuffer.wrap(content)).toString();
jsoupDoc = Parser.htmlParser().parseInput(html, url);

@Test
public void testExclusionCase() throws IOException {
  Config conf = new Config();
  conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");
  TextExtractor extractor = new TextExtractor(conf);
  String content = "<html>the<STYLE>main</STYLE>content of the page</html>";
  Document jsoupDoc = Parser.htmlParser().parseInput(content,
      "http://stormcrawler.net");
  String text = extractor.text(jsoupDoc.body());
  assertEquals("the content of the page", text);
}

@Test
public void testMainContent() throws IOException {
  Config conf = new Config();
  conf.put(TextExtractor.INCLUDE_PARAM_NAME, "DIV[id=\"maincontent\"]");
  TextExtractor extractor = new TextExtractor(conf);
  String content = "<html>the<div id='maincontent'>main<div>content</div></div>of the page</html>";
  Document jsoupDoc = Parser.htmlParser().parseInput(content,
      "http://stormcrawler.net");
  String text = extractor.text(jsoupDoc.body());
  assertEquals("main content", text);
}

@Test
public void testExclusion() throws IOException {
  Config conf = new Config();
  conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "STYLE");
  TextExtractor extractor = new TextExtractor(conf);
  String content = "<html>the<style>main</style>content of the page</html>";
  Document jsoupDoc = Parser.htmlParser().parseInput(content,
      "http://stormcrawler.net");
  String text = extractor.text(jsoupDoc.body());
  assertEquals("the content of the page", text);
}

Javadoc

Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, based on a knowledge of the semantics of the incoming tags.

Popular methods of Parser

xmlParser
Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it
parseInput
unescapeEntities
Utility method to unescape HTML entities from a string
<init>
Create a new Parser, using the specified TreeBuilder
parse
Parse HTML into a Document.
setTrackErrors
Enable or disable parse error tracking for the next parse.
getErrors
Retrieve the parse errors, if any, from the last parse.
isTrackErrors
Check if parse error tracking is enabled.
parseBodyFragment
Parse a fragment of HTML into the body of a Document.
parseFragment
Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing co
settings

settings

Popular in Java

Running tasks concurrently on multiple threads
runOnUiThread (Activity)
onRequestPermissionsResult (Fragment)
notifyDataSetChanged (ArrayAdapter)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
Kernel (java.awt.image)
Top plugins for Android Studio

How to use htmlParsermethodin org.jsoup.parser.Parser

Best Java code snippets using org.jsoup.parser.Parser.htmlParser (Showing top 15 results out of 315)

How to use
htmlParser
method
in
org.jsoup.parser.Parser