org.jsoup.select.Elements.html java code examples

public String $html(String selector, boolean isOuter) {
  Elements elements = $(selector);
  if (elements != null) {
    if(isOuter) {
      return elements.outerHtml();
    }
    return elements.html();
  }
  return null;
}

/**
 * Method to match the given pattern with extracted elements of html page
 * and parse the result for the posts on the given instagram page
 * @return instaProfile as a JSONArray object containing all posts and details of viewer
 */
public JSONArray scrapeInstagram(BufferedReader br, String url) {
  Document htmlPage = null;
  Post instaObj = null;
  JSONArray instaProfile = new JSONArray();
  try {
    htmlPage = Jsoup.parse(this.bufferedReaderToString(br));
  } catch (IOException e) {
    DAO.trace(e);
  }
  String script = htmlPage.getElementsByTag("script").html();
  Matcher m = instaJsonData.matcher(script);
  m.find();
  int start = m.start(1);
  int end = m.start(2) + 1;
  script = script.substring(start, end);
  //TODO: pre-process the posts captured. At present, complete array of posts are output.
  //Only useful data shall be outputted.
  instaObj = new Post(script, this.query);
  instaProfile.put(instaObj);
  return instaProfile;
}

String rawJson = deobfuscateJSON(page.select("script#ractive-public").html()
    .replaceAll("&gt;", ">").replaceAll("&lt;", "<").replace("&amp;", "&"));
JSONObject json = new JSONObject(rawJson);

@Override
public Article parse(String url) {
  final Article article = new Article();
  article.setProvider(PROVIDER);
  article.setValue(url);
  Document document = JsoupUtils.getDocWithPC(url);
  String title = document.select("meta[property=og:title]").attr("content");
  article.setTitle(title);
  String time = document.select("span.publish-time").text();
  article.setTime(time);
  String author = document.select("span.name").text();
  article.setAuthor(author);
  String content = document.select("div.show-content").html();
  content = content.replaceAll("<img", "<img class=\"ui centered image\" ");
  content = content.replaceAll("<table>", "<table class='ui table'>");
  article.setContent(content);
  Matcher matcher = pattern.matcher(content);
  if (matcher.find()){
    article.setImage(matcher.group(1));
  }
  article.setCss("/css/jianshu.css");
  return article;
}

@Override
public Article parse(String url) {
  final Article article = new Article();
  article.setProvider(PROVIDER);
  article.setValue(url);
  Document document = JsoupUtils.getDocWithPC(url);
  String title = document.select("title").text();
  article.setTitle(title);
  String time = document.select("em#post-date").text();
  article.setTime(time);
  String author = document.select("a#post-user").text();
  article.setAuthor(author);
  String content = document.select("div#js_content").html();
  content = content.replaceAll("data-src", "width=\"80%\" src");
  article.setContent(content);
  Matcher matcher = pattern.matcher(document.html());
  if (matcher.find()){
    article.setImage(matcher.group(1));
  }
  return article;
}

Elements links = doc.select("*");
 String crawlingNode = links.html();
   System.out.println(crawlingNode);
     httptest.WriteOnFile(writer, crawlingNode);

 Elements links = doc.getElementsByTag("cite");  
        String crawlingNode = links.html();
          crawlingNode = crawlingNode.replaceAll("(?=<).*?(>=?)", ""); //Remove undesired html tags

          for (Element link : links) {

          String linkText = link.text()+System.lineSeparator();
          System.out.println(linkText);
          httptest.WriteOnFile(writer, linkText);
}

 String html = "<html><body><p>Hello</p></body></html>";
Document doc = Jsoup.parseBodyFragment(html);
Elements fragment = doc.select("p"); // p tag
System.out.println(fragment.html());

 try {
 Document doc = Jsoup.connect("https://angularjs.org/").get();
 Elements header = doc.select("title");
 System.out.println(header.html());
} catch (Exception e) {
 e.printStackTrace();
}

 // parse the doc and select the element containing the text
Elements es = Jsoup
  .parse("<html><body><div>a \ntext<br/>is <b>a</a> text</div></html></body")
  .select("div");

// find <br> tags and replace them (using an arbitrary placeholder '~n~')
es.select("br").append("~n~");
// clean all tags
String clean = Jsoup.clean(es.html(), Whitelist.none());
// replace the placeholder with a real newline
String disp = clean.replaceAll("~n~", "\n");

String html = "<html> ...";
 Document doc = Jsoup.parse(html);
 Elements p = doc.select("div#content > p");
 p.html(p.html().replaceAll("text", "word"));
 System.out.println(doc.toString());

  public MovieModel getDy2018Detail() {
    MovieModel model = new MovieModel();
    model.title = document.select("div.title_all").text();
    model.message = document.select("div#Zoom").html();
    return model;
  }
}

  public MovieModel getDetail() {
    MovieModel model = new MovieModel();
    model.title = document.select("div#show").text();
    model.message = document.select("div#showinfo").html();
    return model;
  }
}

 Document doc = Jsoup.parse(new URL("http://en.wikipedia.org/", 10000);
Elements interestingParts = doc.select("div.interestingClass");

//get the combined HTML fragments as a String
String selectedHtmlAsString = interestingParts.html();

//get all the links
Elements links = interestingParts.select("a[href]");

//filter the document to include certain tags only
Whitelist allowedTags = Whitelist.simpleText().addTags("blockquote","code", "p");
Cleaner cleaner = new Cleaner(allowedTags);
Document filteredDoc = cleaner.clean(doc);

 Document doc = Jsoup.connect("http://wikitravel.org/en/San_Francisco").get();
//select all "next siblings" of the "Get around" h2
Elements section = doc.select("h2:contains(Get around) ~ *");
//select all "next siblings" of the "See" h2 and remove them
section.select("h2:contains(See) ~ *").remove();
//remove the second h2
section.select("h2").remove();
//section now contains the elements between "Get around" and "See"
String sectionHtml = section.html();

 Document documentImage2 = Jsoup.connect(urls[0]).get();
// Using Elements to get the class data
Element div = documentImage2.select("div[class=content]").get(1);
Document doc_i = Jsoup.parse(div.toString());
Elements image = doc_i.select("img");
String imgSrcImage2 = image.html();

@Test
public void single_attributes_should_be_interpreted_as_boolean() throws IOException {
  
  File inputFile = classpath.getResource("rendersample.asciidoc");
  String inputPath = inputFile.getPath().substring(pwd.length() + 1);
  new AsciidoctorInvoker().invoke("-a", "linkcss!", inputPath);
  File expectedFile = new File(inputPath.replaceFirst("\\.asciidoc$", ".html"));
  Document doc = Jsoup.parse(expectedFile, "UTF-8");
  Elements cssStyle = doc.select("style");
  assertThat(cssStyle.html(), is(not("")));
  
  Elements link = doc.select("link");
  assertThat(link.html(), is("".trim()));
  
  expectedFile.delete();
  
}

protected String asMarkdown(String text) {
  Document doc = Jsoup.parseBodyFragment(text);
  removeHtmlComments(doc);
  replaceJavadocCodeBlock(doc);
  String html = doc.getElementsByTag("body").html();
  return replaceInline(html);
}

@Test
public void render_content_without_attributes_should_embed_css_by_default() throws IOException {
  
  Options options = options().inPlace(false).safe(SafeMode.UNSAFE)
      .toDir(testFolder.getRoot()).get();
  asciidoctor.convertFile(classpath.getResource("rendersample.asciidoc"), options);
  Document doc = Jsoup.parse(new File(testFolder.getRoot(),
      "rendersample.html"), "UTF-8");
  Elements cssStyle = doc.select("style");
  assertThat(cssStyle.html(), is(not("")));
  Elements link = doc.select("link");
  assertThat(link.html(), is("".trim()));
  
}

@Test
public void setting_linkcss_as_false_in_string_should_embed_css_file() throws IOException {
  Attributes attributes = attributes("linkcss!").get();
  Options options = options().inPlace(false).safe(SafeMode.UNSAFE)
      .toDir(testFolder.getRoot()).attributes(attributes).get();
  asciidoctor.convertFile(classpath.getResource("rendersample.asciidoc"), options);
  // String readFull = IOUtils.readFull(new FileInputStream(new
  // File(testFolder.getRoot(), "rendersample.html")));
  Document doc = Jsoup.parse(new File(testFolder.getRoot(),
      "rendersample.html"), "UTF-8");
  Elements cssStyle = doc.select("style");
  assertThat(cssStyle.html(), is(not("")));
  Elements link = doc.select("link");
  assertThat(link.html(), is("".trim()));
}

Javadoc

Get the combined inner HTML of all matched elements.

Popular methods of Elements

get
size
first
Get the first matched element.
text
Get the combined text of all the matched elements. Note that it is possible to get repeats if the ma
isEmpty
attr
Set an attribute on all matched elements.
select
Find matching elements within this element list.
remove
stream
last
Get the last matched element.
iterator
<init>

Popular in Java

Parsing JSON documents to java classes using gson
getSharedPreferences (Context)
getContentResolver (Context)
notifyDataSetChanged (ArrayAdapter)
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Top Vim plugins

How to use htmlmethodin org.jsoup.select.Elements

Best Java code snippets using org.jsoup.select.Elements.html (Showing top 20 results out of 315)

How to use
html
method
in
org.jsoup.select.Elements