public String $html(String selector, boolean isOuter) { Elements elements = $(selector); if (elements != null) { if(isOuter) { return elements.outerHtml(); } return elements.html(); } return null; }
/** * Method to match the given pattern with extracted elements of html page * and parse the result for the posts on the given instagram page * @return instaProfile as a JSONArray object containing all posts and details of viewer */ public JSONArray scrapeInstagram(BufferedReader br, String url) { Document htmlPage = null; Post instaObj = null; JSONArray instaProfile = new JSONArray(); try { htmlPage = Jsoup.parse(this.bufferedReaderToString(br)); } catch (IOException e) { DAO.trace(e); } String script = htmlPage.getElementsByTag("script").html(); Matcher m = instaJsonData.matcher(script); m.find(); int start = m.start(1); int end = m.start(2) + 1; script = script.substring(start, end); //TODO: pre-process the posts captured. At present, complete array of posts are output. //Only useful data shall be outputted. instaObj = new Post(script, this.query); instaProfile.put(instaObj); return instaProfile; }
String rawJson = deobfuscateJSON(page.select("script#ractive-public").html() .replaceAll(">", ">").replaceAll("<", "<").replace("&", "&")); JSONObject json = new JSONObject(rawJson);
@Override public Article parse(String url) { final Article article = new Article(); article.setProvider(PROVIDER); article.setValue(url); Document document = JsoupUtils.getDocWithPC(url); String title = document.select("meta[property=og:title]").attr("content"); article.setTitle(title); String time = document.select("span.publish-time").text(); article.setTime(time); String author = document.select("span.name").text(); article.setAuthor(author); String content = document.select("div.show-content").html(); content = content.replaceAll("<img", "<img class=\"ui centered image\" "); content = content.replaceAll("<table>", "<table class='ui table'>"); article.setContent(content); Matcher matcher = pattern.matcher(content); if (matcher.find()){ article.setImage(matcher.group(1)); } article.setCss("/css/jianshu.css"); return article; }
@Override public Article parse(String url) { final Article article = new Article(); article.setProvider(PROVIDER); article.setValue(url); Document document = JsoupUtils.getDocWithPC(url); String title = document.select("title").text(); article.setTitle(title); String time = document.select("em#post-date").text(); article.setTime(time); String author = document.select("a#post-user").text(); article.setAuthor(author); String content = document.select("div#js_content").html(); content = content.replaceAll("data-src", "width=\"80%\" src"); article.setContent(content); Matcher matcher = pattern.matcher(document.html()); if (matcher.find()){ article.setImage(matcher.group(1)); } return article; }
Elements links = doc.select("*"); String crawlingNode = links.html(); System.out.println(crawlingNode); httptest.WriteOnFile(writer, crawlingNode);
Elements links = doc.getElementsByTag("cite"); String crawlingNode = links.html(); crawlingNode = crawlingNode.replaceAll("(?=<).*?(>=?)", ""); //Remove undesired html tags for (Element link : links) { String linkText = link.text()+System.lineSeparator(); System.out.println(linkText); httptest.WriteOnFile(writer, linkText); }
String html = "<html><body><p>Hello</p></body></html>"; Document doc = Jsoup.parseBodyFragment(html); Elements fragment = doc.select("p"); // p tag System.out.println(fragment.html());
try { Document doc = Jsoup.connect("https://angularjs.org/").get(); Elements header = doc.select("title"); System.out.println(header.html()); } catch (Exception e) { e.printStackTrace(); }
// parse the doc and select the element containing the text Elements es = Jsoup .parse("<html><body><div>a \ntext<br/>is <b>a</a> text</div></html></body") .select("div"); // find <br> tags and replace them (using an arbitrary placeholder '~n~') es.select("br").append("~n~"); // clean all tags String clean = Jsoup.clean(es.html(), Whitelist.none()); // replace the placeholder with a real newline String disp = clean.replaceAll("~n~", "\n");
public MovieModel getDy2018Detail() { MovieModel model = new MovieModel(); model.title = document.select("div.title_all").text(); model.message = document.select("div#Zoom").html(); return model; } }
public MovieModel getDetail() { MovieModel model = new MovieModel(); model.title = document.select("div#show").text(); model.message = document.select("div#showinfo").html(); return model; } }
Document doc = Jsoup.parse(new URL("http://en.wikipedia.org/", 10000); Elements interestingParts = doc.select("div.interestingClass"); //get the combined HTML fragments as a String String selectedHtmlAsString = interestingParts.html(); //get all the links Elements links = interestingParts.select("a[href]"); //filter the document to include certain tags only Whitelist allowedTags = Whitelist.simpleText().addTags("blockquote","code", "p"); Cleaner cleaner = new Cleaner(allowedTags); Document filteredDoc = cleaner.clean(doc);
Document doc = Jsoup.connect("http://wikitravel.org/en/San_Francisco").get(); //select all "next siblings" of the "Get around" h2 Elements section = doc.select("h2:contains(Get around) ~ *"); //select all "next siblings" of the "See" h2 and remove them section.select("h2:contains(See) ~ *").remove(); //remove the second h2 section.select("h2").remove(); //section now contains the elements between "Get around" and "See" String sectionHtml = section.html();
Document documentImage2 = Jsoup.connect(urls[0]).get(); // Using Elements to get the class data Element div = documentImage2.select("div[class=content]").get(1); Document doc_i = Jsoup.parse(div.toString()); Elements image = doc_i.select("img"); String imgSrcImage2 = image.html();
@Test public void single_attributes_should_be_interpreted_as_boolean() throws IOException { File inputFile = classpath.getResource("rendersample.asciidoc"); String inputPath = inputFile.getPath().substring(pwd.length() + 1); new AsciidoctorInvoker().invoke("-a", "linkcss!", inputPath); File expectedFile = new File(inputPath.replaceFirst("\\.asciidoc$", ".html")); Document doc = Jsoup.parse(expectedFile, "UTF-8"); Elements cssStyle = doc.select("style"); assertThat(cssStyle.html(), is(not(""))); Elements link = doc.select("link"); assertThat(link.html(), is("".trim())); expectedFile.delete(); }
protected String asMarkdown(String text) { Document doc = Jsoup.parseBodyFragment(text); removeHtmlComments(doc); replaceJavadocCodeBlock(doc); String html = doc.getElementsByTag("body").html(); return replaceInline(html); }
@Test public void render_content_without_attributes_should_embed_css_by_default() throws IOException { Options options = options().inPlace(false).safe(SafeMode.UNSAFE) .toDir(testFolder.getRoot()).get(); asciidoctor.convertFile(classpath.getResource("rendersample.asciidoc"), options); Document doc = Jsoup.parse(new File(testFolder.getRoot(), "rendersample.html"), "UTF-8"); Elements cssStyle = doc.select("style"); assertThat(cssStyle.html(), is(not(""))); Elements link = doc.select("link"); assertThat(link.html(), is("".trim())); }
@Test public void setting_linkcss_as_false_in_string_should_embed_css_file() throws IOException { Attributes attributes = attributes("linkcss!").get(); Options options = options().inPlace(false).safe(SafeMode.UNSAFE) .toDir(testFolder.getRoot()).attributes(attributes).get(); asciidoctor.convertFile(classpath.getResource("rendersample.asciidoc"), options); // String readFull = IOUtils.readFull(new FileInputStream(new // File(testFolder.getRoot(), "rendersample.html"))); Document doc = Jsoup.parse(new File(testFolder.getRoot(), "rendersample.html"), "UTF-8"); Elements cssStyle = doc.select("style"); assertThat(cssStyle.html(), is(not(""))); Elements link = doc.select("link"); assertThat(link.html(), is("".trim())); }