if (!endingWithMp4.isEmpty()) { List<String> list = new ArrayList<>(); endingWithMp4.forEach((e) -> list.add(e.attr("src"))); return getBestQualityLink(list);
Document parse = Jsoup.parse(content); Elements tableElements = parse.select("table"); tableElements.forEach(element -> element.addClass("table table-bordered")); Elements aElements = parse.select("p"); if (aElements != null && aElements.size() > 0) { aElements.forEach(element -> { try { String href = element.text();
@Override public void manipulate(Document document) { document.select("p").forEach(this::processParagraph); }
@Override public void manipulate(Document document) { document.select("p").forEach(this::processParagraph); }
@Override public List<String> getAll() { if (allElements == null) { return null; } List<String> list = new ArrayList<>(); allElements.forEach((e) -> list.add(e.text())); return list; }
String source = "<span class=\"zg_hrsr_ladder\">in <a href=\"https://www.amazon.de/gp/bestsellers/books/ref=pd_zg_hrsr_b_1_1\">Bücher</a> > <a href=\"https://www.amazon.de/gp/bestsellers/books/287480/ref=pd_zg_hrsr_b_1_2\">Krimis & Thriller</a> > <b><a href=\"https://www.amazon.de/gp/bestsellers/books/419954031/ref=pd_zg_hrsr_b_1_3_last\">Deutschland</a></b></span>"; Document htmlDocument = Jsoup.parse(source, "UTF-8"); Elements category = htmlDocument.select("span.zg_hrsr_ladder a"); category.forEach(aElement -> { System.out.println(aElement.text()); });
private void convertBreaksBetweenPages(final Document document) { document.select("article.page ~ article.page").forEach(a -> { a.before("<hr class=\"pagebreak\" />"); }); }
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; public class JSoup { public static void main(String[] args) { String fichier = "<html>" + "<head></head>" + "<body></body>" + "</html>"; Document dc = Jsoup.parse(fichier, "utf-8"); Elements elements = dc.getAllElements(); elements.forEach( element -> System.out.println(element.nodeName())); } }
public static List<MatchedDate> extractFromMeta(Document document) { Map<String, String> metaValues = Maps.newHashMap(); document.select("meta").forEach(m -> { String name = m.attr("name"); String property = m.attr("property"); String content = m.attr("content"); if (!Strings.isNullOrEmpty(name)) { metaValues.put(name.toLowerCase(), content); } else if (!Strings.isNullOrEmpty(property)) { metaValues.put(property.toLowerCase(), content); } }); return DATE_META_KEYS.stream() .filter(k -> metaValues.get(k) != null) .map(k -> new MatchedDate(metaValues.get(k), "META:" + k)) .collect(Collectors.toList()); }
private void convertSpiltBulletsToLists(final Document document) { document.select("p ~ p").forEach(p -> { final Element previous = p.previousElementSibling(); final String previousText = previous.text().trim(); if (LISTITEM_SYMBOLS.stream().anyMatch(s -> s.equals(previousText))) { // So bullet then paragraph... // Delete prvious and turn the other into a list item previous.remove(); p.tagName("li"); } }); }
@Override public void manipulate(Document document) { // Find elements which need to be spilt up Set<Element> elementsWithBr = new HashSet<>(); document.select("br").forEach(e -> elementsWithBr.add(e.parent())); // For each parent elementsWithBr.forEach( e -> { List<Element> runs = collectRuns(document, e); if (!runs.isEmpty()) { addRunsToDom(e, runs); } }); }
@Override public void manipulate(Document document) { // Find elements which need to be spilt up Set<Element> elementsWithBr = new HashSet<>(); document.select("br").forEach(e -> elementsWithBr.add(e.parent())); // For each parent elementsWithBr.forEach( e -> { List<Element> runs = collectRuns(document, e); if (!runs.isEmpty()) { addRunsToDom(e, runs); } }); }
private void cleanUp(final Document document) { // Often we get p tag under a ul, ol, so lets clean that up as a li document.select("ul,ol > p").forEach(l -> l.select("p").wrap("<li></li>")); // Lots of paragraph tags with content '*', they aren't in the ppt so lets just remove them document.select("p:matches(^\\*$)").remove(); // Remove any specific sections which are empty (as they are just noise) document.select("details:empty,section:empty,aside:empty").remove(); // Remove the classes document.select(".slide-content,.slide-master-content,.slide-notes").removeAttr("class"); // Powerpoint puts a preview embedded in.. which we don't want document.select("div[id=/docProps/thumbnail.jpeg]").remove(); } }
private void convertParagraphsToList(final Document document) { document.select("p,li,td").forEach(p -> { final String text = p.text(); for (final String symbol : LISTITEM_SYMBOLS) { if (text.contains(symbol)) { p.tagName("ul"); p.empty(); for (final String s : text.split(symbol)) { p.appendElement("li").text(s); } // Only do this once! return; } } }); }
private static List<MatchedString> extractTextsWithJsoup(Document document, HttpSource source) { List<MatchedString> texts = Lists.newArrayList(); for (String selector : source.getTextSelectors()) { document.select(selector).forEach(e -> texts.add(new MatchedString(e.text(), selector))); } if (!texts.isEmpty()) { return texts; } String itemPropValue = document.select("[itemprop*=articleBody] p").text(); if (itemPropValue != null && !itemPropValue.trim().isEmpty()) { return Lists.newArrayList(new MatchedString(itemPropValue, "[itemprop*=articleBody] p")); } return document.select("p").stream() .map(e -> new MatchedString(e.text(), "p")) .collect(Collectors.toList()); }
private void process(final InputStream stream, final String charset, final ContentHandler handler, final Metadata metadata) throws IOException, SAXException { final Document document = Jsoup.parse(stream, charset, ""); clean(document); document.head().select("meta").forEach(m -> { final String name = m.attr("name"); final String value = m.attr("content"); if (!Strings.isNullOrEmpty(name)) { metadata.add(name, value); } }); document.traverse(new JsoupToSaxVisitor(handler)); }
public void stripComments(Document doc) { List<Node> comments = new ArrayList<>(); doc.getAllElements().forEach( elem -> { if ( ! elem.tagName().equals("style") && ! elem.equals("script") ) { elem.childNodes().forEach( child -> { if ( child instanceof Comment) { comments.add(child); } }); } }); comments.forEach(node -> node.remove()); }
private void inspectCustomElements(org.jsoup.nodes.Element childElement, org.jsoup.nodes.Element templateRoot) { if (isInsideTemplate(childElement, templateRoot)) { storeNotInjectableElementId(childElement); } collectCustomElement(childElement, templateRoot); childElement.children() .forEach(child -> inspectCustomElements(child, templateRoot)); }
@Override public void store(TaskResponse response) throws Exception { if (response.isGroup("oschina.blog")) { response.select(".item").forEach(item -> { String href = item.select(".blog-title-link").attr("href"); try { response.getQueue().push(new Task(href,"oschina.blog.item")); } catch (Exception e) { e.printStackTrace(); } }); } else if (response.isGroup("oschina.blog.item")) { Elements content = response.select(".article-detail"); String title = content.select(".header").text().replace("顶 原 荐",""); String autor = content.select("blog-meta > div:nth-child(1) > a").text(); System.out.println(String.format("文章标题: %s 作者: %s",title,autor)); } } }
protected List<WebUrl> getUrls(final WebUrl webUrl, final Document document) { if (webUrl.getDepth() >= this.seed.getDepth()) { return null; } List<WebUrl> list = new ArrayList<>(); Elements links = document.getElementsByTag("a"); if (links.isEmpty()) { return null; } links.forEach(e -> { String url = e.absUrl("href"); if (!this.filter.contains(url)) { WebUrl childUrl = new WebUrl(webUrl.getDepth() + 1, url); childUrl.setDepth(webUrl.getDepth() + 1); childUrl.setUrl(url); if (parser.shouldVisit(webUrl, childUrl) && isMatched(url)) { list.add(childUrl); } this.filter.add(url); } }); return list; }