@Override public List<Element> selectElements(Element element) { return element.select(selectorText); }
public static String toCompactString(Document document) { document.outputSettings() .prettyPrint(false) .indentAmount(0); return document.html(); } }
@Override public Node removeAttr(String key) { ensureAttributes(); return super.removeAttr(key); }
@Override public Page convert(ResponseBody responseBody) throws IOException { Document document = Jsoup.parse(responseBody.string()); List<String> links = new ArrayList<>(); for (Element element : document.select("a[href]")) { links.add(element.attr("href")); } return new Page(document.title(), Collections.unmodifiableList(links)); } }
protected String getText(Element element) { StringBuilder accum = new StringBuilder(); for (Node node : element.childNodes()) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; accum.append(textNode.text()); } } return accum.toString(); }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
/** * Get the {@code id} attribute of this element. * * @return The id attribute, if present, or an empty string if not. */ public String id() { return attributes().getIgnoreCase("id"); }
public void tail(Node node, int depth) { // make sure there is a space between block tags and immediately following text nodes <div>One</div>Two should be "One Two". if (node instanceof Element) { Element element = (Element) node; if (element.isBlock() && (node.nextSibling() instanceof TextNode) && !TextNode.lastCharIsWhitespace(accum)) accum.append(' '); } } }, this);
/** * Create a new TextNode from HTML encoded (aka escaped) data. * @param encodedText Text containing encoded HTML (e.g. &lt;) * @return TextNode containing unencoded data (e.g. <) */ public static TextNode createFromEncoded(String encodedText) { String text = Entities.unescape(encodedText); return new TextNode(text); }
/** * Set an attribute (key=value). If the attribute already exists, it is replaced. The attribute key comparison is * <b>case insensitive</b>. * @param attributeKey The attribute key. * @param attributeValue The attribute value. * @return this (for chaining) */ public Node attr(String attributeKey, String attributeValue) { attributes().putIgnoreCase(attributeKey, attributeValue); return this; }
private void ensureAttributes() { if (!hasAttributes()) { Object coreValue = value; Attributes attributes = new Attributes(); value = attributes; if (coreValue != null) attributes.put(nodeName(), (String) coreValue); } }
@Override public boolean hasAttr(String key) { ensureAttributes(); return super.hasAttr(key); }
@Override public Attributes attributes() { if (!hasAttributes()) attributes = new Attributes(); return attributes; }
/** * Create a new Attribute from an unencoded key and a HTML attribute encoded value. * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. * @param encodedValue HTML attribute encoded value * @return attribute */ public static Attribute createFromEncoded(String unencodedKey, String encodedValue) { String value = Entities.unescape(encodedValue, true); return new Attribute(unencodedKey, value, null); // parent will get set when Put }
@Override public String absUrl(String key) { ensureAttributes(); return super.absUrl(key); }
/** * Create a new TextNode from HTML encoded (aka escaped) data. * @param encodedText Text containing encoded HTML (e.g. &lt;) * @param baseUri Base uri * @return TextNode containing unencoded data (e.g. <) * @deprecated use {@link TextNode#createFromEncoded(String)} instead, as LeafNodes don't carry base URIs. */ public static TextNode createFromEncoded(String encodedText, String baseUri) { String text = Entities.unescape(encodedText); return new TextNode(text); }