/** * Update the tag name of each matched element. For example, to change each {@code <i>} to a {@code <em>}, do * {@code doc.select("i").tagName("em");} * @param tagName the new tag name * @return this, for chaining * @see Element#tagName(String) */ public Elements tagName(String tagName) { for (Element element : this) { element.tagName(tagName); } return this; }
@Override public boolean matches(Element root, Element element) { return (element.tagName().equalsIgnoreCase(tagName)); }
@Override public boolean matches(Element root, Element element) { return (element.tagName().endsWith(tagName)); }
/** * Set the value of a form element (input, textarea, etc). * @param value value to set * @return this element (for chaining) */ public Element val(String value) { if (tagName().equals("textarea")) text(value); else attr("value", value); return this; }
/** * Get the value of a form element (input, textarea, etc). * @return the value of the form element, or empty string if not set. */ public String val() { if (tagName().equals("textarea")) return text(); else return attr("value"); }
private static void parseTtsElements(Element element, List<LocalisedText> textsToRead) { if (element.tagName().equalsIgnoreCase("tts") && element.attr("service").equalsIgnoreCase("android")) { textsToRead.add(new LocalisedText(element.text(), element.attr("voice"))); return; // ignore any children } for (Element child : element.children()) { parseTtsElements(child, textsToRead); } }
void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) throws IOException { if (!(childNodes.isEmpty() && tag.isSelfClosing())) { if (out.prettyPrint() && (!childNodes.isEmpty() && ( tag.formatAsBlock() || (out.outline() && (childNodes.size()>1 || (childNodes.size()==1 && !(childNodes.get(0) instanceof TextNode)))) ))) indent(accum, depth, out); accum.append("</").append(tagName()).append('>'); } }
/** * Finds any namespaces defined in this element. Returns any tag prefix. */ private String updateNamespaces(org.jsoup.nodes.Element el) { // scan the element for namespace declarations // like: xmlns="blah" or xmlns:prefix="blah" Attributes attributes = el.attributes(); for (Attribute attr : attributes) { String key = attr.getKey(); String prefix; if (key.equals(xmlnsKey)) { prefix = ""; } else if (key.startsWith(xmlnsPrefix)) { prefix = key.substring(xmlnsPrefix.length()); } else { continue; } namespacesStack.peek().put(prefix, attr.getValue()); } // get the element prefix if any int pos = el.tagName().indexOf(":"); return pos > 0 ? el.tagName().substring(0, pos) : ""; }
private static void accumulateParents(Element el, Elements parents) { Element parent = el.parent(); if (parent != null && !parent.tagName().equals("#root")) { parents.add(parent); accumulateParents(parent, parents); } }
public void head(org.jsoup.nodes.Node source, int depth) { namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack if (source instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; String prefix = updateNamespaces(sourceEl); String namespace = namespacesStack.peek().get(prefix); Element el = doc.createElementNS(namespace, sourceEl.tagName()); copyAttributes(sourceEl, el); if (dest == null) { // sets up the root doc.appendChild(el); } else { dest.appendChild(el); } dest = el; // descend } else if (source instanceof org.jsoup.nodes.TextNode) { org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; Text text = doc.createTextNode(sourceText.getWholeText()); dest.appendChild(text); } else if (source instanceof org.jsoup.nodes.Comment) { org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; Comment comment = doc.createComment(sourceComment.getData()); dest.appendChild(comment); } else if (source instanceof org.jsoup.nodes.DataNode) { org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; Text node = doc.createTextNode(sourceData.getWholeData()); dest.appendChild(node); } else { // unhandled } }
/** * Get a CSS selector that will uniquely select this element. * <p> * If the element has an ID, returns #id; * otherwise returns the parent (if any) CSS selector, followed by {@literal '>'}, * followed by a unique selector for the element (tag.class.class:nth-child(n)). * </p> * * @return the CSS Path that can be used to retrieve the element in a selector. */ public String cssSelector() { if (id().length() > 0) return "#" + id(); // Translate HTML namespace ns:tag to CSS namespace syntax ns|tag String tagName = tagName().replace(':', '|'); StringBuilder selector = new StringBuilder(tagName); String classes = StringUtil.join(classNames(), "."); if (classes.length() > 0) selector.append('.').append(classes); if (parent() == null || parent() instanceof Document) // don't add Document to selector, as will always have a html node return selector.toString(); selector.insert(0, " > "); if (parent().select(selector.toString()).size() > 1) selector.append(String.format( ":nth-child(%d)", elementSiblingIndex() + 1)); return parent().cssSelector() + selector.toString(); }
void insert(Token.Comment commentToken) { Comment comment = new Comment(commentToken.getData()); Node insert = comment; if (commentToken.bogus) { // xml declarations are emitted as bogus comments (which is right for html, but not xml) // so we do a bit of a hack and parse the data as an element to pull the attributes out String data = comment.getData(); if (data.length() > 1 && (data.startsWith("!") || data.startsWith("?"))) { Document doc = Jsoup.parse("<" + data.substring(1, data.length() -1) + ">", baseUri, Parser.xmlParser()); if (doc.childNodeSize() > 0) { Element el = doc.child(0); insert = new XmlDeclaration(settings.normalizeTag(el.tagName()), data.startsWith("!")); insert.attributes().addAll(el.attributes()); } // else, we couldn't parse it as a decl, so leave as a comment } } insertNode(insert); }
void insert(Token.Character characterToken) { Node node; // characters in script and style go in as datanodes, not text nodes final String tagName = currentElement().tagName(); final String data = characterToken.getData(); if (characterToken.isCData()) node = new CDataNode(data); else if (tagName.equals("script") || tagName.equals("style")) node = new DataNode(data); else node = new TextNode(data); currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. }
void outerHtmlHead(final Appendable accum, int depth, final Document.OutputSettings out) throws IOException { if (out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()) || out.outline())) { if (accum instanceof StringBuilder) { if (((StringBuilder) accum).length() > 0) indent(accum, depth, out); } else { indent(accum, depth, out); } } accum.append('<').append(tagName()); if (attributes != null) attributes.html(accum, out); // selfclosing includes unknown tags, isEmpty defines tags that are always empty if (childNodes.isEmpty() && tag.isSelfClosing()) { if (out.syntax() == Document.OutputSettings.Syntax.html && tag.isEmpty()) accum.append('>'); else accum.append(" />"); // <img> in html, <img /> in xml } else accum.append('>'); }
/** * Method to scrape the given github organization and put the required data in githubProfile Post * @param githubProfile the Post to hold the scraped data * @param profile String variable holding the profile to be scraped * @param html the given html page to be scraped accoring to the required attributes */ private void scrapeGithubOrg( String profile, Post githubProfile, Document html) { githubProfile.put("user", profile); String shortDescription = html.getElementsByAttributeValueContaining("class", "TableObject-item TableObject-item--primary").get(0).child(2).text(); githubProfile.put("short_description", shortDescription); String homeLocation = html.getElementsByAttributeValueContaining("itemprop", "location").attr("title"); githubProfile.put("location", homeLocation); Elements navigation = html.getElementsByAttributeValue("class", "orgnav"); for (Element e : navigation) { String orgRepositoriesLink = e.child(0).tagName("a").attr("href"); githubProfile.put("organization_respositories_link", "https://github.com" + orgRepositoriesLink); String orgPeopleLink = e.child(1).tagName("a").attr("href"); githubProfile.put("organization_people_link", "https://github.com" + orgPeopleLink); String orgPeopleNumber = e.child(1).tagName("a").child(1).text(); githubProfile.put("organization_people_number", orgPeopleNumber); } }
@Override public boolean matches(Element root, Element element) { if (element instanceof PseudoTextElement) return true; List<TextNode> textNodes = element.textNodes(); for (TextNode textNode : textNodes) { PseudoTextElement pel = new PseudoTextElement( org.jsoup.parser.Tag.valueOf(element.tagName()), element.baseUri(), element.attributes()); textNode.replaceWith(pel); pel.appendChild(textNode); } return false; }
Element insert(Token.StartTag startTag) { // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. if (startTag.isSelfClosing()) { Element el = insertEmpty(startTag); stack.add(el); tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing return el; } Element el = new Element(Tag.valueOf(startTag.name(), settings), baseUri, settings.normalizeAttributes(startTag.attributes)); insert(el); return el; }
private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr); else numDiscarded++; } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); return new ElementMeta(dest, numDiscarded); }
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
public void head(Node source, int depth) { if (elementToSkip != null) { return; } if (source instanceof Element) { Element sourceElement = (Element) source; if (isSafeTag(sourceElement)) { String sourceTag = sourceElement.tagName(); Attributes destinationAttributes = sourceElement.attributes().clone(); Element destinationChild = new Element(Tag.valueOf(sourceTag), sourceElement.baseUri(), destinationAttributes); destination.appendChild(destinationChild); destination = destinationChild; } else if (source != root) { elementToSkip = sourceElement; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destinationText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destinationText); } else if (source instanceof DataNode && isSafeTag(source.parent())) { DataNode sourceData = (DataNode) source; DataNode destinationData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destinationData); } }