protected String getText(Element element) { StringBuilder accum = new StringBuilder(); for (Node node : element.childNodes()) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; accum.append(textNode.text()); } } return accum.toString(); }
public void head(Node source, int depth) { if (elementToSkip != null) { return; } if (source instanceof Element) { Element sourceElement = (Element) source; if (isSafeTag(sourceElement)) { String sourceTag = sourceElement.tagName(); Attributes destinationAttributes = sourceElement.attributes().clone(); Element destinationChild = new Element(Tag.valueOf(sourceTag), sourceElement.baseUri(), destinationAttributes); destination.appendChild(destinationChild); destination = destinationChild; } else if (source != root) { elementToSkip = sourceElement; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destinationText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destinationText); } else if (source instanceof DataNode && isSafeTag(source.parent())) { DataNode sourceData = (DataNode) source; DataNode destinationData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destinationData); } }
/** * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the * original text up to the offset, and will have a new text node sibling containing the text after the offset. * @param offset string offset point to split node at. * @return the newly created text node containing the text after the offset. */ public TextNode splitText(int offset) { final String text = coreValue(); Validate.isTrue(offset >= 0, "Split offset must be not be negative"); Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length"); String head = text.substring(0, offset); String tail = text.substring(offset); text(head); TextNode tailNode = new TextNode(tail); if (parent() != null) parent().addChildren(siblingIndex()+1, tailNode); return tailNode; }
private static void appendNormalisedText(StringBuilder accum, TextNode textNode) { String text = textNode.getWholeText(); if (preserveWhitespace(textNode.parentNode) || textNode instanceof CDataNode) accum.append(text); else StringUtil.appendNormalisedWhitespace(accum, text, TextNode.lastCharIsWhitespace(accum)); }
void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) )) indent(accum, depth, out); boolean normaliseWhite = out.prettyPrint() && parent() instanceof Element && !Element.preserveWhitespace(parent()); Entities.escape(accum, coreValue(), out, false, normaliseWhite, false); }
if (next != null && BR_TAG.equals(next.tag())) { next.remove(); (previousNode = previousNode.previousSibling()) != null;) { && !((TextNode) previousNode).isBlank()) { break; (previous = previous.previousSibling()) != null;) { child.remove(); paragraph.prependChild(child.clone()); && ((TextNode) next).isBlank())) { break; while ((next = next.nextSibling()) != null) { if (!(next instanceof TextNode && ((TextNode) next).isBlank())) { break; for (Node child : brDiv.childNodes()) { if (child instanceof TextNode) { if (!((TextNode) child).isBlank()) { continue DIV;
for (int i = 1; i <= figRefEls.size(); i++) { Element element = figRefEls.get(i - 1); element.attr("id", "FR-" + Strings.padStart(String.valueOf(i), 4, '0')); element.attr("idref", ReferenceTagger.createFigId(element.select("PDAT").text())); element.tagName("a"); element.addClass("figref"); element.replaceWith(new TextNode("Table-Reference")); newEl.addClass("math"); newEl.attr("format", "mathml"); newEl.appendChild(new TextNode(mathml)); element.replaceWith(newEl); try { String unicode = UnicodeUtil.toSubscript(el.html()); el.replaceWith(new TextNode(unicode)); } catch (ParseException e) { el.tagName("sub"); try { String unicode = UnicodeUtil.toSuperscript(el.html()); el.replaceWith(new TextNode(unicode)); } catch (ParseException e) { el.tagName("sup");
public void fixFigrefListItem(Element element) { Node next = element.nextSibling(); String trailingTxt; if (next != null && next instanceof TextNode) { trailingTxt = ((TextNode) next).getWholeText(); } else if (next != null && next instanceof Element) { trailingTxt = ((Element) next).text(); } else { return; } if (trailingTxt.matches("^(, |,? and )")) { next = element.nextSibling().nextSibling(); if (next.nodeName().toLowerCase().equals("b")) { String containedTxt = ((TextNode) next.childNode(0)).getWholeText(); if (containedTxt.matches("[0-9]{1,2}[A-z]?")) { Element newEl = element.clone(); newEl.attr("id", "FR-" + Strings.padStart(containedTxt, 4, '0')); newEl.attr("idref", ReferenceTagger.createFigId(containedTxt)); newEl.tagName("a"); newEl.addClass("figref"); newEl.text(containedTxt); next.replaceWith(newEl); fixFigrefListItem(newEl); } } } }
public static String textPlus(Element elem) { List<TextNode> textNodes = elem.textNodes(); if (textNodes.isEmpty()) return ""; StringBuilder result = new StringBuilder(); // start at the first text node Node currentNode = textNodes.get(0); while (currentNode != null) { // append deep text of all subsequent nodes if (currentNode instanceof TextNode) { TextNode currentText = (TextNode) currentNode; result.append(currentText.text()); } else if (currentNode instanceof Element) { Element currentElement = (Element) currentNode; result.append(currentElement.text()); } currentNode = currentNode.nextSibling(); } return result.toString(); }
protected void cleanNodes(Element body, String tag) { for (Element element : body.getElementsByTag(tag)) { if (element == null || element.parent() == null) { continue; } for (Element child : element.children().select(tag)) { cleanNodes(child, tag); } element.replaceWith(new TextNode(element.text() + "<br/>", "")); } } }
return new TextNode(element.getText(), document.baseUri()); .createElement(element.getTag()); if (element.hasProperty("innerHTML")) { target.html((String) element.getPropertyRaw("innerHTML")); String attributeValue = element.getAttribute(name); if ("".equals(attributeValue)) { target.attr(name, true); } else { target.attr(name, attributeValue);
final String possibleEmoji = img.attr("alt"); img.replaceWith(new TextNode(possibleEmoji)); if(iframe.hasAttr("src")) { String href = iframe.attr("src"); String html = String.format(Locale.US, videoLink, href, href);
for (Element element : body.getAllElements()) { if (Html.isSpanElement(element)) { List<Node> childNodes = element.childNodes(); if (childNodes.isEmpty() && !isHyperlinkWithTarget(element)) { element.remove(); modifiedOne = true; } else { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); if (text.trim().length() == 0) { textNode.remove(); element.before(textNode); element.remove(); normalizeTextNodes((Element) textNode.parent());
Element element = ((Element) node); StringBuilder accum = new StringBuilder(); accum.append("<").append(element.tagName()); for (Attribute attribute: element.attributes()) { if (!(attribute.getKey().startsWith("_"))) { accum.append(" "); if (element.childNodes().isEmpty() && element.tag().isEmpty()) { accum.append(" />"); } else { return ((TextNode) node).getWholeText(); } else if (node instanceof XmlDeclaration) { if (node.childNodes().isEmpty()) { return ""; return node.outerHtml(); } else if (node instanceof Comment) { } else if (node instanceof DataNode && node.childNodes().isEmpty()) {
public static String autoDigest(String str, int size) { StringBuilder sb = new StringBuilder(); Document document = Jsoup.parseBodyFragment(str); List<Node> allTextNode = new ArrayList<>(); getAllTextNode(document.childNodes(), allTextNode); int tLength = 0; for (Node node : allTextNode) { if (node instanceof TextNode) { sb.append(node.parent().outerHtml()); tLength += ((TextNode) node).text().length(); if (tLength > size) { sb.append(" ..."); break; } } } String digest = sb.toString(); Elements elements = Jsoup.parse(str).body().select("video"); if (elements != null && !elements.isEmpty()) { digest = elements.get(0).toString() + "<br/>" + digest; } return digest.trim(); }
List<Node> childNodeList = doc.body().childNodes(); if (childNodeList == null || childNodeList.isEmpty()) { return null; for (int pos = 0; pos != size; pos++) { Node childNode = childNodeList.get(pos); String tagName = childNode.nodeName(); if (tagName.equalsIgnoreCase("h")) { elList.add(new PElement(Html.fromHtml(((Element) childNode).html()))); } else if(tagName.equalsIgnoreCase("h1")){ elList.add(new HElement(((Element) childNode).html())); }else if (tagName.equalsIgnoreCase("img")) { String src = childNode.attr("src"); String width = childNode.attr("width"); String height = childNode.attr("height"); elList.add(new ImgElement(src, YUtils.parseInt(width, 0), YUtils.parseInt(height, 0))); elList.add(new PElement(Html.fromHtml(((Element) childNode).html()))); } else if(childNode instanceof TextNode){ elList.add(new PElement(((TextNode) childNode).text())); }else { elList.add(new PElement(childNode.outerHtml()));
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
void insert(Token.Character characterToken) { Node node; // characters in script and style go in as datanodes, not text nodes final String tagName = currentElement().tagName(); final String data = characterToken.getData(); if (characterToken.isCData()) node = new CDataNode(data); else if (tagName.equals("script") || tagName.equals("style")) node = new DataNode(data); else node = new TextNode(data); currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. }
public void head(Node source, int depth) { NodeWrapper node = new NodeWrapper(source); NodeWrapper parentNode = elements.isEmpty() ? null : elements.peek(); elements.push(node); if (source instanceof Element) { Element sourceEl = (Element) source; String tagName = sourceEl.tagName(); if (tagName.equals("body")) { return; } if(shouldKeepChild(node, parentNode)) { Element destChild = createSafeElement(sourceEl); destination.appendChild(destChild); destination = destChild; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } }
public static String htmlToText(InputStream html) throws IOException { Document document = Jsoup.parse(html, null, ""); Element body = document.body(); return buildStringFromNode(body).toString(); } private static StringBuffer buildStringFromNode(Node node) { StringBuffer buffer = new StringBuffer(); if (node instanceof TextNode) { TextNode textNode = (TextNode) node; buffer.append(textNode.text().trim()); } for (Node childNode : node.childNodes()) { buffer.append(buildStringFromNode(childNode)); } if (node instanceof Element) { Element element = (Element) node; String tagName = element.tagName(); if ("p".equals(tagName) || "br".equals(tagName)) { buffer.append("\n"); } } return buffer; }