/** * Reindex children nodes. Must be called on every children addition/removal. * Iterates {@link #childNodes} list and: * <ul> * <li>calculates three different sibling indexes,</li> * <li>calculates total child element node count,</li> * <li>resets child element nodes array (will be init lazy later by @{#initChildElementNodes}.</li> * </ul> */ protected void reindexChildren() { int siblingElementIndex = 0; for (int i = 0, childNodesSize = childNodes.size(); i < childNodesSize; i++) { Node childNode = childNodes.get(i); childNode.siblingIndex = i; childNode.siblingNameIndex = -1; // reset sibling name info if (childNode.getNodeType() == NodeType.ELEMENT) { childNode.siblingElementIndex = siblingElementIndex; siblingElementIndex++; } } childElementNodesCount = siblingElementIndex; childElementNodes = null; // reset child element nodes }
@Override public boolean match(final Node node) { return node.getParentNode().getNodeType() == Node.NodeType.DOCUMENT; } }
/** * Returns <code>true</code> if given node is a table element. */ protected boolean isTableElement(final Node node) { if (node.getNodeType() != Node.NodeType.ELEMENT) { return false; } String elementName = node.getNodeName().toLowerCase(); return elementName.equals("table"); }
/** * Optimized variant of {@link #reindexChildren()} for addition. * Only added children are optimized. */ protected void reindexChildrenOnAdd(final int addedCount) { int childNodesSize = childNodes.size(); int previousSize = childNodes.size() - addedCount; int siblingElementIndex = childElementNodesCount; for (int i = previousSize; i < childNodesSize; i++) { Node childNode = childNodes.get(i); childNode.siblingIndex = i; childNode.siblingNameIndex = -1; // reset sibling name info if (childNode.getNodeType() == NodeType.ELEMENT) { childNode.siblingElementIndex = siblingElementIndex; siblingElementIndex++; } } childElementNodesCount = siblingElementIndex; childElementNodes = null; // reset child element nodes }
/** * Selects single node for single selector and appends it to the results. */ protected void selectAndAdd(final Node node, final CssSelector cssSelector, final List<Node> result) { // ignore all nodes that are not elements if (node.getNodeType() != Node.NodeType.ELEMENT) { return; } boolean matched = cssSelector.accept(node); if (matched) { // check for duplicates if (result.contains(node)) { return; } // no duplicate found, add it to the results result.add(node); } }
/** * Returns this node's previous sibling of <b>element</b> type * or <code>null</code> if this is the first sibling. */ public Node getPreviousSiblingElement() { parentNode.initChildElementNodes(); if (siblingElementIndex == -1) { for (int i = siblingIndex - 1; i >= 0; i--) { Node sibling = parentNode.childNodes.get(i); if (sibling.getNodeType() == NodeType.ELEMENT) { return sibling; } } return null; } int index = siblingElementIndex - 1; if (index < 0) { return null; } return parentNode.childElementNodes[index]; }
/** * Matches element to css selector. All non-element types are ignored. */ protected boolean matchElement(final Node node) { if (node.getNodeType() != Node.NodeType.ELEMENT) { return false; } String element = getElement(); String nodeName = node.getNodeName(); return element.equals(StringPool.STAR) || element.equals(nodeName); }
/** * Returns first child <b>element</b> with given name or * <code>null</code> if no such children exist. */ public Element getFirstChildElement(final String elementName) { if (childNodes == null) { return null; } for (int i = 0, childNodesSize = childNodes.size(); i < childNodesSize; i++) { Node child = childNodes.get(i); if (child.getNodeType() == NodeType.ELEMENT && elementName.equals(child.getNodeName())) { child.initSiblingNames(); return (Element) child; } } return null; }
/** * Finds the last table in stack of open elements. */ protected Element findLastTable(final Node node) { Node tableNode = node; while (tableNode != null) { if (tableNode.getNodeType() == Node.NodeType.ELEMENT) { String tableNodeName = tableNode.getNodeName().toLowerCase(); if (tableNodeName.equals("table")) { break; } } tableNode = tableNode.getParentNode(); } return (Element) tableNode; }
/** * Returns last child <b>element</b> with given name or * <code>null</code> if no such child node exist. */ public Element getLastChildElement(final String elementName) { if (childNodes == null) { return null; } int from = childNodes.size() - 1; for (int i = from; i >= 0; i--) { Node child = childNodes.get(i); if (child.getNodeType() == NodeType.ELEMENT && elementName.equals(child.getNodeName())) { child.initSiblingNames(); return (Element) child; } } return null; }
/** * Returns this node's next <b>element</b>. */ public Node getNextSiblingElement() { parentNode.initChildElementNodes(); if (siblingElementIndex == -1) { int max = parentNode.getChildNodesCount(); for (int i = siblingIndex; i < max; i++) { Node sibling = parentNode.childNodes.get(i); if (sibling.getNodeType() == NodeType.ELEMENT) { return sibling; } } return null; } int index = siblingElementIndex + 1; if (index >= parentNode.childElementNodesCount) { return null; } return parentNode.childElementNodes[index]; }
if (childNode.getNodeType() == NodeType.ELEMENT) { if (childNode.siblingElementIndex != siblingElementIndex) { return false;
@Override public boolean accept(Node node) { if (node.getNodeType() != Node.NodeType.ELEMENT) { return false; } if ("ema".equals(node.getAttribute("id"))) { return true; } if ("lina".equals(node.getAttribute("id"))) { return true; } return false; } });
protected void elementBody(final Element element) throws IOException { int childCount = element.getChildNodesCount(); if (element.isRawTag()) { for (int i = 0; i < childCount; i++) { Node childNode = element.getChild(i); if (childNode.getNodeType() == Node.NodeType.TEXT) { appendable.append(childNode.getNodeValue()); } else { childNode.visit(this); } } } else { element.visitChildren(this); } }
/** * Removes last child node if contains just empty text. */ protected void removeLastChildNodeIfEmptyText(final Node parentNode, final boolean closedTag) { if (parentNode == null) { return; } Node lastChild = parentNode.getLastChild(); if (lastChild == null) { return; } if (lastChild.getNodeType() != Node.NodeType.TEXT) { return; } if (closedTag) { if (parentNode.getChildNodesCount() == 1) { return; } } Text text = (Text) lastChild; if (text.isBlank()) { lastChild.detachFromParent(); } }
if (node.getNodeType() == Node.NodeType.TEXT) { String value = node.getNodeValue(); if (!StringUtil.isBlank(value)) { if (node.getNodeType() == Node.NodeType.ELEMENT) { Element element = (Element) node;
protected void fixText() { for (Text fosterText : fosterTexts) { // find parent table Element lastTable = findLastTable(fosterText); // move foster element above the table fosterText.detachFromParent(); Node tablesPreviousNode = lastTable.getPreviousSibling(); if (tablesPreviousNode.getNodeType() == Node.NodeType.TEXT) { // append to previous text node Text textNode = (Text) tablesPreviousNode; String text = textNode.getNodeValue(); textNode.setNodeValue(text + fosterText.getNodeValue()); } else { // insert text node before the table lastTable.getParentNode().insertBefore(fosterText, lastTable); } } }
/** * Performs the fix for elements. */ protected void fixElements() { for (Element fosterElement : fosterElements) { // find parent table Element lastTable = findLastTable(fosterElement); Node fosterElementParent = fosterElement.getParentNode(); // filter our foster element Node[] fosterChilds = fosterElement.getChildNodes(); for (Node fosterChild : fosterChilds) { if (fosterChild.getNodeType() == Node.NodeType.ELEMENT) { if (isOneOfTableElements((Element) fosterChild)) { // move all child table elements outside // the foster element fosterChild.detachFromParent(); fosterElementParent.insertBefore(fosterChild, fosterElement); } } } // finally, move foster element above the table fosterElement.detachFromParent(); lastTable.getParentNode().insertBefore(fosterElement, lastTable); } }
@Test void testTwoHtml() throws IOException { File file = new File(testDataRoot, "two.html"); String htmlContent = FileUtil.readString(file); Document document = new LagartoDOMBuilder().parse(htmlContent); Node html = new NodeSelector(document).select("html").get(0); assertNotNull(html); Node body = new NodeSelector(html).selectFirst("body"); Element h1 = body.getFirstChildElement(); assertEquals("h1", h1.getNodeName()); Node comment1 = body.getFirstChild().getNextSibling(); assertEquals(Node.NodeType.COMMENT, comment1.getNodeType()); Element p = (Element) new NodeSelector(body).selectFirst("p"); assertEquals(h1, p.getPreviousSiblingElement()); assertEquals(h1, comment1.getNextSiblingElement()); assertNull(comment1.getNextSiblingName()); // check if filter works just for sub elements List<Node> p_ems = new NodeSelector(p).select("em"); assertEquals(1, p_ems.size()); Element script = (Element) new NodeSelector(html).selectFirst("script"); assertEquals("text/javascript", script.getAttribute("type")); assertTrue(document.check()); }
/** * Matches element to css selector. All non-element types are ignored. */ protected boolean matchElement(final Node node) { if (node.getNodeType() != Node.NodeType.ELEMENT) { return false; } String element = getElement(); String nodeName = node.getNodeName(); return element.equals(StringPool.STAR) || element.equals(nodeName); }