@SuppressWarnings("deprecation") @Override public Stream<SimpleStepNode> out(String edgeName) { List<SimpleStepNode> children=new ArrayList<SimpleStepNode>(); if ("child".equals(edgeName)) { rootNode.getChildren().forEach(child -> { HtmlNode childNode=new HtmlNode (this,child); children.add(childNode); }); return children.stream(); } return null; }
/** * @param rootNode the HTML Cleaner root node to serialize * @return the W3C Document object * @throws ParserConfigurationException if there's an error during serialization */ public Document createDOM(TagNode rootNode) throws ParserConfigurationException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); Document document = factory.newDocumentBuilder().newDocument(); Element rootElement = document.createElement(rootNode.getName()); document.appendChild(rootElement); createSubnodes(document, rootElement, rootNode.getChildren()); return document; }
/** * There's a known limitation (bug?) in HTML Cleaner where if there's a XML declaration specified it'll be copied as * the first element of the body. Thus remove it if it's there. See * https://sourceforge.net/forum/message.php?msg_id=4657800 and * https://sourceforge.net/tracker/index.php?func=detail&aid=2688635&group_id=183053&atid=903696 * * @param cleanedNode the cleaned node (ie after the HTML cleaning) */ private void fixCleanedNodeBug(TagNode cleanedNode) { TagNode body = cleanedNode.getElementsByName("body", false)[0]; if (body.getChildren().size() > 0) { Object firstBodyChild = body.getChildren().get(0); if (firstBodyChild != null && ContentToken.class.isAssignableFrom(firstBodyChild.getClass())) { ContentToken token = (ContentToken) firstBodyChild; if (token.getContent().startsWith("<?xml")) { body.removeChild(token); } } } } }
public static void cleanInvalidAttributes(TagNode parent) { List nodes = parent.getChildren(); if (nodes != null) { for (int i = 0; i < nodes.size(); i++) { Object curChild = nodes.get(i); if (curChild instanceof TagNode) { TagNode curNode = (TagNode)curChild; Map attrMap = curNode.getAttributes(); Set<String> toRemove = new HashSet<String>(); for (Object entryObj : attrMap.entrySet()) { Entry entry = (Entry)entryObj; String attrName = (String)entry.getKey(); if (!HtmlExtractUtils.isValidAttribute(attrName)) { toRemove.add(attrName); } } for (String remove : toRemove) { curNode.removeAttribute(remove); } cleanInvalidAttributes(curNode); } } } } }
createSubnodes(document, subelement, subTagNode.getChildren());