private static TagNode getTargetTagNode(String html, String tagName) { TagNode targetNode = null; try { TagNode rootNode = getHtmlCleaner().clean(html); if (tagName == null || "".equals(tagName) || tagName.equalsIgnoreCase(rootNode.getName())) { return rootNode; } TagNode [] targetNodes = rootNode.getElementsByName(tagName, true); if (targetNodes.length > 0) { targetNode = targetNodes[0]; } } catch (Exception e) { throw new RuntimeException(e); } return targetNode; }
/** * There's a known limitation (bug?) in HTML Cleaner where if there's a XML declaration specified it'll be copied as * the first element of the body. Thus remove it if it's there. See * https://sourceforge.net/forum/message.php?msg_id=4657800 and * https://sourceforge.net/tracker/index.php?func=detail&aid=2688635&group_id=183053&atid=903696 * * @param cleanedNode the cleaned node (ie after the HTML cleaning) */ private void fixCleanedNodeBug(TagNode cleanedNode) { TagNode body = cleanedNode.getElementsByName("body", false)[0]; if (body.getChildren().size() > 0) { Object firstBodyChild = body.getChildren().get(0); if (firstBodyChild != null && ContentToken.class.isAssignableFrom(firstBodyChild.getClass())) { ContentToken token = (ContentToken) firstBodyChild; if (token.getContent().startsWith("<?xml")) { body.removeChild(token); } } } } }
TagNode[] metaData = pageData.getElementsByName("meta", true); for (TagNode metaElement : metaData)
final TagNode rootNode = htmlCleaner.clean(html); final TagNode[] anchorTags = rootNode.getElementsByName("a", true); final TagNode[] imageTags = rootNode.getElementsByName("img", true); final TagNode[] targetNodes = rootNode.getElementsByName("body", false); if (targetNodes.length > 0) { TagNode bodyNode = targetNodes[0];