/** * Returns the node number of a child node given the node object. * This would typically be used in conjuction with digUpStringNode, * after which the string node's parent can be used to find the * string node's position. Faster than calling findPositionOf(text) * again. Note that the position is at a linear level alone - there * is no recursion in this method. * @param searchNode The child node to find. * @return The offset of the child tag or -1 if it was not found. */ public int findPositionOf(Node searchNode) { Node node; int loc = 0; for (SimpleNodeIterator e=children();e.hasMoreNodes();) { node = e.nextNode(); if (node==searchNode) { return loc; } loc++; } return -1; }
/** * Returns the node number of a child node given the node object. * This would typically be used in conjuction with digUpStringNode, * after which the string node's parent can be used to find the * string node's position. Faster than calling findPositionOf(text) * again. Note that the position is at a linear level alone - there * is no recursion in this method. * @param searchNode The child node to find. * @return The offset of the child tag or -1 if it was not found. */ public int findPositionOf(Node searchNode) { Node node; int loc = 0; for (SimpleNodeIterator e=children();e.hasMoreNodes();) { node = e.nextNode(); if (node==searchNode) { return loc; } loc++; } return -1; }
NodeList nodes = parser.extractAllNodesThatMatch(new AndFilter(new TagNameFilter("h3"), new HasAttributeFilter("id", "h3_"+num))); SimpleNodeIterator nodeIterator = nodes.elements(); while (nodeIterator.hasMoreNodes()) { Node node = nodeIterator.nextNode(); HeadingTag tag = (HeadingTag)node; System.out.println(tag.getStringText()); }
/** * Return the HTML code for the children of this tag. * @return A string with the HTML code for the contents of this tag. */ public String getChildrenHTML() { StringBuffer buff = new StringBuffer(); for (SimpleNodeIterator e = children();e.hasMoreNodes();) { AbstractNode node = (AbstractNode)e.nextNode(); buff.append(node.toHtml()); } return buff.toString(); }
/** * Return the HTML code for the children of this tag. * @return A string with the HTML code for the contents of this tag. */ public String getChildrenHTML() { StringBuilder buff = new StringBuilder(); for (SimpleNodeIterator e = children();e.hasMoreNodes();) { AbstractNode node = (AbstractNode)e.nextNode(); buff.append(node.toHtml()); } return buff.toString(); }
/** * Return the textual contents of this tag and it's children. * @return The 'browser' text contents of this tag. */ public String toPlainTextString() { StringBuffer stringRepresentation = new StringBuffer(); for (SimpleNodeIterator e=children();e.hasMoreNodes();) { stringRepresentation.append(e.nextNode().toPlainTextString()); } return stringRepresentation.toString(); }
/** * Return the textual contents of this tag and it's children. * @return The 'browser' text contents of this tag. */ public String toPlainTextString() { StringBuilder stringRepresentation = new StringBuilder(); for (SimpleNodeIterator e=children();e.hasMoreNodes();) { stringRepresentation.append(e.nextNode().toPlainTextString()); } return stringRepresentation.toString(); }
/** * Add the textual contents of the children of this node to the buffer. * @param sb The buffer to append to. */ protected void putChildrenInto(StringBuilder sb) { Node node; for (SimpleNodeIterator e = children (); e.hasMoreNodes ();) { node = e.nextNode (); // eliminate virtual tags // if (!(node.getStartPosition () == node.getEndPosition ())) sb.append (node.toHtml ()); } }
/** * @see org.htmlparser.nodes.TagNode#accept(org.htmlparser.visitors.NodeVisitor) */ public void accept(NodeVisitor visitor) { // be invisible but show the children (if they like visits) NodeList children = m_decorated.getChildren(); if (children == null) { return; } SimpleNodeIterator itChildren = children.elements(); while (itChildren.hasMoreNodes()) { itChildren.nextNode().accept(visitor); } }
/** * Returns the node number of the first node containing the given text. * This can be useful to index into the composite tag and get other children. * Text is compared without case sensitivity and conversion to uppercase * uses the supplied locale. * @return int The node index in the children list of the node containing * the text or -1 if not found. * @param locale The locale to use in converting to uppercase. * @param text The text to search for. */ public int findPositionOf (String text, Locale locale) { Node node; int loc; loc = 0; text = text.toUpperCase (locale); for (SimpleNodeIterator e = children (); e.hasMoreNodes (); ) { node = e.nextNode (); if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text)) return loc; loc++; } return -1; }
/** * Returns the node number of the first node containing the given text. * This can be useful to index into the composite tag and get other children. * Text is compared without case sensitivity and conversion to uppercase * uses the supplied locale. * @return int The node index in the children list of the node containing * the text or -1 if not found. * @param locale The locale to use in converting to uppercase. * @param text The text to search for. */ public int findPositionOf (String text, Locale locale) { Node node; int loc; loc = 0; text = text.toUpperCase (locale); for (SimpleNodeIterator e = children (); e.hasMoreNodes (); ) { node = e.nextNode (); if (-1 != node.toPlainTextString ().toUpperCase (locale).indexOf (text)) return loc; loc++; } return -1; }
/** * Searches all children who for a name attribute. Returns first match. * @param name Attribute to match in tag * @return Tag Tag matching the name attribute */ public Tag searchByName(String name) { Node node; Tag tag = null; boolean found = false; for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) { node = e.nextNode(); if (node instanceof Tag) { tag = (Tag)node; String nameAttribute = tag.getAttribute("NAME"); if (nameAttribute!=null && nameAttribute.equals(name)) found=true; } } if (found) return tag; else return null; }
/** * @see org.htmlparser.Tag#accept(org.htmlparser.visitors.NodeVisitor) */ public void accept(NodeVisitor visitor) { // be invisible but show the children (if they like visits) NodeList children = m_decorated.getChildren(); if (children == null) { return; } SimpleNodeIterator itChildren = children.elements(); while (itChildren.hasMoreNodes()) { itChildren.nextNode().accept(visitor); } }
/** * Searches all children who for a name attribute. Returns first match. * @param name Attribute to match in tag * @return Tag Tag matching the name attribute */ public Tag searchByName(String name) { Node node; Tag tag = null; boolean found = false; for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) { node = e.nextNode(); if (node instanceof Tag) { tag = (Tag)node; String nameAttribute = tag.getAttribute("NAME"); if (nameAttribute!=null && nameAttribute.equals(name)) found=true; } } if (found) return tag; else return null; }
/** * @see org.htmlparser.Tag#accept(org.htmlparser.visitors.NodeVisitor) */ public void accept(NodeVisitor visitor) { // be invisible but show the children (if they like visits) NodeList children = m_decorated.getChildren(); if (children == null) { return; } SimpleNodeIterator itChildren = children.elements(); while (itChildren.hasMoreNodes()) { itChildren.nextNode().accept(visitor); } }
public List<String> extractLinks(PageData pageData) { NodeList nodes = pageData.getNodes(); if (nodes == null) { return Collections.emptyList(); } NodeList linkNodes = nodes.extractAllNodesThatMatch(nodeFilter, true); ArrayList<String> links = new ArrayList<String>(linkNodes.size()); SimpleNodeIterator it = linkNodes.elements(); while (it.hasMoreNodes()) { Tag tag = (Tag) it.nextNode(); String href = new String(tag.getAttribute("href")); href = href.trim().replaceAll("&", "&"); links.add(href); } return links; }
/** * Find the textarea tag matching the given name * @param name Name of the textarea tag to be found within the form. * @return The <code>TEXTAREA</code> tag with the matching name. */ public TextareaTag getTextAreaTag(String name) { TextareaTag textareaTag=null; boolean found = false; for (SimpleNodeIterator e=getFormTextareas ().elements();e.hasMoreNodes() && !found;) { textareaTag = (TextareaTag)e.nextNode(); String textAreaName = textareaTag.getAttribute("NAME"); if (textAreaName!=null && textAreaName.equals(name)) found = true; } if (found) return (textareaTag); else return (null); }
Parser parser = new Parser("http://www.yahoo.com/"); NodeList list = parser.parse(new TagNameFilter("IMG")); for ( SimpleNodeIterator iterator = list.elements(); iterator.hasMoreNodes(); ) { Tag tag = (Tag) iterator.nextNode(); System.out.println(tag.getAttribute("src")); }
/** * Gets a frame by name. * Names are checked without case sensitivity and conversion to uppercase * is performed with the locale provided. * @param name The name of the frame to retrieve. * @param locale The locale to use when converting to uppercase. * @return The specified frame or <code>null</code> if it wasn't found. */ public FrameTag getFrame (String name, Locale locale) { Node node; FrameTag ret; ret = null; name = name.toUpperCase (locale); for (SimpleNodeIterator e = getFrames ().elements (); e.hasMoreNodes () && (null == ret); ) { node = e.nextNode(); if (node instanceof FrameTag) { ret = (FrameTag)node; if (!ret.getFrameName ().toUpperCase (locale).equals (name)) ret = null; } } return (ret); }
/** * Extracts the title from the given HTML. * * @return never null, just an empty string if not parsable. */ public static String extractTitle(String html) throws ParserException { String title = ""; Parser parser = new Parser(html); NodeList matches = parser.extractAllNodesThatMatch(TITLE_FILTER); SimpleNodeIterator it = matches.elements(); while (it.hasMoreNodes()) { TitleTag node = (TitleTag) it.nextNode(); title = node.getTitle().trim(); } return title; }