private List<Element> getAllElements(final List<StartTag> startTags) { if (startTags.isEmpty()) return Collections.emptyList(); final ArrayList<Element> elements=new ArrayList<Element>(startTags.size()); for (StartTag startTag : startTags) { final Element element=startTag.getElement(); if (element.end<=end) elements.add(element); } return elements; }
/** * Returns the {@link FormControl} defined by this start tag. * <p> * This is equivalent to {@link #getElement()}<code>.</code>{@link Element#getFormControl() getFormControl()}. * * @return the {@link FormControl} defined by this start tag, or <code>null</code> if it is not a <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#form-controls">control</a>. */ public FormControl getFormControl() { return getElement().getFormControl(); }
/** * Returns the {@link Element} beginning at or immediately following the specified position in the source document. * <p> * This is equivalent to {@link #getNextStartTag(int) getNextStartTag(pos)}<code>.</code>{@link StartTag#getElement() getElement()}, * assuming the result is not <code>null</code>. * * @param pos the position in the source document from which to start the search, may be out of bounds. * @return the {@link Element} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. */ public Element getNextElement(final int pos) { final StartTag startTag=getNextStartTag(pos); return startTag==null ? null : startTag.getElement(); }
/** * Returns the {@link Element} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document. * <p> * This is equivalent to {@link #getNextStartTag(int,String,String,boolean) getNextStartTag(pos,attributeName,value,valueCaseSensitive)}<code>.</code>{@link StartTag#getElement() getElement()}, * assuming the result is not <code>null</code>. * * @param pos the position in the source document from which to start the search, may be out of bounds. * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. * @param value the value of the specified attribute to search for, must not be <code>null</code>. * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. * @return the {@link Element} with the specified attribute name/value pair beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. * @see #getNextElement(int pos, String attributeName, Pattern valueRegexPattern) */ public Element getNextElement(final int pos, final String attributeName, final String value, final boolean valueCaseSensitive) { final StartTag startTag=getNextStartTag(pos,attributeName,value,valueCaseSensitive); return startTag==null ? null : startTag.getElement(); }
/** * Returns the {@link Element} with the specified class beginning at or immediately following the specified position in the source document. * <p> * This matches an element with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple * class names separated by white space in the attribute value. * <p> * This is equivalent to {@link #getNextStartTagByClass(int,String) getNextStartTagByClass(pos,className)}<code>.</code>{@link StartTag#getElement() getElement()}, * assuming the result is not <code>null</code>. * * @param pos the position in the source document from which to start the search, may be out of bounds. * @param className the class name (case sensitive) to search for, must not be <code>null</code>. * @return the {@link Element} with the specified class beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. */ public Element getNextElementByClass(final int pos, final String className) { final StartTag startTag=getNextStartTagByClass(pos,className); return startTag==null ? null : startTag.getElement(); }
/** * Returns a list of all {@linkplain Element elements} in this source document. * <p> * Calling this method on the <code>Source</code> object performs a {@linkplain #fullSequentialParse() full sequential parse} automatically. * <p> * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method. * * @return a list of all {@linkplain Element elements} in this source document. */ public List<Element> getAllElements() { if (allElements==null) { final List<StartTag> allStartTags=getAllStartTags(); if (allStartTags.isEmpty()) return Collections.emptyList(); allElements=new ArrayList<Element>(allStartTags.size()); for (StartTag startTag : allStartTags) allElements.add(startTag.getElement()); } return allElements; }
/** * Returns the {@link Element} with the specified attribute name and value pattern beginning at or immediately following the specified position in the source document. * <p> * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all. * <p> * This is equivalent to {@link #getNextStartTag(int,String,Pattern) getNextStartTag(pos,attributeName,valueRegexPattern)}<code>.</code>{@link StartTag#getElement() getElement()}, * assuming the result is not <code>null</code>. * * @param pos the position in the source document from which to start the search, may be out of bounds. * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. * @return the {@link Element} with the specified attribute name and value pattern beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. * @see #getNextElement(int pos, String attributeName, String value, boolean valueCaseSensitive) */ public Element getNextElement(final int pos, final String attributeName, final Pattern valueRegexPattern) { final StartTag startTag=getNextStartTag(pos,attributeName,valueRegexPattern); return startTag==null ? null : startTag.getElement(); }
/** * Returns the {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document. * <p> * This is equivalent to {@link #getNextStartTag(int,String) getNextStartTag(pos,name)}<code>.</code>{@link StartTag#getElement() getElement()}, * assuming the result is not <code>null</code>. * <p> * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to * {@link #getNextElement(int) getNextElement(pos)}. * <p> * Specifying an argument to the <code>name</code> parameter that ends in a colon (<code>:</code>) searches for all elements * in the specified XML namespace. * <p> * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. * * @param pos the position in the source document from which to start the search, may be out of bounds. * @param name the {@linkplain Element#getName() name} of the element to search for. * @return the {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} beginning at or immediately following the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. */ public Element getNextElement(final int pos, String name) { final StartTag startTag=getNextStartTag(pos,name); return startTag==null ? null : startTag.getElement(); }
/** * Returns the most nested {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document. * <p> * The specified position can be anywhere inside the {@linkplain Element#getStartTag() start tag}, {@linkplain Element#getEndTag() end tag}, * or {@linkplain Element#getContent() content} of the element. There is no requirement that the returned element has an end tag, and it * may be a {@linkplain TagType#isServerTag() server tag} or HTML {@linkplain StartTagType#COMMENT comment}. * <p> * See the {@link Tag} class documentation for more details about the behaviour of this method. * <p> * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. * * @param pos the position in the source document, may be out of bounds. * @param name the {@linkplain Element#getName() name} of the element to search for. * @return the most nested {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} that {@linkplain Segment#encloses(int) encloses} the specified position in the source document, or <code>null</code> if none exists or the specified position is out of bounds. */ public Element getEnclosingElement(final int pos, String name) { int startBefore=pos; if (name!=null) name=name.toLowerCase(); final boolean isXMLTagName=Tag.isXMLName(name); while (true) { StartTag startTag=StartTag.getPrevious(this,startBefore,name,StartTagType.NORMAL,isXMLTagName); if (startTag==null) return null; Element element=startTag.getElement(); if (pos < element.end) return element; startBefore=startTag.begin-1; } }
private EndTag getOptionalEndTag(final HTMLElementTerminatingTagNameSets terminatingTagNameSets) { int pos=end; while (pos<source.end) { final Tag tag=Tag.getNextTag(source,pos); if (tag==null) break; Set<String> terminatingTagNameSet; if (tag instanceof EndTag) { if (tag.name==name) return (EndTag)tag; terminatingTagNameSet=terminatingTagNameSets.TerminatingEndTagNameSet; } else { terminatingTagNameSet=terminatingTagNameSets.NonterminatingElementNameSet; if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) { Element nonterminatingElement=((StartTag)tag).getElement(); pos=nonterminatingElement.end; continue; } terminatingTagNameSet=terminatingTagNameSets.TerminatingStartTagNameSet; } if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) return new EndTag(source,tag.begin,tag.begin,EndTagType.NORMAL,name); pos=tag.begin+1; } // Ran out of tags. The only legitimate case of this happening is if the HTML end tag is missing, in which case the end of the element is the end of the source document return new EndTag(source,source.end,source.end,EndTagType.NORMAL,name); }
private CharSequence getStartTagHTML(StartTag startTag) { // tidies and filters out non-approved attributes StringBuilder sb = new StringBuilder(); sb.append('<').append(startTag.getName()); for (Attribute attribute : startTag.getAttributes()) { if (allowedAttributes.contains(attribute.getKey().toLowerCase())) { sb.append(' ').append(attribute.getName()); if (attribute.getValue() != null) { sb.append("=\""); sb.append(CharacterReference.encode(attribute.getValue())); sb.append('"'); } } } if (startTag.getElement().getEndTag() == null && !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) { sb.append(" /"); } sb.append('>'); return sb; }
/** * Returns the {@linkplain Element element} that is ended by this end tag. * <p> * Returns <code>null</code> if this end tag is not properly matched to any {@linkplain StartTag start tag} in the source document. * <p> * This method is much less efficient than the {@link StartTag#getElement()} method. * <p> * IMPLEMENTATION NOTE: The explanation for why this method is relatively inefficient lies in the fact that more than one * {@linkplain StartTagType start tag type} can have the same * {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}, so it is not possible to know for certain * which type of start tag this end tag is matched to (see {@link EndTagType#getCorrespondingStartTagType()} for more explanation). * Because of this uncertainty, the implementation of this method must check every start tag preceding this end tag, calling its * {@link StartTag#getElement()} method to see whether it is terminated by this end tag. * * @return the {@linkplain Element element} that is ended by this end tag. */ public Element getElement() { if (element!=Element.NOT_CACHED) return element; int pos=begin; while (pos!=0) { StartTag startTag=source.getPreviousStartTag(pos-1); if (startTag==null) break; Element foundElement=startTag.getElement(); // this automatically sets foundElement.getEndTag().element cache if (foundElement.getEndTag()==this) return foundElement; // no need to set element as it was already done in previous statement pos=startTag.begin; } return element=null; }
public ParserTag(StartTag tag) { setName(tag.getName()); setBegin(tag.getElement().getEnd()); setEnd(tag.getElement().getBegin()); setStartTagBegin(tag.getElement().getStartTag().getBegin()); setStartTagEnd(tag.getElement().getStartTag().getEnd()); if (tag.getElement().getEndTag() != null) { setEndTagBegin(tag.getElement().getEndTag().getBegin()); setEndTagEnd(tag.getElement().getEndTag().getEnd()); } else { setEndTagBegin(tag.getElement().getStartTag().getBegin()); setEndTagEnd(tag.getElement().getStartTag().getEnd()); } setAttributes(tag.getAttributes()); }
/** * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment. * <p> * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists. * <p> * If this segment is itself an {@link Element}, this element is returned, not the first child element. * * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. */ public final Element getFirstElement() { StartTag startTag=checkEnclosure(StartTag.getNext(source,begin)); while (startTag!=null) { final Element element=startTag.getElement(); if (element.end<=end) return element; startTag=checkEnclosure(startTag.getNextStartTag()); } return null; }
/** * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment. * <p> * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.get(0)</code>, * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists. * <p> * If this segment is itself an {@link Element} with the specified class, this element is returned. * * @param className the class name (case sensitive) to search for, must not be <code>null</code>. * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. */ public final Element getFirstElementByClass(final String className) { StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className)); while (startTag!=null) { final Element element=startTag.getElement(); if (element.end<=end) return element; startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className)); } return null; }
/** * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment. * <p> * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists. * <p> * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned. * * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive) */ public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) { StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern)); while (startTag!=null) { final Element element=startTag.getElement(); if (element.end<=end) return element; startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern)); } return null; }
/** * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment. * <p> * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists. * <p> * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned. * * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. * @param value the value of the specified attribute to search for, must not be <code>null</code>. * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. * @see #getFirstElement(String attributeName, Pattern valueRegexPattern) */ public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) { StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive)); while (startTag!=null) { final Element element=startTag.getElement(); if (element.end<=end) return element; startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive)); } return null; }
/** * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment. * <p> * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists. * <p> * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}. * <p> * If this segment is itself an {@link Element} with the specified name, this element is returned. * * @param name the {@linkplain Element#getName() name} of the element to search for. * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. */ public final Element getFirstElement(String name) { if (name==null) return getFirstElement(); final boolean isXMLTagName=Tag.isXMLName(name); name=name.toLowerCase(); StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName)); while (startTag!=null) { final Element element=startTag.getElement(); if (element.end<=end) return element; startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName)); } return null; }
List<StartTag> tags = sourceHtml.getAllStartTags(FORMULA_TAG_NAME); for (StartTag tag : tags) { EndTag endTag = tag.getElement().getEndTag(); if (endTag == null) { logger.warn("Formula element without end tag in " + source); continue; for (StartTag texTag : tag.getElement().getContent().getAllStartTags(TEX_TAG_NAME)) { Element texElement = texTag.getElement(); if (texElement.getEndTag() == null) { logger.warn("Tex element without end tag in " + source);
final StartTag startTag=(StartTag)tag; if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || excludeElement(startTag) || (excludeNonHTMLElements && !HTMLElements.getElementNames().contains(tag.name))) { nodeIterator.skipToPos(startTag.getElement().getEnd()); continue;