/** * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. */ public BoilerpipeHTMLParser() { this(new BoilerpipeHTMLContentHandler()); }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); return false; }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after parsing. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { // just to be sure flushBlock(); return new TextDocument(getTitle(), getTextBlocks()); }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); instance.addLabelAction(action); return false; }
void flushBlock() { if (inBody == 0) { if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { setTitle(tokenBuffer.toString().trim()); } else if (ANCHOR_TEXT_END.equals(token)) { inAnchorText = false; } else if (isWord(token)) { numTokens++; numWords++; addTextBlock(tb);
public boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.flushBlock(); instance.inBody--; return false; }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { return contentHandler.toTextDocument(); } }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addLabelAction(action); return true; }
public void flushBlock() { if (inBody == 0) { if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { setTitle(tokenBuffer.toString().trim()); } else if (ANCHOR_TEXT_END.equals(token)) { inAnchorText = false; } else if (isWord(token)) { numTokens++; numWords++; addTextBlock(tb); blockTagLevel = -1;
public boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.flushBlock(); instance.inBody--; return false; } };
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); instance.addLabelAction(action); return false; }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { return contentHandler.toTextDocument(); } }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addLabelAction(action); return true; }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after parsing. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { // just to be sure flushBlock(); return new TextDocument(getTitle(), getTextBlocks()); }
public void flushBlock() { if (inBody == 0) { if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { setTitle(tokenBuffer.toString().trim()); } else if (ANCHOR_TEXT_END.equals(token)) { inAnchorText = false; } else if (isWord(token)) { numTokens++; numWords++; addTextBlock(tb); blockTagLevel = -1;
public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.addWhitespaceIfNecessary(); return false; }
public boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.flushBlock(); instance.inBody++; return false; }
public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); instance.addLabelAction(action); return false; }
/** * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. */ public BoilerpipeHTMLParser() { this(new BoilerpipeHTMLContentHandler()); }
/** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { return contentHandler.toTextDocument(); } }