de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler java code examples

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  return false;
}

/**
 * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
 * s. NOTE: Only call this after parsing.
 * 
 * @return The {@link TextDocument}
 */
public TextDocument toTextDocument() {
  // just to be sure
  flushBlock();
  return new TextDocument(getTitle(), getTextBlocks());
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  instance.addLabelAction(action);
  return false;
}

void flushBlock() {
  if (inBody == 0) {
    if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
      setTitle(tokenBuffer.toString().trim());
    } else if (ANCHOR_TEXT_END.equals(token)) {
      inAnchorText = false;
    } else if (isWord(token)) {
      numTokens++;
      numWords++;
  addTextBlock(tb);

public boolean end(final BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName) {
  instance.flushBlock();
  instance.inBody--;
  return false;
}

  /**
   * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
   * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
   * 
   * @return The {@link TextDocument}
   */
  public TextDocument toTextDocument() {
    return contentHandler.toTextDocument();
  }
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addLabelAction(action);
  return true;
}

public void flushBlock() {
  if (inBody == 0) {
    if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
      setTitle(tokenBuffer.toString().trim());
    } else if (ANCHOR_TEXT_END.equals(token)) {
      inAnchorText = false;
    } else if (isWord(token)) {
      numTokens++;
      numWords++;
  addTextBlock(tb);
  blockTagLevel = -1;

  public boolean end(final BoilerpipeHTMLContentHandler instance,
      final String localName, final String qName) {
    instance.flushBlock();
    instance.inBody--;
    return false;
  }
};

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  instance.addLabelAction(action);
  return false;
}

  /**
   * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
   * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
   * 
   * @return The {@link TextDocument}
   */
  public TextDocument toTextDocument() {
    return contentHandler.toTextDocument();
  }
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addLabelAction(action);
  return true;
}

/**
 * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
 * s. NOTE: Only call this after parsing.
 * 
 * @return The {@link TextDocument}
 */
public TextDocument toTextDocument() {
  // just to be sure
  flushBlock();
  return new TextDocument(getTitle(), getTextBlocks());
}

public void flushBlock() {
  if (inBody == 0) {
    if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
      setTitle(tokenBuffer.toString().trim());
    } else if (ANCHOR_TEXT_END.equals(token)) {
      inAnchorText = false;
    } else if (isWord(token)) {
      numTokens++;
      numWords++;
  addTextBlock(tb);
  blockTagLevel = -1;

public boolean end(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName) {
  instance.addWhitespaceIfNecessary();
  return false;
}

public boolean start(final BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.flushBlock();
  instance.inBody++;
  return false;
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  instance.addLabelAction(action);
  return false;
}

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}

  /**
   * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
   * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
   * 
   * @return The {@link TextDocument}
   */
  public TextDocument toTextDocument() {
    return contentHandler.toTextDocument();
  }
}

Javadoc

A simple SAX ContentHandler, used by BoilerpipeSAXInput. Can be used by different parser implementations, e.g. NekoHTML and TagSoup.

Most used methods

<init>
Constructs a BoilerpipeHTMLContentHandler using the given TagActionMap.
addTextBlock
addWhitespaceIfNecessary
flushBlock
getTextBlocks
getTitle
isWord
setTitle
toTextDocument
Returns a TextDocument containing the extracted TextBlocks. NOTE: Only call this after parsing.
addLabelAction
characters
endDocument

Popular in Java

Reading from database using SQL prepared statement
getExternalFilesDir (Context)
setContentView (Activity)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
UnknownHostException (java.net)
Thrown when a hostname can not be resolved.
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
JComboBox (javax.swing)
CodeWhisperer alternatives

How to useBoilerpipeHTMLContentHandler in de.l3s.boilerpipe.sax

Best Java code snippets using de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler (Showing top 20 results out of 315)

How to use
BoilerpipeHTMLContentHandler
in
de.l3s.boilerpipe.sax