de.l3s.boilerpipe.sax java code examples

@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
  super.startPrefixMapping(prefix, uri);
  delegate.startPrefixMapping(prefix, uri);
}

@Override
public void startDocument() throws SAXException {
  super.startDocument();
  delegate.startDocument();
  inHeader = true;
  inFooter = false;
  headerCharOffset = 0;
  if (includeMarkup) {
    elements = new ArrayList<>();
  }
}

@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
  super.startElement(uri, localName, qName, atts);
  if (inHeader) {
    delegate.startElement(uri, localName, qName, atts);
  } else if (inFooter) {
    // Do nothing
  } else if (includeMarkup) {
    elements.add(new RecordedElement(uri, localName, qName, atts));
  } else {
    // This happens for the <body> element, if we're not doing markup.
    delegate.startElement(uri, localName, qName, atts);
  }
}

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

  protected void addTagAction(final String tag, final TagAction action) {
    TagAction previousAction = get(tag);
    if(previousAction == null) {
      setTagAction(tag, action);
    } else {
      setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
    }
  }
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  instance.addLabelAction(action);
  return false;
}

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    String localName, String qName, Attributes atts)
    throws SAXException {
  return t1.start(instance, localName, qName, atts)
      | t2.start(instance, localName, qName, atts);
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  return false;
}

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}

  /**
   * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
   * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
   * 
   * @return The {@link TextDocument}
   */
  public TextDocument toTextDocument() {
    return contentHandler.toTextDocument();
  }
}

@Override
public void characters(char[] chars, int offset, int length) throws SAXException {
  super.characters(chars, offset, length);
  if (inHeader) {
    delegate.characters(chars, offset, length);
    headerCharOffset++;
  } else if (inFooter) {
    // Do nothing
  } else if (includeMarkup) {
    RecordedElement element = elements.get(elements.size() - 1);
    char[] characters = new char[length];
    System.arraycopy(chars, offset, characters, 0, length);
    element.getCharacters().add(characters);
  }
}

@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
  super.endElement(uri, localName, qName);
  if (inHeader) {
    delegate.endElement(uri, localName, qName);
    inHeader = !localName.equals("head");
  } else if (inFooter) {
    // Do nothing
  } else if (localName.equals("body")) {
    inFooter = true;
  } else if (includeMarkup) {
    // Add the end element, and the continuation from the previous element
    elements.add(new RecordedElement(uri, localName, qName));
    elements.add(new RecordedElement());
  }
}

@Override
public void endDocument() throws SAXException {
  super.endDocument();

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}

public boolean start(BoilerpipeHTMLContentHandler instance,
    String localName, String qName, Attributes atts)
    throws SAXException {
  return t1.start(instance, localName, qName, atts)
      | t2.start(instance, localName, qName, atts);
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

How to use de.l3s.boilerpipe.sax

Best Java code snippets using de.l3s.boilerpipe.sax (Showing top 20 results out of 315)