Codota Logo
de.l3s.boilerpipe.sax
Code IndexAdd Codota to your IDE (free)

How to use de.l3s.boilerpipe.sax

Best Java code snippets using de.l3s.boilerpipe.sax (Showing top 20 results out of 315)

  • Add the Codota plugin to your IDE and get smart completions
private void myMethod () {
List l =
  • Codota Iconnew LinkedList()
  • Codota IconCollections.emptyList()
  • Codota Iconnew ArrayList()
  • Smart code suggestions by Codota
}
origin: apache/tika

@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
  super.startPrefixMapping(prefix, uri);
  delegate.startPrefixMapping(prefix, uri);
}
origin: apache/tika

@Override
public void startDocument() throws SAXException {
  super.startDocument();
  delegate.startDocument();
  inHeader = true;
  inFooter = false;
  headerCharOffset = 0;
  if (includeMarkup) {
    elements = new ArrayList<>();
  }
}
origin: apache/tika

@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
  super.startElement(uri, localName, qName, atts);
  if (inHeader) {
    delegate.startElement(uri, localName, qName, atts);
  } else if (inFooter) {
    // Do nothing
  } else if (includeMarkup) {
    elements.add(new RecordedElement(uri, localName, qName, atts));
  } else {
    // This happens for the <body> element, if we're not doing markup.
    delegate.startElement(uri, localName, qName, atts);
  }
}
origin: de.l3s.boilerpipe/boilerpipe

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}
origin: de.l3s.boilerpipe/boilerpipe

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

origin: de.l3s.boilerpipe/boilerpipe

  protected void addTagAction(final String tag, final TagAction action) {
    TagAction previousAction = get(tag);
    if(previousAction == null) {
      setTagAction(tag, action);
    } else {
      setTagAction(tag, new CommonTagActions.Chained(previousAction, action));
    }
  }
}
origin: com.syncthemall/boilerpipe

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  instance.addLabelAction(action);
  return false;
}
origin: com.syncthemall/boilerpipe

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}
origin: com.syncthemall/boilerpipe

public boolean start(BoilerpipeHTMLContentHandler instance,
    String localName, String qName, Attributes atts)
    throws SAXException {
  return t1.start(instance, localName, qName, atts)
      | t2.start(instance, localName, qName, atts);
}
origin: de.l3s.boilerpipe/boilerpipe

public boolean start(BoilerpipeHTMLContentHandler instance,
    final String localName, final String qName,
    final Attributes atts) {
  instance.addWhitespaceIfNecessary();
  return false;
}
origin: com.syncthemall/boilerpipe

/**
 * Creates a new {@link HTMLHighlighter}, which is set-up to return the full
 * HTML text, with the extracted text portion <b>highlighted</b>.
 */
public static HTMLHighlighter newHighlightingInstance() {
  return new HTMLHighlighter(false);
}
origin: de.l3s.boilerpipe/boilerpipe

  /**
   * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
   * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}.
   * 
   * @return The {@link TextDocument}
   */
  public TextDocument toTextDocument() {
    return contentHandler.toTextDocument();
  }
}
origin: apache/tika

@Override
public void characters(char[] chars, int offset, int length) throws SAXException {
  super.characters(chars, offset, length);
  if (inHeader) {
    delegate.characters(chars, offset, length);
    headerCharOffset++;
  } else if (inFooter) {
    // Do nothing
  } else if (includeMarkup) {
    RecordedElement element = elements.get(elements.size() - 1);
    char[] characters = new char[length];
    System.arraycopy(chars, offset, characters, 0, length);
    element.getCharacters().add(characters);
  }
}
origin: apache/tika

@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
  super.endElement(uri, localName, qName);
  if (inHeader) {
    delegate.endElement(uri, localName, qName);
    inHeader = !localName.equals("head");
  } else if (inFooter) {
    // Do nothing
  } else if (localName.equals("body")) {
    inFooter = true;
  } else if (includeMarkup) {
    // Add the end element, and the continuation from the previous element
    elements.add(new RecordedElement(uri, localName, qName));
    elements.add(new RecordedElement());
  }
}
origin: apache/tika

@Override
public void endDocument() throws SAXException {
  super.endDocument();
origin: com.syncthemall/boilerpipe

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

origin: de.l3s.boilerpipe/boilerpipe

/**
 * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler.
 */
public BoilerpipeHTMLParser() {
  this(new BoilerpipeHTMLContentHandler());
}
origin: de.l3s.boilerpipe/boilerpipe

public boolean start(BoilerpipeHTMLContentHandler instance,
    String localName, String qName, Attributes atts)
    throws SAXException {
  return t1.start(instance, localName, qName, atts)
      | t2.start(instance, localName, qName, atts);
}
origin: pvdlg/boilerpipe

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

origin: Netbreeze-GmbH/boilerpipe

/**
 * Retrieves the {@link TextDocument} using a default HTML parser.
 */
public TextDocument getTextDocument() throws BoilerpipeProcessingException {
  return getTextDocument(new BoilerpipeHTMLParser());
}

de.l3s.boilerpipe.sax

Most used classes

  • BoilerpipeHTMLContentHandler
    A simple SAX ContentHandler, used by BoilerpipeSAXInput. Can be used by different parser implementat
  • BoilerpipeSAXInput
    Parses an InputSource using SAX and returns a TextDocument.
  • BoilerpipeHTMLParser
    A simple SAX Parser, used by BoilerpipeSAXInput. The parser uses CyberNeko [http://nekohtml.sourcefo
  • CommonTagActions$Chained
  • DefaultTagActionMap
    Default TagActions. Seem to work well.
  • HTMLFetcher,
  • HTMLHighlighter$Implementation,
  • HTMLHighlighter$TagAction,
  • HTMLHighlighter,
  • TagAction,
  • TagActionMap,
  • CommonTagActions$2,
  • CommonTagActions$BlockTagLabelAction,
  • ImageExtractor$Implementation,
  • ImageExtractor$TagAction,
  • ImageExtractor,
  • MarkupTagAction,
  • MediaExtractor$Implementation,
  • MediaExtractor$TagAction
Codota Logo
  • Products

    Search for Java codeSearch for JavaScript codeEnterprise
  • IDE Plugins

    IntelliJ IDEAWebStormAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimAtomGoLandRubyMineEmacsJupyter
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogCodota Academy Plugin user guide Terms of usePrivacy policyJava Code IndexJavascript Code Index
Get Codota for your IDE now