de.l3s.boilerpipe.sax.HTMLDocument.toInputSource java code examples

public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * parses the media (picture, video) out of doc
 * 
 * @param doc document to parse the media out
 * @param extractor extractor to use
 * @return list of extracted media, with size = 0 if no media found
 */
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
  final HTMLDocument htmlDoc = new HTMLDocument(doc);
  List<Media> media = new ArrayList<Media>();
  TextDocument tdoc;
  try {
    tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(tdoc);
    final InputSource is = htmlDoc.toInputSource();
    media = process(tdoc, is);
  } catch (Exception e) {
    return null;
  }
  return media;
}

/**
 * parses the media (picture, video) out of doc
 * @param doc document to parse the media out
 * @param extractor extractor to use
 * @return list of extracted media, with size = 0 if no media found
 */
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
  final HTMLDocument htmlDoc = new HTMLDocument(doc);
  List<Media> media = new ArrayList<Media>();
  TextDocument tdoc;
  try {
    tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(tdoc);
    final InputSource is = htmlDoc.toInputSource();
    media = process(tdoc, is);
  } catch (Exception e) {
    return null;
  }
  return media;
}

/**
 * parses the media (picture, video) out of doc
 * 
 * @param doc document to parse the media out
 * @param extractor extractor to use
 * @return list of extracted media, with size = 0 if no media found
 */
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
  final HTMLDocument htmlDoc = new HTMLDocument(doc);
  List<Media> media = new ArrayList<Media>();
  TextDocument tdoc;
  try {
    tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(tdoc);
    final InputSource is = htmlDoc.toInputSource();
    media = process(tdoc, is);
  } catch (Exception e) {
    return null;
  }
  return media;
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return The highlighted HTML.
 * @throws BoilerpipeProcessingException
 */
public String process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified
 * {@link BoilerpipeExtractor}.
 * 
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 * 
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException,
    BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * 
 * @param doc
 *            The processed {@link TextDocument}.
 * @param is
 *            The original HTML document.
 * @return A List of enclosed {@link Image}s
 * @throws BoilerpipeProcessingException
 */
public List<Image> process(final URL url, final BoilerpipeExtractor extractor)
    throws IOException, BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
      .getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the retrieved HTML using the specified
 * {@link BoilerpipeExtractor}.
 * 
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 * 
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor) throws IOException,
    BoilerpipeProcessingException, SAXException {
  final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
  final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
  extractor.process(doc);
  final InputSource is = htmlDoc.toInputSource();
  return process(doc, is);
}

/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 * @param url the url of the document to fetch
 * @param extractor extractor to use
 *
 * @return A List of enclosed {@link Image}s
 * @throws IOException
 * @throws BoilerpipeProcessingException
 * @throws SAXException
 */
@SuppressWarnings("javadoc")
public List<Media> process(final URL url, final BoilerpipeExtractor extractor)
        throws IOException, BoilerpipeProcessingException, SAXException {
    final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
    final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
            .getTextDocument();
    extractor.process(doc);
    final InputSource is = htmlDoc.toInputSource();
    return process(doc, is);
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * Extracts text from the HTML code available from the given {@link URL}.
 * NOTE: This method is mainly to be used for show case purposes. If you are
 * going to crawl the Web, consider using {@link #getText(InputSource)}
 * instead.
 * 
 * @param url  The URL pointing to the HTML code.
 * @return  The extracted text.
 * @throws BoilerpipeProcessingException
 */
public String getText(final URL url) throws BoilerpipeProcessingException {
  try {
    return getText(HTMLFetcher.fetch(url).toInputSource());
  } catch (IOException e) {
    throw new BoilerpipeProcessingException(e);
  }
}

/**
 * returns the article from an document with its basic html structure. 
 * 
 * @param HTMLDocument
 * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors
 * @return String
 */
public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) {
  final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
  hh.setOutputHighlightOnly(true);
  TextDocument doc;
  String text = "";
  try {
    doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
    extractor.process(doc);
    final InputSource is = htmlDoc.toInputSource();
    text = hh.process(doc, is);
  } catch (Exception ex) {
    return null;
  }
  return removeNotAllowedTags(text, docUri);
}

Popular methods of HTMLDocument

<init>

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
notifyDataSetChanged (ArrayAdapter)
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Timer (java.util)
Timers schedule one-shot or recurring TimerTask for execution. Prefer java.util.concurrent.Scheduled
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
JOptionPane (javax.swing)
CodeWhisperer alternatives

How to use toInputSourcemethodin de.l3s.boilerpipe.sax.HTMLDocument

Best Java code snippets using de.l3s.boilerpipe.sax.HTMLDocument.toInputSource (Showing top 18 results out of 315)

How to use
toInputSource
method
in
de.l3s.boilerpipe.sax.HTMLDocument