org.htmlparser.Parser.visitAllNodesWith java code examples

/**
 * Extract the text from a page.
 * @return The textual contents of the page.
 * @exception ParserException If a parse error occurs.
 */
protected String extractStrings ()
  throws
    ParserException
{
  String ret;
  mParser.visitAllNodesWith (this);
  ret = mBuffer.toString ();
  mBuffer = new StringBuilder(4096);
  return (ret);
}

/**
 * Extract the text from a page.
 * @return The textual contents of the page.
 * @exception ParserException If a parse error occurs.
 */
protected String extractStrings ()
  throws
    ParserException
{
  String ret;
  mCollapseState = 0;
  mParser.visitAllNodesWith (this);
  ret = mBuffer.toString ();
  mBuffer = new StringBuffer(4096);
  return (ret);
}

@Override
public PageContent fetchPageContent(String url) {
  logger.debug("Fetching {}", url);
  try {
    Parser parser = new Parser(url);
    PageContentVisitor visitor = new PageContentVisitor(baseUrl, url);
    parser.visitAllNodesWith(visitor);
    
    return visitor.getContent();
  } catch (ParserException ex) {
    throw new IllegalStateException(ex);
  }
}

@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, FilterContext fc) throws IOException,
    TranslationException {
  StringBuilder all = null;
  try {
    all = new StringBuilder();
    char[] cbuf = new char[1000];
    int len = -1;
    while ((len = infile.read(cbuf)) > 0) {
      all.append(cbuf, 0, len);
    }
  } catch (OutOfMemoryError e) {
    // out of memory?
    all = null;
    System.gc();
    throw new IOException(OStrings.getString("HHC__FILE_TOO_BIG"));
  }
  Parser parser = new Parser();
  try {
    parser.setInputHTML(all.toString());
    parser.visitAllNodesWith(new HHCFilterVisitor(this, outfile));
  } catch (ParserException pe) {
    System.out.println(pe);
  }
}

public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
 try {
  Parser htmlParser = Parser.createParser(html, "utf8");  

  PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);      
  htmlParser.visitAllNodesWith(res);      
  mText = res.getText();
 } catch (ParserException e) {      
  System.err.println(" Parser exception: " + e + " trying simple conversion");
  // Plan B!!!
  mText = PostCleanerVisitor.simpleProc(html);
 }    
}

mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());
mBuffer = new StringBuffer (4096);
mCollapseState = 0;
mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());

parser.visitAllNodesWith(sb);

mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());
mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());

parser.visitAllNodesWith(this);

/**
 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
 */
public String process(String html, String encoding) throws ParserException {
  m_result = new StringBuffer();
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  // initialize the page with the given char set
  Page page = new Page(html, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  if ((m_noAutoCloseTags != null) && (m_noAutoCloseTags.size() > 0)) {
    // Degrade Composite tags that do have children in the DOM tree
    // to simple single tags: This allows to finish this tag with opened HTML tags without the effect
    // that html parser will generate the closing tags.
    PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
    lexer.setNodeFactory(factory);
  }
  // process the page using the given visitor
  parser.visitAllNodesWith(this);
  // return the result
  return getResult();
}

/**
 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
 */
public String process(String html, String encoding) throws ParserException {
  m_result = new StringBuffer();
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  // initialize the page with the given char set
  Page page = new Page(html, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) {
    // Degrade Composite tags that do have children in the DOM tree 
    // to simple single tags: This allows to finish this tag with opened HTML tags without the effect 
    // that html parser will generate the closing tags. 
    PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
    lexer.setNodeFactory(factory);
  }
  // process the page using the given visitor
  parser.visitAllNodesWith(this);
  // return the result
  return getResult();
}

HtmlPage page = new HtmlPage(parser);
try {
  parser.visitAllNodesWith(page);
} catch (ParserException e) {
  log.error("visit page error:", e);

parser.setLexer(lexer);
parser.visitAllNodesWith(this);

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  String result = stringBean.getStrings();
  return result == null ? "" : result;
}

parser.visitAllNodesWith(this);

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  String result = stringBean.getStrings();
  return result == null ? "" : result;
}

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  return stringBean.getStrings();
}

  @Test
  public void testLinkExtraction() throws ParserException {
    Parser parser = new Parser("http://synyx.de");
    ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class);
    parser.visitAllNodesWith(visitor);
    Node[] links = visitor.getTags();
    // TODO this could use some more meaningful assertions
    assertTrue(links.length > 0);
    for (int i = 0; i < links.length; i++) {
      LinkTag linkTag = (LinkTag) links[i];
      System.out.print("\"" + linkTag.getLinkText() + "\" => ");
      System.out.println(linkTag.getLink());
    }
  }
}

parser.visitAllNodesWith(visitor);

parser.visitAllNodesWith(visitor);

Javadoc

Apply the given visitor to the current page. The visitor is passed to the accept() method of each node in the page in a depth first traversal. The visitor beginParsing() method is called prior to processing the page and finishedParsing() is called after the processing.

Popular methods of Parser

<init>
Construct a parser using the provided lexer and feedback object. This would be used to create a pars
parse
Parse the given resource, using the filter provided. This can be used to extract information from sp
createParser
Creates the parser on an input string.
extractAllNodesThatMatch
Extract all nodes matching the given filter.
setNodeFactory
Set the current node factory.
setLexer
Set the lexer for this parser. The current NodeFactory is transferred to (set on) the given lexer, s
elements
Returns an iterator (enumeration) over the html nodes. org.htmlparser.nodes can be of three main typ
reset
Reset the parser to start from the beginning again. This assumes support for a reset from the underl
getConnection
Return the current connection.
setInputHTML
Initializes the parser with the given input HTML String.
setURL
Set the URL for this parser. This method creates a new Lexer reading from the given URL. Trying to s
getConnectionManager
Get the connection manager all Parsers use.

Popular in Java

Making http requests using okhttp
onRequestPermissionsResult (Fragment)
getResourceAsStream (ClassLoader)
startActivity (Activity)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Top plugins for Android Studio

How to use visitAllNodesWithmethodin org.htmlparser.Parser

Best Java code snippets using org.htmlparser.Parser.visitAllNodesWith (Showing top 20 results out of 315)

How to use
visitAllNodesWith
method
in
org.htmlparser.Parser