org.htmlparser.Parser.<init> java code examples

/**
 * Create a FilterBean object.
 */
public FilterBean ()
{
  mPropertySupport = new PropertyChangeSupport (this);
  mParser = new Parser ();
  mFilters = null;
  mNodes = null;
  mRecursive = true;
}

/** Creates new LinkBean */
public LinkBean ()
{
  mPropertySupport = new PropertyChangeSupport (this);
  mLinks = null;
  mParser = new Parser ();
}

/** Creates new LinkBean */
public LinkBean ()
{
  mPropertySupport = new PropertyChangeSupport (this);
  mLinks = null;
  mParser = new Parser ();
}

/**
 * Create a FilterBean object.
 */
public FilterBean ()
{
  mPropertySupport = new PropertyChangeSupport (this);
  mParser = new Parser ();
  mFilters = null;
  mNodes = null;
}

private NodeList parseHtml(String possibleTable) {
 try {
  Parser parser = new Parser(possibleTable);
  return parser.parse(null);
 } catch (ParserException | StringIndexOutOfBoundsException e) {
  return null;
 }
}

private NodeList parseHtml(String possibleTable) {
 try {
  Parser parser = new Parser(possibleTable);
  return parser.parse(null);
 } catch (ParserException e) {
  return null;
 }
}

@Override
public PageContent fetchPageContent(String url) {
  logger.debug("Fetching {}", url);
  try {
    Parser parser = new Parser(url);
    PageContentVisitor visitor = new PageContentVisitor(baseUrl, url);
    parser.visitAllNodesWith(visitor);
    
    return visitor.getContent();
  } catch (ParserException ex) {
    throw new IllegalStateException(ex);
  }
}

public void parse() throws ParserException {
  Parser parser = new Parser();
  parser.setInputHTML(html);
  nodes = parser.parse(null);
}

public static List<String> getLinks(String url) throws ParserException {
  Parser htmlParser = new Parser(url);
  List<String> links = new LinkedList<String>();
  NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
  for (int m = 0; m < tagNodeList.size(); m++) {
    LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
    String linkName = loopLinks.getLink();
    links.add(linkName);
  }
  return links;
}

/**
 * Creates the parser on an input string.
 * @param html The string containing HTML.
 * @param charset <em>Optional</em>. The character set encoding that will
 * be reported by {@link #getEncoding}. If charset is <code>null</code>
 * the default character set is used.
 * @return A parser with the <code>html</code> string as input.
 */
public static Parser createParser (String html, String charset)
{
  Parser ret;
  if (null == html)
    throw new IllegalArgumentException ("html cannot be null");
  ret = new Parser (new Lexer (new Page (html, charset)));
  return (ret);
}

/**
 * Extracts the title from the given HTML.
 *
 * @return never null, just an empty string if not parsable.
 */
public static String extractTitle(String html) throws ParserException {
  String title = "";
  Parser parser = new Parser(html);
  NodeList matches = parser.extractAllNodesThatMatch(TITLE_FILTER);
  SimpleNodeIterator it = matches.elements();
  while (it.hasMoreNodes()) {
    TitleTag node = (TitleTag) it.nextNode();
    title = node.getTitle().trim();
  }
  return title;
}

private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException,
    MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException {
  if (null != body && body.contains("base64")) {
    Parser parser = new Parser(body);
    NodeList nodeList = parser.parse(null);
    HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor();
    nodeList.visitAllNodesWith(htmlImageNodeVisitor);
    body = nodeList.toHtml();
    addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images());
  }
}

/**
 * Create a Parser Object having a String Object as input (instead of a url or a string representing the url location).
 * <BR>The string will be parsed as it would be a file.
 * @param input The string in input.
 * @return The Parser Object with the string as input stream.
 */
public static Parser createParserParsingAnInputString (String input)
  throws ParserException, UnsupportedEncodingException
{
   Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(input);
  lexer.setPage(page);
  parser.setLexer(lexer);
  
  return parser;
  
}

public HtmlTableScanner(String page) {
 if (page == null || page.equals(""))
  page = "<i>This page intentionally left blank.</i>";
 NodeList htmlTree;
 try {
  Parser parser = new Parser(new Lexer(new Page(page)));
  htmlTree = parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
 scanForTables(htmlTree);
}

public HtmlTableScanner(String page) {
 if (page == null || page.equals(""))
  page = "<i>This page intentionally left blank.</i>";
 NodeList htmlTree;
 try {
  Parser parser = new Parser(new Lexer(new Page(page)));
  htmlTree = parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
 scanForTables(htmlTree);
}

private NodeList getMatchingTags(NodeFilter filter) throws Exception {
 String html = examiner.html();
 Parser parser = new Parser(new Lexer(new Page(html)));
 NodeList list = parser.parse(null);
 NodeList matches = list.extractAllNodesThatMatch(filter, true);
 return matches;
}

private NodeList makeNodeList(TestPage pageToTest) {
 String html = pageToTest.getHtml();
 Parser parser = new Parser(new Lexer(new Page(html)));
 try {
  return parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
}

private NodeList makeNodeList(TestPage pageToTest) {
 String html = pageToTest.getHtml();
 Parser parser = new Parser(new Lexer(new Page(html)));
 try {
  return parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
}

private NodeList getMatchingTags(NodeFilter filter) throws Exception {
 String html = examiner.html();
 Parser parser = new Parser(new Lexer(new Page(html)));
 NodeList list = parser.parse(null);
 NodeList matches = list.extractAllNodesThatMatch(filter, true);
 return matches;
}

  @Test
  public void testLinkExtraction() throws ParserException {
    Parser parser = new Parser("http://synyx.de");
    ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class);
    parser.visitAllNodesWith(visitor);
    Node[] links = visitor.getTags();
    // TODO this could use some more meaningful assertions
    assertTrue(links.length > 0);
    for (int i = 0; i < links.length; i++) {
      LinkTag linkTag = (LinkTag) links[i];
      System.out.print("\"" + linkTag.getLinkText() + "\" => ");
      System.out.println(linkTag.getLink());
    }
  }
}

Javadoc

Zero argument constructor. The parser is in a safe but useless state parsing an empty string. Set the lexer or connection using #setLexeror #setConnection.

Popular methods of Parser

parse
Parse the given resource, using the filter provided. This can be used to extract information from sp
visitAllNodesWith
Apply the given visitor to the current page. The visitor is passed to the accept() method of each n
createParser
Creates the parser on an input string.
extractAllNodesThatMatch
Extract all nodes matching the given filter.
setNodeFactory
Set the current node factory.
setLexer
Set the lexer for this parser. The current NodeFactory is transferred to (set on) the given lexer, s
elements
Returns an iterator (enumeration) over the html nodes. org.htmlparser.nodes can be of three main typ
reset
Reset the parser to start from the beginning again. This assumes support for a reset from the underl
getConnection
Return the current connection.
setInputHTML
Initializes the parser with the given input HTML String.
setURL
Set the URL for this parser. This method creates a new Lexer reading from the given URL. Trying to s
getConnectionManager
Get the connection manager all Parsers use.

Popular in Java

Making http requests using okhttp
onRequestPermissionsResult (Fragment)
getResourceAsStream (ClassLoader)
startActivity (Activity)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
From CI to AI: The AI layer in your organization

How to use org.htmlparser.Parserconstructor

Best Java code snippets using org.htmlparser.Parser.<init> (Showing top 20 results out of 315)

How to use
org.htmlparser.Parser
constructor