org.htmlparser.Parser.parse java code examples

private HTMLPage(Parser parser) throws ParserException {
  this.nodes = parser.parse(null);
}

private NodeList parseHtml(String possibleTable) {
 try {
  Parser parser = new Parser(possibleTable);
  return parser.parse(null);
 } catch (ParserException | StringIndexOutOfBoundsException e) {
  return null;
 }
}

private NodeList parseHtml(String possibleTable) {
 try {
  Parser parser = new Parser(possibleTable);
  return parser.parse(null);
 } catch (ParserException e) {
  return null;
 }
}

public void parse() throws ParserException {
  Parser parser = new Parser();
  parser.setInputHTML(html);
  nodes = parser.parse(null);
}

 private void fillMap(String documentationFile) throws IOException {
  InputStream resourceAsStream = getClass().getResourceAsStream(documentationFile);
  try {
   BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream));
   StringBuilder sb = new StringBuilder();
   while (true) {
    String line;
    line = reader.readLine();
    if (line == null) {
     break;
    }
    sb.append(line + "\n");
   }

   String document = sb.toString();

   Parser parser = new Parser(document);
   NodeList list = parser.parse(null);
   HtmlDocumentationVisitor visitor = new HtmlDocumentationVisitor(document);
   list.visitAllNodesWith(visitor);
   map.putAll(visitor.getMap());
  } catch (Exception e) {
   RutaIdeUIPlugin.error(e);
  }

 }
}

public static String parseFontHTML(String content) {
  hasData = false;
  Parser parser = Parser.createParser(content, "UTF-8");
  StringBuilder sb = null;
  try {
    NodeList list = (NodeList) parser.parse(null);
    if (hasFont(list)) {
      sb = getNewHtml(list);
    }
  } catch (ParserException e) {
    e.printStackTrace();
  }
  if (sb == null) {
    return content;
  }
  return sb.toString().replace("</FONT></FONT></FONT>", "</FONT>").replace("</FONT></FONT>", "</FONT>");
}

/**
 * Apply each of the filters.
 * The first filter is applied to the output of the parser.
 * Subsequent filters are applied to the output of the prior filter.
 * @return A list of nodes passed through all filters.
 * If there are no filters, returns the entire page.
 * @throws ParserException If an encoding change occurs
 * or there is some other problem.
 */
protected NodeList applyFilters ()
  throws
    ParserException
{
  NodeFilter[] filters;
  NodeList ret;
  ret = mParser.parse (null);
  filters = getFilters ();
  if (null != filters)
    for (int i = 0; i < filters.length; i++)
      ret = ret.extractAllNodesThatMatch (filters[i], mRecursive);
  return (ret);
}

private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException,
    MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException {
  if (null != body && body.contains("base64")) {
    Parser parser = new Parser(body);
    NodeList nodeList = parser.parse(null);
    HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor();
    nodeList.visitAllNodesWith(htmlImageNodeVisitor);
    body = nodeList.toHtml();
    addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images());
  }
}

public HtmlTableScanner(String page) {
 if (page == null || page.equals(""))
  page = "<i>This page intentionally left blank.</i>";
 NodeList htmlTree;
 try {
  Parser parser = new Parser(new Lexer(new Page(page)));
  htmlTree = parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
 scanForTables(htmlTree);
}

public HtmlTableScanner(String page) {
 if (page == null || page.equals(""))
  page = "<i>This page intentionally left blank.</i>";
 NodeList htmlTree;
 try {
  Parser parser = new Parser(new Lexer(new Page(page)));
  htmlTree = parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
 scanForTables(htmlTree);
}

/**
 * 提取多张图片
 * @param pageHtml
 * @return
 */
public List<String> parseManga(String pageHtml) {
  try {
    List<String> result = new ArrayList<String>();
    Parser parser = new Parser(pageHtml);
    NodeFilter filter = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","item-container"));
    NodeList list = parser.parse(filter);
    for (int i = 0; i < list.size(); i++) {
      Node item = list.elementAt(i);
      result.add(((ImageTag) item.getChildren().elementAt(2)).getAttribute("data-src"));
    }
    return result;
  } catch (ParserException e) {
    logger.error(e.getMessage());
  }
  return null;
}

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
 String documentText = jcas.getDocumentText();
 List<AnnotationFS> annotations = new ArrayList<AnnotationFS>();
 List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>();
 try {
  Parser parser = new Parser(documentText);
  NodeList list = parser.parse(null);
  HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent);
  list.visitAllNodesWith(visitor);
  annotations = visitor.getAnnotations();
  annotationStack = visitor.getAnnotationStack();
 } catch (ParserException e) {
  throw new AnalysisEngineProcessException(e);
 }
 for (AnnotationFS each : annotations) {
  if (each.getBegin() < each.getEnd()) {
   jcas.addFsToIndexes(each);
  }
 }
 for (AnnotationFS each : annotationStack) {
  if (each.getBegin() < each.getEnd()) {
   jcas.addFsToIndexes(each);
  }
 }
}

private NodeList getMatchingTags(NodeFilter filter) throws Exception {
 String html = examiner.html();
 Parser parser = new Parser(new Lexer(new Page(html)));
 NodeList list = parser.parse(null);
 NodeList matches = list.extractAllNodesThatMatch(filter, true);
 return matches;
}

private NodeList makeNodeList(TestPage pageToTest) {
 String html = pageToTest.getHtml();
 Parser parser = new Parser(new Lexer(new Page(html)));
 try {
  return parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
}

/**
 * Apply each of the filters.
 * The first filter is applied to the parser.
 * Subsequent filters are applied to the output of the prior filter.
 * @return A list of nodes passed through all filters.
 * @throws ParserException If an encoding change occurs
 * or there is some other problem.
 */
protected NodeList applyFilters ()
  throws
    ParserException
{
  NodeList ret;
  ret = new NodeList ();
  if (null != getFilters ())
    for (int i = 0; i < getFilters ().length; i++)
      if (0 == i)
        ret = mParser.parse (getFilters ()[0]);
      else
        ret = ret.extractAllNodesThatMatch (getFilters ()[i]);
  return (ret);
}

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
 String documentText = jcas.getDocumentText();
 List<AnnotationFS> annotations = new ArrayList<AnnotationFS>();
 List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>();
 try {
  Parser parser = new Parser(documentText);
  NodeList list = parser.parse(null);
  HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent);
  list.visitAllNodesWith(visitor);
  annotations = visitor.getAnnotations();
  annotationStack = visitor.getAnnotationStack();
 } catch (ParserException e) {
  throw new AnalysisEngineProcessException(e);
 }
 for (AnnotationFS each : annotations) {
  if (each.getBegin() < each.getEnd()) {
   jcas.addFsToIndexes(each);
  }
 }
 for (AnnotationFS each : annotationStack) {
  if (each.getBegin() < each.getEnd()) {
   jcas.addFsToIndexes(each);
  }
 }
}

private NodeList makeNodeList(TestPage pageToTest) {
 String html = pageToTest.getHtml();
 Parser parser = new Parser(new Lexer(new Page(html)));
 try {
  return parser.parse(null);
 } catch (ParserException e) {
  throw new SlimError(e);
 }
}

private NodeList getMatchingTags(NodeFilter filter) throws Exception {
 String html = examiner.html();
 Parser parser = new Parser(new Lexer(new Page(html)));
 NodeList list = parser.parse(null);
 NodeList matches = list.extractAllNodesThatMatch(filter, true);
 return matches;
}

/**
 * 提取单张图片
 * @param pageHtml
 * @return
 */
public String parseMedium(String pageHtml) {
  try {
    Parser parser = new Parser(pageHtml);
    NodeFilter filter = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","original-image"));
    NodeList list = parser.parse(filter);
    if (list.size() > 0) {
      return ((ImageTag)list.elementAt(0)).getAttribute("data-src");
    }
  } catch (ParserException e) {
    logger.error(e.getMessage());
  }
  return null;
}

/**
 * 在搜索列表中找到下一页的地址
 * @param pageHtml
 * @return
 */
public String parseNextPage(String pageHtml) {
  try {
    Parser parser = new Parser(pageHtml);
    NodeFilter filter = new AndFilter(new TagNameFilter("a"),new HasAttributeFilter("rel","next"));
    NodeList list =  parser.parse(filter);
    if(list.size() > 0) {
      return ((LinkTag)list.elementAt(0)).getLink();
    }
  } catch (ParserException e) {
    logger.error(e.getMessage());
  }
  return null;
}

Javadoc

Parse the given resource, using the filter provided. This can be used to extract information from specific nodes. When used with a null filter it returns an entire page which can then be modified and converted back to HTML (Note: the synthesis use-case is not handled very well; the parser is more often used to extract information from a web page).

For example, to replace the entire contents of the HEAD with a single TITLE tag you could do this:

 
NodeList nl = parser.parse (null); // here is your two node list 
NodeList heads = nl.extractAllNodesThatMatch (new TagNameFilter ("HEAD")) 
if (heads.size () > 0) // there may not be a HEAD tag 
{ 
Head head = heads.elementAt (0); // there should be only one 
head.removeAll (); // clean out the contents 
Tag title = new TitleTag (); 
title.setTagName ("title"); 
title.setChildren (new NodeList (new TextNode ("The New Title"))); 
Tag title_end = new TitleTag (); 
title_end.setTagName ("/title"); 
title.setEndTag (title_end); 
head.add (title); 
} 
System.out.println (nl.toHtml ()); // output the modified HTML

Popular methods of Parser

<init>
Construct a parser using the provided lexer and feedback object. This would be used to create a pars
visitAllNodesWith
Apply the given visitor to the current page. The visitor is passed to the accept() method of each n
createParser
Creates the parser on an input string.
extractAllNodesThatMatch
Extract all nodes matching the given filter.
setNodeFactory
Set the current node factory.
setLexer
Set the lexer for this parser. The current NodeFactory is transferred to (set on) the given lexer, s
elements
Returns an iterator (enumeration) over the html nodes. org.htmlparser.nodes can be of three main typ
reset
Reset the parser to start from the beginning again. This assumes support for a reset from the underl
getConnection
Return the current connection.
setInputHTML
Initializes the parser with the given input HTML String.
setURL
Set the URL for this parser. This method creates a new Lexer reading from the given URL. Trying to s
getConnectionManager
Get the connection manager all Parsers use.

Popular in Java

Making http requests using okhttp
onRequestPermissionsResult (Fragment)
getResourceAsStream (ClassLoader)
startActivity (Activity)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Github Copilot alternatives

How to use parsemethodin org.htmlparser.Parser

Best Java code snippets using org.htmlparser.Parser.parse (Showing top 20 results out of 315)

How to use
parse
method
in
org.htmlparser.Parser