private HTMLPage(Parser parser) throws ParserException { this.nodes = parser.parse(null); }
private NodeList parseHtml(String possibleTable) { try { Parser parser = new Parser(possibleTable); return parser.parse(null); } catch (ParserException | StringIndexOutOfBoundsException e) { return null; } }
private NodeList parseHtml(String possibleTable) { try { Parser parser = new Parser(possibleTable); return parser.parse(null); } catch (ParserException e) { return null; } }
public void parse() throws ParserException { Parser parser = new Parser(); parser.setInputHTML(html); nodes = parser.parse(null); }
private void fillMap(String documentationFile) throws IOException { InputStream resourceAsStream = getClass().getResourceAsStream(documentationFile); try { BufferedReader reader = new BufferedReader(new InputStreamReader(resourceAsStream)); StringBuilder sb = new StringBuilder(); while (true) { String line; line = reader.readLine(); if (line == null) { break; } sb.append(line + "\n"); } String document = sb.toString(); Parser parser = new Parser(document); NodeList list = parser.parse(null); HtmlDocumentationVisitor visitor = new HtmlDocumentationVisitor(document); list.visitAllNodesWith(visitor); map.putAll(visitor.getMap()); } catch (Exception e) { RutaIdeUIPlugin.error(e); } } }
public static String parseFontHTML(String content) { hasData = false; Parser parser = Parser.createParser(content, "UTF-8"); StringBuilder sb = null; try { NodeList list = (NodeList) parser.parse(null); if (hasFont(list)) { sb = getNewHtml(list); } } catch (ParserException e) { e.printStackTrace(); } if (sb == null) { return content; } return sb.toString().replace("</FONT></FONT></FONT>", "</FONT>").replace("</FONT></FONT>", "</FONT>"); }
/** * Apply each of the filters. * The first filter is applied to the output of the parser. * Subsequent filters are applied to the output of the prior filter. * @return A list of nodes passed through all filters. * If there are no filters, returns the entire page. * @throws ParserException If an encoding change occurs * or there is some other problem. */ protected NodeList applyFilters () throws ParserException { NodeFilter[] filters; NodeList ret; ret = mParser.parse (null); filters = getFilters (); if (null != filters) for (int i = 0; i < filters.length; i++) ret = ret.extractAllNodesThatMatch (filters[i], mRecursive); return (ret); }
private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException, MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException { if (null != body && body.contains("base64")) { Parser parser = new Parser(body); NodeList nodeList = parser.parse(null); HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor(); nodeList.visitAllNodesWith(htmlImageNodeVisitor); body = nodeList.toHtml(); addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images()); } }
public HtmlTableScanner(String page) { if (page == null || page.equals("")) page = "<i>This page intentionally left blank.</i>"; NodeList htmlTree; try { Parser parser = new Parser(new Lexer(new Page(page))); htmlTree = parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } scanForTables(htmlTree); }
public HtmlTableScanner(String page) { if (page == null || page.equals("")) page = "<i>This page intentionally left blank.</i>"; NodeList htmlTree; try { Parser parser = new Parser(new Lexer(new Page(page))); htmlTree = parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } scanForTables(htmlTree); }
/** * 提取多张图片 * @param pageHtml * @return */ public List<String> parseManga(String pageHtml) { try { List<String> result = new ArrayList<String>(); Parser parser = new Parser(pageHtml); NodeFilter filter = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","item-container")); NodeList list = parser.parse(filter); for (int i = 0; i < list.size(); i++) { Node item = list.elementAt(i); result.add(((ImageTag) item.getChildren().elementAt(2)).getAttribute("data-src")); } return result; } catch (ParserException e) { logger.error(e.getMessage()); } return null; }
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { String documentText = jcas.getDocumentText(); List<AnnotationFS> annotations = new ArrayList<AnnotationFS>(); List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>(); try { Parser parser = new Parser(documentText); NodeList list = parser.parse(null); HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent); list.visitAllNodesWith(visitor); annotations = visitor.getAnnotations(); annotationStack = visitor.getAnnotationStack(); } catch (ParserException e) { throw new AnalysisEngineProcessException(e); } for (AnnotationFS each : annotations) { if (each.getBegin() < each.getEnd()) { jcas.addFsToIndexes(each); } } for (AnnotationFS each : annotationStack) { if (each.getBegin() < each.getEnd()) { jcas.addFsToIndexes(each); } } }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
private NodeList makeNodeList(TestPage pageToTest) { String html = pageToTest.getHtml(); Parser parser = new Parser(new Lexer(new Page(html))); try { return parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } }
/** * Apply each of the filters. * The first filter is applied to the parser. * Subsequent filters are applied to the output of the prior filter. * @return A list of nodes passed through all filters. * @throws ParserException If an encoding change occurs * or there is some other problem. */ protected NodeList applyFilters () throws ParserException { NodeList ret; ret = new NodeList (); if (null != getFilters ()) for (int i = 0; i < getFilters ().length; i++) if (0 == i) ret = mParser.parse (getFilters ()[0]); else ret = ret.extractAllNodesThatMatch (getFilters ()[i]); return (ret); }
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { String documentText = jcas.getDocumentText(); List<AnnotationFS> annotations = new ArrayList<AnnotationFS>(); List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>(); try { Parser parser = new Parser(documentText); NodeList list = parser.parse(null); HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent); list.visitAllNodesWith(visitor); annotations = visitor.getAnnotations(); annotationStack = visitor.getAnnotationStack(); } catch (ParserException e) { throw new AnalysisEngineProcessException(e); } for (AnnotationFS each : annotations) { if (each.getBegin() < each.getEnd()) { jcas.addFsToIndexes(each); } } for (AnnotationFS each : annotationStack) { if (each.getBegin() < each.getEnd()) { jcas.addFsToIndexes(each); } } }
private NodeList makeNodeList(TestPage pageToTest) { String html = pageToTest.getHtml(); Parser parser = new Parser(new Lexer(new Page(html))); try { return parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
/** * 提取单张图片 * @param pageHtml * @return */ public String parseMedium(String pageHtml) { try { Parser parser = new Parser(pageHtml); NodeFilter filter = new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","original-image")); NodeList list = parser.parse(filter); if (list.size() > 0) { return ((ImageTag)list.elementAt(0)).getAttribute("data-src"); } } catch (ParserException e) { logger.error(e.getMessage()); } return null; }
/** * 在搜索列表中找到下一页的地址 * @param pageHtml * @return */ public String parseNextPage(String pageHtml) { try { Parser parser = new Parser(pageHtml); NodeFilter filter = new AndFilter(new TagNameFilter("a"),new HasAttributeFilter("rel","next")); NodeList list = parser.parse(filter); if(list.size() > 0) { return ((LinkTag)list.elementAt(0)).getLink(); } } catch (ParserException e) { logger.error(e.getMessage()); } return null; }