/** * Create a FilterBean object. */ public FilterBean () { mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mFilters = null; mNodes = null; mRecursive = true; }
/** Creates new LinkBean */ public LinkBean () { mPropertySupport = new PropertyChangeSupport (this); mLinks = null; mParser = new Parser (); }
/** Creates new LinkBean */ public LinkBean () { mPropertySupport = new PropertyChangeSupport (this); mLinks = null; mParser = new Parser (); }
/** * Create a FilterBean object. */ public FilterBean () { mPropertySupport = new PropertyChangeSupport (this); mParser = new Parser (); mFilters = null; mNodes = null; }
private NodeList parseHtml(String possibleTable) { try { Parser parser = new Parser(possibleTable); return parser.parse(null); } catch (ParserException | StringIndexOutOfBoundsException e) { return null; } }
private NodeList parseHtml(String possibleTable) { try { Parser parser = new Parser(possibleTable); return parser.parse(null); } catch (ParserException e) { return null; } }
@Override public PageContent fetchPageContent(String url) { logger.debug("Fetching {}", url); try { Parser parser = new Parser(url); PageContentVisitor visitor = new PageContentVisitor(baseUrl, url); parser.visitAllNodesWith(visitor); return visitor.getContent(); } catch (ParserException ex) { throw new IllegalStateException(ex); } }
public void parse() throws ParserException { Parser parser = new Parser(); parser.setInputHTML(html); nodes = parser.parse(null); }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
/** * Creates the parser on an input string. * @param html The string containing HTML. * @param charset <em>Optional</em>. The character set encoding that will * be reported by {@link #getEncoding}. If charset is <code>null</code> * the default character set is used. * @return A parser with the <code>html</code> string as input. */ public static Parser createParser (String html, String charset) { Parser ret; if (null == html) throw new IllegalArgumentException ("html cannot be null"); ret = new Parser (new Lexer (new Page (html, charset))); return (ret); }
/** * Extracts the title from the given HTML. * * @return never null, just an empty string if not parsable. */ public static String extractTitle(String html) throws ParserException { String title = ""; Parser parser = new Parser(html); NodeList matches = parser.extractAllNodesThatMatch(TITLE_FILTER); SimpleNodeIterator it = matches.elements(); while (it.hasMoreNodes()) { TitleTag node = (TitleTag) it.nextNode(); title = node.getTitle().trim(); } return title; }
private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException, MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException { if (null != body && body.contains("base64")) { Parser parser = new Parser(body); NodeList nodeList = parser.parse(null); HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor(); nodeList.visitAllNodesWith(htmlImageNodeVisitor); body = nodeList.toHtml(); addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images()); } }
/** * Create a Parser Object having a String Object as input (instead of a url or a string representing the url location). * <BR>The string will be parsed as it would be a file. * @param input The string in input. * @return The Parser Object with the string as input stream. */ public static Parser createParserParsingAnInputString (String input) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(input); lexer.setPage(page); parser.setLexer(lexer); return parser; }
public HtmlTableScanner(String page) { if (page == null || page.equals("")) page = "<i>This page intentionally left blank.</i>"; NodeList htmlTree; try { Parser parser = new Parser(new Lexer(new Page(page))); htmlTree = parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } scanForTables(htmlTree); }
public HtmlTableScanner(String page) { if (page == null || page.equals("")) page = "<i>This page intentionally left blank.</i>"; NodeList htmlTree; try { Parser parser = new Parser(new Lexer(new Page(page))); htmlTree = parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } scanForTables(htmlTree); }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
private NodeList makeNodeList(TestPage pageToTest) { String html = pageToTest.getHtml(); Parser parser = new Parser(new Lexer(new Page(html))); try { return parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } }
private NodeList makeNodeList(TestPage pageToTest) { String html = pageToTest.getHtml(); Parser parser = new Parser(new Lexer(new Page(html))); try { return parser.parse(null); } catch (ParserException e) { throw new SlimError(e); } }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
@Test public void testLinkExtraction() throws ParserException { Parser parser = new Parser("http://synyx.de"); ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class); parser.visitAllNodesWith(visitor); Node[] links = visitor.getTags(); // TODO this could use some more meaningful assertions assertTrue(links.length > 0); for (int i = 0; i < links.length; i++) { LinkTag linkTag = (LinkTag) links[i]; System.out.print("\"" + linkTag.getLinkText() + "\" => "); System.out.println(linkTag.getLink()); } } }