/** * Extract the text from a page. * @return The textual contents of the page. * @exception ParserException If a parse error occurs. */ protected String extractStrings () throws ParserException { String ret; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); mBuffer = new StringBuilder(4096); return (ret); }
/** * Extract the text from a page. * @return The textual contents of the page. * @exception ParserException If a parse error occurs. */ protected String extractStrings () throws ParserException { String ret; mCollapseState = 0; mParser.visitAllNodesWith (this); ret = mBuffer.toString (); mBuffer = new StringBuffer(4096); return (ret); }
@Override public PageContent fetchPageContent(String url) { logger.debug("Fetching {}", url); try { Parser parser = new Parser(url); PageContentVisitor visitor = new PageContentVisitor(baseUrl, url); parser.visitAllNodesWith(visitor); return visitor.getContent(); } catch (ParserException ex) { throw new IllegalStateException(ex); } }
@Override public void processFile(BufferedReader infile, BufferedWriter outfile, FilterContext fc) throws IOException, TranslationException { StringBuilder all = null; try { all = new StringBuilder(); char[] cbuf = new char[1000]; int len = -1; while ((len = infile.read(cbuf)) > 0) { all.append(cbuf, 0, len); } } catch (OutOfMemoryError e) { // out of memory? all = null; System.gc(); throw new IOException(OStrings.getString("HHC__FILE_TOO_BIG")); } Parser parser = new Parser(); try { parser.setInputHTML(all.toString()); parser.visitAllNodesWith(new HHCFilterVisitor(this, outfile)); } catch (ParserException pe) { System.out.println(pe); } }
public PostCleaner(String html, int minCodeChars, boolean excludeCode) { try { Parser htmlParser = Parser.createParser(html, "utf8"); PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode); htmlParser.visitAllNodesWith(res); mText = res.getText(); } catch (ParserException e) { System.err.println(" Parser exception: " + e + " trying simple conversion"); // Plan B!!! mText = PostCleanerVisitor.simpleProc(html); } }
mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); mBuffer = new StringBuffer (4096); mCollapseState = 0; mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ());
mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ()); mParser.visitAllNodesWith (this); updateStrings (mBuffer.toString ());
parser.visitAllNodesWith(this);
/** * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String) */ public String process(String html, String encoding) throws ParserException { m_result = new StringBuffer(); Parser parser = new Parser(); Lexer lexer = new Lexer(); // initialize the page with the given char set Page page = new Page(html, encoding); lexer.setPage(page); parser.setLexer(lexer); if ((m_noAutoCloseTags != null) && (m_noAutoCloseTags.size() > 0)) { // Degrade Composite tags that do have children in the DOM tree // to simple single tags: This allows to finish this tag with opened HTML tags without the effect // that html parser will generate the closing tags. PrototypicalNodeFactory factory = configureNoAutoCorrectionTags(); lexer.setNodeFactory(factory); } // process the page using the given visitor parser.visitAllNodesWith(this); // return the result return getResult(); }
/** * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String) */ public String process(String html, String encoding) throws ParserException { m_result = new StringBuffer(); Parser parser = new Parser(); Lexer lexer = new Lexer(); // initialize the page with the given char set Page page = new Page(html, encoding); lexer.setPage(page); parser.setLexer(lexer); if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) { // Degrade Composite tags that do have children in the DOM tree // to simple single tags: This allows to finish this tag with opened HTML tags without the effect // that html parser will generate the closing tags. PrototypicalNodeFactory factory = configureNoAutoCorrectionTags(); lexer.setNodeFactory(factory); } // process the page using the given visitor parser.visitAllNodesWith(this); // return the result return getResult(); }
HtmlPage page = new HtmlPage(parser); try { parser.visitAllNodesWith(page); } catch (ParserException e) { log.error("visit page error:", e);
parser.setLexer(lexer); parser.visitAllNodesWith(this);
/** * Extract the text from a HTML page.<p> * * @param in the html content input stream * @param encoding the encoding of the content * * @return the extracted text from the page * @throws ParserException if the parsing of the HTML failed * @throws UnsupportedEncodingException if the given encoding is not supported */ public static String extractText(InputStream in, String encoding) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(in, encoding); lexer.setPage(page); parser.setLexer(lexer); StringBean stringBean = new StringBean(); parser.visitAllNodesWith(stringBean); String result = stringBean.getStrings(); return result == null ? "" : result; }
parser.visitAllNodesWith(this);
/** * Extract the text from a HTML page.<p> * * @param in the html content input stream * @param encoding the encoding of the content * * @return the extracted text from the page * @throws ParserException if the parsing of the HTML failed * @throws UnsupportedEncodingException if the given encoding is not supported */ public static String extractText(InputStream in, String encoding) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(in, encoding); lexer.setPage(page); parser.setLexer(lexer); StringBean stringBean = new StringBean(); parser.visitAllNodesWith(stringBean); String result = stringBean.getStrings(); return result == null ? "" : result; }
/** * Extract the text from a HTML page.<p> * * @param in the html content input stream * @param encoding the encoding of the content * * @return the extracted text from the page * @throws ParserException if the parsing of the HTML failed * @throws UnsupportedEncodingException if the given encoding is not supported */ public static String extractText(InputStream in, String encoding) throws ParserException, UnsupportedEncodingException { Parser parser = new Parser(); Lexer lexer = new Lexer(); Page page = new Page(in, encoding); lexer.setPage(page); parser.setLexer(lexer); StringBean stringBean = new StringBean(); parser.visitAllNodesWith(stringBean); return stringBean.getStrings(); }
@Test public void testLinkExtraction() throws ParserException { Parser parser = new Parser("http://synyx.de"); ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class); parser.visitAllNodesWith(visitor); Node[] links = visitor.getTags(); // TODO this could use some more meaningful assertions assertTrue(links.length > 0); for (int i = 0; i < links.length; i++) { LinkTag linkTag = (LinkTag) links[i]; System.out.print("\"" + linkTag.getLinkText() + "\" => "); System.out.println(linkTag.getLink()); } } }
parser.visitAllNodesWith(visitor);
parser.visitAllNodesWith(visitor);