/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }
/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }
/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }