@Override protected Document parseDomImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); try { handler = parseHtmlImpl(source, config, new NormalizingTagBalancer()); } catch (IOException ioe) { return null; } Document document = handler.getDocument(); document.appendChild(DomUtil.getFirstNamedChildNode(handler.getFragment(), "html")); fixNekoWeirdness(document); return document; }
/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }
/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }
/** * Override the document start to record whether HTML, HEAD or BODY have been seen */ @Override public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext, Augmentations augs) throws XNIException { super.startDocument(locator, encoding, nscontext, augs); for (int i = fElementStack.top - 1; i >= 0; i--) { fSeenAnything = true; if (fElementStack.data[i].element.code == HTMLElements.HTML) { fSeenRootElement = true; } if (fElementStack.data[i].element.code == HTMLElements.HEAD) { fSeenHeadElement = true; } if (fElementStack.data[i].element.code == HTMLElements.BODY) { fSeenBodyElement = true; } } } }
/** * Override the document start to record whether HTML, HEAD or BODY have been seen */ @Override public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext, Augmentations augs) throws XNIException { super.startDocument(locator, encoding, nscontext, augs); for (int i = fElementStack.top - 1; i >= 0; i--) { fSeenAnything = true; if (fElementStack.data[i].element.code == HTMLElements.HTML) { fSeenRootElement = true; } if (fElementStack.data[i].element.code == HTMLElements.HEAD) { fSeenHeadElement = true; } if (fElementStack.data[i].element.code == HTMLElements.BODY) { fSeenBodyElement = true; } } } }
/** * Parse HTML source. * * @return a document handler containing the parsed source */ private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config, NormalizingTagBalancer tagBalancer) throws IOException { HTMLScanner htmlScanner = new HTMLScanner(); tagBalancer.setScanner(htmlScanner); DocumentHandler handler = newDocumentHandler(source); NamespaceBinder namespaceBinder = new NamespaceBinder(); namespaceBinder.setDocumentHandler(handler); namespaceBinder.setDocumentSource(tagBalancer); namespaceBinder.reset(config); tagBalancer.setDocumentHandler(namespaceBinder); // Order of filter is Scanner -> OSMLFilter -> Tag Balancer tagBalancer.setDocumentSource(htmlScanner); htmlScanner.setDocumentHandler(tagBalancer); tagBalancer.reset(config); htmlScanner.reset(config); XMLInputSource inputSource = new XMLInputSource(null, null, null); inputSource.setEncoding("UTF-8"); inputSource.setCharacterStream(new StringReader(source)); htmlScanner.setInputSource(inputSource); htmlScanner.scanDocument(true); return handler; }
/** * Override the document start to record whether HTML, HEAD or BODY have been seen */ @Override public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext, Augmentations augs) throws XNIException { super.startDocument(locator, encoding, nscontext, augs); for (int i = fElementStack.top - 1; i >= 0; i--) { fSeenAnything = true; if (fElementStack.data[i].element.code == HTMLElements.HTML) { fSeenRootElement = true; } if (fElementStack.data[i].element.code == HTMLElements.HEAD) { fSeenHeadElement = true; } if (fElementStack.data[i].element.code == HTMLElements.BODY) { fSeenBodyElement = true; } } } }
@Override protected Document parseDomImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); try { handler = parseHtmlImpl(source, config, new NormalizingTagBalancer()); } catch (IOException ioe) { return null; } Document document = handler.getDocument(); document.appendChild(DomUtil.getFirstNamedChildNode(handler.getFragment(), "html")); fixNekoWeirdness(document); return document; }
@Override protected Document parseDomImpl(String source) throws GadgetException { DocumentHandler handler; HTMLConfiguration config = newConfiguration(); try { handler = parseHtmlImpl(source, config, new NormalizingTagBalancer()); } catch (IOException ioe) { return null; } Document document = handler.getDocument(); document.appendChild(DomUtil.getFirstNamedChildNode(handler.getFragment(), "html")); fixNekoWeirdness(document); return document; }