/** * Constructor - creates instance of the parser with specified content. * @param cleaner * @param reader */ public HtmlTokenizer(HtmlCleaner cleaner, Reader reader, final CleanTimeValues cleanTimeValues) { this._reader = new BufferedReader(reader); this.cleaner = cleaner; this.props = cleaner.getProperties(); this.transformations = cleaner.getTransformations(); this.cleanTimeValues = cleanTimeValues; }
/** * Creates a {@link Serializer} instance by finding and invoking the constructor * of the injected HtmlCleaner {@link Serializer} class with the given * object argument array. */ public Serializer createSerializer(HtmlCleaner cleaner) throws Exception { if (arguments == null) { arguments = new Object [] { cleaner.getProperties() }; } return (Serializer) ConstructorUtils.invokeConstructor(serializerClass, arguments); } }
/** * Constructor - creates instance of the parser with specified content. * * @param cleaner * @param reader */ public HtmlTokenizer(HtmlCleaner cleaner, Reader reader, final CleanTimeValues cleanTimeValues) { this._reader = new BufferedReader(reader); this.cleaner = cleaner; this.props = cleaner.getProperties(); this.transformations = cleaner.getTransformations(); this.cleanTimeValues = cleanTimeValues; }
private HtmlCleaner getHtmlCleaner() { HtmlCleaner htmlCleaner = new HtmlCleaner(); htmlCleaner.getProperties().setUseCdataForScriptAndStyle(false); htmlCleaner.getProperties().setPruneTags("script,style"); htmlCleaner.getProperties().setTreatUnknownTagsAsContent(true); htmlCleaner.getProperties().setOmitUnknownTags(true); return htmlCleaner; }
URL urlSB = new URL("https://www.groupon.com/browse/chicago?z=skip"); URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:5.0) Gecko/20100101 Firefox/25.0"); urlConnection.connect(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream()); new PrettyXmlSerializer(props).writeToFile(tagNodeRoot, "cleaned.xml", "utf-8");
final URL urlSB = new URL("http://www.groupon.com/browse/chicago?z=skip"); final URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"); urlConnection.connect(); final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); final TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());
final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties properties = cleaner.getProperties(); final Serializer serializer = new SimpleHtmlSerializer(properties); TagNode node = cleaner.clean("hello world"); StringWriter writer = new StringWriter(); serializer.write(node, writer, "UTF-8"); System.out.println(writer.toString());
/** * Sets {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties. * @param cleanerTransformations */ public void setCleanerTransformations(CleanerTransformations cleanerTransformations) { getHtmlCleaner().getProperties().setCleanerTransformations(cleanerTransformations); }
/** * Creates a <code>HtmlCleaner</code> instance. * By default, it sets the following properties to the <code>HtmlCleaner</code>: * <ul> * <li>omitXmlDeclaration : true</li> * <li>omitDoctypeDeclaration : true</li> * </ul> * @return */ protected HtmlCleaner createHtmlCleaner() { HtmlCleaner htmlCleaner = new HtmlCleaner(); htmlCleaner.getProperties().setOmitXmlDeclaration(true); htmlCleaner.getProperties().setOmitDoctypeDeclaration(true); return htmlCleaner; } }
/** * Returns {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties. * @return */ public CleanerTransformations getCleanerTransformations() { return getHtmlCleaner().getProperties().getCleanerTransformations(); }
private class cleanHtml extends AsyncTask<Void, Void, Void>{ @Override protected Void doInBackground(Void... arg0) { try { HtmlCleaner cleaner = new HtmlCleaner(); String url = "https://www.easistent.com/urniki/263/razredi/16515"; TagNode node = cleaner.clean(new URL(url)); CleanerProperties props = cleaner.getProperties(); String fileName = Environment.getExternalStorageDirectory().getPath() + "/Android/data/com.whizzapps.stpsurniki/cleaned.html"; new PrettyXmlSerializer(props).writeToFile(node, fileName, "utf-8"); Log.i("TAG", "AsyncTask done!"); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } }
HtmlCleaner cleaner = new HtmlCleaner(); TagNode node = cleaner.clean(html); DomSerializer ser = new DomSerializer(cleaner.getProperties()); Document myW3cDoc = ser.createDOM(node);
private static synchronized void initCleaner() { if (!htmlCleanerInitialized) { cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setOmitXmlDeclaration(true); htmlCleanerInitialized = true; } }
HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); TagNode mainNode = cleaner.clean(htmlString);
public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException { try { HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; try { builder = builderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { e.printStackTrace(); } TagNode tagNode = new HtmlCleaner().clean(source); Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode); return doc; } catch (ParserConfigurationException ex) { ex.printStackTrace(); return null; } }
private String wrap(Object text, HtmlCleaner htmlCleaner) { if (text != null) { if (text instanceof TagNode) { final CleanerProperties cleanerProperties = htmlCleaner.getProperties(); cleanerProperties.setOmitXmlDeclaration(true); final HtmlSerializer htmlSerializer = new PrettyHtmlSerializer(cleanerProperties); final String html = htmlSerializer.getAsString((TagNode) text); return html; } else { return TypeUtils.castToString(text); } } return ""; } }
public CleanHtmlFunction() { this.cleaner = new HtmlCleaner(); CleanerProperties p = cleaner.getProperties(); p.setOmitComments(true); p.setTranslateSpecialEntities(true); p.setTransResCharsToNCR(true); // remove all tags that contain uninteresting content p.setPruneTags("style,script,form,object,audio,video"); }
public Set<String> validateNonEmpty(String html) { final Set<String> result = new HashSet<>(); final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties properties = cleaner.getProperties(); properties.setOmitXmlDeclaration(true); properties.setOmitHtmlEnvelope(true); properties.setOmitComments(true); properties.setNamespacesAware(false); properties.setDeserializeEntities(true); if (isEmpty(cleaner.clean(html))) { result.add(ValidatorMessages.HTML_IS_EMPTY); } return result; }
final HtmlCleaner mCleaner = new HtmlCleaner(); CleanerProperties props = mCleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); /*url from were data to be fetched*/ String mSiteUrl="http://www.example.com"; String mXPath="//div"; //TagnNode for storing data received from url final TagNode mGetDataFromUrl; //Establish connection URL url=new URL(mSiteUrl); final URLConnection mCCon=url.openConnection(); mGetDataFromUrl=mCleaner .clean(new InputStreamReader(mCCon.getInputStream())); //get to xpath from were data is to be retrieve Object[] mPageData=mGetDataFromUrl.evaluateXPath(mXPath); //validate object if(mPageData.length>0) { TagNode mXPathParsedData = (TagNode) mPageData[0]; // all text in div is in mData Strign mData=mXPathParsedData .getText().trim(); }
private void init() { // Initialize HTMLCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); props.setNamespacesAware(false); // Initialize DomSerializer domSerializer = new DomSerializer(props); // Initialize xml parser try { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilder = documentBuilderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { // THIS CAN NEVER HAPPEN } }