public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException { try { HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; try { builder = builderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { e.printStackTrace(); } TagNode tagNode = new HtmlCleaner().clean(source); Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode); return doc; } catch (ParserConfigurationException ex) { ex.printStackTrace(); return null; } }
cp.setAllowHtmlInsideAttributes(true); cp.setAllowMultiWordAttributes(true); cp.setRecognizeUnicodeChars(true); cp.setOmitComments(true);
public static String snapFromHtmlWithCookies(Context context, String xPath, String attrToSnap, String urlString, String cookies) throws IOException, XPatherException { String snap = ""; // create an instance of HtmlCleaner HtmlCleaner cleaner = new HtmlCleaner(); // take default cleaner properties CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); URL url = new URL(urlString); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setDoOutput(true); // optional cookies connection.setRequestProperty(context.getString(R.string.cookie_prefix), cookies); connection.connect(); // use the cleaner to "clean" the HTML and return it as a TagNode object TagNode root = cleaner.clean(new InputStreamReader(connection.getInputStream())); Object[] foundNodes = root.evaluateXPath(xPath); if (foundNodes.length > 0) { TagNode foundNode = (TagNode) foundNodes[0]; snap = foundNode.getAttributeByName(attrToSnap); } return snap; }
props.setRecognizeUnicodeChars(true); props.setOmitComments(true);
final HtmlCleaner mCleaner = new HtmlCleaner(); CleanerProperties props = mCleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); /*url from were data to be fetched*/ String mSiteUrl="http://www.example.com"; String mXPath="//div"; //TagnNode for storing data received from url final TagNode mGetDataFromUrl; //Establish connection URL url=new URL(mSiteUrl); final URLConnection mCCon=url.openConnection(); mGetDataFromUrl=mCleaner .clean(new InputStreamReader(mCCon.getInputStream())); //get to xpath from were data is to be retrieve Object[] mPageData=mGetDataFromUrl.evaluateXPath(mXPath); //validate object if(mPageData.length>0) { TagNode mXPathParsedData = (TagNode) mPageData[0]; // all text in div is in mData Strign mData=mXPathParsedData .getText().trim(); }
private void init() { // Initialize HTMLCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); props.setNamespacesAware(false); // Initialize DomSerializer domSerializer = new DomSerializer(props); // Initialize xml parser try { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilder = documentBuilderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { // THIS CAN NEVER HAPPEN } }
props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); try {
props.setRecognizeUnicodeChars( toBoolean(unicodeChars) );
props.setRecognizeUnicodeChars(toBoolean(unicodeChars));
props.setUseCdataForScriptAndStyle(this.usecdata); props.setTranslateSpecialEntities(this.specialentities); props.setRecognizeUnicodeChars(this.unicodechars); props.setOmitUnknownTags(this.omitunknowntags); props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent);
props.setAllowHtmlInsideAttributes(false); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true);
props.setOmitDoctypeDeclaration(true); props.setOmitXmlDeclaration(true); props.setRecognizeUnicodeChars(false); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(false);
CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(false); props.setRecognizeUnicodeChars(true); props.setUseEmptyElementTags(true); props.setAdvancedXmlEscape(true);