org.htmlcleaner.HtmlCleaner.getProperties java code examples

/**
 * Constructor - creates instance of the parser with specified content.
 * @param cleaner
 * @param reader
 */
public HtmlTokenizer(HtmlCleaner cleaner, Reader reader, final CleanTimeValues cleanTimeValues) {
  this._reader = new BufferedReader(reader);
  this.cleaner = cleaner;
  this.props = cleaner.getProperties();
  this.transformations = cleaner.getTransformations();
  this.cleanTimeValues = cleanTimeValues;
}

  /**
   * Creates a {@link Serializer} instance by finding and invoking the constructor
   * of the injected HtmlCleaner {@link Serializer} class with the given
   * object argument array.
   */
  public Serializer createSerializer(HtmlCleaner cleaner) throws Exception
  {
    if (arguments == null)
    {
      arguments = new Object [] { cleaner.getProperties() };
    }

    return (Serializer) ConstructorUtils.invokeConstructor(serializerClass, arguments);
  }
}

/**
 * Constructor - creates instance of the parser with specified content.
 *
 * @param cleaner
 * @param reader
 */
public HtmlTokenizer(HtmlCleaner cleaner, Reader reader, final CleanTimeValues cleanTimeValues) {
  this._reader = new BufferedReader(reader);
  this.cleaner = cleaner;
  this.props = cleaner.getProperties();
  this.transformations = cleaner.getTransformations();
  this.cleanTimeValues = cleanTimeValues;
}

private HtmlCleaner getHtmlCleaner() {
  HtmlCleaner htmlCleaner = new HtmlCleaner();
  htmlCleaner.getProperties().setUseCdataForScriptAndStyle(false);
  htmlCleaner.getProperties().setPruneTags("script,style");
  htmlCleaner.getProperties().setTreatUnknownTagsAsContent(true);
  htmlCleaner.getProperties().setOmitUnknownTags(true);
  return htmlCleaner;
}

 URL urlSB = new URL("https://www.groupon.com/browse/chicago?z=skip");
URLConnection urlConnection = urlSB.openConnection();
urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:5.0) Gecko/20100101 Firefox/25.0");
urlConnection.connect();
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setNamespacesAware(false);
TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());
new PrettyXmlSerializer(props).writeToFile(tagNodeRoot, "cleaned.xml", "utf-8");

final URL urlSB = new URL("http://www.groupon.com/browse/chicago?z=skip");
   final URLConnection urlConnection = urlSB.openConnection();
   urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0");
   urlConnection.connect();
   final HtmlCleaner cleaner = new HtmlCleaner();
   final CleanerProperties props = cleaner.getProperties();
   props.setNamespacesAware(false);
   final TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());

final HtmlCleaner cleaner = new HtmlCleaner();
 final CleanerProperties properties = cleaner.getProperties();
 final Serializer serializer = new SimpleHtmlSerializer(properties);
 TagNode node = cleaner.clean("hello world");
 StringWriter writer = new StringWriter();
 serializer.write(node, writer, "UTF-8");
 System.out.println(writer.toString());

/**
 * Sets {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties.
 * @param cleanerTransformations
 */
public void setCleanerTransformations(CleanerTransformations cleanerTransformations)
{
  getHtmlCleaner().getProperties().setCleanerTransformations(cleanerTransformations);
}

  /**
   * Creates a <code>HtmlCleaner</code> instance.
   * By default, it sets the following properties to the <code>HtmlCleaner</code>:
   * <ul>
   *   <li>omitXmlDeclaration : true</li>
   *   <li>omitDoctypeDeclaration : true</li>
   * </ul>
   * @return
   */
  protected HtmlCleaner createHtmlCleaner()
  {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    htmlCleaner.getProperties().setOmitXmlDeclaration(true);
    htmlCleaner.getProperties().setOmitDoctypeDeclaration(true);
    return htmlCleaner;
  }
}

/**
 * Returns {@link CleanerTransformations} of the underlying <code>HtmlCleaner</code> properties.
 * @return
 */
public CleanerTransformations getCleanerTransformations()
{
  return getHtmlCleaner().getProperties().getCleanerTransformations();
}

 private class cleanHtml extends AsyncTask<Void, Void, Void>{

  @Override
  protected Void doInBackground(Void... arg0) {
    try {
      HtmlCleaner cleaner = new HtmlCleaner();
      String url = "https://www.easistent.com/urniki/263/razredi/16515";
      TagNode node = cleaner.clean(new URL(url));
      CleanerProperties props = cleaner.getProperties();
      String fileName = Environment.getExternalStorageDirectory().getPath() + "/Android/data/com.whizzapps.stpsurniki/cleaned.html";
      new PrettyXmlSerializer(props).writeToFile(node, fileName, "utf-8");
      Log.i("TAG", "AsyncTask done!");
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return null;
  }
}

 HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(html);
DomSerializer ser = new DomSerializer(cleaner.getProperties());
Document myW3cDoc = ser.createDOM(node);

private static synchronized void initCleaner() {
  if (!htmlCleanerInitialized) {
    cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    props.setOmitComments(true);
    props.setOmitXmlDeclaration(true);
    htmlCleanerInitialized = true;
  }
}

 HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setNamespacesAware(false);
TagNode mainNode = cleaner.clean(htmlString);

 public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException {
  try {
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    props.setAllowHtmlInsideAttributes(true);
    props.setAllowMultiWordAttributes(true);
    props.setRecognizeUnicodeChars(true);
    props.setOmitComments(true);

    DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = null;
    try {
      builder = builderFactory.newDocumentBuilder();
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
    }

    TagNode tagNode = new HtmlCleaner().clean(source);

    Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    return doc;
  } catch (ParserConfigurationException ex) {
    ex.printStackTrace();
    return null;
  }
}

  private String wrap(Object text, HtmlCleaner htmlCleaner) {
    if (text != null) {
      if (text instanceof TagNode) {
        final CleanerProperties cleanerProperties = htmlCleaner.getProperties();
        cleanerProperties.setOmitXmlDeclaration(true);
        final HtmlSerializer htmlSerializer = new PrettyHtmlSerializer(cleanerProperties);
        final String html = htmlSerializer.getAsString((TagNode) text);
        return html;
      } else {
        return TypeUtils.castToString(text);
      }
    }
    return "";
  }
}

public CleanHtmlFunction() {
  this.cleaner = new HtmlCleaner();
  CleanerProperties p = cleaner.getProperties();
  p.setOmitComments(true);
  p.setTranslateSpecialEntities(true);
  p.setTransResCharsToNCR(true);
  // remove all tags that contain uninteresting content
  p.setPruneTags("style,script,form,object,audio,video");
}

public Set<String> validateNonEmpty(String html) {
  final Set<String> result = new HashSet<>();
  final HtmlCleaner cleaner = new HtmlCleaner();
  final CleanerProperties properties = cleaner.getProperties();
  properties.setOmitXmlDeclaration(true);
  properties.setOmitHtmlEnvelope(true);
  properties.setOmitComments(true);
  properties.setNamespacesAware(false);
  properties.setDeserializeEntities(true);
  if (isEmpty(cleaner.clean(html))) {
    result.add(ValidatorMessages.HTML_IS_EMPTY);
  }
  return result;
}

 final HtmlCleaner mCleaner = new HtmlCleaner();   
CleanerProperties props = mCleaner.getProperties();  
  props.setAllowHtmlInsideAttributes(true);  
  props.setAllowMultiWordAttributes(true);  
  props.setRecognizeUnicodeChars(true);  
  props.setOmitComments(true);      
 /*url from were data to be fetched*/  
String mSiteUrl="http://www.example.com";   
String mXPath="//div";   
//TagnNode for storing data received from url  
final TagNode mGetDataFromUrl;   //Establish connection   URL
url=new URL(mSiteUrl);   final URLConnection
mCCon=url.openConnection();   mGetDataFromUrl=mCleaner .clean(new   
InputStreamReader(mCCon.getInputStream()));   //get to xpath from
were data is to be retrieve    Object[]
mPageData=mGetDataFromUrl.evaluateXPath(mXPath);   //validate object
if(mPageData.length>0) {
   TagNode mXPathParsedData = (TagNode) mPageData[0];  
   // all text in div is in mData   
   Strign mData=mXPathParsedData .getText().trim();   }

private void init() {
  
  // Initialize HTMLCleaner
  cleaner = new HtmlCleaner();
  CleanerProperties props = cleaner.getProperties();
  props.setAllowHtmlInsideAttributes(true);
  props.setAllowMultiWordAttributes(true);
  props.setRecognizeUnicodeChars(true);
  props.setOmitComments(true);
  props.setNamespacesAware(false);
  
  // Initialize DomSerializer
  domSerializer = new DomSerializer(props);
  
  // Initialize xml parser		
  try {
    DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
    documentBuilder = documentBuilderFactory.newDocumentBuilder();
  } catch (ParserConfigurationException e) {
    // THIS CAN NEVER HAPPEN
  }
}

Popular methods of HtmlCleaner

clean
<init>
getInnerHtml
addAttributesToTag
Add attributes from specified map to the specified tag. If some attribute already exist it is preser
addIfNeededToPruneSet
addPossibleHeadCandidate
Checks if specified tag with specified info is candidate for moving to head section.
addPruneNode
calculateRootNode
Assigns root node to internal variable and adds neccessery xmlns attributes if cleaner if namespaces
closeAll
Close all unclosed tags if there are any.
closeSnippet
Forced closing
createDocumentNodes
createTagNode

Popular in Java

Making http post requests using okhttp
onCreateOptionsMenu (Activity)
getSharedPreferences (Context)
notifyDataSetChanged (ArrayAdapter)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
Date (java.sql)
A class which can consume and produce dates in SQL Date format. Dates are represented in SQL as yyyy
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
ImageIO (javax.imageio)
Top plugins for WebStorm

How to use getPropertiesmethodin org.htmlcleaner.HtmlCleaner

Best Java code snippets using org.htmlcleaner.HtmlCleaner.getProperties (Showing top 20 results out of 315)

How to use
getProperties
method
in
org.htmlcleaner.HtmlCleaner