com.gargoylesoftware.htmlunit.util.EncodingSniffer java code examples

/**
 * Returns {@code true} if the specified HTTP response headers indicate an XML response.
 *
 * @param headers the HTTP response headers
 * @return {@code true} if the specified HTTP response headers indicate an XML response
 */
static boolean isXml(final List<NameValuePair> headers) {
  return contentTypeEndsWith(headers, "text/xml", "application/xml", "text/vnd.wap.wml", "+xml");
}

if (isHtml(headers)) {
  return sniffHtmlEncoding(headers, content);
else if (isXml(headers)) {
  return sniffXmlEncoding(headers, content);
  return sniffUnknownContentTypeEncoding(headers, content);

/**
 * <p>Sniffs encoding settings from the specified XML content and/or the corresponding HTTP headers using
 * a custom algorithm.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * {@code null}, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the XML content to be sniffed
 * @param content the XML content to be sniffed
 * @return the encoding sniffed from the specified XML content and/or the corresponding HTTP headers,
 *         or {@code null} if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static Charset sniffXmlEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  Charset encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  if (encoding != null) {
    return encoding;
  }
  bytes = readAndPrepend(content, SIZE_OF_XML_CONTENT_SNIFFED, bytes);
  encoding = sniffEncodingFromXmlDeclaration(bytes);
  return encoding;
}

/**
 * <p>Sniffs encoding settings from the specified content of unknown type by looking for <tt>Content-Type</tt>
 * information in the HTTP headers and <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * information in the content.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * {@code null}, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the content to be sniffed
 * @param content the content to be sniffed
 * @return the encoding sniffed from the specified content and/or the corresponding HTTP headers,
 *         or {@code null} if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static Charset sniffUnknownContentTypeEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  Charset encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  final byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  return encoding;
}

/**
 * <p>Sniffs encoding settings from the specified HTML content and/or the corresponding HTTP headers based on the
 * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#determining-the-character-encoding">HTML5
 * encoding sniffing algorithm</a>.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * <tt>null</tt>, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the HTML content to be sniffed
 * @param content the HTML content to be sniffed
 * @return the encoding sniffed from the specified HTML content and/or the corresponding HTTP headers,
 *         or <tt>null</tt> if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static String sniffHtmlEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  String encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  if (encoding != null) {
    return encoding;
  }
  bytes = readAndPrepend(content, SIZE_OF_HTML_CONTENT_SNIFFED, bytes);
  encoding = sniffEncodingFromMetaTag(bytes);
  return encoding;
}

if (matches(bytes, i, COMMENT_START)) {
  i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i);
  if (i == -1) {
    break;
else if (matches(bytes, i, META_START)) {
  i += META_START.length;
  for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
    i = att.getUpdatedIndex();
    final String name = att.getName();
        charset = extractEncodingFromContentType(value);
        if (charset == null) {
          continue;
      if (isSupportedCharset(charset)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Encoding found in meta tag: '" + charset + "'.");
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;
  while ((att = getAttribute(bytes, i)) != null) {
    i = att.getUpdatedIndex();
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;

if (matches(bytes, i, COMMENT_START)) {
  i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i);
  if (i == -1) {
    break;
else if (matches(bytes, i, META_START)) {
  i += META_START.length;
  for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
    i = att.getUpdatedIndex();
    final String name = att.getName();
      Charset charset = null;
      if ("charset".equals(name)) {
        charset = toCharset(value);
        charset = extractEncodingFromContentType(value);
        if (charset == null) {
          continue;
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;
  while ((att = getAttribute(bytes, i)) != null) {
    i = att.getUpdatedIndex();
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;

int i;
for (i = 0; i < bytes.length; i++) {
  if (matches(bytes, i, CHARSET_START)) {
    i += CHARSET_START.length;
    break;
  return isSupportedCharset(charset) ? charset : null;
  return isSupportedCharset(charset) ? charset : null;
int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
if (end == -1) {
  end = bytes.length;
return isSupportedCharset(charset) ? charset : null;

int i;
for (i = 0; i < bytes.length; i++) {
  if (matches(bytes, i, CHARSET_START)) {
    i += CHARSET_START.length;
    break;
  return toCharset(charsetName);
  return toCharset(charsetName);
int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
if (end == -1) {
  end = bytes.length;
return toCharset(charsetName);

/**
 * Attempts to read <tt>size</tt> bytes from the specified input stream and then prepends the specified prefix to
 * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read
 * <tt>size</tt> bytes; however, the returned byte array will always be the exact length of the number of bytes
 * read plus the length of the prefix array.
 *
 * @param content the input stream to read from
 * @param size the number of bytes to try to read
 * @param prefix the byte array to prepend to the bytes read from the specified input stream
 * @return the bytes read from the specified input stream, prefixed by the specified prefix
 * @throws IOException if an IO error occurs
 */
static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
  final byte[] bytes = read(content, size);
  final byte[] joined = new byte[prefix.length + bytes.length];
  System.arraycopy(prefix, 0, joined, 0, prefix.length);
  System.arraycopy(bytes, 0, joined, prefix.length, bytes.length);
  return joined;
}

/**
 * Attempts to sniff an encoding from the specified HTTP headers.
 *
 * @param headers the HTTP headers to examine
 * @return the encoding sniffed from the specified HTTP headers, or {@code null} if the encoding
 *         could not be determined
 */
public static Charset sniffEncodingFromHttpHeaders(final List<NameValuePair> headers) {
  for (final NameValuePair pair : headers) {
    final String name = pair.getName();
    if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
      final Charset encoding = extractEncodingFromContentType(pair.getValue());
      if (encoding != null) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Encoding found in HTTP headers: '" + encoding + "'.");
        }
        return encoding;
      }
    }
  }
  return null;
}

/**
 * <p>Sniffs encoding settings from the specified HTML content and/or the corresponding HTTP headers based on the
 * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding">HTML5
 * encoding sniffing algorithm</a>.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * {@code null}, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the HTML content to be sniffed
 * @param content the HTML content to be sniffed
 * @return the encoding sniffed from the specified HTML content and/or the corresponding HTTP headers,
 *         or {@code null} if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static Charset sniffHtmlEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  Charset encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  if (encoding != null) {
    return encoding;
  }
  bytes = readAndPrepend(content, SIZE_OF_HTML_CONTENT_SNIFFED, bytes);
  encoding = sniffEncodingFromMetaTag(bytes);
  return encoding;
}

if (matches(bytes, i, COMMENT_START)) {
  i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i);
  if (i == -1) {
    break;
else if (matches(bytes, i, META_START)) {
  i += META_START.length;
  for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
    i = att.getUpdatedIndex();
    final String name = att.getName();
        charset = extractEncodingFromContentType(value);
        if (charset == null) {
          continue;
      if (isSupportedCharset(charset)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Encoding found in meta tag: '" + charset + "'.");
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;
  while ((att = getAttribute(bytes, i)) != null) {
    i = att.getUpdatedIndex();
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;

if (matches(bytes, i, COMMENT_START)) {
  i = indexOfSubArray(bytes, COMMENT_END, i);
  if (i == -1) {
    break;
else if (matches(bytes, i, META_START)) {
  i += META_START.length;
  for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) {
    i = att.getUpdatedIndex();
    final String name = att.getName();
      Charset charset = null;
      if ("charset".equals(name)) {
        charset = toCharset(value);
        charset = extractEncodingFromContentType(value);
        if (charset == null) {
          continue;
  i = skipToAnyOf(bytes, i, WHITESPACE);
  if (i == -1) {
    break;
  while ((att = getAttribute(bytes, i)) != null) {
    i = att.getUpdatedIndex();
  i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E});
  if (i == -1) {
    break;

/**
 * <p>Sniffs encoding settings from the specified content of unknown type by looking for <tt>Content-Type</tt>
 * information in the HTTP headers and <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * information in the content.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * <tt>null</tt>, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the content to be sniffed
 * @param content the content to be sniffed
 * @return the encoding sniffed from the specified content and/or the corresponding HTTP headers,
 *         or <tt>null</tt> if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static String sniffUnknownContentTypeEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  String encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  final byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  return encoding;
}

int i;
for (i = 0; i < bytes.length; i++) {
  if (matches(bytes, i, CHARSET_START)) {
    i += CHARSET_START.length;
    break;
  return isSupportedCharset(charset) ? charset : null;
  return isSupportedCharset(charset) ? charset : null;
int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
if (end == -1) {
  end = bytes.length;
return isSupportedCharset(charset) ? charset : null;

int i;
for (i = 0; i < bytes.length; i++) {
  if (matches(bytes, i, CHARSET_START)) {
    i += CHARSET_START.length;
    break;
  return toCharset(charsetName);
  return toCharset(charsetName);
int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B});
if (end == -1) {
  end = bytes.length;
return toCharset(charsetName);

/**
 * Attempts to read <tt>size</tt> bytes from the specified input stream and then prepends the specified prefix to
 * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read
 * <tt>size</tt> bytes; however, the returned byte array will always be the exact length of the number of bytes
 * read plus the length of the prefix array.
 *
 * @param content the input stream to read from
 * @param size the number of bytes to try to read
 * @param prefix the byte array to prepend to the bytes read from the specified input stream
 * @return the bytes read from the specified input stream, prefixed by the specified prefix
 * @throws IOException if an IO error occurs
 */
static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException {
  final byte[] bytes = read(content, size);
  final byte[] joined = new byte[prefix.length + bytes.length];
  System.arraycopy(prefix, 0, joined, 0, prefix.length);
  System.arraycopy(bytes, 0, joined, prefix.length, bytes.length);
  return joined;
}

/**
 * Attempts to sniff an encoding from the specified HTTP headers.
 *
 * @param headers the HTTP headers to examine
 * @return the encoding sniffed from the specified HTTP headers, or {@code null} if the encoding
 *         could not be determined
 */
public static Charset sniffEncodingFromHttpHeaders(final List<NameValuePair> headers) {
  for (final NameValuePair pair : headers) {
    final String name = pair.getName();
    if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) {
      final Charset encoding = extractEncodingFromContentType(pair.getValue());
      if (encoding != null) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Encoding found in HTTP headers: '" + encoding + "'.");
        }
        return encoding;
      }
    }
  }
  return null;
}

/**
 * <p>Sniffs encoding settings from the specified XML content and/or the corresponding HTTP headers using
 * a custom algorithm.</p>
 *
 * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns
 * <tt>null</tt>, as if no encoding had been found.</p>
 *
 * @param headers the HTTP response headers sent back with the XML content to be sniffed
 * @param content the XML content to be sniffed
 * @return the encoding sniffed from the specified XML content and/or the corresponding HTTP headers,
 *         or <tt>null</tt> if the encoding could not be determined
 * @throws IOException if an IO error occurs
 */
public static String sniffXmlEncoding(final List<NameValuePair> headers, final InputStream content)
  throws IOException {
  String encoding = sniffEncodingFromHttpHeaders(headers);
  if (encoding != null || content == null) {
    return encoding;
  }
  byte[] bytes = read(content, 3);
  encoding = sniffEncodingFromUnicodeBom(bytes);
  if (encoding != null) {
    return encoding;
  }
  bytes = readAndPrepend(content, SIZE_OF_XML_CONTENT_SNIFFED, bytes);
  encoding = sniffEncodingFromXmlDeclaration(bytes);
  return encoding;
}

Javadoc

Sniffs encoding settings from HTML, XML or other content. The HTML encoding sniffing algorithm is based on the HTML5 encoding sniffing algorithm.

Most used methods

contentTypeEndsWith
Returns true if the specified HTTP response headers contain a Content-Type that ends with one of the
extractEncodingFromContentType
Extracts an encoding from the specified Content-Type value usingthe IETF algorithm [http://ietfrepor
getAttribute
Extracts an attribute from the specified byte array, starting at the specified index, using theHTML5
indexOfSubArray
Finds the first index of the specified sub-array inside the specified array, starting at the specifi
isHtml
Returns true if the specified HTTP response headers indicate an HTML response.
isXml
Returns true if the specified HTTP response headers indicate an XML response.
matches
Returns true if the byte in the specified byte array at the specified index matches one of the speci
read
Attempts to read size bytes from the specified input stream. Note that this method is not guaranteed
readAndPrepend
Attempts to read size bytes from the specified input stream and then prepends the specified prefix t
skipToAnyOf
Skips ahead to the first occurrence of any of the specified targets within the specified array, star
sniffEncoding
If the specified content is HTML content, this method sniffs encoding settings from the specified HT
sniffEncodingFromHttpHeaders
Attempts to sniff an encoding from the specified HTTP headers.

Popular in Java

Creating JSON documents from java classes using gson
setScale (BigDecimal)
getApplicationContext (Context)
runOnUiThread (Activity)
String (java.lang)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Menu (java.awt)
Top Sublime Text plugins

How to useEncodingSniffer in com.gargoylesoftware.htmlunit.util

Best Java code snippets using com.gargoylesoftware.htmlunit.util.EncodingSniffer (Showing top 20 results out of 315)

How to use
EncodingSniffer
in
com.gargoylesoftware.htmlunit.util