/** * Returns {@code true} if the specified HTTP response headers indicate an XML response. * * @param headers the HTTP response headers * @return {@code true} if the specified HTTP response headers indicate an XML response */ static boolean isXml(final List<NameValuePair> headers) { return contentTypeEndsWith(headers, "text/xml", "application/xml", "text/vnd.wap.wml", "+xml"); }
if (isHtml(headers)) { return sniffHtmlEncoding(headers, content); else if (isXml(headers)) { return sniffXmlEncoding(headers, content); return sniffUnknownContentTypeEncoding(headers, content);
/** * <p>Sniffs encoding settings from the specified XML content and/or the corresponding HTTP headers using * a custom algorithm.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * {@code null}, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the XML content to be sniffed * @param content the XML content to be sniffed * @return the encoding sniffed from the specified XML content and/or the corresponding HTTP headers, * or {@code null} if the encoding could not be determined * @throws IOException if an IO error occurs */ public static Charset sniffXmlEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { Charset encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); if (encoding != null) { return encoding; } bytes = readAndPrepend(content, SIZE_OF_XML_CONTENT_SNIFFED, bytes); encoding = sniffEncodingFromXmlDeclaration(bytes); return encoding; }
/** * <p>Sniffs encoding settings from the specified content of unknown type by looking for <tt>Content-Type</tt> * information in the HTTP headers and <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a> * information in the content.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * {@code null}, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the content to be sniffed * @param content the content to be sniffed * @return the encoding sniffed from the specified content and/or the corresponding HTTP headers, * or {@code null} if the encoding could not be determined * @throws IOException if an IO error occurs */ public static Charset sniffUnknownContentTypeEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { Charset encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } final byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); return encoding; }
/** * <p>Sniffs encoding settings from the specified HTML content and/or the corresponding HTTP headers based on the * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#determining-the-character-encoding">HTML5 * encoding sniffing algorithm</a>.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * <tt>null</tt>, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the HTML content to be sniffed * @param content the HTML content to be sniffed * @return the encoding sniffed from the specified HTML content and/or the corresponding HTTP headers, * or <tt>null</tt> if the encoding could not be determined * @throws IOException if an IO error occurs */ public static String sniffHtmlEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { String encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); if (encoding != null) { return encoding; } bytes = readAndPrepend(content, SIZE_OF_HTML_CONTENT_SNIFFED, bytes); encoding = sniffEncodingFromMetaTag(bytes); return encoding; }
if (matches(bytes, i, COMMENT_START)) { i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i); if (i == -1) { break; else if (matches(bytes, i, META_START)) { i += META_START.length; for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) { i = att.getUpdatedIndex(); final String name = att.getName(); charset = extractEncodingFromContentType(value); if (charset == null) { continue; if (isSupportedCharset(charset)) { if (LOG.isDebugEnabled()) { LOG.debug("Encoding found in meta tag: '" + charset + "'."); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break; while ((att = getAttribute(bytes, i)) != null) { i = att.getUpdatedIndex(); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break;
if (matches(bytes, i, COMMENT_START)) { i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i); if (i == -1) { break; else if (matches(bytes, i, META_START)) { i += META_START.length; for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) { i = att.getUpdatedIndex(); final String name = att.getName(); Charset charset = null; if ("charset".equals(name)) { charset = toCharset(value); charset = extractEncodingFromContentType(value); if (charset == null) { continue; i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break; while ((att = getAttribute(bytes, i)) != null) { i = att.getUpdatedIndex(); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break;
int i; for (i = 0; i < bytes.length; i++) { if (matches(bytes, i, CHARSET_START)) { i += CHARSET_START.length; break; return isSupportedCharset(charset) ? charset : null; return isSupportedCharset(charset) ? charset : null; int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B}); if (end == -1) { end = bytes.length; return isSupportedCharset(charset) ? charset : null;
int i; for (i = 0; i < bytes.length; i++) { if (matches(bytes, i, CHARSET_START)) { i += CHARSET_START.length; break; return toCharset(charsetName); return toCharset(charsetName); int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B}); if (end == -1) { end = bytes.length; return toCharset(charsetName);
/** * Attempts to read <tt>size</tt> bytes from the specified input stream and then prepends the specified prefix to * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read * <tt>size</tt> bytes; however, the returned byte array will always be the exact length of the number of bytes * read plus the length of the prefix array. * * @param content the input stream to read from * @param size the number of bytes to try to read * @param prefix the byte array to prepend to the bytes read from the specified input stream * @return the bytes read from the specified input stream, prefixed by the specified prefix * @throws IOException if an IO error occurs */ static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException { final byte[] bytes = read(content, size); final byte[] joined = new byte[prefix.length + bytes.length]; System.arraycopy(prefix, 0, joined, 0, prefix.length); System.arraycopy(bytes, 0, joined, prefix.length, bytes.length); return joined; }
/** * Attempts to sniff an encoding from the specified HTTP headers. * * @param headers the HTTP headers to examine * @return the encoding sniffed from the specified HTTP headers, or {@code null} if the encoding * could not be determined */ public static Charset sniffEncodingFromHttpHeaders(final List<NameValuePair> headers) { for (final NameValuePair pair : headers) { final String name = pair.getName(); if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) { final Charset encoding = extractEncodingFromContentType(pair.getValue()); if (encoding != null) { if (LOG.isDebugEnabled()) { LOG.debug("Encoding found in HTTP headers: '" + encoding + "'."); } return encoding; } } } return null; }
/** * <p>Sniffs encoding settings from the specified HTML content and/or the corresponding HTTP headers based on the * <a href="http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding">HTML5 * encoding sniffing algorithm</a>.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * {@code null}, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the HTML content to be sniffed * @param content the HTML content to be sniffed * @return the encoding sniffed from the specified HTML content and/or the corresponding HTTP headers, * or {@code null} if the encoding could not be determined * @throws IOException if an IO error occurs */ public static Charset sniffHtmlEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { Charset encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); if (encoding != null) { return encoding; } bytes = readAndPrepend(content, SIZE_OF_HTML_CONTENT_SNIFFED, bytes); encoding = sniffEncodingFromMetaTag(bytes); return encoding; }
if (matches(bytes, i, COMMENT_START)) { i = indexOfSubArray(bytes, new byte[] {'-', '-', '>'}, i); if (i == -1) { break; else if (matches(bytes, i, META_START)) { i += META_START.length; for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) { i = att.getUpdatedIndex(); final String name = att.getName(); charset = extractEncodingFromContentType(value); if (charset == null) { continue; if (isSupportedCharset(charset)) { if (LOG.isDebugEnabled()) { LOG.debug("Encoding found in meta tag: '" + charset + "'."); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break; while ((att = getAttribute(bytes, i)) != null) { i = att.getUpdatedIndex(); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break;
if (matches(bytes, i, COMMENT_START)) { i = indexOfSubArray(bytes, COMMENT_END, i); if (i == -1) { break; else if (matches(bytes, i, META_START)) { i += META_START.length; for (Attribute att = getAttribute(bytes, i); att != null; att = getAttribute(bytes, i)) { i = att.getUpdatedIndex(); final String name = att.getName(); Charset charset = null; if ("charset".equals(name)) { charset = toCharset(value); charset = extractEncodingFromContentType(value); if (charset == null) { continue; i = skipToAnyOf(bytes, i, WHITESPACE); if (i == -1) { break; while ((att = getAttribute(bytes, i)) != null) { i = att.getUpdatedIndex(); i = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3E}); if (i == -1) { break;
/** * <p>Sniffs encoding settings from the specified content of unknown type by looking for <tt>Content-Type</tt> * information in the HTTP headers and <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a> * information in the content.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * <tt>null</tt>, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the content to be sniffed * @param content the content to be sniffed * @return the encoding sniffed from the specified content and/or the corresponding HTTP headers, * or <tt>null</tt> if the encoding could not be determined * @throws IOException if an IO error occurs */ public static String sniffUnknownContentTypeEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { String encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } final byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); return encoding; }
int i; for (i = 0; i < bytes.length; i++) { if (matches(bytes, i, CHARSET_START)) { i += CHARSET_START.length; break; return isSupportedCharset(charset) ? charset : null; return isSupportedCharset(charset) ? charset : null; int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B}); if (end == -1) { end = bytes.length; return isSupportedCharset(charset) ? charset : null;
int i; for (i = 0; i < bytes.length; i++) { if (matches(bytes, i, CHARSET_START)) { i += CHARSET_START.length; break; return toCharset(charsetName); return toCharset(charsetName); int end = skipToAnyOf(bytes, i, new byte[] {0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x3B}); if (end == -1) { end = bytes.length; return toCharset(charsetName);
/** * Attempts to read <tt>size</tt> bytes from the specified input stream and then prepends the specified prefix to * the bytes read, returning the resultant byte array. Note that this method is not guaranteed to be able to read * <tt>size</tt> bytes; however, the returned byte array will always be the exact length of the number of bytes * read plus the length of the prefix array. * * @param content the input stream to read from * @param size the number of bytes to try to read * @param prefix the byte array to prepend to the bytes read from the specified input stream * @return the bytes read from the specified input stream, prefixed by the specified prefix * @throws IOException if an IO error occurs */ static byte[] readAndPrepend(final InputStream content, final int size, final byte[] prefix) throws IOException { final byte[] bytes = read(content, size); final byte[] joined = new byte[prefix.length + bytes.length]; System.arraycopy(prefix, 0, joined, 0, prefix.length); System.arraycopy(bytes, 0, joined, prefix.length, bytes.length); return joined; }
/** * Attempts to sniff an encoding from the specified HTTP headers. * * @param headers the HTTP headers to examine * @return the encoding sniffed from the specified HTTP headers, or {@code null} if the encoding * could not be determined */ public static Charset sniffEncodingFromHttpHeaders(final List<NameValuePair> headers) { for (final NameValuePair pair : headers) { final String name = pair.getName(); if (HttpHeader.CONTENT_TYPE_LC.equalsIgnoreCase(name)) { final Charset encoding = extractEncodingFromContentType(pair.getValue()); if (encoding != null) { if (LOG.isDebugEnabled()) { LOG.debug("Encoding found in HTTP headers: '" + encoding + "'."); } return encoding; } } } return null; }
/** * <p>Sniffs encoding settings from the specified XML content and/or the corresponding HTTP headers using * a custom algorithm.</p> * * <p>Note that if an encoding is found but it is not supported on the current platform, this method returns * <tt>null</tt>, as if no encoding had been found.</p> * * @param headers the HTTP response headers sent back with the XML content to be sniffed * @param content the XML content to be sniffed * @return the encoding sniffed from the specified XML content and/or the corresponding HTTP headers, * or <tt>null</tt> if the encoding could not be determined * @throws IOException if an IO error occurs */ public static String sniffXmlEncoding(final List<NameValuePair> headers, final InputStream content) throws IOException { String encoding = sniffEncodingFromHttpHeaders(headers); if (encoding != null || content == null) { return encoding; } byte[] bytes = read(content, 3); encoding = sniffEncodingFromUnicodeBom(bytes); if (encoding != null) { return encoding; } bytes = readAndPrepend(content, SIZE_OF_XML_CONTENT_SNIFFED, bytes); encoding = sniffEncodingFromXmlDeclaration(bytes); return encoding; }