protected String getEncoding( String requiredEncoding, File file, Log log ) throws IOException { FileInputStream fis = null; try { fis = new FileInputStream( file ); CharsetDetector detector = new CharsetDetector(); detector.setDeclaredEncoding( requiredEncoding ); detector.setText( new BufferedInputStream( fis ) ); CharsetMatch[] charsets = detector.detectAll(); if ( charsets == null ) { return null; } else { return charsets[0].getName(); } } finally { IOUtil.close( fis ); } }
public static String detect(InputStream in, String format, String encoding) throws IOException { // the input stream must support marks if (!in.markSupported()) { throw new IOException("Mark not supported by input stream"); } String result = null; if (format != null) { result = checkFormat(format, in); if (result != null) { return result; } } // in case of HTML or XML check whether there is a charset // specification; might be too fragile CharsetDetector detector = new CharsetDetector(); if (encoding != null) { detector.setDeclaredEncoding(encoding); } detector.setText(in); CharsetMatch found = detector.detect(); result = found.getName(); LOG.debug("Encoding: " + result); return result; }
public static String detect(InputStream in, String format, String encoding) throws IOException { // the input stream must support marks if (!in.markSupported()) { throw new IOException("Mark not supported by input stream"); } String result = null; if (format != null) { result = checkFormat(format, in); if (result != null) { return result; } } // in case of HTML or XML check whether there is a charset // specification; might be too fragile CharsetDetector detector = new CharsetDetector(); if (encoding != null) { detector.setDeclaredEncoding(encoding); } detector.setText(in); CharsetMatch found = detector.detect(); result = found.getName(); LOG.debug("Encoding: " + result); return result; }
/** * Use a third party library as last resort to guess the charset from the * bytes. */ private static String getCharsetFromText(byte[] content, String declaredCharset, int maxLengthCharsetDetection) { String charset = null; // filter HTML tags CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.enableInputFilter(true); // give it a hint if (declaredCharset != null) charsetDetector.setDeclaredEncoding(declaredCharset); // trim the content of the text for the detection byte[] subContent = content; if (maxLengthCharsetDetection != -1 && content.length > maxLengthCharsetDetection) { subContent = Arrays.copyOfRange(content, 0, maxLengthCharsetDetection); } charsetDetector.setText(subContent); try { CharsetMatch charsetMatch = charsetDetector.detect(); charset = validateCharset(charsetMatch.getName()); } catch (Exception e) { charset = null; } return charset; }