public static String readFileAsStringGuessEncoding(String filePath) { String s = null; try { File file = new File(filePath); byte [] fileData = new byte[(int)file.length()]; DataInputStream dis = new DataInputStream(new FileInputStream(file)); dis.readFully(fileData); dis.close(); CharsetMatch match = new CharsetDetector().setText(fileData).detect(); if (match != null) try { Lt.d("For file: " + filePath + " guessed enc: " + match.getName() + " conf: " + match.getConfidence()); s = new String(fileData, match.getName()); } catch (UnsupportedEncodingException ue) { s = null; } if (s == null) s = new String(fileData); } catch (Exception e) { Lt.e("Exception in readFileAsStringGuessEncoding(): " + e); e.printStackTrace(); } return s; }
byte[] thisAppCanBreak = "this app can break" .getBytes("ISO-8859-1"); CharsetDetector detector = new CharsetDetector(); detector.setText(thisAppCanBreak); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); }
public static void main(String[] args) throws IOException { InputStream file = new FileInputStream(args[0]); try { file = new BufferedInputStream(file); CharsetDetector detector = new CharsetDetector(); detector.setText(file); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); } } finally { file.close(); } }
protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; }
/** * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection * Return the file contents as a String. */ public static String fileAnyEncodingToString(File f) throws IOException { byte[] byteData = IOUtils.toByteArray(new FileInputStream(f)); CharsetDetector detector = new CharsetDetector(); String unicodeData = detector.getString(byteData, null); // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator"); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 60) { LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName()); if (match.getLanguage() != null) { LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage()); } } return unicodeData; }
String charset = "ISO-8859-1"; //Default chartset, put whatever you want byte[] fileContent = null; FileInputStream fin = null; //create FileInputStream object fin = new FileInputStream(file.getPath()); /* * Create byte array large enough to hold the content of the file. * Use File.length to determine size of the file in bytes. */ fileContent = new byte[(int) file.length()]; /* * To read content of the file in byte array, use * int read(byte[] byteArray) method of java FileInputStream class. * */ fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); //Here you have the encode name and the confidence //In my case if the confidence is > 50 I return the encode, else I return the default value if (confidence > 50) { charset = cm.getName(); } }
public String autoDetectEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); setSelectedItem(charSet); return charSet; }
public static String getEncode(InputStream data) throws IOException{ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; } }
public static String getEncode(byte[] data){ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; }
String charset = "ISO-8859-1"; //Default chartset, put whatever you want byte[] fileContent = null; FileInputStream fin = null; //create FileInputStream object fin = new FileInputStream(file.getPath()); /* * Create byte array large enough to hold the content of the file. * Use File.length to determine size of the file in bytes. */ fileContent = new byte[(int) file.length()]; /* * To read content of the file in byte array, use * int read(byte[] byteArray) method of java FileInputStream class. * */ fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); //Here you have the encode name and the confidence //In my case if the confidence is > 50 I return the encode, else I return the default value if (confidence > 50) { charset = cm.getName(); } }
public String detect(InputStream fin, byte[] fileContent) throws IOException { String charset = "ISO-8859-1"; fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); if (confidence > 50) { charset = cm.getName(); } } return charset; } }
(cm==null?"null":Integer.toString(cm.getConfidence())));
.detect(); if (match != null && match.getConfidence() > 50) { contentType = contentType.withCharset(match.getName());