Refine search
BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(bis); CharsetMatch cm = cd.detect(); if (cm != null) { reader = cm.getReader(); charset = cm.getName(); }else { throw new UnsupportedCharsetException() }
byte[] thisAppCanBreak = "this app can break" .getBytes("ISO-8859-1"); CharsetDetector detector = new CharsetDetector(); detector.setText(thisAppCanBreak); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); }
@Override public Set<Charset> detect(InputStream source) throws CharsetDetectorException { Set<Charset> set = new HashSet<Charset>(); com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector(); try { charsetDetector.setText(new BufferedInputStream(source)); CharsetMatch[] charsetMatchs = charsetDetector.detectAll(); for (CharsetMatch match : charsetMatchs) { set.add(Charset.forName(match.getName())); } } catch (IOException e) { throw new CharsetDetectorException(e.getMessage(), e); } return set; }
/** * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection * Return the file contents as a String. */ public static String fileAnyEncodingToString(File f) throws IOException { byte[] byteData = IOUtils.toByteArray(new FileInputStream(f)); CharsetDetector detector = new CharsetDetector(); String unicodeData = detector.getString(byteData, null); // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator"); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 60) { LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName()); if (match.getLanguage() != null) { LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage()); } } return unicodeData; }
CharsetDetector detector; CharsetMatch match; byte[] byteData = ...; detector = new CharsetDetector(); detector.setText(byteData); match = detector.detect();
public static String readFileAsStringGuessEncoding(String filePath) { String s = null; try { File file = new File(filePath); byte [] fileData = new byte[(int)file.length()]; DataInputStream dis = new DataInputStream(new FileInputStream(file)); dis.readFully(fileData); dis.close(); CharsetMatch match = new CharsetDetector().setText(fileData).detect(); if (match != null) try { Lt.d("For file: " + filePath + " guessed enc: " + match.getName() + " conf: " + match.getConfidence()); s = new String(fileData, match.getName()); } catch (UnsupportedEncodingException ue) { s = null; } if (s == null) s = new String(fileData); } catch (Exception e) { Lt.e("Exception in readFileAsStringGuessEncoding(): " + e); e.printStackTrace(); } return s; }
public static void XMLtoString(File file) { String encoding = ""; String str = ""; try { // detect the encoding of the file CharsetDetector cd = new CharsetDetector().setText(new BufferedInputStream(new FileInputStream(file))); encoding = cd.detect().getName(); // to avoid the BOM ("byte order mark") being added to the String, encoding is specified as a parameter str = FileUtils.readFileToString(file, encoding); } catch (IOException e) { System.err.println("Caught IOException: " + e.getMessage()); } }
@Override public void getNext(CAS aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); try (InputStream is = new BufferedInputStream( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { String text; if (ENCODING_AUTO.equals(sourceEncoding)) { CharsetDetector detector = new CharsetDetector(); text = IOUtils.toString(detector.getReader(is, null)); } else { text = IOUtils.toString(is, sourceEncoding); } aJCas.setDocumentText(text); } } }
/** * Returns an array of encodings that can be detected by the <code>detectEncoding</code> methods. * Note that some of the returned character encodings may not be available on the Java runtime. * * @return an array of encodings that can be detected by the <code>detectEncoding</code> methods. */ public static String[] getDetectableEncodings() { return CharsetDetector.getAllDetectableCharsets(); }
CharsetMatch matches[] = detectAll();
public static void main(String[] args) throws IOException { InputStream file = new FileInputStream(args[0]); try { file = new BufferedInputStream(file); CharsetDetector detector = new CharsetDetector(); detector.setText(file); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); } } finally { file.close(); } }
protected String getEncoding( String requiredEncoding, File file, Log log ) throws IOException { FileInputStream fis = null; try { fis = new FileInputStream( file ); CharsetDetector detector = new CharsetDetector(); detector.setDeclaredEncoding( requiredEncoding ); detector.setText( new BufferedInputStream( fis ) ); CharsetMatch[] charsets = detector.detectAll(); if ( charsets == null ) { return null; } else { return charsets[0].getName(); } } finally { IOUtil.close( fis ); } }
public static void main(String[] args) { String[] detectable = CharsetDetector.getAllDetectableCharsets(); for (int i = 0; i < detectable.length; i++) { String charset = detectable[i]; System.out.println(charset); } }
protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; }
public static void main(String[] args) { String[] detectable = CharsetDetector.getAllDetectableCharsets(); for (int i = 0; i < detectable.length; i++) { String charset = detectable[i]; System.out.println(charset); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public static void main(String[] args) { String[] detectable = CharsetDetector.getAllDetectableCharsets(); for (int i = 0; i < detectable.length; i++) { String charset = detectable[i]; System.out.println(charset); } } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }