Refine search
BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(bis); CharsetMatch cm = cd.detect(); if (cm != null) { reader = cm.getReader(); charset = cm.getName(); }else { throw new UnsupportedCharsetException() }
byte[] thisAppCanBreak = "this app can break" .getBytes("ISO-8859-1"); CharsetDetector detector = new CharsetDetector(); detector.setText(thisAppCanBreak); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); }
@Override public Set<Charset> detect(InputStream source) throws CharsetDetectorException { Set<Charset> set = new HashSet<Charset>(); com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector(); try { charsetDetector.setText(new BufferedInputStream(source)); CharsetMatch[] charsetMatchs = charsetDetector.detectAll(); for (CharsetMatch match : charsetMatchs) { set.add(Charset.forName(match.getName())); } } catch (IOException e) { throw new CharsetDetectorException(e.getMessage(), e); } return set; }
/** * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection * Return the file contents as a String. */ public static String fileAnyEncodingToString(File f) throws IOException { byte[] byteData = IOUtils.toByteArray(new FileInputStream(f)); CharsetDetector detector = new CharsetDetector(); String unicodeData = detector.getString(byteData, null); // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator"); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 60) { LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName()); if (match.getLanguage() != null) { LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage()); } } return unicodeData; }
public static String readFileAsStringGuessEncoding(String filePath) { String s = null; try { File file = new File(filePath); byte [] fileData = new byte[(int)file.length()]; DataInputStream dis = new DataInputStream(new FileInputStream(file)); dis.readFully(fileData); dis.close(); CharsetMatch match = new CharsetDetector().setText(fileData).detect(); if (match != null) try { Lt.d("For file: " + filePath + " guessed enc: " + match.getName() + " conf: " + match.getConfidence()); s = new String(fileData, match.getName()); } catch (UnsupportedEncodingException ue) { s = null; } if (s == null) s = new String(fileData); } catch (Exception e) { Lt.e("Exception in readFileAsStringGuessEncoding(): " + e); e.printStackTrace(); } return s; }
/** * Create a java.io.Reader for reading the Unicode character data corresponding * to the original byte data supplied to the Charset detect operation. * <p> * CAUTION: if the source of the byte data was an InputStream, a Reader * can be created for only one matching char set using this method. If more * than one charset needs to be tried, the caller will need to reset * the InputStream and create InputStreamReaders itself, based on the charset name. * * @return the Reader for the Unicode character data. * * @stable ICU 3.4 */ public Reader getReader() { InputStream inputStream = fInputStream; if (inputStream == null) { inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); } try { inputStream.reset(); return new InputStreamReader(inputStream, getName()); } catch (IOException e) { return null; } }
/** * Create a Java String from Unicode character data corresponding * to the original byte data supplied to the Charset detect operation. * * @return a String created from the converted input data. * * @stable ICU 3.4 */ public String getString() throws java.io.IOException { return getString(-1); }
public static void main(String[] args) throws IOException { InputStream file = new FileInputStream(args[0]); try { file = new BufferedInputStream(file); CharsetDetector detector = new CharsetDetector(); detector.setText(file); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); } } finally { file.close(); } }
protected String getEncoding( String requiredEncoding, File file, Log log ) throws IOException { FileInputStream fis = null; try { fis = new FileInputStream( file ); CharsetDetector detector = new CharsetDetector(); detector.setDeclaredEncoding( requiredEncoding ); detector.setText( new BufferedInputStream( fis ) ); CharsetMatch[] charsets = detector.detectAll(); if ( charsets == null ) { return null; } else { return charsets[0].getName(); } } finally { IOUtil.close( fis ); } }
protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; }
public static String getEncode(byte[] data){ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; }
public static String getEncode(InputStream data) throws IOException{ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public String autoDetectEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); setSelectedItem(charSet); return charSet; }
public String detect(InputStream fin, byte[] fileContent) throws IOException { String charset = "ISO-8859-1"; fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); if (confidence > 50) { charset = cm.getName(); } } return charset; } }
protected String detectEncoding(InputStream in) throws IOException, ConversionException { if (!in.markSupported()) { // detector.setText requires mark in = new BufferedInputStream(in); } CharsetDetector detector = new CharsetDetector(); detector.setText(in); CharsetMatch charsetMatch = detector.detect(); if (charsetMatch == null) { throw new ConversionException("Cannot detect source charset."); } return charsetMatch.getName(); }