BufferedInputStream bis = new BufferedInputStream(input); CharsetDetector cd = new CharsetDetector(); cd.setText(bis); CharsetMatch cm = cd.detect(); if (cm != null) { reader = cm.getReader(); charset = cm.getName(); }else { throw new UnsupportedCharsetException() }
public static String getEncode(byte[] data){ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; }
public static String getEncode(InputStream data) throws IOException{ CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch match = detector.detect(); String encoding = match.getName(); System.out.println("The Content in " + match.getName()); CharsetMatch[] matches = detector.detectAll(); System.out.println("All possibilities"); for (CharsetMatch m : matches) { System.out.println("CharsetName:" + m.getName() + " Confidence:" + m.getConfidence()); } return encoding; } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
public Charset detectEncoding(byte[] input) { // Fall back to the incredibly slow ICU. It might be better to just skip this entirely. CharsetDetector detector = new CharsetDetector(); detector.setText(input); CharsetMatch match = detector.detect(); return Charset.forName(match.getName().toUpperCase()); } }
@Override public Set<Charset> detect(InputStream source) throws CharsetDetectorException { Set<Charset> set = new HashSet<Charset>(); com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector(); try { charsetDetector.setText(new BufferedInputStream(source)); CharsetMatch[] charsetMatchs = charsetDetector.detectAll(); for (CharsetMatch match : charsetMatchs) { set.add(Charset.forName(match.getName())); } } catch (IOException e) { throw new CharsetDetectorException(e.getMessage(), e); } return set; }
protected String detectEncoding(InputStream in) throws IOException, ConversionException { if (!in.markSupported()) { // detector.setText requires mark in = new BufferedInputStream(in); } CharsetDetector detector = new CharsetDetector(); detector.setText(in); CharsetMatch charsetMatch = detector.detect(); if (charsetMatch == null) { throw new ConversionException("Cannot detect source charset."); } return charsetMatch.getName(); }
byte[] thisAppCanBreak = "this app can break" .getBytes("ISO-8859-1"); CharsetDetector detector = new CharsetDetector(); detector.setText(thisAppCanBreak); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); }
/** * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection * Return the file contents as a String. */ public static String fileAnyEncodingToString(File f) throws IOException { byte[] byteData = IOUtils.toByteArray(new FileInputStream(f)); CharsetDetector detector = new CharsetDetector(); String unicodeData = detector.getString(byteData, null); // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator"); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 60) { LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName()); if (match.getLanguage() != null) { LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage()); } } return unicodeData; }
public static void main(String[] args) throws IOException { InputStream file = new FileInputStream(args[0]); try { file = new BufferedInputStream(file); CharsetDetector detector = new CharsetDetector(); detector.setText(file); String tableTemplate = "%10s %10s %8s%n"; System.out.format(tableTemplate, "CONFIDENCE", "CHARSET", "LANGUAGE"); for (CharsetMatch match : detector.detectAll()) { System.out.format(tableTemplate, match .getConfidence(), match.getName(), match .getLanguage()); } } finally { file.close(); } }
public static String getClipboardCharset () throws UnsupportedCharsetException, UnsupportedFlavorException, IOException { String clipText = null; final Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard(); final Transferable contents = clipboard.getContents(null); if ((contents != null) && contents.isDataFlavorSupported(DataFlavor.stringFlavor)) clipText = (String) contents.getTransferData(DataFlavor.stringFlavor); if (contents!=null && clipText!=null) { final CharsetDetector cd = new CharsetDetector(); cd.setText(clipText.getBytes()); final CharsetMatch cm = cd.detect(); if (cm != null) return cm.getName(); } throw new UnsupportedCharsetException("Unknown"); }
protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; }
/** * Detects the character set for the buffer. * * @param buffer The buffer * @param offset where to start * @param length the length to read * @return The detected charset or null */ public static Charset detect(@NonNull byte[] buffer, int offset, int length) { Preconditions.checkArgument(length > 0); Preconditions.checkArgument(offset >= 0); final com.ibm.icu.text.CharsetDetector detector = new com.ibm.icu.text.CharsetDetector(); try { detector.setText(new ByteArrayInputStream(buffer, offset, length)); return Charset.forName(detector.detect().getName()); } catch (Exception e) { return null; } }
public String autoDetectEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); setSelectedItem(charSet); return charSet; }
public String detect(InputStream fin, byte[] fileContent) throws IOException { String charset = "ISO-8859-1"; fin.read(fileContent); byte[] data = fileContent; CharsetDetector detector = new CharsetDetector(); detector.setText(data); CharsetMatch cm = detector.detect(); if (cm != null) { int confidence = cm.getConfidence(); //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%"); if (confidence > 50) { charset = cm.getName(); } } return charset; } }
@Override public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream, String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws IOException { resultBuilder.metas().set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault)); // Trying to detect the CHARSET of the stream final CharsetDetector detector = new CharsetDetector(); try (BufferedInputStream bis = new BufferedInputStream(inputStream)) { detector.setText(bis); final CharsetMatch match = detector.detect(); final ParserFieldsBuilder result = resultBuilder.newDocument(); final String content; if (match != null) { content = match.getString(); result.add(CHARSET_DETECTION, match.getName()); } else { bis.reset(); content = IOUtils.toString(bis, Charset.defaultCharset()); } result.add(CONTENT, content); result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000)); } }
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }
public Encoding sniff() throws IOException { try { CharsetDetector detector = new CharsetDetector(); detector.setText(this); CharsetMatch match = detector.detect(); Encoding enc = Encoding.forName(match.getName()); Encoding actual = enc.getActualHtmlEncoding(); if (actual != null) { enc = actual; } if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) { return enc; } else { return null; } } catch (Exception e) { return null; } }