CharsetDetector detector = new CharsetDetector(markLimit); String cleaned = CharsetUtils.clean(incomingCharset); if (cleaned != null) { detector.setDeclaredEncoding(cleaned); } else { detector.enableInputFilter(true); detector.setText(input); for (CharsetMatch match : detector.detectAll()) { try { return CharsetUtils.forName(match.getName());
/** * Autodetect the charset of an inputStream, and return a String * containing the converted input data. * <p> * This is a convenience method that is equivalent to * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code> * <p> * Raise an exception if no charsets appear to match the input data. * * @param in The source of the byte data in the unknown charset. * @param declaredEncoding A declared encoding for the data, if available, * or null or an empty string if none is available. * @stable ICU 3.4 */ public String getString(byte[] in, String declaredEncoding) { fDeclaredEncoding = declaredEncoding; try { setText(in); CharsetMatch match = detect(); if (match == null) { return null; } return match.getString(-1); } catch (IOException e) { return null; } }
private CharsetDetector setText(byte[] in, int length) { fRawInput = in; fRawLength = length; MungeInput(); return this; }
public String guessEncoding(InputStream is) throws IOException { CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) ); charsetDetector.enableInputFilter(true); CharsetMatch cm = charsetDetector.detect(); return cm.getName(); }
CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
mime.equals(org.apache.tika.mime.MediaType.TEXT_HTML.toString()) || mime.equals(org.apache.tika.mime.MediaType.TEXT_PLAIN.toString())){ CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(bytes); String fileString = charsetDetector.getString(bytes, null); bytes = fileString.getBytes(charsetDetector.detect().getName());
@Override protected void decodeValue(Klv klv) { byte[] bytes = klv.getValue(); CharsetDetector charsetDetector = new CharsetDetector(); charsetDetector.setText(bytes); CharsetMatch[] charsetMatches = charsetDetector.detectAll(); Optional<CharsetMatch> charsetMatch = Arrays.stream(charsetMatches) .filter(match -> possibleCharsets.contains(match.getName())) .findFirst(); Charset charset = utf8; if (charsetMatch.isPresent()) { try { charset = Charset.forName(charsetMatch.get().getName()); } catch (IllegalArgumentException e) { LOGGER.trace("Unsupported encoding, falling back to default encoding"); } } value = new String(bytes, charset); }
/** * Set the input text (byte) data whose charset is to be detected. * * @param in the input text of unknown encoding * @return This CharsetDetector * @stable ICU 3.4 */ public CharsetDetector setText(byte[] in) { return setText(in, in.length); }
CharsetMatch matches[] = detectAll();
/** * Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * from an http header or xml declaration or similar source that * can be provided as additional information to the charset detector. * A match between a declared encoding and a possible detected encoding * will raise the quality of that detected encoding by a small delta, * and will also appear as a "reason" for the match. * <p> * A declared encoding that is incompatible with the input data being * analyzed will not be added to the list of possible encodings. * * @param encoding The declared encoding * @stable ICU 3.4 */ public CharsetDetector setDeclaredEncoding(String encoding) { setCanonicalDeclaredEncoding(encoding); return this; } // Value is rounded up, so zero really means zero occurences.
@Override public String detect(byte[] data, String hint) { CharsetDetector detector = new CharsetDetector(); if (hint != null) { detector.setDeclaredEncoding(hint); } detector.setText(data); CharsetMatch match = detector.detect(); return match.getName(); } }
CharsetDetector detector = new CharsetDetector(); content = edit.streamContent(); if (content.markSupported()) detector.setText(content); } else { content.read(contentBytes); detector.setText(contentBytes); CharsetMatch match = detector.detect();
CharsetMatch matches[] = detectAll();
/** * Set the declared encoding for charset detection. * The declared encoding of an input text is an encoding obtained * from an http header or xml declaration or similar source that * can be provided as additional information to the charset detector. * A match between a declared encoding and a possible detected encoding * will raise the quality of that detected encoding by a small delta, * and will also appear as a "reason" for the match. * <p> * A declared encoding that is incompatible with the input data being * analyzed will not be added to the list of possible encodings. * * @param encoding The declared encoding * @stable ICU 3.4 */ public CharsetDetector setDeclaredEncoding(String encoding) { setCanonicalDeclaredEncoding(encoding); return this; } // Value is rounded up, so zero really means zero occurences.
/** * Detects the character encoding of a string. When the character * encoding of what the input is supposed to be is known, specifying * it as a declared encoding will influence the detection result. * @param input the input to detect encoding on * @param declaredEncoding declared input encoding, if known * @return the character encoding official name or <code>null</code> * if the input is null or blank * @throws IOException if there is a problem find the character encoding */ public static String detectCharset( String input, String declaredEncoding) throws IOException { if (StringUtils.isBlank(input)) { return null; } CharsetDetector cd = new CharsetDetector(); if (StringUtils.isNotBlank(declaredEncoding)) { cd.setDeclaredEncoding(declaredEncoding); } String charset = null; cd.enableInputFilter(true); cd.setText(input.getBytes("UTF-8")); CharsetMatch match = cd.detect(); charset = match.getName(); if (LOG.isDebugEnabled()) { LOG.debug("Detected encoding: " + charset); } return charset; }
CharsetDetector detector = new CharsetDetector(); detector.setText(text.getRawValue()); CharsetMatch match = detector.detect(); if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
CharsetDetector detector = new CharsetDetector(markLimit); String cleaned = CharsetUtils.clean(incomingCharset); if (cleaned != null) { detector.setDeclaredEncoding(cleaned); } else { detector.enableInputFilter(true); detector.setText(input); for (CharsetMatch match : detector.detectAll()) { try { return CharsetUtils.forName(match.getName());
/** * Set the input text (byte) data whose charset is to be detected. * * @param in the input text of unknown encoding * @return This CharsetDetector * @stable ICU 3.4 */ public CharsetDetector setText(byte[] in) { return setText(in, in.length); }