org.apache.tika.parser.txt.CharsetDetector java code examples

CharsetDetector detector = new CharsetDetector(markLimit);
  String cleaned = CharsetUtils.clean(incomingCharset);
  if (cleaned != null) {
    detector.setDeclaredEncoding(cleaned);
  } else {
detector.enableInputFilter(true);
detector.setText(input);
for (CharsetMatch match : detector.detectAll()) {
  try {
    return CharsetUtils.forName(match.getName());

/**
 * Autodetect the charset of an inputStream, and return a String
 * containing the converted input data.
 * <p>
 * This is a convenience method that is equivalent to
 * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
 * <p>
 * Raise an exception if no charsets appear to match the input data.
 *
 * @param in               The source of the byte data in the unknown charset.
 * @param declaredEncoding A declared encoding for the data, if available,
 *                         or null or an empty string if none is available.
 * @stable ICU 3.4
 */
public String getString(byte[] in, String declaredEncoding) {
  fDeclaredEncoding = declaredEncoding;
  try {
    setText(in);
    CharsetMatch match = detect();
    if (match == null) {
      return null;
    }
    return match.getString(-1);
  } catch (IOException e) {
    return null;
  }
}

private CharsetDetector setText(byte[] in, int length) {
  fRawInput = in;
  fRawLength = length;
  MungeInput();
  return this;
}

public String guessEncoding(InputStream is) throws IOException {
  CharsetDetector charsetDetector = new CharsetDetector();
  charsetDetector.setText( is instanceof BufferedInputStream ? is : new BufferedInputStream(is) );
  charsetDetector.enableInputFilter(true);
  CharsetMatch cm = charsetDetector.detect();
  return cm.getName();
}

CharsetDetector detector = new CharsetDetector();
detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 &&
    tryToSet7BitEncoding(msg, match.getName())) {

mime.equals(org.apache.tika.mime.MediaType.TEXT_HTML.toString()) || 
mime.equals(org.apache.tika.mime.MediaType.TEXT_PLAIN.toString())){
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(bytes);
String fileString = charsetDetector.getString(bytes, null);
bytes = fileString.getBytes(charsetDetector.detect().getName());

@Override
protected void decodeValue(Klv klv) {
 byte[] bytes = klv.getValue();
 CharsetDetector charsetDetector = new CharsetDetector();
 charsetDetector.setText(bytes);
 CharsetMatch[] charsetMatches = charsetDetector.detectAll();
 Optional<CharsetMatch> charsetMatch =
   Arrays.stream(charsetMatches)
     .filter(match -> possibleCharsets.contains(match.getName()))
     .findFirst();
 Charset charset = utf8;
 if (charsetMatch.isPresent()) {
  try {
   charset = Charset.forName(charsetMatch.get().getName());
  } catch (IllegalArgumentException e) {
   LOGGER.trace("Unsupported encoding, falling back to default encoding");
  }
 }
 value = new String(bytes, charset);
}

/**
 * Set the input text (byte) data whose charset is to be detected.
 *
 * @param in the input text of unknown encoding
 * @return This CharsetDetector
 * @stable ICU 3.4
 */
public CharsetDetector setText(byte[] in) {
  return setText(in, in.length);
}

CharsetMatch matches[] = detectAll();

/**
 * Set the declared encoding for charset detection.
 * The declared encoding of an input text is an encoding obtained
 * from an http header or xml declaration or similar source that
 * can be provided as additional information to the charset detector.
 * A match between a declared encoding and a possible detected encoding
 * will raise the quality of that detected encoding by a small delta,
 * and will also appear as a "reason" for the match.
 * <p>
 * A declared encoding that is incompatible with the input data being
 * analyzed will not be added to the list of possible encodings.
 *
 * @param encoding The declared encoding
 * @stable ICU 3.4
 */
public CharsetDetector setDeclaredEncoding(String encoding) {
  setCanonicalDeclaredEncoding(encoding);
  return this;
}
//   Value is rounded up, so zero really means zero occurences.

  @Override
  public String detect(byte[] data, String hint) {
    CharsetDetector detector = new CharsetDetector();
    if (hint != null) {
      detector.setDeclaredEncoding(hint);
    }
    detector.setText(data);
    CharsetMatch match = detector.detect();
    return match.getName();
  }
}

CharsetDetector detector = new CharsetDetector();
content = edit.streamContent();
if (content.markSupported()) 
  detector.setText(content);
} else {
   content.read(contentBytes);
   detector.setText(contentBytes);
CharsetMatch match = detector.detect();

  return setText(new byte[0]);
} else if ( kBufSize > bytesRead) {
  return setText(inputBytes, (int)bytesRead);
} else {
  return setText(inputBytes);

CharsetMatch matches[] = detectAll();

/**
 * Set the declared encoding for charset detection.
 * The declared encoding of an input text is an encoding obtained
 * from an http header or xml declaration or similar source that
 * can be provided as additional information to the charset detector.
 * A match between a declared encoding and a possible detected encoding
 * will raise the quality of that detected encoding by a small delta,
 * and will also appear as a "reason" for the match.
 * <p>
 * A declared encoding that is incompatible with the input data being
 * analyzed will not be added to the list of possible encodings.
 *
 * @param encoding The declared encoding
 * @stable ICU 3.4
 */
public CharsetDetector setDeclaredEncoding(String encoding) {
  setCanonicalDeclaredEncoding(encoding);
  return this;
}
//   Value is rounded up, so zero really means zero occurences.

/**
 * Detects the character encoding of a string. When the character
 * encoding of what the input is supposed to be is known, specifying
 * it as a declared encoding will influence the detection result.
 * @param input the input to detect encoding on
 * @param declaredEncoding declared input encoding, if known
 * @return the character encoding official name or <code>null</code>
 *         if the input is null or blank
 * @throws IOException if there is a problem find the character encoding
 */
public static String detectCharset(
    String input, String declaredEncoding) throws IOException {
  if (StringUtils.isBlank(input)) {
    return null;
  }
  CharsetDetector cd = new CharsetDetector();
  if (StringUtils.isNotBlank(declaredEncoding)) {
    cd.setDeclaredEncoding(declaredEncoding);
  }
  String charset = null;
  cd.enableInputFilter(true);
  cd.setText(input.getBytes("UTF-8"));
  CharsetMatch match = cd.detect();
  charset = match.getName();
  if (LOG.isDebugEnabled()) {
    LOG.debug("Detected encoding: " + charset);
  }
  return charset;
}

CharsetDetector detector = new CharsetDetector();
detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 &&
    tryToSet7BitEncoding(msg, match.getName())) {

CharsetDetector detector = new CharsetDetector(markLimit);
  String cleaned = CharsetUtils.clean(incomingCharset);
  if (cleaned != null) {
    detector.setDeclaredEncoding(cleaned);
  } else {
detector.enableInputFilter(true);
detector.setText(input);
for (CharsetMatch match : detector.detectAll()) {
  try {
    return CharsetUtils.forName(match.getName());

setText(in);
CharsetMatch match = detect();

/**
 * Set the input text (byte) data whose charset is to be detected.
 *
 * @param in the input text of unknown encoding
 * @return This CharsetDetector
 * @stable ICU 3.4
 */
public CharsetDetector setText(byte[] in) {
  return setText(in, in.length);
}

Javadoc

CharsetDetector provides a facility for detecting the charset or encoding of character data in an unknown format. The input data can either be from an input stream or an array of bytes. The result of the detection operation is a list of possibly matching charsets, or, for simple use, you can just ask for a Java Reader that will will work over the input data.

Character set detection is at best an imprecise operation. The detection process will attempt to identify the charset that best matches the characteristics of the byte data, but the process is partly statistical in nature, and the results can not be guaranteed to always be correct.

For best accuracy in charset detection, the input data should be primarily in a single language, and a minimum of a few hundred bytes worth of plain text in the language are needed. The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content.

Most used methods

<init>
setText
detect
Return the charset that best matches the supplied input data. Note though, that because the detectio
enableInputFilter
Enable filtering of input text. If filtering is enabled, text within angle brackets ("<" and ">") wi
setDeclaredEncoding
Set the declared encoding for charset detection. The declared encoding of an input text is an encodi
detectAll
Return an array of all charsets that appear to be plausible matches with the input data. The array i
MungeInput
setCanonicalDeclaredEncoding
Try to set fDeclaredEncoding to the canonical name for , if it exists.
getString
Autodetect the charset of an inputStream, and return a String containing the converted input data. T

Popular in Java

Updating database using SQL prepared statement
setContentView (Activity)
notifyDataSetChanged (ArrayAdapter)
getResourceAsStream (ClassLoader)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
CodeWhisperer alternatives

How to useCharsetDetector in org.apache.tika.parser.txt

Best Java code snippets using org.apache.tika.parser.txt.CharsetDetector (Showing top 20 results out of 315)

How to use
CharsetDetector
in
org.apache.tika.parser.txt