com.ibm.icu.text.CharsetDetector java code examples

Refine search

CharsetMatch

 BufferedInputStream bis = new BufferedInputStream(input);
CharsetDetector cd = new CharsetDetector();
cd.setText(bis);
CharsetMatch cm = cd.detect();

if (cm != null) {
  reader = cm.getReader();
  charset = cm.getName();
}else {
  throw new UnsupportedCharsetException()
}

byte[] thisAppCanBreak = "this app can break"
   .getBytes("ISO-8859-1");
 CharsetDetector detector = new CharsetDetector();
 detector.setText(thisAppCanBreak);
 String tableTemplate = "%10s %10s %8s%n";
 System.out.format(tableTemplate, "CONFIDENCE",
   "CHARSET", "LANGUAGE");
 for (CharsetMatch match : detector.detectAll()) {
  System.out.format(tableTemplate, match
    .getConfidence(), match.getName(), match
    .getLanguage());
 }

@Override
public Set<Charset> detect(InputStream source) throws CharsetDetectorException {
  Set<Charset> set = new HashSet<Charset>();
  com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector();
  try {
    charsetDetector.setText(new BufferedInputStream(source));
    CharsetMatch[] charsetMatchs = charsetDetector.detectAll();
    for (CharsetMatch match : charsetMatchs) {
      set.add(Charset.forName(match.getName()));
    }
  } catch (IOException e) {
    throw new CharsetDetectorException(e.getMessage(), e);
  }
  return set;
}

/**
 * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
 * Return the file contents as a String.
 */
public static String fileAnyEncodingToString(File f) throws IOException {
 byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
 CharsetDetector detector = new CharsetDetector();
 String unicodeData = detector.getString(byteData, null);
 // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
 unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
 CharsetMatch match = detector.detect();
 if (match != null && match.getConfidence() > 60) {
  LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
  if (match.getLanguage() != null) {
   LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
  }
 }
 return unicodeData;
}

 CharsetDetector detector;
CharsetMatch match;
byte[] byteData = ...;

detector = new CharsetDetector();

detector.setText(byteData);
match = detector.detect();

 public static String readFileAsStringGuessEncoding(String filePath)
{
  String s = null;
  try {
    File file = new File(filePath);
    byte [] fileData = new byte[(int)file.length()];
    DataInputStream dis = new DataInputStream(new FileInputStream(file));
    dis.readFully(fileData);
    dis.close();

    CharsetMatch match = new CharsetDetector().setText(fileData).detect();

    if (match != null) try {
      Lt.d("For file: " + filePath + " guessed enc: " + match.getName() + " conf: " + match.getConfidence());
      s = new String(fileData, match.getName());
    } catch (UnsupportedEncodingException ue) {
      s = null;
    }
    if (s == null)
      s = new String(fileData);
  } catch (Exception e) {
    Lt.e("Exception in readFileAsStringGuessEncoding(): " + e);
    e.printStackTrace();
  }
  return s;
}

  setText(in);
  CharsetMatch match = detect();
  return match.getString(-1);
} catch (IOException e) {
  return null;

  setText(in);
  CharsetMatch match = detect();
  return match.getReader();
} catch (IOException e) {
  return null;

 public static void XMLtoString(File file) {

  String encoding = "";
  String str = "";

  try {
    // detect the encoding of the file
    CharsetDetector cd = new CharsetDetector().setText(new BufferedInputStream(new FileInputStream(file)));
    encoding = cd.detect().getName();

    // to avoid the BOM ("byte order mark") being added to the String, encoding is specified as a parameter
    str = FileUtils.readFileToString(file, encoding);
  }
  catch (IOException e) {
    System.err.println("Caught IOException: " + e.getMessage());
  }
}

  @Override
  public void getNext(CAS aJCas)
    throws IOException, CollectionException
  {
    Resource res = nextFile();
    initCas(aJCas, res);

    try (InputStream is = new BufferedInputStream(
        CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) {
      String text;

      if (ENCODING_AUTO.equals(sourceEncoding)) {
        CharsetDetector detector = new CharsetDetector();
        text = IOUtils.toString(detector.getReader(is, null));
      }
      else {
        text = IOUtils.toString(is, sourceEncoding);
      }
      
      aJCas.setDocumentText(text);        
    }
  }
}

/**
 * Returns an array of encodings that can be detected by the <code>detectEncoding</code> methods.
 * Note that some of the returned character encodings may not be available on the Java runtime.
 *
 * @return an array of encodings that can be detected by the <code>detectEncoding</code> methods.
 */
public static String[] getDetectableEncodings() {
  return CharsetDetector.getAllDetectableCharsets();
}

CharsetMatch matches[] = detectAll();

public static void main(String[] args) throws IOException {
 InputStream file = new FileInputStream(args[0]);
 try {
  file = new BufferedInputStream(file);
  CharsetDetector detector = new CharsetDetector();
  detector.setText(file);
  String tableTemplate = "%10s %10s %8s%n";
  System.out.format(tableTemplate, "CONFIDENCE",
    "CHARSET", "LANGUAGE");
  for (CharsetMatch match : detector.detectAll()) {
   System.out.format(tableTemplate, match
     .getConfidence(), match.getName(), match
     .getLanguage());
  }
 } finally {
  file.close();
 }
}

protected String getEncoding( String requiredEncoding, File file, Log log )
  throws IOException
{
  FileInputStream fis = null;
  try
  {
    fis = new FileInputStream( file );
    CharsetDetector detector = new CharsetDetector();
    detector.setDeclaredEncoding( requiredEncoding );
    detector.setText( new BufferedInputStream( fis ) );
    CharsetMatch[] charsets = detector.detectAll();
    if ( charsets == null )
    {
      return null;
    }
    else
    {
      return charsets[0].getName();
    }
  }
  finally
  {
    IOUtil.close( fis );
  }
}

public static void main(String[] args) {
  String[] detectable = CharsetDetector.getAllDetectableCharsets();
  for (int i = 0; i < detectable.length; i++) {
    String charset = detectable[i];
    System.out.println(charset);
  }
}

protected String suggestEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  return charSet;
}

public static void main(String[] args) {
  String[] detectable = CharsetDetector.getAllDetectableCharsets();
  for (int i = 0; i < detectable.length; i++) {
    String charset = detectable[i];
    System.out.println(charset);
  }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

  public static void main(String[] args) {
    String[] detectable = CharsetDetector.getAllDetectableCharsets();
    for (int i = 0; i < detectable.length; i++) {
      String charset = detectable[i];
      System.out.println(charset);
    }
  }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

Javadoc

CharsetDetector provides a facility for detecting the charset or encoding of character data in an unknown format. The input data can either be from an input stream or an array of bytes. The result of the detection operation is a list of possibly matching charsets, or, for simple use, you can just ask for a Java Reader that will will work over the input data.

Character set detection is at best an imprecise operation. The detection process will attempt to identify the charset that best matches the characteristics of the byte data, but the process is partly statistical in nature, and the results can not be guaranteed to always be correct.

For best accuracy in charset detection, the input data should be primarily in a single language, and a minimum of a few hundred bytes worth of plain text in the language are needed. The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content.

Most used methods

<init>
Constructor
setText
Set the input text (byte) data whose charset is to be detected.
detect
Return the charset that best matches the supplied input data. Note though, that because the detectio
detectAll
Return an array of all charsets that appear to be plausible matches with the input data. The array i
getAllDetectableCharsets
Get the names of all charsets supported by CharsetDetector class.Note: Multiple different charset en
setDeclaredEncoding
Set the declared encoding for charset detection. The declared encoding of an input text is an encodi
enableInputFilter
Enable filtering of input text. If filtering is enabled, text within angle brackets ("<" and ">") wi
MungeInput
getReader
Autodetect the charset of an inputStream, and return a Java Reader to access the converted input dat
getString
Autodetect the charset of an inputStream, and return a String containing the converted input data. T

Popular in Java

Reactive rest calls using spring rest template
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
putExtra (Intent)
getApplicationContext (Context)
String (java.lang)
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
Top Sublime Text plugins

How to useCharsetDetector in com.ibm.icu.text

Best Java code snippets using com.ibm.icu.text.CharsetDetector (Showing top 20 results out of 315)

Refine search

How to use
CharsetDetector
in
com.ibm.icu.text