com.ibm.icu.text.CharsetMatch java code examples

Refine search

CharsetDetector

 BufferedInputStream bis = new BufferedInputStream(input);
CharsetDetector cd = new CharsetDetector();
cd.setText(bis);
CharsetMatch cm = cd.detect();

if (cm != null) {
  reader = cm.getReader();
  charset = cm.getName();
}else {
  throw new UnsupportedCharsetException()
}

byte[] thisAppCanBreak = "this app can break"
   .getBytes("ISO-8859-1");
 CharsetDetector detector = new CharsetDetector();
 detector.setText(thisAppCanBreak);
 String tableTemplate = "%10s %10s %8s%n";
 System.out.format(tableTemplate, "CONFIDENCE",
   "CHARSET", "LANGUAGE");
 for (CharsetMatch match : detector.detectAll()) {
  System.out.format(tableTemplate, match
    .getConfidence(), match.getName(), match
    .getLanguage());
 }

@Override
public Set<Charset> detect(InputStream source) throws CharsetDetectorException {
  Set<Charset> set = new HashSet<Charset>();
  com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector();
  try {
    charsetDetector.setText(new BufferedInputStream(source));
    CharsetMatch[] charsetMatchs = charsetDetector.detectAll();
    for (CharsetMatch match : charsetMatchs) {
      set.add(Charset.forName(match.getName()));
    }
  } catch (IOException e) {
    throw new CharsetDetectorException(e.getMessage(), e);
  }
  return set;
}

/**
 * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
 * Return the file contents as a String.
 */
public static String fileAnyEncodingToString(File f) throws IOException {
 byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
 CharsetDetector detector = new CharsetDetector();
 String unicodeData = detector.getString(byteData, null);
 // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
 unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
 CharsetMatch match = detector.detect();
 if (match != null && match.getConfidence() > 60) {
  LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
  if (match.getLanguage() != null) {
   LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
  }
 }
 return unicodeData;
}

 public static String readFileAsStringGuessEncoding(String filePath)
{
  String s = null;
  try {
    File file = new File(filePath);
    byte [] fileData = new byte[(int)file.length()];
    DataInputStream dis = new DataInputStream(new FileInputStream(file));
    dis.readFully(fileData);
    dis.close();

    CharsetMatch match = new CharsetDetector().setText(fileData).detect();

    if (match != null) try {
      Lt.d("For file: " + filePath + " guessed enc: " + match.getName() + " conf: " + match.getConfidence());
      s = new String(fileData, match.getName());
    } catch (UnsupportedEncodingException ue) {
      s = null;
    }
    if (s == null)
      s = new String(fileData);
  } catch (Exception e) {
    Lt.e("Exception in readFileAsStringGuessEncoding(): " + e);
    e.printStackTrace();
  }
  return s;
}

  setText(in);
  CharsetMatch match = detect();
  return match.getString(-1);
} catch (IOException e) {
  return null;

StringBuilder sb = new StringBuilder();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
int bytesRead = 0;
String name = getName();

/**
 * Create a java.io.Reader for reading the Unicode character data corresponding
 * to the original byte data supplied to the Charset detect operation.
 * <p>
 * CAUTION:  if the source of the byte data was an InputStream, a Reader
 * can be created for only one matching char set using this method.  If more
 * than one charset needs to be tried, the caller will need to reset
 * the InputStream and create InputStreamReaders itself, based on the charset name.
 *
 * @return the Reader for the Unicode character data.
 *
 * @stable ICU 3.4
 */
public Reader getReader() {
  InputStream inputStream = fInputStream;
  if (inputStream == null) {
    inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
  }
  try {
    inputStream.reset();
    return new InputStreamReader(inputStream, getName());
  } catch (IOException e) {
    return null;
  }
}

/**
 * Create a Java String from Unicode character data corresponding
 * to the original byte data supplied to the Charset detect operation.
 *
 * @return a String created from the converted input data.
 *
 * @stable ICU 3.4
 */
public String getString()  throws java.io.IOException {
  return getString(-1);
}

public static void main(String[] args) throws IOException {
 InputStream file = new FileInputStream(args[0]);
 try {
  file = new BufferedInputStream(file);
  CharsetDetector detector = new CharsetDetector();
  detector.setText(file);
  String tableTemplate = "%10s %10s %8s%n";
  System.out.format(tableTemplate, "CONFIDENCE",
    "CHARSET", "LANGUAGE");
  for (CharsetMatch match : detector.detectAll()) {
   System.out.format(tableTemplate, match
     .getConfidence(), match.getName(), match
     .getLanguage());
  }
 } finally {
  file.close();
 }
}

protected String getEncoding( String requiredEncoding, File file, Log log )
  throws IOException
{
  FileInputStream fis = null;
  try
  {
    fis = new FileInputStream( file );
    CharsetDetector detector = new CharsetDetector();
    detector.setDeclaredEncoding( requiredEncoding );
    detector.setText( new BufferedInputStream( fis ) );
    CharsetMatch[] charsets = detector.detectAll();
    if ( charsets == null )
    {
      return null;
    }
    else
    {
      return charsets[0].getName();
    }
  }
  finally
  {
    IOUtil.close( fis );
  }
}

protected String suggestEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  return charSet;
}

public static String getEncode(byte[] data){
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}

public static String getEncode(InputStream data) throws IOException{
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

public String autoDetectEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  setSelectedItem(charSet);
  return charSet;
}

  public String detect(InputStream fin, byte[] fileContent) throws IOException
  {        
    
    String charset = "ISO-8859-1";
    fin.read(fileContent);

    byte[] data =  fileContent;

    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();

    if (cm != null) {
      int confidence = cm.getConfidence();
      //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%");
      if (confidence > 50) {
        charset = cm.getName();
      }
    }        
    return charset;
  }
}

protected String detectEncoding(InputStream in) throws IOException, ConversionException {
  if (!in.markSupported()) {
    // detector.setText requires mark
    in = new BufferedInputStream(in);
  }
  CharsetDetector detector = new CharsetDetector();
  detector.setText(in);
  CharsetMatch charsetMatch = detector.detect();
  if (charsetMatch == null) {
    throw new ConversionException("Cannot detect source charset.");
  }
  return charsetMatch.getName();
}

Javadoc

This class represents a charset that has been identified by a CharsetDetector as a possible encoding for a set of input data. From an instance of this class, you can ask for a confidence level in the charset identification, or for Java Reader or String to access the original byte data in Unicode form.

Instances of this class are created only by CharsetDetectors.

Note: this class has a natural ordering that is inconsistent with equals. The natural ordering is based on the match confidence value.

Most used methods

getName
Get the name of the detected charset. The name will be one that can be used with other APIs on the p
getConfidence
Get an indication of the confidence in the charset detected. Confidence values range from 0-100, wit
getString
Create a Java String from Unicode character data corresponding to the original byte data supplied to
getLanguage
Get the ISO code for the language of the detected charset.
getReader
Create a java.io.Reader for reading the Unicode character data corresponding to the original byte da
<init>

Popular in Java

Updating database using SQL prepared statement
getContentResolver (Context)
getExternalFilesDir (Context)
getApplicationContext (Context)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
CodeWhisperer alternatives

How to useCharsetMatch in com.ibm.icu.text

Best Java code snippets using com.ibm.icu.text.CharsetMatch (Showing top 20 results out of 315)

Refine search

How to use
CharsetMatch
in
com.ibm.icu.text