com.ibm.icu.text.CharsetMatch.getName java code examples

 BufferedInputStream bis = new BufferedInputStream(input);
CharsetDetector cd = new CharsetDetector();
cd.setText(bis);
CharsetMatch cm = cd.detect();

if (cm != null) {
  reader = cm.getReader();
  charset = cm.getName();
}else {
  throw new UnsupportedCharsetException()
}

public static String getEncode(byte[] data){
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}

public static String getEncode(InputStream data) throws IOException{
  CharsetDetector detector = new CharsetDetector();
  detector.setText(data);
  CharsetMatch match = detector.detect();
  String encoding = match.getName();
  System.out.println("The Content in " + match.getName());
  CharsetMatch[] matches = detector.detectAll();
  System.out.println("All possibilities");
  for (CharsetMatch m : matches) {
  System.out.println("CharsetName:" + m.getName() + " Confidence:"
   + m.getConfidence());
  }
  return encoding;
}
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

 public Charset detectEncoding(byte[] input) {
  // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
  CharsetDetector detector = new CharsetDetector();
  detector.setText(input);
  CharsetMatch match = detector.detect();
  return Charset.forName(match.getName().toUpperCase());
 }
}

@Override
public Set<Charset> detect(InputStream source) throws CharsetDetectorException {
  Set<Charset> set = new HashSet<Charset>();
  com.ibm.icu.text.CharsetDetector charsetDetector = new com.ibm.icu.text.CharsetDetector();
  try {
    charsetDetector.setText(new BufferedInputStream(source));
    CharsetMatch[] charsetMatchs = charsetDetector.detectAll();
    for (CharsetMatch match : charsetMatchs) {
      set.add(Charset.forName(match.getName()));
    }
  } catch (IOException e) {
    throw new CharsetDetectorException(e.getMessage(), e);
  }
  return set;
}

protected String detectEncoding(InputStream in) throws IOException, ConversionException {
  if (!in.markSupported()) {
    // detector.setText requires mark
    in = new BufferedInputStream(in);
  }
  CharsetDetector detector = new CharsetDetector();
  detector.setText(in);
  CharsetMatch charsetMatch = detector.detect();
  if (charsetMatch == null) {
    throw new ConversionException("Cannot detect source charset.");
  }
  return charsetMatch.getName();
}

byte[] thisAppCanBreak = "this app can break"
   .getBytes("ISO-8859-1");
 CharsetDetector detector = new CharsetDetector();
 detector.setText(thisAppCanBreak);
 String tableTemplate = "%10s %10s %8s%n";
 System.out.format(tableTemplate, "CONFIDENCE",
   "CHARSET", "LANGUAGE");
 for (CharsetMatch match : detector.detectAll()) {
  System.out.format(tableTemplate, match
    .getConfidence(), match.getName(), match
    .getLanguage());
 }

/**
 * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
 * Return the file contents as a String.
 */
public static String fileAnyEncodingToString(File f) throws IOException {
 byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
 CharsetDetector detector = new CharsetDetector();
 String unicodeData = detector.getString(byteData, null);
 // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
 unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
 CharsetMatch match = detector.detect();
 if (match != null && match.getConfidence() > 60) {
  LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
  if (match.getLanguage() != null) {
   LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
  }
 }
 return unicodeData;
}

public static void main(String[] args) throws IOException {
 InputStream file = new FileInputStream(args[0]);
 try {
  file = new BufferedInputStream(file);
  CharsetDetector detector = new CharsetDetector();
  detector.setText(file);
  String tableTemplate = "%10s %10s %8s%n";
  System.out.format(tableTemplate, "CONFIDENCE",
    "CHARSET", "LANGUAGE");
  for (CharsetMatch match : detector.detectAll()) {
   System.out.format(tableTemplate, match
     .getConfidence(), match.getName(), match
     .getLanguage());
  }
 } finally {
  file.close();
 }
}

 public static String getClipboardCharset () throws UnsupportedCharsetException, UnsupportedFlavorException, IOException {
  String clipText = null;
  final Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
  final Transferable contents = clipboard.getContents(null);
  if ((contents != null) && contents.isDataFlavorSupported(DataFlavor.stringFlavor))
    clipText = (String) contents.getTransferData(DataFlavor.stringFlavor);

  if (contents!=null && clipText!=null) {
    final CharsetDetector cd = new CharsetDetector();
    cd.setText(clipText.getBytes());
    final CharsetMatch cm = cd.detect();

    if (cm != null)
      return cm.getName();
  }

  throw new UnsupportedCharsetException("Unknown");
}

protected String suggestEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  return charSet;
}

/**
* Detects the character set for the buffer.
*
* @param buffer The buffer
* @param offset where to start
* @param length the length to read
* @return The detected charset or null
*/
public static Charset detect(@NonNull byte[] buffer, int offset, int length) {
 Preconditions.checkArgument(length > 0);
 Preconditions.checkArgument(offset >= 0);
 final com.ibm.icu.text.CharsetDetector detector = new com.ibm.icu.text.CharsetDetector();
 try {
   detector.setText(new ByteArrayInputStream(buffer, offset, length));
   return Charset.forName(detector.detect().getName());
 } catch (Exception e) {
   return null;
 }
}

public String autoDetectEncoding(final byte[] bytes) {
  final CharsetDetector cd = new CharsetDetector();
  cd.setText(bytes);
  final CharsetMatch charsetMatch = cd.detect();
  final String charSet = charsetMatch.getName();
  final int confidence = charsetMatch.getConfidence();
  logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
  setSelectedItem(charSet);
  return charSet;
}

  public String detect(InputStream fin, byte[] fileContent) throws IOException
  {        
    
    String charset = "ISO-8859-1";
    fin.read(fileContent);

    byte[] data =  fileContent;

    CharsetDetector detector = new CharsetDetector();
    detector.setText(data);
    CharsetMatch cm = detector.detect();

    if (cm != null) {
      int confidence = cm.getConfidence();
      //System.out.println("Encoding: " + cm.getName() + " - Confidence: " + confidence + "%");
      if (confidence > 50) {
        charset = cm.getName();
      }
    }        
    return charset;
  }
}

@Override
public void parseContent(final MultivaluedMap<String, String> parameters, final InputStream inputStream,
    String extension, final String mimeType, final ParserResultBuilder resultBuilder) throws IOException {
  resultBuilder.metas().set(MIME_TYPE, findMimeType(extension, mimeType, this::findMimeTypeUsingDefault));
  // Trying to detect the CHARSET of the stream
  final CharsetDetector detector = new CharsetDetector();
  try (BufferedInputStream bis = new BufferedInputStream(inputStream)) {
    detector.setText(bis);
    final CharsetMatch match = detector.detect();
    final ParserFieldsBuilder result = resultBuilder.newDocument();
    final String content;
    if (match != null) {
      content = match.getString();
      result.add(CHARSET_DETECTION, match.getName());
    } else {
      bis.reset();
      content = IOUtils.toString(bis, Charset.defaultCharset());
    }
    result.add(CONTENT, content);
    result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000));
  }
}

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

public Encoding sniff() throws IOException {
  try {
    CharsetDetector detector = new CharsetDetector();
    detector.setText(this);
    CharsetMatch match = detector.detect();
    Encoding enc = Encoding.forName(match.getName());
    Encoding actual = enc.getActualHtmlEncoding();
    if (actual != null) {
      enc = actual;
    }
    if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
      return enc;
    } else {
      return null;
    }
  } catch (Exception e) {
    return null;
  }
}

Javadoc

Get the name of the detected charset. The name will be one that can be used with other APIs on the platform that accept charset names. It is the "Canonical name" as defined by the class java.nio.charset.Charset; for charsets that are registered with the IANA charset registry, this is the MIME-preferred registerd name.

Popular methods of CharsetMatch

getConfidence
Get an indication of the confidence in the charset detected. Confidence values range from 0-100, wit
getString
Create a Java String from Unicode character data corresponding to the original byte data supplied to
getLanguage
Get the ISO code for the language of the detected charset.
getReader
Create a java.io.Reader for reading the Unicode character data corresponding to the original byte da
<init>

Popular in Java

Start an intent from android
notifyDataSetChanged (ArrayAdapter)
getSupportFragmentManager (FragmentActivity)
setRequestProperty (URLConnection)
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
ImageIO (javax.imageio)
JOptionPane (javax.swing)
CodeWhisperer alternatives

How to use getNamemethodin com.ibm.icu.text.CharsetMatch

Best Java code snippets using com.ibm.icu.text.CharsetMatch.getName (Showing top 20 results out of 315)

How to use
getName
method
in
com.ibm.icu.text.CharsetMatch