groovy.util.CharsetToolkit java code examples

/**
 * Create a buffered reader for this file.
 *
 * @param file a File
 * @return a BufferedReader
 * @throws IOException if an IOException occurs.
 * @since 1.0
 */
public static BufferedReader newReader(File file) throws IOException {
  CharsetToolkit toolkit = new CharsetToolkit(file);
  return toolkit.getReader();
}

/**
 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered or the default
 * charset if an 8-bit <code>Charset</code> is encountered.
 *
 * @return a <code>BufferedReader</code>
 * @throws FileNotFoundException if the file is not found.
 */
public BufferedReader getReader() throws FileNotFoundException {
  LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
  if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
    try {
      reader.read();
    }
    catch (IOException e) {
      // should never happen, as a file with no content
      // but with a BOM has at least one char
    }
  }
  return reader;
}

/**
 * Defines the default <code>Charset</code> used in case the buffer represents
 * an 8-bit <code>Charset</code>.
 *
 * @param defaultCharset the default <code>Charset</code> to be returned
 * if an 8-bit <code>Charset</code> is encountered.
 */
public void setDefaultCharset(Charset defaultCharset) {
  if (defaultCharset != null)
    this.defaultCharset = defaultCharset;
  else
    this.defaultCharset = getDefaultSystemCharset();
}

if (hasUTF8Bom())
  return Charset.forName("UTF-8");
if (hasUTF16LEBom())
  return Charset.forName("UTF-16LE");
if (hasUTF16BEBom())
  return Charset.forName("UTF-16BE");
    if (isTwoBytesSequence(b0)) {
      if (!isContinuationChar(b1))
        validU8Char = false;
      else
    else if (isThreeBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2)))
        validU8Char = false;
      else
    else if (isFourBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
        validU8Char = false;
      else
    else if (isFiveBytesSequence(b0)) {
      if (!(isContinuationChar(b1)
        && isContinuationChar(b2)

  public void testFileEncoding() throws Exception {
    executeTarget("testFileEncoding");

    final File testfilesPackageDir = new File(tmpDir, "org/codehaus/groovy/tools/groovydoc/testfiles");
    System.err.println("testfilesPackageDir = " + testfilesPackageDir);
    final String[] list = testfilesPackageDir.list(new FilenameFilter() {
      public boolean accept(File file, String name) {
        return name.equals("DocumentedClass.html");
      }
    });

    File documentedClassHtmlDoc = new File(testfilesPackageDir, list[0]);
    CharsetToolkit charsetToolkit = new CharsetToolkit(documentedClassHtmlDoc);

    assertEquals("The generated groovydoc must be in 'UTF-16LE' file encoding.'", Charset.forName("UTF-16LE"), charsetToolkit.getCharset());
  }
}

public Charset getCharset() {
  if (this.charset == null)
    this.charset = guessEncoding();
  return charset;
}

updateEncodingsScores(encodingsScores, new CharsetToolkit(data).guessEncoding().displayName());

if (hasUTF8Bom())
  return Charset.forName("UTF-8");
if (hasUTF16LEBom())
  return Charset.forName("UTF-16LE");
if (hasUTF16BEBom())
  return Charset.forName("UTF-16BE");
    if (isTwoBytesSequence(b0)) {
      if (!isContinuationChar(b1))
        validU8Char = false;
      else
    else if (isThreeBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2)))
        validU8Char = false;
      else
    else if (isFourBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
        validU8Char = false;
      else
    else if (isFiveBytesSequence(b0)) {
      if (!(isContinuationChar(b1)
        && isContinuationChar(b2)

public Charset getCharset() {
  if (this.charset == null)
    this.charset = guessEncoding();
  return charset;
}

if (hasUTF8Bom())
  return Charset.forName("UTF-8");
if (hasUTF16LEBom())
  return Charset.forName("UTF-16LE");
if (hasUTF16BEBom())
  return Charset.forName("UTF-16BE");
    if (isTwoBytesSequence(b0)) {
      if (!isContinuationChar(b1))
        validU8Char = false;
      else
    else if (isThreeBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2)))
        validU8Char = false;
      else
    else if (isFourBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
        validU8Char = false;
      else
    else if (isFiveBytesSequence(b0)) {
      if (!(isContinuationChar(b1)
        && isContinuationChar(b2)

/**
 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
 * method <code>guessEncoding()</code>.
 *
 * @return a <code>BufferedReader</code>
 * @throws FileNotFoundException if the file is not found.
 */
public BufferedReader getReader() throws FileNotFoundException {
  LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
  if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
    try {
      reader.read();
    }
    catch (IOException e) {
      // should never happen, as a file with no content
      // but with a BOM has at least one char
    }
  }
  return reader;
}

/**
 * Create a buffered reader for this file.
 *
 * @param file a File
 * @return a BufferedReader
 * @throws IOException if an IOException occurs.
 * @since 1.0
 */
public static BufferedReader newReader(File file) throws IOException {
  CharsetToolkit toolkit = new CharsetToolkit(file);
  return toolkit.getReader();
}

/**
 * @param infile the file to create a GroovyCodeSource for.
 * @throws IOException if an issue arises opening and reading the file.
 */
public GroovyCodeSource(final File infile) throws IOException {
  this(infile, CharsetToolkit.getDefaultSystemCharset().name());
}

public Charset getCharset() {
  if (this.charset == null)
    this.charset = guessEncoding();
  return charset;
}

if (hasUTF8Bom())
  return Charset.forName("UTF-8");
if (hasUTF16LEBom())
  return Charset.forName("UTF-16LE");
if (hasUTF16BEBom())
  return Charset.forName("UTF-16BE");
    if (isTwoBytesSequence(b0)) {
      if (!isContinuationChar(b1))
        validU8Char = false;
      else
    else if (isThreeBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2)))
        validU8Char = false;
      else
    else if (isFourBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
        validU8Char = false;
      else
    else if (isFiveBytesSequence(b0)) {
      if (!(isContinuationChar(b1)
        && isContinuationChar(b2)

/**
 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
 * method <code>guessEncoding()</code>.
 *
 * @return a <code>BufferedReader</code>
 * @throws FileNotFoundException if the file is not found.
 */
public BufferedReader getReader() throws FileNotFoundException {
  LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
  if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
    try {
      reader.read();
    }
    catch (IOException e) {
      // should never happen, as a file with no content
      // but with a BOM has at least one char
    }
  }
  return reader;
}

/**
 * Create a buffered reader for this file.
 *
 * @param file a File
 * @return a BufferedReader
 * @throws IOException if an IOException occurs.
 */
public static BufferedReader newReader(File file) throws IOException {
  CharsetToolkit toolkit = new CharsetToolkit(file);
  return toolkit.getReader();
}

/**
 * Read the content of this URL and returns it as a String.
 *
 * @param url URL to read content from
 * @return the text from that URL
 * @throws IOException if an IOException occurs.
 * @since 1.0
 */
public static String getText(URL url) throws IOException {
  return getText(url, CharsetToolkit.getDefaultSystemCharset().name());
}

public Charset getCharset() {
  if (this.charset == null)
    this.charset = guessEncoding();
  return charset;
}

if (hasUTF8Bom())
  return Charset.forName("UTF-8");
if (hasUTF16LEBom())
  return Charset.forName("UTF-16LE");
if (hasUTF16BEBom())
  return Charset.forName("UTF-16BE");
    if (isTwoBytesSequence(b0)) {
      if (!isContinuationChar(b1))
        validU8Char = false;
      else
    else if (isThreeBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2)))
        validU8Char = false;
      else
    else if (isFourBytesSequence(b0)) {
      if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
        validU8Char = false;
      else
    else if (isFiveBytesSequence(b0)) {
      if (!(isContinuationChar(b1)
        && isContinuationChar(b2)

Javadoc

Utility class to guess the encoding of a given text file.

Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer is wide enough, the charset should also be discovered.

A byte buffer of 4KB is usually sufficient to be able to guess the encoding.

Usage:

 
// guess the encoding 
Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096); 
// create a reader with the correct charset 
CharsetToolkit toolkit = new CharsetToolkit(file); 
BufferedReader reader = toolkit.getReader(); 
// read the file content 
String line; 
while ((line = br.readLine())!= null) 
{ 
System.out.println(line); 
}

Most used methods

<init>
Constructor of the CharsetToolkit utility class.
getCharset
getDefaultSystemCharset
Retrieve the default charset of the system.
getReader
Gets a BufferedReader (indeed a LineNumberReader) from the File specified in the constructor of Char
guessEncoding
Guess the encoding of the provided buffer. If Byte Order Markers are encountered at the beginning o
hasUTF16BEBom
Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
hasUTF16LEBom
Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and ucs-16le).
hasUTF8Bom
Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
isContinuationChar
If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
isFiveBytesSequence
If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
isFourBytesSequence
If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
isSixBytesSequence
If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.

Popular in Java

Parsing JSON documents to java classes using gson
setScale (BigDecimal)
getExternalFilesDir (Context)
getContentResolver (Context)
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
BitSet (java.util)
The BitSet class implements abit array [http://en.wikipedia.org/wiki/Bit_array]. Each element is eit
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
Reference (javax.naming)
Top 12 Jupyter Notebook extensions

How to useCharsetToolkit in groovy.util

Best Java code snippets using groovy.util.CharsetToolkit (Showing top 20 results out of 315)

How to use
CharsetToolkit
in
groovy.util