import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.logging.Logger;
/**
*
* SmartEncodingInputStream
extends an InputStream
with a special
* constructor and a special method for dealing with text files encoded within different charsets.
*
*
* It surrounds a normal InputStream
whatever it may be (FileInputStream
...). It reads a
* buffer of a defined length. Then with this byte buffer, it uses the class
* CharsetToolkit
to parse this buffer and guess what the encoding is. All this steps
* are done within the constructor. At this time, you can call the method getReader()
to retrieve a
* Reader
created with the good charset, as guessed while parsing the first bytes of the file. This
* Reader
reads inside the SmartEncodingInputStream
. It reads first in
* the internal buffer, then when we reach the end of the buffer, the underlying InputStream is read with the default
* read method.
*
*
* Usage:
*
*
*
* FileInputStream fis = new FileInputStream("utf-8.txt");
* SmartEncodingInputStream smartIS = new SmartEncodingInputStream(fis);
* Reader reader = smartIS.getReader();
* BufferedReader bufReader = new BufferedReader(reader);
*
* String line;
* while ((line = bufReader.readLine()) != null) {
* System.out.println(line);
* }
*
*
* Date: 23 juil. 2002
*
* @author Guillaume Laforge
*/
public class SmartEncodingInputStream
extends InputStream {
private final InputStream is;
private int bufferLength;
private final byte[] buffer;
private int counter;
private final Charset charset;
public static final int BUFFER_LENGTH_2KB = 2048;
public static final int BUFFER_LENGTH_4KB = 4096;
public static final int BUFFER_LENGTH_8KB = 8192;
/**
*
* Constructor of the SmartEncodingInputStream
class. The wider the buffer is, the
* most sure you are to have guessed the encoding of the InputStream
you wished to get a
* Reader
from.
*
*
* It is possible to defined
*
*
* @param is
* the InputStream
of which we want to create a Reader
with the encoding guessed
* from the first buffer of the file.
* @param bufferLength
* the length of the buffer that is used to guess the encoding.
* @param defaultCharset
* specifies the default Charset
to use when an 8-bit Charset
is guessed. This
* parameter may be null, in this case the default system charset is used as definied in the system property
* "file.encoding" read by the method getDefaultSystemCharset()
from the class
* CharsetToolkit
.
* @param enforce8Bit
* enforce the use of the specified default Charset
in case the encoding US-ASCII is recognized.
* @throws IOException
*/
public SmartEncodingInputStream(final InputStream is, final int bufferLength, final Charset defaultCharset,
final boolean enforce8Bit) throws IOException {
this.is = is;
this.bufferLength = bufferLength;
this.buffer = new byte[bufferLength];
this.counter = 0;
this.bufferLength = is.read(buffer);
final CharsetToolkit charsetToolkit = new CharsetToolkit(buffer, defaultCharset);
charsetToolkit.setEnforce8Bit(enforce8Bit);
this.charset = charsetToolkit.guessEncoding();
}
/**
* Constructor of the SmartEncodingInputStream
. With this constructor, the default
* Charset
used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead.
*
* @param is
* is the InputStream
of which we want to create a Reader
with the encoding guessed
* from the first buffer of the file.
* @param bufferLength
* the length of the buffer that is used to guess the encoding.
* @param defaultCharset
* specifies the default Charset
to use when an 8-bit Charset
is guessed. This
* parameter may be null, in this case the default system charset is used as definied in the system property
* "file.encoding" read by the method getDefaultSystemCharset()
from the class
* CharsetToolkit
.
* @throws IOException
*/
public SmartEncodingInputStream(final InputStream is, final int bufferLength, final Charset defaultCharset)
throws IOException {
this(is, bufferLength, defaultCharset, true);
}
/**
* Constructor of the SmartEncodingInputStream
. With this constructor, the default
* Charset
used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead.
*
* @param is
* is the InputStream
of which we want to create a Reader
with the encoding guessed
* from the first buffer of the file.
* @param bufferLength
* the length of the buffer that is used to guess the encoding.
* @throws IOException
*/
public SmartEncodingInputStream(final InputStream is, final int bufferLength) throws IOException {
this(is, bufferLength, null, true);
}
/**
* Constructor of the SmartEncodingInputStream
. With this constructor, the default
* Charset
used when an 8-bit encoding is guessed does not need to be specified. The default system
* charset will be used instead. The buffer length does not need to be specified either. A default buffer length of 4
* KB is used.
*
* @param is
* is the InputStream
of which we want to create a Reader
with the encoding guessed
* from the first buffer of the file.
* @throws IOException
*/
public SmartEncodingInputStream(final InputStream is) throws IOException {
this(is, SmartEncodingInputStream.BUFFER_LENGTH_8KB, null, true);
}
/**
* Implements the method read()
as defined in the InputStream
interface. As a certain number
* of bytes has already been read from the underlying InputStream
, we first read the bytes of this
* buffer, otherwise, we directly read the rest of the stream from the underlying InputStream
.
*
* @return the total number of bytes read into the buffer, or -1
is there is no more data because the end
* of the stream has been reached.
* @throws IOException
*/
@Override
public int read()
throws IOException {
if (counter < bufferLength)
return buffer[counter++];
else
return is.read();
}
/**
* Gets a Reader
with the right Charset
as guessed by reading the beginning of the
* underlying InputStream
.
*
* @return a Reader
defined with the right encoding.
*/
public Reader getReader() {
return new InputStreamReader(this, this.charset);
}
/**
* Retrieves the Charset
as guessed from the underlying InputStream
.
*
* @return the Charset
guessed.
*/
public Charset getEncoding() {
return this.charset;
}
}
/**
*
* Utility class to guess the encoding of a given byte array. The guess is
* unfortunately not 100% sure. Especially for 8-bit charsets. It's not possible
* to know which 8-bit charset is used. Except through statistical analysis. We
* will then infer that the charset encountered is the same as the default
* standard charset.
*
*
* On the other hand, unicode files encoded in UTF-16 (low or big endian) or
* UTF-8 files with a Byte Order Marker are easy to find. For UTF-8 files with
* no BOM, if the buffer is wide enough, it's easy to guess.
*
*
* Tested against a complicated UTF-8 file, Sun's implementation does not render
* bad UTF-8 constructs as expected by the specification. But with a buffer wide
* enough, the method guessEncoding() did behave correctly and recognized the
* UTF-8 charset.
*
*
* A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
*
*
* Usage:
*
*
*
* // guess the encoding
* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
*
* // create a reader with the charset we've just discovered
* FileInputStream fis = new FileInputStream(file);
* InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
* BufferedReader br = new BufferedReader(isr);
*
* // read the file content
* String line;
* while ((line = br.readLine()) != null) {
* System.out.println(line);
* }
*
*
* Date: 18 juil. 2002
*
*
* @author Guillaume LAFORGE
*/
class CharsetToolkit {
private final byte[] buffer;
private Charset defaultCharset;
private boolean enforce8Bit = false;
/**
* Constructor of the CharsetToolkit
utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
*/
public CharsetToolkit(final byte[] buffer) {
this.buffer = buffer;
this.defaultCharset = getDefaultSystemCharset();
}
/**
* Constructor of the CharsetToolkit
utility class.
*
* @param buffer
* the byte buffer of which we want to know the encoding.
* @param defaultCharset
* the default Charset to use in case an 8-bit charset is
* recognized.
*/
public CharsetToolkit(final byte[] buffer, final Charset defaultCharset) {
this.buffer = buffer;
setDefaultCharset(defaultCharset);
}
/**
* Defines the default Charset
used in case the buffer
* represents an 8-bit Charset
.
*
* @param defaultCharset
* the default Charset
to be returned by
* guessEncoding()
if an 8-bit Charset
* is encountered.
*/
public void setDefaultCharset(final Charset defaultCharset) {
if (defaultCharset != null)
this.defaultCharset = defaultCharset;
else
this.defaultCharset = getDefaultSystemCharset();
}
/**
* If US-ASCII is recognized, enforce to return the default encoding, rather
* than US-ASCII. It might be a file without any special character in the
* range 128-255, but that may be or become a file encoded with the default
* charset
rather than US-ASCII.
*
* @param enforce
* a boolean specifying the use or not of US-ASCII.
*/
public void setEnforce8Bit(final boolean enforce) {
this.enforce8Bit = enforce;
}
/**
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII
* encoding.
*
* @return a boolean representing the flag of use of US-ASCII.
*/
public boolean getEnforce8Bit() {
return this.enforce8Bit;
}
/**
* Retrieves the default Charset
*
* @return
*/
public Charset getDefaultCharset() {
return defaultCharset;
}
/**
*
* Guess the encoding of the provided buffer.
*
* If Byte Order Markers are encountered at the beginning of the buffer, we
* immidiately return the charset implied by this BOM. Otherwise, the file
* would not be a human readable text file.
*
* If there is no BOM, this method tries to discern whether the file is
* UTF-8 or not. If it is not UTF-8, we assume the encoding is the default
* system encoding (of course, it might be any 8-bit charset, but usually,
* an 8-bit charset is the default one).
*
*
* It is possible to discern UTF-8 thanks to the pattern of characters with
* a multi-byte sequence.
*
*
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
*
* With UTF-8, 0xFE and 0xFF never appear.
*
*
* @return the Charset recognized.
*/
public Charset guessEncoding() {
// if the file has a Byte Order Marker, we can assume the file is in
// UTF-xx
// otherwise, the file would not be human readable
if (hasUTF8Bom(buffer))
return Charset.forName("UTF-8");
if (hasUTF16LEBom(buffer))
return Charset.forName("UTF-16LE");
if (hasUTF16BEBom(buffer))
return Charset.forName("UTF-16BE");
// if a byte has its most significant bit set, the file is in UTF-8 or
// in the default encoding
// otherwise, the file is in US-ASCII
boolean highOrderBit = false;
// if the file is in UTF-8, high order bytes must have a certain value,
// in order to be valid
// if it's not the case, we can assume the encoding is the default
// encoding of the system
boolean validU8Char = true;
// TODO the buffer is not read up to the end, but up to length - 6
final int length = buffer.length;
int i = 0;
while (i < length - 6) {
final byte b0 = buffer[i];
final byte b1 = buffer[i + 1];
final byte b2 = buffer[i + 2];
final byte b3 = buffer[i + 3];
final byte b4 = buffer[i + 4];
final byte b5 = buffer[i + 5];
if (b0 < 0) {
// a high order bit was encountered, thus the encoding is not
// US-ASCII
// it may be either an 8-bit encoding or UTF-8
highOrderBit = true;
// a two-bytes sequence was encoutered
if (isTwoBytesSequence(b0)) {
// there must be one continuation byte of the form 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!isContinuationChar(b1))
validU8Char = false;
else
i++;
}
// a three-bytes sequence was encoutered
else if (isThreeBytesSequence(b0)) {
// there must be two continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
validU8Char = false;
else
i += 2;
}
// a four-bytes sequence was encoutered
else if (isFourBytesSequence(b0)) {
// there must be three continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
validU8Char = false;
else
i += 3;
}
// a five-bytes sequence was encoutered
else if (isFiveBytesSequence(b0)) {
// there must be four continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)
&& isContinuationChar(b3) && isContinuationChar(b4)))
validU8Char = false;
else
i += 4;
}
// a six-bytes sequence was encoutered
else if (isSixBytesSequence(b0)) {
// there must be five continuation bytes of the form
// 10xxxxxx,
// otherwise the following characteris is not a valid UTF-8
// construct
if (!(isContinuationChar(b1) && isContinuationChar(b2)
&& isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))
validU8Char = false;
else
i += 5;
} else
validU8Char = false;
}
if (!validU8Char)
break;
i++;
}
// if no byte with an high order bit set, the encoding is US-ASCII
// (it might have been UTF-7, but this encoding is usually internally
// used only by mail systems)
if (!highOrderBit) {
// returns the default charset rather than US-ASCII if the
// enforce8Bit flag is set.
if (this.enforce8Bit)
return this.defaultCharset;
else
return Charset.forName("US-ASCII");
}
// if no invalid UTF-8 were encountered, we can assume the encoding is
// UTF-8,
// otherwise the file would not be human readable
if (validU8Char)
return Charset.forName("UTF-8");
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is
// the default encoding
return this.defaultCharset;
}
public static Charset guessEncoding(final File f, final int bufferLength)
throws FileNotFoundException, IOException {
final FileInputStream fis = new FileInputStream(f);
final byte[] buffer = new byte[bufferLength];
fis.read(buffer);
fis.close();
final CharsetToolkit toolkit = new CharsetToolkit(buffer);
toolkit.setDefaultCharset(getDefaultSystemCharset());
return toolkit.guessEncoding();
}
public static Charset guessEncoding(final File f, final int bufferLength,
final Charset defaultCharset) throws FileNotFoundException,
IOException {
final FileInputStream fis = new FileInputStream(f);
final byte[] buffer = new byte[bufferLength];
fis.read(buffer);
fis.close();
final CharsetToolkit toolkit = new CharsetToolkit(buffer);
toolkit.setDefaultCharset(defaultCharset);
return toolkit.guessEncoding();
}
/**
* If the byte has the form 10xxxxx, then it's a continuation byte of a
* multiple byte character;
*
* @param b
* a byte.
* @return true if it's a continuation char.
*/
private static boolean isContinuationChar(final byte b) {
return -128 <= b && b <= -65;
}
/**
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes
* sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a two-bytes sequence.
*/
private static boolean isTwoBytesSequence(final byte b) {
return -64 <= b && b <= -33;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a
* three-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a three-bytes sequence.
*/
private static boolean isThreeBytesSequence(final byte b) {
return -32 <= b && b <= -17;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a
* four-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a four-bytes sequence.
*/
private static boolean isFourBytesSequence(final byte b) {
return -16 <= b && b <= -9;
}
/**
* If the byte has the form 11110xx, then it's the first byte of a
* five-bytes sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a five-bytes sequence.
*/
private static boolean isFiveBytesSequence(final byte b) {
return -8 <= b && b <= -5;
}
/**
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes
* sequence character.
*
* @param b
* a byte.
* @return true if it's the first byte of a six-bytes sequence.
*/
private static boolean isSixBytesSequence(final byte b) {
return -4 <= b && b <= -3;
}
/**
* Retrieve the default charset of the system.
*
* @return the default Charset
.
*/
public static Charset getDefaultSystemCharset() {
return Charset.forName(System.getProperty("file.encoding"));
}
/**
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other
* editors).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF8.
*/
private static boolean hasUTF8Bom(final byte[] bom) {
return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);
}
/**
* Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and
* ucs-16le).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Low Endian.
*/
private static boolean hasUTF16LEBom(final byte[] bom) {
return (bom[0] == -1 && bom[1] == -2);
}
/**
* Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
*
* @param bom
* a buffer.
* @return true if the buffer has a BOM for UTF-16 Big Endian.
*/
private static boolean hasUTF16BEBom(final byte[] bom) {
return (bom[0] == -2 && bom[1] == -1);
}
/**
* Retrieves all the available Charset
s on the platform, among
* which the default charset
.
*
* @return an array of Charset
s.
*/
public static Charset[] getAvailableCharsets() {
final Collection collection = Charset.availableCharsets().values();
return (Charset[]) collection.toArray(new Charset[collection.size()]);
}
}