/*
* $Id: XmlReader.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Crimson" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, Sun Microsystems, Inc.,
* http://www.sun.com. For more information on the Apache Software
* Foundation, please see .
*/
import java.io.*;
import java.util.Hashtable;
/**
* This handles several XML-related tasks that normal java.io Readers
* don't support, inluding use of IETF standard encoding names and
* automatic detection of most XML encodings. The former is needed
* for interoperability; the latter is needed to conform with the XML
* spec. This class also optimizes reading some common encodings by
* providing low-overhead unsynchronized Reader support.
*
* Note that the autodetection facility should be used only on
* data streams which have an unknown character encoding. For example,
* it should never be used on MIME text/xml entities.
*
*
Note that XML processors are only required to support UTF-8 and
* UTF-16 character encodings. Autodetection permits the underlying Java
* implementation to provide support for many other encodings, such as
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
*
* @author David Brownell
* @version $Revision: 1.1 $
*/
final public class XmlReader extends Reader
{
private static final int MAXPUSHBACK = 512;
private Reader in;
private String assignedEncoding;
private boolean closed;
//
// This class always delegates I/O to a reader, which gets
// its data from the very beginning of the XML text. It needs
// to use a pushback stream since (a) autodetection can read
// partial UTF-8 characters which need to be fully processed,
// (b) the "Unicode" readers swallow characters that they think
// are byte order marks, so tests fail if they don't see the
// real byte order mark.
//
// It's got do this efficiently: character I/O is solidly on the
// critical path. (So keep buffer length over 2 Kbytes to avoid
// excess buffering. Many URL handlers stuff a BufferedInputStream
// between here and the real data source, and larger buffers keep
// that from slowing you down.)
//
/**
* Constructs the reader from an input stream, autodetecting
* the encoding to use according to the heuristic specified
* in the XML 1.0 recommendation.
*
* @param in the input stream from which the reader is constructed
* @exception IOException on error, such as unrecognized encoding
*/
public static Reader createReader (InputStream in) throws IOException
{
return new XmlReader (in);
}
/**
* Creates a reader supporting the given encoding, mapping
* from standard encoding names to ones that understood by
* Java where necessary.
*
* @param in the input stream from which the reader is constructed
* @param encoding the IETF standard name of the encoding to use;
* if null, autodetection is used.
* @exception IOException on error, including unrecognized encoding
*/
public static Reader createReader (InputStream in, String encoding)
throws IOException
{
if (encoding == null) {
return new XmlReader(in);
}
if ("UTF-8".equalsIgnoreCase (encoding)
|| "UTF8".equalsIgnoreCase (encoding)) {
return new Utf8Reader (in);
}
if ("US-ASCII".equalsIgnoreCase (encoding)
|| "ASCII".equalsIgnoreCase (encoding)) {
return new AsciiReader (in);
}
if ("ISO-8859-1".equalsIgnoreCase (encoding)
// plus numerous aliases ...
) {
return new Iso8859_1Reader (in);
}
// What we really want is an administerable resource mapping
// encoding names/aliases to classnames. For example a property
// file resource, "readers/mapping.props", holding and a set
// of readers in that (sub)package... defaulting to this call
// only if no better choice is available.
//
return new InputStreamReader (in, std2java (encoding));
}
// JDK doesn't know all of the standard encoding names, and
// in particular none of the EBCDIC ones IANA defines (and
// which IBM encourages).
static private final Hashtable charsets = new Hashtable (31);
static {
charsets.put ("UTF-16", "Unicode");
charsets.put ("ISO-10646-UCS-2", "Unicode");
// NOTE: no support for ISO-10646-UCS-4 yet.
charsets.put ("EBCDIC-CP-US", "cp037");
charsets.put ("EBCDIC-CP-CA", "cp037");
charsets.put ("EBCDIC-CP-NL", "cp037");
charsets.put ("EBCDIC-CP-WT", "cp037");
charsets.put ("EBCDIC-CP-DK", "cp277");
charsets.put ("EBCDIC-CP-NO", "cp277");
charsets.put ("EBCDIC-CP-FI", "cp278");
charsets.put ("EBCDIC-CP-SE", "cp278");
charsets.put ("EBCDIC-CP-IT", "cp280");
charsets.put ("EBCDIC-CP-ES", "cp284");
charsets.put ("EBCDIC-CP-GB", "cp285");
charsets.put ("EBCDIC-CP-FR", "cp297");
charsets.put ("EBCDIC-CP-AR1", "cp420");
charsets.put ("EBCDIC-CP-HE", "cp424");
charsets.put ("EBCDIC-CP-BE", "cp500");
charsets.put ("EBCDIC-CP-CH", "cp500");
charsets.put ("EBCDIC-CP-ROECE", "cp870");
charsets.put ("EBCDIC-CP-YU", "cp870");
charsets.put ("EBCDIC-CP-IS", "cp871");
charsets.put ("EBCDIC-CP-AR2", "cp918");
// IANA also defines two that JDK 1.2 doesn't handle:
// EBCDIC-CP-GR --> CP423
// EBCDIC-CP-TR --> CP905
}
// returns an encoding name supported by JDK >= 1.1.6
// for some cases required by the XML spec
private static String std2java (String encoding)
{
String temp = encoding.toUpperCase ();
temp = (String) charsets.get (temp);
return (temp != null) ? temp : encoding;
}
/** Returns the standard name of the encoding in use */
public String getEncoding ()
{
return assignedEncoding;
}
private XmlReader (InputStream stream) throws IOException
{
super (stream);
PushbackInputStream pb;
byte buf [];
int len;
/*if (stream instanceof PushbackInputStream)
pb = (PushbackInputStream) stream;
else*/
/**
* Commented out the above code to make sure it works when the
* document is accessed using http. URL connection in the code uses
* a PushbackInputStream with size 7 and when we try to push back
* MAX which default value is set to 512 we get and exception. So
* that's why we need to wrap the stream irrespective of what type
* of stream we start off with.
*/
pb = new PushbackInputStream (stream, MAXPUSHBACK);
//
// See if we can figure out the character encoding used
// in this file by peeking at the first few bytes.
//
buf = new byte [4];
len = pb.read (buf);
if (len > 0)
pb.unread (buf, 0, len);
if (len == 4) switch (buf [0] & 0x0ff) {
case 0:
// 00 3c 00 3f == illegal UTF-16 big-endian
if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) {
setEncoding (pb, "UnicodeBig");
return;
}
// else it's probably UCS-4
break;
case '<': // 0x3c: the most common cases!
switch (buf [1] & 0x0ff) {
// First character is '<'; could be XML without
// an XML directive such as "", "