欢迎访问悦橙教程(wld5.com),关注java教程。悦橙教程  java问答|  每日更新
页面导航 : > > 文章正文

UnicodeReader 从BOM识别字符集,unicodereaderbom,/** - change

来源: javaer 分享于  点击 24042 次 点评:238

UnicodeReader 从BOM识别字符集,unicodereaderbom,/** - change


/** - changed BOM recognition ordering (longer boms first) Original pseudocode   : Thomas Weidenfeller Implementation tweaked: Aki Nieminen http://www.unicode.org/unicode/faq/utf_bom.html BOMs:   00 00 FE FF    = UTF-32, big-endian   FF FE 00 00    = UTF-32, little-endian   EF BB BF       = UTF-8,   FE FF          = UTF-16, big-endian   FF FE          = UTF-16, little-endian Win2k Notepad:   Unicode format = UTF-16LE***/import java.io.*;/** * Generic unicode textreader, which will use BOM mark * to identify the encoding to be used. If BOM is not found * then use a given default or system encoding. */public class UnicodeReader extends Reader {   PushbackInputStream internalIn;   InputStreamReader   internalIn2 = null;   String              defaultEnc;   private static final int BOM_SIZE = 4;   /**    *    * @param in  inputstream to be read    * @param defaultEnc default encoding if stream does not have     *                   BOM marker. Give NULL to use system-level default.    */   public UnicodeReader(InputStream in, String defaultEnc) {      internalIn = new PushbackInputStream(in, BOM_SIZE);      this.defaultEnc = defaultEnc;   }   public String getDefaultEncoding() {      return defaultEnc;   }   /**    * Get stream encoding or NULL if stream is uninitialized.    * Call init() or read() method to initialize it.    */   public String getEncoding() {      if (internalIn2 == null) return null;      return internalIn2.getEncoding();   }   /**    * Read-ahead four bytes and check for BOM marks. Extra bytes are    * unread back to the stream, only BOM bytes are skipped.    */   protected void init() throws IOException {      if (internalIn2 != null) return;      String encoding;      byte bom[] = new byte[BOM_SIZE];      int n, unread;      n = internalIn.read(bom, 0, bom.length);      if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&                  (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {         encoding = "UTF-32BE";         unread = n - 4;      } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&                  (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {         encoding = "UTF-32LE";         unread = n - 4;      } else if (  (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&            (bom[2] == (byte)0xBF) ) {         encoding = "UTF-8";         unread = n - 3;      } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {         encoding = "UTF-16BE";         unread = n - 2;      } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {         encoding = "UTF-16LE";         unread = n - 2;      } else {         // Unicode BOM mark not found, unread all bytes         encoding = defaultEnc;         unread = n;      }          //System.out.println("read=" + n + ", unread=" + unread);      if (unread > 0) internalIn.unread(bom, (n - unread), unread);      // Use given encoding      if (encoding == null) {         internalIn2 = new InputStreamReader(internalIn);      } else {         internalIn2 = new InputStreamReader(internalIn, encoding);      }   }   public void close() throws IOException {      init();      internalIn2.close();   }   public int read(char[] cbuf, int off, int len) throws IOException {      init();      return internalIn2.read(cbuf, off, len);   }}//该片段来自于http://byrx.net
相关栏目:

用户点评