首先,不同编码的文本,是根据文本的前两个字节来定义其编码格式的。定义如下:
ANSI: 无格式定义; Unicode: 前两个字节为FFFE; Unicode big endian: 前两字节为FEFF; UTF-8: 前两字节为EFBB; 知道了各种编码格式的区别,写代码就容易了.1 package charset; 2 3 import java.io.BufferedInputStream; 4 import java.io.File; 5 import java.io.FileInputStream; 6 7 public class demo { 8 public static void main(String[] args) { 9 String txtCharset=get_charset(new File("e:/1.TXT"));10 System.out.println(txtCharset);11 }12 public static String get_charset(File file) {13 String charset = "GBK";14 byte[] first3Bytes = new byte[3];15 try {16 boolean checked=false;17 BufferedInputStream bis = new BufferedInputStream(18 new FileInputStream(file));19 bis.mark(0);20 int read = bis.read(first3Bytes, 0, 3);21 if (read == -1)22 return charset;23 if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {24 charset = "UTF-16LE";25 checked = true;26 } else if (first3Bytes[0] == (byte) 0xFE27 && first3Bytes[1] == (byte) 0xFF) {28 charset = "UTF-16BE";29 checked = true;30 } else if (first3Bytes[0] == (byte) 0xEF31 && first3Bytes[1] == (byte) 0xBB32 && first3Bytes[2] == (byte) 0xBF) {33 charset = "UTF-8";34 checked = true;35 }36 bis.reset();37 if (!checked) {38 // int len = 0;39 int loc = 0;40 41 while ((read = bis.read()) != -1) {42 loc++;43 if (read >= 0xF0)44 break;45 if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK46 break;47 if (0xC0 <= read && read <= 0xDF) {48 read = bis.read();49 if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)50 // (0x8051 // - 0xBF),也可能在GB编码内52 continue;53 else54 break;55 } else if (0xE0 <= read && read <= 0xEF) { // 也有可能出错,但是几率较小56 read = bis.read();57 if (0x80 <= read && read <= 0xBF) {58 read = bis.read();59 if (0x80 <= read && read <= 0xBF) {60 charset = "UTF-8";61 break;62 } else63 break;64 } else65 break;66 }67 }68 // System.out.println( loc + " " + Integer.toHexString( read )69 // );70 }71 72 bis.close();73 } catch (Exception e) {74 e.printStackTrace();75 }76 77 return charset;78 }79 }