Mht格式文本提取,mht文本提取,protected St
分享于 点击 23385 次 点评:242
Mht格式文本提取,mht文本提取,protected St
protected String getText(InputStream is) { String mhts = super.readString(is);//读取文件数据 int a1 = mhts.indexOf("<HTML"); int a2 = mhts.indexOf("</HTML>"); String html = mhts.substring(a1, a2 + 7); html = decodeQuotedPrintable(html, "GBK");//todo 实际编码 return super.dumpText(is);//使用Jsoup读取html中的文本}public static String decodeQuotedPrintable(String str, String encoding) { if (str == null) { return null; } try { //str = str.replaceAll("=\\n", "");//?? byte[] bytes = str.getBytes("US-ASCII"); ByteArrayOutputStream buffer = new ByteArrayOutputStream(); for (int i = 0; i < bytes.length; i++) { int b = bytes[i]; if (b == '=') { int u = Character.digit((char) bytes[++i], 16); int l = Character.digit((char) bytes[++i], 16); if (u == -1 || l == -1) {//?? continue; } buffer.write((char) ((u << 4) + l)); } else { buffer.write(b); } } return buffer.toString(encoding); } catch (Exception e) { e.printStackTrace(); return str; }}//该片段来自于http://byrx.net
用户点评