使用Java基于数据流直接抽取ppt文本,java数据流抽取ppt,如下代码是直接基于数据流
分享于 点击 11948 次 点评:54
使用Java基于数据流直接抽取ppt文本,java数据流抽取ppt,如下代码是直接基于数据流
如下代码是直接基于数据流进行文本抽取,支持powerpoint97-powerpoint2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途。
public class PPTExtractor { public static StringBuilder logBytes = new StringBuilder(); public static int getPPTDcoument(byte[] ogiBytes, Stream stream, int dirSect1) { for(int i=0;i<8;i++) { int offsetEntry = (dirSect1 + 1)*512 + i*128; StringBuilder content = new StringBuilder(); bytesToString(ogiBytes, content, offsetEntry, 64, 0); if(content.toString().indexOf("PowerPoint Document") > -1) { return offsetEntry; } } return 0; } public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc) { byte[] bytes = new byte[length]; System.arraycopy(ogiBytes, start, bytes, 0, length); if(fc == 0) { for(int i=0;i<bytes.length;i++) { if(i == bytes.length - 1) { return; } String hexStr = Integer.toHexString(bytes[i+1] & 0xFF) + Integer.toHexString(bytes[i] & 0xFF); int ch = Integer.valueOf(hexStr, 16); content.append( (char)ch ); i++; } } else { for(int i=0;i<bytes.length;i++) { int ch = bytes[i] & 0xFF; content.append( (char)ch ); } } } public static void printLogBytes(List<Byte> legaled) throws Exception { logBytes = new StringBuilder(); logBytes.append("\\n========================================================"); for(int a=0;a<legaled.size();a++) { if(a % 16 == 0) { logBytes.append("\\n"); } logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" "); } logBytes.append("\\n========================================================"); FileUtil.writeAscFile("E:\\\\bytes.txt", logBytes.toString()); } public static int findTextRecords(Stream stream, byte[] bytes, int start, StringBuilder content, int[] offset) { byte opt = bytes[start]; int container = opt & 0x0f; if(container == 0x0f) { return start+8; } offset[0] = start + 2; int type = stream.getShort(offset); offset[0] = start + 4; int len = stream.getInteger(offset); if(type == 0x0FA8) { bytesToString(bytes, content, start+8, len, 1); System.out.println("Text Bytes Atom found!"); } if(type == 0x0FA0) { bytesToString(bytes, content, start+8, len, 0); System.out.println("Text Chars Atom found!"); } int newStart = start + 8 + len; if(newStart > bytes.length - 8) { newStart = -1; } return newStart; } public static void main(String[] args) throws Exception { byte[] ogiBytes = FileUtil.readBinFile("D:\\\\tools\\\\oletest\\\\cn-t.ppt"); System.out.println("Total bytes: "+ ogiBytes.length); if( ogiBytes.length < 8 || (ogiBytes[0] & 0xFF) != 208 || (ogiBytes[1] & 0xFF) != 207 || (ogiBytes[2] & 0xFF) != 17 || (ogiBytes[3] & 0xFF) != 224 || (ogiBytes[4] & 0xFF) != 161 || (ogiBytes[5] & 0xFF) != 177 || (ogiBytes[6] & 0xFF) != 26 || (ogiBytes[7] & 0xFF) != 225 ){ System.out.println("Not the ppt file!"); return; } Stream stream = new Stream(ogiBytes); int[] offset = new int[1]; offset[0] = 48; int dirSect1 = stream.getInteger(offset); int pptDocument = getPPTDcoument(ogiBytes, stream, dirSect1); if(pptDocument <= 0) { System.out.println("This version of ppt can not be parsed!"); return; } offset[0] = pptDocument + 116; int startSect = stream.getInteger(offset); int docStart = (startSect + 1)*512; int docLength = stream.getInteger(offset); byte[] bytes = new byte[docLength]; System.arraycopy(ogiBytes, docStart, bytes, 0, docLength); stream = new Stream(bytes); StringBuilder content = new StringBuilder(); int start = 0; while(start != -1) { start = findTextRecords(stream, bytes, start, content, offset); } FileUtil.writeAscFile("E:\\\\output.txt", content.toString(), false); System.out.println("Done!"); }}//该片段来自于http://byrx.net
用户点评