欢迎访问悦橙教程(wld5.com),关注java教程。悦橙教程  java问答|  每日更新
页面导航 : > > 文章正文

使用Java基于数据流直接抽取ppt文本,java数据流抽取ppt,如下代码是直接基于数据流

来源: javaer 分享于  点击 11948 次 点评:54

使用Java基于数据流直接抽取ppt文本,java数据流抽取ppt,如下代码是直接基于数据流


如下代码是直接基于数据流进行文本抽取,支持powerpoint97-powerpoint2003版本,之后的版本实际都是xml,抽取文本非常简单,因此在此处不再说明,代码仅供研究学习使用,禁止用于商业用途。

public class PPTExtractor {    public static StringBuilder logBytes = new StringBuilder();    public static int getPPTDcoument(byte[] ogiBytes, Stream stream, int dirSect1)    {        for(int i=0;i<8;i++)        {            int offsetEntry = (dirSect1 + 1)*512 + i*128;            StringBuilder content = new StringBuilder();            bytesToString(ogiBytes, content, offsetEntry, 64, 0);            if(content.toString().indexOf("PowerPoint Document") > -1)            {                return offsetEntry;            }        }        return 0;    }    public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)    {        byte[] bytes = new byte[length];        System.arraycopy(ogiBytes, start, bytes, 0, length);        if(fc == 0)        {            for(int i=0;i<bytes.length;i++)            {                if(i == bytes.length - 1)                {                    return;                }                String hexStr = Integer.toHexString(bytes[i+1] & 0xFF) + Integer.toHexString(bytes[i] & 0xFF);                int ch = Integer.valueOf(hexStr, 16);                content.append( (char)ch );                i++;            }        }        else        {            for(int i=0;i<bytes.length;i++)            {                int ch = bytes[i] & 0xFF;                content.append( (char)ch );            }        }    }    public static void printLogBytes(List<Byte> legaled) throws Exception    {        logBytes = new StringBuilder();        logBytes.append("\\n========================================================");        for(int a=0;a<legaled.size();a++)        {            if(a % 16 == 0)            {                logBytes.append("\\n");            }            logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");        }        logBytes.append("\\n========================================================");        FileUtil.writeAscFile("E:\\\\bytes.txt", logBytes.toString());    }    public static int findTextRecords(Stream stream, byte[] bytes, int start, StringBuilder content, int[] offset)    {        byte opt = bytes[start];        int container = opt & 0x0f;        if(container == 0x0f)        {            return start+8;        }        offset[0] = start + 2;        int type = stream.getShort(offset);        offset[0] = start + 4;        int len = stream.getInteger(offset);        if(type == 0x0FA8)        {            bytesToString(bytes, content, start+8, len, 1);            System.out.println("Text Bytes Atom found!");        }        if(type == 0x0FA0)        {            bytesToString(bytes, content, start+8, len, 0);            System.out.println("Text Chars Atom found!");        }        int newStart = start + 8 + len;        if(newStart > bytes.length - 8)        {            newStart = -1;        }        return newStart;    }    public static void main(String[] args) throws Exception    {        byte[] ogiBytes = FileUtil.readBinFile("D:\\\\tools\\\\oletest\\\\cn-t.ppt");        System.out.println("Total bytes: "+ ogiBytes.length);        if(                ogiBytes.length < 8         ||                (ogiBytes[0] & 0xFF) != 208 ||                (ogiBytes[1] & 0xFF) != 207 ||                (ogiBytes[2] & 0xFF) != 17     ||                (ogiBytes[3] & 0xFF) != 224 ||                (ogiBytes[4] & 0xFF) != 161 ||                (ogiBytes[5] & 0xFF) != 177 ||                (ogiBytes[6] & 0xFF) != 26     ||                (ogiBytes[7] & 0xFF) != 225        ){            System.out.println("Not the ppt file!");            return;        }        Stream stream = new Stream(ogiBytes);        int[] offset = new int[1];        offset[0] = 48;        int dirSect1 = stream.getInteger(offset);        int pptDocument = getPPTDcoument(ogiBytes, stream, dirSect1);        if(pptDocument <= 0)        {            System.out.println("This version of ppt can not be parsed!");            return;        }        offset[0] = pptDocument + 116;        int startSect = stream.getInteger(offset);        int docStart = (startSect + 1)*512;        int docLength = stream.getInteger(offset);        byte[] bytes = new byte[docLength];        System.arraycopy(ogiBytes, docStart, bytes, 0, docLength);        stream = new Stream(bytes);        StringBuilder content = new StringBuilder();        int start = 0;        while(start != -1)        {            start = findTextRecords(stream, bytes, start, content, offset);        }        FileUtil.writeAscFile("E:\\\\output.txt", content.toString(), false);        System.out.println("Done!");    }}//该片段来自于http://byrx.net
相关栏目:

用户点评