欢迎访问悦橙教程(wld5.com),关注java教程。悦橙教程  java问答|  每日更新
页面导航 : > > 文章正文

使用Java不借助框架直接抽取rtf文本,java框架抽取rtf,如下代码是直接基于字符对

来源: javaer 分享于  点击 19029 次 点评:159

使用Java不借助框架直接抽取rtf文本,java框架抽取rtf,如下代码是直接基于字符对


如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,大多都需要依赖于第三方的包或者用swing里面的api抽取,感觉有点庞杂,代码仅供研究学习使用,禁止用于商业用途。

public class RTFExtractor{    private static Map<String, String> include = new HashMap<String, String>();    static    {        include.put("par",             " ");        include.put("bullet",     "?");        include.put("emdash",     "—");        include.put("emspace",    "?");        include.put("endash",     "–");        include.put("enspace",    "?");        include.put("ldblquote",  "“");        include.put("lquote",     "‘");        include.put("ltrmark",    "?");        include.put("rdblquote",  "”");        include.put("rquote",     "’");        include.put("rtlmark",    "?");        include.put("tab",        " ");        include.put("zwj",        "?");        include.put("zwnj",       "?");    }    public static boolean isRTFFormat(byte[] ogiBytes)    {        if(ogiBytes == null || ogiBytes.length < 8)        {            return false;        }        if(            (ogiBytes[0] & 0xFF) == 0x7B ||            (ogiBytes[1] & 0xFF) == 0x5C ||            (ogiBytes[2] & 0xFF) == 0x72 ||            (ogiBytes[3] & 0xFF) == 0x74 ||            (ogiBytes[4] & 0xFF) == 0x66 ||            (ogiBytes[5] & 0xFF) == 0x31 ||            (ogiBytes[6] & 0xFF) == 0x5C ||            (ogiBytes[7] & 0xFF) == 0x61        ){            return true;        }        short lines = 0;        for(int i = 0; i < ogiBytes.length; i++)        {            if(ogiBytes[i] != 0x0A && ogiBytes[i] != 0x0D)            {                continue;            }            if(lines > 2)            {                break;            }            if(i+1+8 >= ogiBytes.length)            {                break;            }            if(                (ogiBytes[i+1] & 0xFF) == 0x7B ||                (ogiBytes[i+2] & 0xFF) == 0x5C ||                (ogiBytes[i+3] & 0xFF) == 0x72 ||                (ogiBytes[i+4] & 0xFF) == 0x74 ||                (ogiBytes[i+5] & 0xFF) == 0x66 ||                (ogiBytes[i+6] & 0xFF) == 0x31 ||                (ogiBytes[i+7] & 0xFF) == 0x5C ||                (ogiBytes[i+8] & 0xFF) == 0x61            ){                return true;            }            lines++;        }        return false;    }    public static String extract(String file, int[] err, int read)    {        String content = file;        if(read == 0)        {            try {                content = FileUtil.readAscFile(file);            } catch (Exception e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }        System.out.println("Total length: "+ content.length());        if(!isRTFFormat(content.getBytes()))        {            System.out.println("Not the rtf file!");            err[0] = ExtractorUtil.NOT_RTF_FORMAT;            return "";        }        int line1 = content.indexOf("}}");        if(line1 > -1)        {            int line2 = content.indexOf("}}", line1+2);            if(line2 > -1)            {                content = content.substring(line2+2);            }            else            {                content = content.substring(line1+2);            }        }        content = content.replaceAll("\\\\{\\\\\\\\\\\\*[^\\\\}]*?\\\\}", "");        content = content.replaceAll("\\n|\\r", "");        StringBuilder builder = new StringBuilder();        String[] buffers = content.split("\\\\\\\\");        for(int i=0;i<buffers.length;i++)        {            String buffer = buffers[i].trim();            String value = include.get(buffer);            if(value != null)            {                builder.append(value);                continue;            }            if(buffer.startsWith("'"))            {                if(i == buffers.length - 1)                {                    break;                }                String a = buffer.replaceAll("\\\\}\\\\{", "");                String b = buffers[i+1].replaceAll("\\\\}\\\\{", "");                if(!b.startsWith("'"))                {                    continue;                }                if(a.length() < 3 || b.length() < 3)                {                    continue;                }                if(a.length() > 3)                {                    builder.append( a.substring(3) );                    a = a.substring(0, 3);                }                if(b.length() > 3)                {                    builder.append( b.substring(3) );                    b = b.substring(0, 3);                }                a = a.replace("'", "");                b = b.replace("'", "");                if(a.length() != 2 || a.replaceAll("[A-Fa-f0-9]", "").length() != 0)                {                    continue;                }                if(b.length() != 2 || b.replaceAll("[A-Fa-f0-9]", "").length() != 0)                {                    continue;                }                int ch = Integer.valueOf(b+a, 16);                byte[] temp = new byte[2];                temp[0] = (byte) ch;                temp[1] = (byte) (ch >> 8);                builder.append( new String(temp) );                i++;                continue;            }            int spaceOffset = buffer.indexOf(" ");            if(spaceOffset > -1)            {                String rest = buffer.substring(spaceOffset+1);                if(buffer.startsWith("f0 "))                {                    builder.append( rest.replaceAll("\\\\}\\\\{", "") );                    continue;                }                if("{".equals(rest) || "}".equals(rest) || rest.indexOf("}{") > -1 || rest.indexOf("}}") > -1)                {                    continue;                }                if(rest.startsWith("{"))                {                    builder.append( rest.substring(1) );                    continue;                }                if(rest.endsWith("}"))                {                    builder.append( rest.substring(0, rest.length()-1) );                    continue;                }                builder.append( rest );                continue;            }        }        err[0] = ExtractorUtil.EXTRACTING_DONE;        return builder.toString();    }    public static void main(String[] args) throws Exception    {        String text = RTFExtractor.extract("E:\\en.rtf", new int[1], 0);        FileUtil.writeAscFile("E:\\output.txt", text, false);        System.out.println("Done!");    }}//该片段来自于http://byrx.net
相关栏目:

用户点评