欢迎访问悦橙教程(wld5.com),关注java教程。悦橙教程  java问答|  每日更新
页面导航 : > > 文章正文

java判断utf8字符包含几个字节(utf8mb4判断),,在mysql中设置utf

来源: javaer 分享于  点击 29679 次 点评:60

java判断utf8字符包含几个字节(utf8mb4判断),,在mysql中设置utf


在mysql中设置utf8-general-ci时是无法插入4字节utf-8字符的,如果mysql中设置了utf8-general-ci在插入4字节字符时则会出问题, 为了避免此类问题, 可以提前对字符串做判断, 判断方法的代码片段如下:

import java.nio.charset.Charset;/** * 处理由于mysql配置字符编码utf8 general ci, 而应用会接受utf8-mb4字节的情况 * Created by zhaoyukai on 2017/9/28. */public class UTF8MB4 {    private final static Charset UTF8 = Charset.forName("UTF-8");    /**     * 判断字符串中是否存在4字节字符     * @param input 输入字符串     * @return 包含4字节返回true, 否则为false     */    public static boolean containsMb4Char(String input) {        if (input == null) {            return false;        }        byte[] bytes = input.getBytes(UTF8);        for (int i = 0; i < bytes.length; i++) {            byte b = bytes[i];            //four bytes            if ((b & 0XF0) == 0XF0) {                return true;            } else if ((b & 0XE0) == 0XE0) {                //three bytes                //forward 2 byte                i += 2;            } else if ((b & 0XC0) == 0XC0) {                i += 1;            }        }        return false;    }    /**     * 替换可能存在的utf8 4字节字符     * @param input 输入字符串     * @param replacement 替换为的字符串     * @return 替换后的utf8字符串     */    public static String replaceMb4Char(String input, String replacement) {        if (input == null) {            throw new IllegalArgumentException("input can not be null when replaceMb4Char");        }        StringBuilder sb = new StringBuilder(input.length());        byte[] bytes = input.getBytes(UTF8);        char[] chars = input.toCharArray();        int charIdx = 0;        for (int i = 0; i < bytes.length; i++) {            byte b = bytes[i];            //four bytes            if ((b & 0XF0) == 0XF0) {                sb.append(replacement);                //utf-8四字节字符unicode后变为2个字符, 故字符下标多加1                charIdx+=2;                i+=3;                continue;            } else if ((b & 0XE0) == 0XE0) {                //three bytes                //forward 2 byte                i += 2;            } else if ((b & 0XC0) == 0XC0) {                i += 1;            }            sb.append(chars[charIdx]);            charIdx++;        }        return sb.toString();    }}

如下是单元测试代码:

import org.apache.commons.io.IOUtils;import org.junit.Assert;import org.junit.Test;import org.springframework.util.StringUtils;import java.io.IOException;import java.io.InputStream;import java.nio.charset.Charset;import java.util.List;/** * Created by zhaoyukai on 2017/9/27. */public class Utf8Mb4Test {    private final static Charset UTF8 = Charset.forName("UTF-8");    @Test    public void testReplacement() {        String input = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94哈哈哈";        String output = UTF8MB4.replaceMb4Char(input, "");        String expect = "A啊中哈哈哈";        Assert.assertEquals(expect, output);    }    @Test    public void testContainsMb4() {        testFalse("呵呵呵");        testFalse("AAAA");        testFalse(",,,");        testFalse("中国。。,&………………");        testFalse("我们mmm他们..你们abcdddd牛m");    }    @Test    public void testReplacePerf() {        long startMm = System.currentTimeMillis();        int times = 10000000;        while (times > 0) {            UTF8MB4.replaceMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941", "");            times--;        }        long end = System.currentTimeMillis();        long used = end - startMm;        System.out.println(used);    }    @Test    public void testContainsPerf() {        long startMm = System.currentTimeMillis();        int times = 10000000;        while (times > 0) {            UTF8MB4.containsMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941");            times--;        }        long end = System.currentTimeMillis();        long used = end - startMm;        System.out.println(used);    }    @Test    public void testContainsMb4True() throws IOException {        String input = IOUtils.toString(getClass().getClassLoader().getResourceAsStream("contains.txt"), UTF8);        testTrue(input);        testTrue("\"A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94\"");    }    @Test    public void testContainsCharsInFile() throws IOException {        InputStream stream = null;        try {            stream = getClass().getClassLoader().getResourceAsStream("contains.txt");            List<String> lines = IOUtils.readLines(stream, UTF8);            for (String line : lines) {                if (StringUtils.isEmpty(line)) {                    continue;                }                char first = line.charAt(0);                String last = line.substring(1);                boolean expectContains = first == '1';                boolean actualContains = UTF8MB4.containsMb4Char(last);                Assert.assertEquals(String.format("%s is %s but %s", last, expectContains, actualContains),                        expectContains, actualContains);            }        } finally {            IOUtils.closeQuietly(stream);        }    }    @Test    public void testReplaceCharsInFile() throws IOException {        InputStream stream = null;        try {            stream = getClass().getClassLoader().getResourceAsStream("replace.txt");            List<String> lines = IOUtils.readLines(stream, UTF8);            for (String line : lines) {                if (StringUtils.isEmpty(line)) {                    continue;                }                int idxEq = line.indexOf('=');                if (idxEq == -1) {                    throw new RuntimeException("测试文本错误, 未按=分隔");                }                String afterReplace = line.substring(0, idxEq);                String beforeReplace = line.substring(idxEq+1);                String real = UTF8MB4.replaceMb4Char(beforeReplace, "");                String expect = afterReplace;                Assert.assertEquals(expect, real);            }        } finally {            IOUtils.closeQuietly(stream);        }    }    void testTrue(String input) {        boolean contains = UTF8MB4.containsMb4Char(input);        Assert.assertTrue(contains);    }    void testFalse(String input) {        boolean contains = UTF8MB4.containsMb4Char(input);        Assert.assertFalse(contains);    }    @Test    public void testMb4() {        String chs = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94";//        byte[] bytes = chs.getBytes(UTF8);        byte[] masks = {                (byte) 0X00,                (byte) 0X80,                (byte) 0XC0,                (byte) 0XE0,        };        char c = 'ﭾ';        int count = 1;        System.out.println(String.format("%02X", (int) c));        while (count < 110000) {            char nc = (char) ((int) c + count);            System.out.print(nc);            byte[] bytes = String.valueOf(nc).getBytes(UTF8);            System.out.print(String.format("%02X", bytes[0]));            count++;            if (count % 50 == 0) {                System.out.println();            }        }    }}
相关栏目:

用户点评