java判断utf8字符包含几个字节(utf8mb4判断),,在mysql中设置utf
分享于 点击 29679 次 点评:60
java判断utf8字符包含几个字节(utf8mb4判断),,在mysql中设置utf
在mysql中设置utf8-general-ci时是无法插入4字节utf-8字符的,如果mysql中设置了utf8-general-ci在插入4字节字符时则会出问题, 为了避免此类问题, 可以提前对字符串做判断, 判断方法的代码片段如下:
import java.nio.charset.Charset;/** * 处理由于mysql配置字符编码utf8 general ci, 而应用会接受utf8-mb4字节的情况 * Created by zhaoyukai on 2017/9/28. */public class UTF8MB4 { private final static Charset UTF8 = Charset.forName("UTF-8"); /** * 判断字符串中是否存在4字节字符 * @param input 输入字符串 * @return 包含4字节返回true, 否则为false */ public static boolean containsMb4Char(String input) { if (input == null) { return false; } byte[] bytes = input.getBytes(UTF8); for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; //four bytes if ((b & 0XF0) == 0XF0) { return true; } else if ((b & 0XE0) == 0XE0) { //three bytes //forward 2 byte i += 2; } else if ((b & 0XC0) == 0XC0) { i += 1; } } return false; } /** * 替换可能存在的utf8 4字节字符 * @param input 输入字符串 * @param replacement 替换为的字符串 * @return 替换后的utf8字符串 */ public static String replaceMb4Char(String input, String replacement) { if (input == null) { throw new IllegalArgumentException("input can not be null when replaceMb4Char"); } StringBuilder sb = new StringBuilder(input.length()); byte[] bytes = input.getBytes(UTF8); char[] chars = input.toCharArray(); int charIdx = 0; for (int i = 0; i < bytes.length; i++) { byte b = bytes[i]; //four bytes if ((b & 0XF0) == 0XF0) { sb.append(replacement); //utf-8四字节字符unicode后变为2个字符, 故字符下标多加1 charIdx+=2; i+=3; continue; } else if ((b & 0XE0) == 0XE0) { //three bytes //forward 2 byte i += 2; } else if ((b & 0XC0) == 0XC0) { i += 1; } sb.append(chars[charIdx]); charIdx++; } return sb.toString(); }}
如下是单元测试代码:
import org.apache.commons.io.IOUtils;import org.junit.Assert;import org.junit.Test;import org.springframework.util.StringUtils;import java.io.IOException;import java.io.InputStream;import java.nio.charset.Charset;import java.util.List;/** * Created by zhaoyukai on 2017/9/27. */public class Utf8Mb4Test { private final static Charset UTF8 = Charset.forName("UTF-8"); @Test public void testReplacement() { String input = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94哈哈哈"; String output = UTF8MB4.replaceMb4Char(input, ""); String expect = "A啊中哈哈哈"; Assert.assertEquals(expect, output); } @Test public void testContainsMb4() { testFalse("呵呵呵"); testFalse("AAAA"); testFalse(",,,"); testFalse("中国。。,&………………"); testFalse("我们mmm他们..你们abcdddd牛m"); } @Test public void testReplacePerf() { long startMm = System.currentTimeMillis(); int times = 10000000; while (times > 0) { UTF8MB4.replaceMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941", ""); times--; } long end = System.currentTimeMillis(); long used = end - startMm; System.out.println(used); } @Test public void testContainsPerf() { long startMm = System.currentTimeMillis(); int times = 10000000; while (times > 0) { UTF8MB4.containsMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941"); times--; } long end = System.currentTimeMillis(); long used = end - startMm; System.out.println(used); } @Test public void testContainsMb4True() throws IOException { String input = IOUtils.toString(getClass().getClassLoader().getResourceAsStream("contains.txt"), UTF8); testTrue(input); testTrue("\"A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94\""); } @Test public void testContainsCharsInFile() throws IOException { InputStream stream = null; try { stream = getClass().getClassLoader().getResourceAsStream("contains.txt"); List<String> lines = IOUtils.readLines(stream, UTF8); for (String line : lines) { if (StringUtils.isEmpty(line)) { continue; } char first = line.charAt(0); String last = line.substring(1); boolean expectContains = first == '1'; boolean actualContains = UTF8MB4.containsMb4Char(last); Assert.assertEquals(String.format("%s is %s but %s", last, expectContains, actualContains), expectContains, actualContains); } } finally { IOUtils.closeQuietly(stream); } } @Test public void testReplaceCharsInFile() throws IOException { InputStream stream = null; try { stream = getClass().getClassLoader().getResourceAsStream("replace.txt"); List<String> lines = IOUtils.readLines(stream, UTF8); for (String line : lines) { if (StringUtils.isEmpty(line)) { continue; } int idxEq = line.indexOf('='); if (idxEq == -1) { throw new RuntimeException("测试文本错误, 未按=分隔"); } String afterReplace = line.substring(0, idxEq); String beforeReplace = line.substring(idxEq+1); String real = UTF8MB4.replaceMb4Char(beforeReplace, ""); String expect = afterReplace; Assert.assertEquals(expect, real); } } finally { IOUtils.closeQuietly(stream); } } void testTrue(String input) { boolean contains = UTF8MB4.containsMb4Char(input); Assert.assertTrue(contains); } void testFalse(String input) { boolean contains = UTF8MB4.containsMb4Char(input); Assert.assertFalse(contains); } @Test public void testMb4() { String chs = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94";// byte[] bytes = chs.getBytes(UTF8); byte[] masks = { (byte) 0X00, (byte) 0X80, (byte) 0XC0, (byte) 0XE0, }; char c = 'ﭾ'; int count = 1; System.out.println(String.format("%02X", (int) c)); while (count < 110000) { char nc = (char) ((int) c + count); System.out.print(nc); byte[] bytes = String.valueOf(nc).getBytes(UTF8); System.out.print(String.format("%02X", bytes[0])); count++; if (count % 50 == 0) { System.out.println(); } } }}
用户点评