中文字符,GBK乱码

背景

  • 一些行业的文件数据交互都是用GBK作为字符集
  • 会产生一些乱码
  • 文件的字段会被要求定长,不够要用空格补充
  • 哪些字符是占1个长度,哪些字符占2个长度,也是一个问题

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/**
* 按GBK编码写入文件,固定长度
* 补空格(有判断不全的情况)
*
* @param length 总长度
* @param str 要补位的字符串
* @return 补全位数的字符串
*/
public static String jointSpaceOld(int length, String str) {
if (str.length() < length) {
int chinaNum = 0;
// 中文一个字符占两位
if (!StringUtils.isEmpty(str)) {
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
try {
if (Character.toString(c).getBytes(GBK).length == 2) {
chinaNum++;
}
} catch (UnsupportedEncodingException e) {
if (isAllChineseByBlock(c) || isLatin1Supplement(c)) {
chinaNum++;
}
}
}
}
int total = length - str.length() - chinaNum;
str = str + " ".repeat(Math.max(0, total));
}
return str;
}

/**
* 按GBK编码写入文件,固定长度
* 补空格(有判断不全的情况)
*
* @param length 总长度
* @param str 要补位的字符串
* @return 补全位数的字符串
*/
public static String jointSpace(int length, String str) {
StringBuilder sb = new StringBuilder(StringUtils.defaultString(str));
try {
while (sb.toString().getBytes(GBK).length > length) {
int sbLen = sb.length();
sb.delete(sbLen - 1, sbLen);
}
while (sb.toString().getBytes(GBK).length < length) {
sb.append(" ");
}
} catch (UnsupportedEncodingException e) {
sb = new StringBuilder();
sb.append(" ".repeat(Math.max(0, length)));
}
return sb.toString();
}

public static boolean isAllChineseByBlock(char c) {
return isChineseByBlock(c) || isChinesePunctuation(c);
}

/**
* 使用UnicodeBlock方法判断
* 是否为中文字符
*
* @param c 字符
* @return 是否
*/
public static boolean isChineseByBlock(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
}

/**
* 使用UnicodeScript方法判断
* 是否为中文
*
* @param c 字符
* @return 是否
*/
public static boolean isChineseByScript(char c) {
Character.UnicodeScript sc = Character.UnicodeScript.of(c);
return sc == Character.UnicodeScript.HAN;
}


/**
* 根据UnicodeBlock方法判断中文标点符号
*
* @param c 字符
* @return 是否
*/
public static boolean isChinesePunctuation(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
return ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS
|| ub == Character.UnicodeBlock.VERTICAL_FORMS;
}


/**
* 根据UnicodeBlock方法是否拉丁1-增补
*
* @param c 字符
* @return 是否
*/
public static boolean isLatin1Supplement(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
return ub == Character.UnicodeBlock.LATIN_1_SUPPLEMENT;
}


/**
* 根据UnicodeBlock方法是否基本拉丁语
*
* @param c 字符
* @return 是否
*/
public static boolean isBasic(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
return ub == Character.UnicodeBlock.BASIC_LATIN;
}

测试类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@Test
void gbk() throws UnsupportedEncodingException {
// 错误=张·三 end
// 正确=张·三 end
// 固定长度10
String spaceStr = "张·三";
log.debug(GbkUtils.jointSpace(10, spaceStr));
String str = "减法10−2=8,下划线1_2,中横线1-2,少数民族名张·三,中文符号,。;,全角123,繁體張三";
StringBuilder newStr = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
log.debug("{},gbk_length={},block={},chinese={},latin={}", c, Character.toString(c).getBytes("GBK").length
, Character.UnicodeBlock.of(c), GbkUtils.isAllChineseByBlock(c), GbkUtils.isLatin1Supplement(c));
if (GbkUtils.isAllChineseByBlock(c) || GbkUtils.isLatin1Supplement(c) || GbkUtils.isBasic(c)) {
newStr.append(c);
} else {
// 无法识别的字符转换为.
newStr.append(".");
}
}
// 数学符号
assertFalse(GbkUtils.isAllChineseByBlock("−".charAt(0)));
assertTrue(GbkUtils.isAllChineseByBlock("中".charAt(0)));
assertTrue(GbkUtils.isAllChineseByBlock("體".charAt(0)));
assertTrue(GbkUtils.isAllChineseByBlock("。".charAt(0)));
assertTrue(GbkUtils.isAllChineseByBlock("1".charAt(0)));
// 基本拉丁语
assertTrue(GbkUtils.isBasic("a".charAt(0)));
log.debug("\n str={}\nGBK的乱码替换为.\nnewStr={}", str, newStr);
}

测试结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
25: 张·三end
26: 张·三 end
27: 张·三 end
32: 减,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 法,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 1,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 0,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: −,gbk_length=1,block=MATHEMATICAL_OPERATORS,chinese=false,latin=false
32: 2,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: =,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 8,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 下,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 划,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 线,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 1,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: _,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 2,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 中,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 横,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 线,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 1,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: -,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 2,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 少,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 数,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 民,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 族,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 名,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 张,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: ·,gbk_length=2,block=LATIN_1_SUPPLEMENT,chinese=false,latin=true
32: 三,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 中,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 文,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 符,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 号,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: ,,gbk_length=2,block=HALFWIDTH_AND_FULLWIDTH_FORMS,chinese=true,latin=false
32: 。,gbk_length=2,block=CJK_SYMBOLS_AND_PUNCTUATION,chinese=true,latin=false
32: ;,gbk_length=2,block=HALFWIDTH_AND_FULLWIDTH_FORMS,chinese=true,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 全,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 角,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 1,gbk_length=2,block=HALFWIDTH_AND_FULLWIDTH_FORMS,chinese=true,latin=false
32: 2,gbk_length=2,block=HALFWIDTH_AND_FULLWIDTH_FORMS,chinese=true,latin=false
32: 3,gbk_length=2,block=HALFWIDTH_AND_FULLWIDTH_FORMS,chinese=true,latin=false
32: ,,gbk_length=1,block=BASIC_LATIN,chinese=false,latin=false
32: 繁,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 體,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 張,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
32: 三,gbk_length=2,block=CJK_UNIFIED_IDEOGRAPHS,chinese=true,latin=false
49:
str=减法10−2=8,下划线1_2,中横线1-2,少数民族名张·三,中文符号,。;,全角123,繁體張三
GBK的乱码替换为.
newStr=减法10.2=8,下划线1_2,中横线1-2,少数民族名张·三,中文符号,。;,全角123,繁體張三