|
| 1 | +>From 67219f0130ec7c876ac0b299046460fad31caabf Mon Sep 17 00:00:00 2001 |
| 2 | +From: Rich Felker <dalias@aerifal.cx> |
| 3 | +Date: Mon, 30 Mar 2026 16:00:50 -0400 |
| 4 | +Subject: [PATCH] fix pathological slowness & incorrect mappings in iconv |
| 5 | + gb18030 decoder |
| 6 | + |
| 7 | +in order to implement the "UTF" aspect of gb18030 (ability to |
| 8 | +represent arbitrary unicode characters not present in the 2-byte |
| 9 | +mapping), we have to apply the index obtained from the encoded 4-byte |
| 10 | +sequence into the set of unmapped characters. this was done by |
| 11 | +scanning repeatedly over the table of mapped characters and counting |
| 12 | +off mapped characters below a running index by which to adjust the |
| 13 | +running index by on each iteration. this iterative process eventually |
| 14 | +leaves us with the value of the Nth unmapped character replacing the |
| 15 | +index, but depending on which particular character that is, the number |
| 16 | +of iterations needed to find it can be in the tens of thousands, and |
| 17 | +each iteration traverses the whole 126x190 table in the inner loop. |
| 18 | +this can lead to run times exceeding an entire second per character on |
| 19 | +moderate-speed machines. |
| 20 | + |
| 21 | +on top of that, the transformation logic produced wrong results for |
| 22 | +BMP characters above the the surrogate range, as a result of not |
| 23 | +correctly accounting for it being excluded, and for characters outside |
| 24 | +the BMP, as a result of a misunderstanding of how gb18030 encodes |
| 25 | +them. |
| 26 | + |
| 27 | +this patch replaces the unmapped character lookup with a single linear |
| 28 | +search of a list of unmapped ranges. there are only 206 such ranges, |
| 29 | +and these are permanently assigned and unchangeable as a consequence |
| 30 | +of the character encoding having to be stable, so a simple array of |
| 31 | +16-bit start/length values for each range consumes only 824 bytes, a |
| 32 | +very reasonable size cost here. |
| 33 | + |
| 34 | +this new table accounts for the previously-incorrect surrogate |
| 35 | +handling, and non-BMP characters are handled correctly by a single |
| 36 | +offset, without the need for any unmapped-range search. |
| 37 | + |
| 38 | +there are still a small number of mappings that are incorrect due to |
| 39 | +late changes made in the definition of gb18030, swapping PUA |
| 40 | +codepoints with proper Unicode characters. correcting these requires a |
| 41 | +postprocessing step that will be added later. |
| 42 | +--- |
| 43 | + src/locale/gb18030utf.h | 206 ++++++++++++++++++++++++++++++++++++++++ |
| 44 | + src/locale/iconv.c | 33 +++++-- |
| 45 | + 2 files changed, 230 insertions(+), 9 deletions(-) |
| 46 | + create mode 100644 src/locale/gb18030utf.h |
| 47 | + |
| 48 | +diff --git a/src/locale/gb18030utf.h b/src/locale/gb18030utf.h |
| 49 | +new file mode 100644 |
| 50 | +index 00000000..322a2440 |
| 51 | +--- /dev/null |
| 52 | ++++ b/src/locale/gb18030utf.h |
| 53 | +@@ -0,0 +1,206 @@ |
| 54 | ++{ 0x80, 36 }, |
| 55 | ++{ 0xa5, 2 }, |
| 56 | ++{ 0xa9, 7 }, |
| 57 | ++{ 0xb2, 5 }, |
| 58 | ++{ 0xb8, 31 }, |
| 59 | ++{ 0xd8, 8 }, |
| 60 | ++{ 0xe2, 6 }, |
| 61 | ++{ 0xeb, 1 }, |
| 62 | ++{ 0xee, 4 }, |
| 63 | ++{ 0xf4, 3 }, |
| 64 | ++{ 0xf8, 1 }, |
| 65 | ++{ 0xfb, 1 }, |
| 66 | ++{ 0xfd, 4 }, |
| 67 | ++{ 0x102, 17 }, |
| 68 | ++{ 0x114, 7 }, |
| 69 | ++{ 0x11c, 15 }, |
| 70 | ++{ 0x12c, 24 }, |
| 71 | ++{ 0x145, 3 }, |
| 72 | ++{ 0x149, 4 }, |
| 73 | ++{ 0x14e, 29 }, |
| 74 | ++{ 0x16c, 98 }, |
| 75 | ++{ 0x1cf, 1 }, |
| 76 | ++{ 0x1d1, 1 }, |
| 77 | ++{ 0x1d3, 1 }, |
| 78 | ++{ 0x1d5, 1 }, |
| 79 | ++{ 0x1d7, 1 }, |
| 80 | ++{ 0x1d9, 1 }, |
| 81 | ++{ 0x1db, 1 }, |
| 82 | ++{ 0x1dd, 28 }, |
| 83 | ++{ 0x1fa, 87 }, |
| 84 | ++{ 0x252, 15 }, |
| 85 | ++{ 0x262, 101 }, |
| 86 | ++{ 0x2c8, 1 }, |
| 87 | ++{ 0x2cc, 13 }, |
| 88 | ++{ 0x2da, 183 }, |
| 89 | ++{ 0x3a2, 1 }, |
| 90 | ++{ 0x3aa, 7 }, |
| 91 | ++{ 0x3c2, 1 }, |
| 92 | ++{ 0x3ca, 55 }, |
| 93 | ++{ 0x402, 14 }, |
| 94 | ++{ 0x450, 1 }, |
| 95 | ++{ 0x452, 7102 }, |
| 96 | ++{ 0x2011, 2 }, |
| 97 | ++{ 0x2017, 1 }, |
| 98 | ++{ 0x201a, 2 }, |
| 99 | ++{ 0x201e, 7 }, |
| 100 | ++{ 0x2027, 9 }, |
| 101 | ++{ 0x2031, 1 }, |
| 102 | ++{ 0x2034, 1 }, |
| 103 | ++{ 0x2036, 5 }, |
| 104 | ++{ 0x203c, 112 }, |
| 105 | ++{ 0x20ad, 86 }, |
| 106 | ++{ 0x2104, 1 }, |
| 107 | ++{ 0x2106, 3 }, |
| 108 | ++{ 0x210a, 12 }, |
| 109 | ++{ 0x2117, 10 }, |
| 110 | ++{ 0x2122, 62 }, |
| 111 | ++{ 0x216c, 4 }, |
| 112 | ++{ 0x217a, 22 }, |
| 113 | ++{ 0x2194, 2 }, |
| 114 | ++{ 0x219a, 110 }, |
| 115 | ++{ 0x2209, 6 }, |
| 116 | ++{ 0x2210, 1 }, |
| 117 | ++{ 0x2212, 3 }, |
| 118 | ++{ 0x2216, 4 }, |
| 119 | ++{ 0x221b, 2 }, |
| 120 | ++{ 0x2221, 2 }, |
| 121 | ++{ 0x2224, 1 }, |
| 122 | ++{ 0x2226, 1 }, |
| 123 | ++{ 0x222c, 2 }, |
| 124 | ++{ 0x222f, 5 }, |
| 125 | ++{ 0x2238, 5 }, |
| 126 | ++{ 0x223e, 10 }, |
| 127 | ++{ 0x2249, 3 }, |
| 128 | ++{ 0x224d, 5 }, |
| 129 | ++{ 0x2253, 13 }, |
| 130 | ++{ 0x2262, 2 }, |
| 131 | ++{ 0x2268, 6 }, |
| 132 | ++{ 0x2270, 37 }, |
| 133 | ++{ 0x2296, 3 }, |
| 134 | ++{ 0x229a, 11 }, |
| 135 | ++{ 0x22a6, 25 }, |
| 136 | ++{ 0x22c0, 82 }, |
| 137 | ++{ 0x2313, 333 }, |
| 138 | ++{ 0x246a, 10 }, |
| 139 | ++{ 0x249c, 100 }, |
| 140 | ++{ 0x254c, 4 }, |
| 141 | ++{ 0x2574, 13 }, |
| 142 | ++{ 0x2590, 3 }, |
| 143 | ++{ 0x2596, 10 }, |
| 144 | ++{ 0x25a2, 16 }, |
| 145 | ++{ 0x25b4, 8 }, |
| 146 | ++{ 0x25be, 8 }, |
| 147 | ++{ 0x25c8, 3 }, |
| 148 | ++{ 0x25cc, 2 }, |
| 149 | ++{ 0x25d0, 18 }, |
| 150 | ++{ 0x25e6, 31 }, |
| 151 | ++{ 0x2607, 2 }, |
| 152 | ++{ 0x260a, 54 }, |
| 153 | ++{ 0x2641, 1 }, |
| 154 | ++{ 0x2643, 2110 }, |
| 155 | ++{ 0x2e82, 2 }, |
| 156 | ++{ 0x2e85, 3 }, |
| 157 | ++{ 0x2e89, 2 }, |
| 158 | ++{ 0x2e8d, 10 }, |
| 159 | ++{ 0x2e98, 15 }, |
| 160 | ++{ 0x2ea8, 2 }, |
| 161 | ++{ 0x2eab, 3 }, |
| 162 | ++{ 0x2eaf, 4 }, |
| 163 | ++{ 0x2eb4, 2 }, |
| 164 | ++{ 0x2eb8, 3 }, |
| 165 | ++{ 0x2ebc, 14 }, |
| 166 | ++{ 0x2ecb, 293 }, |
| 167 | ++{ 0x2ffc, 4 }, |
| 168 | ++{ 0x3004, 1 }, |
| 169 | ++{ 0x3018, 5 }, |
| 170 | ++{ 0x301f, 2 }, |
| 171 | ++{ 0x302a, 20 }, |
| 172 | ++{ 0x303f, 2 }, |
| 173 | ++{ 0x3094, 7 }, |
| 174 | ++{ 0x309f, 2 }, |
| 175 | ++{ 0x30f7, 5 }, |
| 176 | ++{ 0x30ff, 6 }, |
| 177 | ++{ 0x312a, 246 }, |
| 178 | ++{ 0x322a, 7 }, |
| 179 | ++{ 0x3232, 113 }, |
| 180 | ++{ 0x32a4, 234 }, |
| 181 | ++{ 0x3390, 12 }, |
| 182 | ++{ 0x339f, 2 }, |
| 183 | ++{ 0x33a2, 34 }, |
| 184 | ++{ 0x33c5, 9 }, |
| 185 | ++{ 0x33cf, 2 }, |
| 186 | ++{ 0x33d3, 2 }, |
| 187 | ++{ 0x33d6, 113 }, |
| 188 | ++{ 0x3448, 43 }, |
| 189 | ++{ 0x3474, 298 }, |
| 190 | ++{ 0x359f, 111 }, |
| 191 | ++{ 0x360f, 11 }, |
| 192 | ++{ 0x361b, 765 }, |
| 193 | ++{ 0x3919, 85 }, |
| 194 | ++{ 0x396f, 96 }, |
| 195 | ++{ 0x39d1, 14 }, |
| 196 | ++{ 0x39e0, 147 }, |
| 197 | ++{ 0x3a74, 218 }, |
| 198 | ++{ 0x3b4f, 287 }, |
| 199 | ++{ 0x3c6f, 113 }, |
| 200 | ++{ 0x3ce1, 885 }, |
| 201 | ++{ 0x4057, 264 }, |
| 202 | ++{ 0x4160, 471 }, |
| 203 | ++{ 0x4338, 116 }, |
| 204 | ++{ 0x43ad, 4 }, |
| 205 | ++{ 0x43b2, 43 }, |
| 206 | ++{ 0x43de, 248 }, |
| 207 | ++{ 0x44d7, 373 }, |
| 208 | ++{ 0x464d, 20 }, |
| 209 | ++{ 0x4662, 193 }, |
| 210 | ++{ 0x4724, 5 }, |
| 211 | ++{ 0x472a, 82 }, |
| 212 | ++{ 0x477d, 16 }, |
| 213 | ++{ 0x478e, 441 }, |
| 214 | ++{ 0x4948, 50 }, |
| 215 | ++{ 0x497b, 2 }, |
| 216 | ++{ 0x497e, 4 }, |
| 217 | ++{ 0x4984, 1 }, |
| 218 | ++{ 0x4987, 20 }, |
| 219 | ++{ 0x499c, 3 }, |
| 220 | ++{ 0x49a0, 22 }, |
| 221 | ++{ 0x49b8, 703 }, |
| 222 | ++{ 0x4c78, 39 }, |
| 223 | ++{ 0x4ca4, 111 }, |
| 224 | ++{ 0x4d1a, 148 }, |
| 225 | ++{ 0x4daf, 81 }, |
| 226 | ++{ 0x9fa6, 14426 }, |
| 227 | ++{ 0xe76c, 1 }, |
| 228 | ++{ 0xe7c8, 1 }, |
| 229 | ++{ 0xe7e7, 13 }, |
| 230 | ++{ 0xe815, 1 }, |
| 231 | ++{ 0xe819, 5 }, |
| 232 | ++{ 0xe81f, 7 }, |
| 233 | ++{ 0xe827, 4 }, |
| 234 | ++{ 0xe82d, 4 }, |
| 235 | ++{ 0xe833, 8 }, |
| 236 | ++{ 0xe83c, 7 }, |
| 237 | ++{ 0xe844, 16 }, |
| 238 | ++{ 0xe856, 14 }, |
| 239 | ++{ 0xe865, 4295 }, |
| 240 | ++{ 0xf92d, 76 }, |
| 241 | ++{ 0xf97a, 27 }, |
| 242 | ++{ 0xf996, 81 }, |
| 243 | ++{ 0xf9e8, 9 }, |
| 244 | ++{ 0xf9f2, 26 }, |
| 245 | ++{ 0xfa10, 1 }, |
| 246 | ++{ 0xfa12, 1 }, |
| 247 | ++{ 0xfa15, 3 }, |
| 248 | ++{ 0xfa19, 6 }, |
| 249 | ++{ 0xfa22, 1 }, |
| 250 | ++{ 0xfa25, 2 }, |
| 251 | ++{ 0xfa2a, 1030 }, |
| 252 | ++{ 0xfe32, 1 }, |
| 253 | ++{ 0xfe45, 4 }, |
| 254 | ++{ 0xfe53, 1 }, |
| 255 | ++{ 0xfe58, 1 }, |
| 256 | ++{ 0xfe67, 1 }, |
| 257 | ++{ 0xfe6c, 149 }, |
| 258 | ++{ 0xff5f, 129 }, |
| 259 | ++{ 0xffe6, 26 }, |
| 260 | +diff --git a/src/locale/iconv.c b/src/locale/iconv.c |
| 261 | +index 52178950..4151411d 100644 |
| 262 | +--- a/src/locale/iconv.c |
| 263 | ++++ b/src/locale/iconv.c |
| 264 | +@@ -74,6 +74,10 @@ static const unsigned short gb18030[126][190] = { |
| 265 | + #include "gb18030.h" |
| 266 | + }; |
| 267 | + |
| 268 | ++static const unsigned short gb18030utf[][2] = { |
| 269 | ++#include "gb18030utf.h" |
| 270 | ++}; |
| 271 | ++ |
| 272 | + static const unsigned short big5[89][157] = { |
| 273 | + #include "big5.h" |
| 274 | + }; |
| 275 | +@@ -224,6 +228,8 @@ static unsigned uni_to_jis(unsigned c) |
| 276 | + } |
| 277 | + } |
| 278 | + |
| 279 | ++#define countof(a) (sizeof (a) / sizeof *(a)) |
| 280 | ++ |
| 281 | + size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) |
| 282 | + { |
| 283 | + size_t x=0; |
| 284 | +@@ -430,15 +436,24 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri |
| 285 | + d = *((unsigned char *)*in + 3); |
| 286 | + if (d-'0'>9) goto ilseq; |
| 287 | + c += d-'0'; |
| 288 | +- c += 128; |
| 289 | +- for (d=0; d<=c; ) { |
| 290 | +- k = 0; |
| 291 | +- for (int i=0; i<126; i++) |
| 292 | +- for (int j=0; j<190; j++) |
| 293 | +- if (gb18030[i][j]-d <= c-d) |
| 294 | +- k++; |
| 295 | +- d = c+1; |
| 296 | +- c += k; |
| 297 | ++ /* Starting at 90 30 81 30 (189000), mapping is |
| 298 | ++ * linear without gaps, to U+10000 and up. */ |
| 299 | ++ if (c >= 189000) { |
| 300 | ++ c -= 189000; |
| 301 | ++ c += 0x10000; |
| 302 | ++ if (c >= 0x110000) goto ilseq; |
| 303 | ++ break; |
| 304 | ++ } |
| 305 | ++ /* Otherwise we must process an index into set |
| 306 | ++ * of characters unmapped by 2-byte table. */ |
| 307 | ++ for (int i=0; ; i++) { |
| 308 | ++ if (i==countof(gb18030utf)) |
| 309 | ++ goto ilseq; |
| 310 | ++ if (c<gb18030utf[i][1]) { |
| 311 | ++ c += gb18030utf[i][0]; |
| 312 | ++ break; |
| 313 | ++ } |
| 314 | ++ c -= gb18030utf[i][1]; |
| 315 | + } |
| 316 | + break; |
| 317 | + } |
| 318 | +-- |
| 319 | +2.21.0 |
| 320 | + |
| 321 | + |
0 commit comments