[444] | 1 | /* |
---|
| 2 | * Copyright (C) 2002 by Red Hat, Incorporated. All rights reserved. |
---|
| 3 | * |
---|
| 4 | * Permission to use, copy, modify, and distribute this software |
---|
| 5 | * is freely granted, provided that this notice is preserved. |
---|
| 6 | * |
---|
| 7 | * Tests gleaned from Markus Kuhn's UTF-8 and Unicode FAQ, |
---|
| 8 | * and specifically, his UTF-8-test.txt decoder stress test file. |
---|
| 9 | */ |
---|
| 10 | |
---|
| 11 | #include <stdio.h> |
---|
| 12 | #include <stdlib.h> |
---|
| 13 | #include <locale.h> |
---|
| 14 | |
---|
| 15 | #define MAX_BYTES 65 |
---|
| 16 | |
---|
| 17 | int num_invalid(const char *s, int len); |
---|
| 18 | |
---|
| 19 | char first[6][6] = { |
---|
| 20 | {0x0}, /* U-00000000 */ |
---|
| 21 | {0xc2, 0x80}, /* U-00000080 */ |
---|
| 22 | {0xe0, 0xa0, 0x80}, /* U-00000800 */ |
---|
| 23 | {0xf0, 0x90, 0x80, 0x80}, /* U-00010000 */ |
---|
| 24 | {0xf8, 0x88, 0x80, 0x80, 0x80}, /* U-00200000 */ |
---|
| 25 | {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80} /* U-04000000 */ |
---|
| 26 | }; |
---|
| 27 | |
---|
| 28 | char last[6][6] = { |
---|
| 29 | {0x7f}, /* U-0000007F */ |
---|
| 30 | {0xdf, 0xbf}, /* U-000007FF */ |
---|
| 31 | {0xef, 0xbf, 0xbf}, /* U-0000FFFF */ |
---|
| 32 | {0xf7, 0xbf, 0xbf, 0xbf}, /* U-001FFFFF */ |
---|
| 33 | {0xfb, 0xbf, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */ |
---|
| 34 | {0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */ |
---|
| 35 | }; |
---|
| 36 | |
---|
| 37 | char boundary[5][6] = { |
---|
| 38 | {0xed, 0x9f, 0xbf}, /* U-0000D7FF */ |
---|
| 39 | {0xee, 0x80, 0x80}, /* U-0000E000 */ |
---|
| 40 | {0xef, 0xbf, 0xbd}, /* U-0000FFFD */ |
---|
| 41 | {0xf4, 0x8f, 0xbf, 0xbf}, /* U-0010FFFF */ |
---|
| 42 | {0xf4, 0x90, 0x80, 0x80} /* U-00110000 */ |
---|
| 43 | }; |
---|
| 44 | |
---|
| 45 | char continuation_bytes[8][7] = { |
---|
| 46 | {0x80}, |
---|
| 47 | {0xbf}, |
---|
| 48 | {0x80, 0xbf}, |
---|
| 49 | {0x80, 0xbf, 0x80}, |
---|
| 50 | {0x80, 0xbf, 0x80, 0xbf}, |
---|
| 51 | {0x80, 0xbf, 0x80, 0xbf, 0x80}, |
---|
| 52 | {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf}, |
---|
| 53 | {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80} |
---|
| 54 | }; |
---|
| 55 | |
---|
| 56 | char all_continuation_bytes[64]; |
---|
| 57 | |
---|
| 58 | |
---|
| 59 | char all_two_byte_seq[32]; |
---|
| 60 | char all_three_byte_seq[16]; |
---|
| 61 | char all_four_byte_seq[8]; |
---|
| 62 | char all_five_byte_seq[4]; |
---|
| 63 | char all_six_byte_seq[2]; |
---|
| 64 | |
---|
| 65 | char incomplete_seq[10][6] = { |
---|
| 66 | {0xc2}, /* U-00000080 */ |
---|
| 67 | {0xe0, 0x80}, /* U-00000800 */ |
---|
| 68 | {0xf0, 0x80, 0x80}, /* U-00010000 */ |
---|
| 69 | {0xf8, 0x80, 0x80, 0x80}, /* U-00200000 */ |
---|
| 70 | {0xfc, 0x80, 0x80, 0x80, 0x80}, /* U-04000000 */ |
---|
| 71 | {0xdf}, /* U-000007FF */ |
---|
| 72 | {0xef, 0xbf}, /* U-0000FFFF */ |
---|
| 73 | {0xf7, 0xbf, 0xbf}, /* U-001FFFFF */ |
---|
| 74 | {0xfb, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */ |
---|
| 75 | {0xfd, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */ |
---|
| 76 | }; |
---|
| 77 | |
---|
| 78 | char incomplete_seq_concat[30]; |
---|
| 79 | |
---|
| 80 | char impossible_bytes[3][4] = { |
---|
| 81 | {0xfe}, |
---|
| 82 | {0xff}, |
---|
| 83 | {0xfe, 0xfe, 0xff, 0xff} |
---|
| 84 | }; |
---|
| 85 | |
---|
| 86 | char overlong[5][6] = { |
---|
| 87 | {0xc0, 0xaf}, |
---|
| 88 | {0xe0, 0x80, 0xaf}, |
---|
| 89 | {0xf0, 0x80, 0x80, 0xaf}, |
---|
| 90 | {0xf8, 0x80, 0x80, 0x80, 0xaf}, |
---|
| 91 | {0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf} |
---|
| 92 | }; |
---|
| 93 | |
---|
| 94 | char overlong_max[5][6] = { |
---|
| 95 | {0xc1, 0xbf}, |
---|
| 96 | {0xe0, 0x9f, 0xbf}, |
---|
| 97 | {0xf0, 0x8f, 0xbf, 0xbf}, |
---|
| 98 | {0xf8, 0x87, 0xbf, 0xbf, 0xbf}, |
---|
| 99 | {0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf} |
---|
| 100 | }; |
---|
| 101 | |
---|
| 102 | char overlong_nul[5][6] = { |
---|
| 103 | {0xc0, 0x80}, |
---|
| 104 | {0xe0, 0x80, 0x80}, |
---|
| 105 | {0xf0, 0x80, 0x80, 0x80}, |
---|
| 106 | {0xf8, 0x80, 0x80, 0x80, 0x80}, |
---|
| 107 | {0xfc, 0x80, 0x80, 0x80, 0x80, 0x80} |
---|
| 108 | }; |
---|
| 109 | |
---|
| 110 | char single_surrogates[7][3] = { |
---|
| 111 | {0xed, 0xa0, 0x80}, |
---|
| 112 | {0xed, 0xad, 0xbf}, |
---|
| 113 | {0xed, 0xae, 0x80}, |
---|
| 114 | {0xed, 0xaf, 0xbf}, |
---|
| 115 | {0xed, 0xb0, 0x80}, |
---|
| 116 | {0xed, 0xbe, 0x80}, |
---|
| 117 | {0xed, 0xbf, 0xbf} |
---|
| 118 | }; |
---|
| 119 | |
---|
| 120 | char paired_surrogates[8][6] = { |
---|
| 121 | {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80}, |
---|
| 122 | {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf}, |
---|
| 123 | {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80}, |
---|
| 124 | {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf}, |
---|
| 125 | {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80}, |
---|
| 126 | {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf}, |
---|
| 127 | {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80}, |
---|
| 128 | {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf} |
---|
| 129 | }; |
---|
| 130 | |
---|
| 131 | char illegal_pos[2][3] = { |
---|
| 132 | {0xff, 0xfe}, |
---|
| 133 | {0xff, 0xff} |
---|
| 134 | }; |
---|
| 135 | |
---|
| 136 | int main() |
---|
| 137 | { |
---|
| 138 | wchar_t wchar; |
---|
| 139 | int retval; |
---|
| 140 | int i; |
---|
| 141 | |
---|
| 142 | if (!setlocale(LC_CTYPE, "C-UTF-8")) |
---|
| 143 | { |
---|
| 144 | printf("Failed to set C-UTF-8 locale.\n"); |
---|
| 145 | return 1; |
---|
| 146 | } |
---|
| 147 | else |
---|
| 148 | printf("Set C-UTF-8 locale.\n"); |
---|
| 149 | |
---|
| 150 | /* 2 Boundary condition test cases */ |
---|
| 151 | /* 2.1 First possible sequence of a certain length */ |
---|
| 152 | retval = mbtowc(&wchar, first[0], MAX_BYTES); |
---|
| 153 | if (retval == 0) |
---|
| 154 | printf("2.1.1: U-%08d\n", wchar); |
---|
| 155 | else |
---|
| 156 | printf("2.1.1: Invalid\n"); |
---|
| 157 | |
---|
| 158 | for (i = 2; i < 7; i++) |
---|
| 159 | { |
---|
| 160 | retval = mbtowc (&wchar, first[i-1], MAX_BYTES); |
---|
| 161 | if (retval == i) |
---|
| 162 | printf("2.1.%d: U-%08x\n", i, wchar); |
---|
| 163 | else |
---|
| 164 | printf("2.1.%d: Invalid\n", i); |
---|
| 165 | } |
---|
| 166 | |
---|
| 167 | /* 2.2 Last possible sequence of a certain length */ |
---|
| 168 | for (i = 1; i < 7; i++) |
---|
| 169 | { |
---|
| 170 | retval = mbtowc (&wchar, last[i-1], MAX_BYTES); |
---|
| 171 | if (retval == i) |
---|
| 172 | printf("2.2.%d: U-%08x\n", i, wchar); |
---|
| 173 | else |
---|
| 174 | printf("2.2.%d: Invalid\n", i); |
---|
| 175 | } |
---|
| 176 | |
---|
| 177 | /* 2.3 Other boundary conditions */ |
---|
| 178 | for (i = 1; i < 6; i++) |
---|
| 179 | { |
---|
| 180 | retval = mbtowc (&wchar, boundary[i-1], MAX_BYTES); |
---|
| 181 | if ((i < 4 && retval == 3) || (i > 3 && retval == 4)) |
---|
| 182 | printf("2.3.%d: U-%08x\n", i, wchar); |
---|
| 183 | else |
---|
| 184 | printf("2.3.%d: Invalid\n", i); |
---|
| 185 | } |
---|
| 186 | |
---|
| 187 | /* 3 Malformed sequences */ |
---|
| 188 | /* 3.1 Unexpected continuation bytes */ |
---|
| 189 | retval = mbtowc (&wchar, continuation_bytes[0], MAX_BYTES); |
---|
| 190 | if (retval == 1) |
---|
| 191 | printf("3.1.1: U-%08x\n", wchar); |
---|
| 192 | else |
---|
| 193 | printf("3.1.1: 1 Invalid\n"); |
---|
| 194 | |
---|
| 195 | retval = mbtowc (&wchar, continuation_bytes[1], MAX_BYTES); |
---|
| 196 | if (retval == 1) |
---|
| 197 | printf("3.1.2: U-%08x\n", wchar); |
---|
| 198 | else |
---|
| 199 | printf("3.1.2: 1 Invalid\n"); |
---|
| 200 | |
---|
| 201 | for(i=2; i< 8; i++) |
---|
| 202 | { |
---|
| 203 | retval = num_invalid(continuation_bytes[i], i); |
---|
| 204 | if (retval == -1) |
---|
| 205 | printf("3.1.%d: Valid Character Found\n", i+1); |
---|
| 206 | else |
---|
| 207 | printf("3.1.%d: %d Invalid\n", i+1, retval); |
---|
| 208 | } |
---|
| 209 | |
---|
| 210 | for(i = 0x80; i < 0xc0; i++) |
---|
| 211 | all_continuation_bytes[i-0x80] = i; |
---|
| 212 | |
---|
| 213 | retval = num_invalid(all_continuation_bytes, 0xc0 - 0x80); |
---|
| 214 | if (retval == -1) |
---|
| 215 | printf("3.1.9: Valid Character Found\n"); |
---|
| 216 | else |
---|
| 217 | printf("3.1.9: %d Invalid\n", retval); |
---|
| 218 | |
---|
| 219 | /* 3.2 Lonely start characters */ |
---|
| 220 | for(i = 0xc0; i < 0xe0; i++) |
---|
| 221 | all_two_byte_seq[i-0xc0] = i; |
---|
| 222 | |
---|
| 223 | retval = num_invalid(all_two_byte_seq, 0xe0 - 0xc0); |
---|
| 224 | if (retval == -1) |
---|
| 225 | printf("3.2.1: Valid Character Found\n"); |
---|
| 226 | else |
---|
| 227 | printf("3.2.1: %d Invalid\n", retval); |
---|
| 228 | |
---|
| 229 | for(i = 0xe0; i < 0xf0; i++) |
---|
| 230 | all_three_byte_seq[i-0xe0] = i; |
---|
| 231 | |
---|
| 232 | retval = num_invalid(all_three_byte_seq, 0xf0 - 0xe0); |
---|
| 233 | if (retval == -1) |
---|
| 234 | printf("3.2.2: Valid Character Found\n"); |
---|
| 235 | else |
---|
| 236 | printf("3.2.2: %d Invalid\n", retval); |
---|
| 237 | |
---|
| 238 | for(i = 0xf0; i < 0xf8; i++) |
---|
| 239 | all_four_byte_seq[i-0xf0] = i; |
---|
| 240 | |
---|
| 241 | retval = num_invalid(all_four_byte_seq, 0xf8 - 0xf0); |
---|
| 242 | if (retval == -1) |
---|
| 243 | printf("3.2.3: Valid Character Found\n"); |
---|
| 244 | else |
---|
| 245 | printf("3.2.3: %d Invalid\n", retval); |
---|
| 246 | |
---|
| 247 | for(i = 0xf8; i < 0xfc; i++) |
---|
| 248 | all_five_byte_seq[i-0xf8] = i; |
---|
| 249 | |
---|
| 250 | retval = num_invalid(all_five_byte_seq, 0xfc - 0xf8); |
---|
| 251 | if (retval == -1) |
---|
| 252 | printf("3.2.4: Valid Character Found\n"); |
---|
| 253 | else |
---|
| 254 | printf("3.2.4: %d Invalid\n", retval); |
---|
| 255 | |
---|
| 256 | for(i = 0xfc; i < 0xfe; i++) |
---|
| 257 | all_six_byte_seq[i-0xfc] = i; |
---|
| 258 | |
---|
| 259 | retval = num_invalid(all_six_byte_seq, 0xfe - 0xfc); |
---|
| 260 | if (retval == -1) |
---|
| 261 | printf("3.2.5: Valid Character Found\n"); |
---|
| 262 | else |
---|
| 263 | printf("3.2.5: %d Invalid\n", retval); |
---|
| 264 | |
---|
| 265 | /* 3.3 Sequences with last continuation byte missing */ |
---|
| 266 | for(i = 1; i < 6; i++) |
---|
| 267 | { |
---|
| 268 | retval = mbtowc(&wchar, incomplete_seq[i-1], i); |
---|
| 269 | if(retval == -1) |
---|
| 270 | printf("3.3.%d: 1 Invalid\n", i); |
---|
| 271 | else |
---|
| 272 | printf("3.3.%d: Valid Character Found\n", i); |
---|
| 273 | } |
---|
| 274 | |
---|
| 275 | for(i = 6; i < 11; i++) |
---|
| 276 | { |
---|
| 277 | retval = mbtowc(&wchar, incomplete_seq[i-1], i - 5); |
---|
| 278 | if(retval == -1) |
---|
| 279 | printf("3.3.%d: 1 Invalid\n", i); |
---|
| 280 | else |
---|
| 281 | printf("3.3.%d: Valid Character Found\n", i); |
---|
| 282 | } |
---|
| 283 | |
---|
| 284 | /* 3.4 Concatenation of incomplete sequences */ |
---|
| 285 | /* This test is excluded because the mbtowc function does not return the |
---|
| 286 | number of bytes read in an invalid multi-byte sequence. */ |
---|
| 287 | |
---|
| 288 | /* 3.5 Impossible bytes */ |
---|
| 289 | retval = mbtowc(&wchar, impossible_bytes[0], 1); |
---|
| 290 | if(retval == -1) |
---|
| 291 | printf("3.5.1: 1 Invalid\n"); |
---|
| 292 | else |
---|
| 293 | printf("3.5.1: Valid Character Found\n"); |
---|
| 294 | |
---|
| 295 | retval = mbtowc(&wchar, impossible_bytes[1], 1); |
---|
| 296 | if(retval == -1) |
---|
| 297 | printf("3.5.2: 1 Invalid\n"); |
---|
| 298 | else |
---|
| 299 | printf("3.5.2: Valid Character Found\n"); |
---|
| 300 | |
---|
| 301 | retval = mbtowc(&wchar, impossible_bytes[2], 4); |
---|
| 302 | if(retval == -1) |
---|
| 303 | printf("3.5.3: 1 Invalid\n"); |
---|
| 304 | else |
---|
| 305 | printf("3.5.3: Valid Character Found\n"); |
---|
| 306 | |
---|
| 307 | /* 4 Overlong sequences */ |
---|
| 308 | /* 4.1 Examples of an overlong ASCII character */ |
---|
| 309 | for(i = 2; i < 7; i++) |
---|
| 310 | { |
---|
| 311 | retval = mbtowc(&wchar, overlong[i-2], i); |
---|
| 312 | if(retval == -1) |
---|
| 313 | printf("4.1.%d: 1 Invalid\n", i-1); |
---|
| 314 | else |
---|
| 315 | printf("4.1.%d: Valid Character Found\n", i-1); |
---|
| 316 | } |
---|
| 317 | |
---|
| 318 | /* 4.2 Maximum overlong sequences */ |
---|
| 319 | for(i = 2; i < 7; i++) |
---|
| 320 | { |
---|
| 321 | retval = mbtowc(&wchar, overlong_max[i-2], i); |
---|
| 322 | if(retval == -1) |
---|
| 323 | printf("4.2.%d: 1 Invalid\n", i-1); |
---|
| 324 | else |
---|
| 325 | printf("4.2.%d: Valid Character Found\n", i-1); |
---|
| 326 | } |
---|
| 327 | |
---|
| 328 | /* 4.3 Overlong representation of the NUL character */ |
---|
| 329 | for(i = 2; i < 7; i++) |
---|
| 330 | { |
---|
| 331 | retval = mbtowc(&wchar, overlong_nul[i-2], i); |
---|
| 332 | if(retval == -1) |
---|
| 333 | printf("4.3.%d: 1 Invalid\n", i-1); |
---|
| 334 | else |
---|
| 335 | printf("4.3.%d: Valid Character Found\n", i-1); |
---|
| 336 | } |
---|
| 337 | |
---|
| 338 | /* 5 Illegal code positions */ |
---|
| 339 | /* 5.1 Single UTF-16 surrogates */ |
---|
| 340 | for (i = 1; i < 8; i++) |
---|
| 341 | { |
---|
| 342 | retval = mbtowc(&wchar, single_surrogates[i-1], 3); |
---|
| 343 | if(retval == -1) |
---|
| 344 | printf("5.1.%d: 1 Invalid\n", i); |
---|
| 345 | else |
---|
| 346 | printf("5.1.%d: Valid Character Found\n", i); |
---|
| 347 | } |
---|
| 348 | |
---|
| 349 | /* 5.2 Paired UTF-16 surrogates */ |
---|
| 350 | for (i = 1; i < 8; i++) |
---|
| 351 | { |
---|
| 352 | retval = mbtowc(&wchar, paired_surrogates[i-1], 6); |
---|
| 353 | if(retval == -1) |
---|
| 354 | printf("5.2.%d: 1 Invalid\n", i); |
---|
| 355 | else |
---|
| 356 | printf("5.2.%d: Valid Character Found\n", i); |
---|
| 357 | } |
---|
| 358 | |
---|
| 359 | /* 5.3 Other illegal code positions */ |
---|
| 360 | retval = mbtowc(&wchar, illegal_pos[0], 3); |
---|
| 361 | if(retval == -1) |
---|
| 362 | printf("5.3.1: 1 Invalid\n"); |
---|
| 363 | else |
---|
| 364 | printf("5.3.1: Valid Character Found\n"); |
---|
| 365 | |
---|
| 366 | retval = mbtowc(&wchar, illegal_pos[1], 3); |
---|
| 367 | if(retval == -1) |
---|
| 368 | printf("5.3.2: 1 Invalid\n"); |
---|
| 369 | else |
---|
| 370 | printf("5.3.2: Valid Character Found\n"); |
---|
| 371 | |
---|
| 372 | return 0; |
---|
| 373 | } |
---|
| 374 | |
---|
| 375 | /* return number of invalid characters in string, |
---|
| 376 | returns -1 if a valid character is found */ |
---|
| 377 | int |
---|
| 378 | num_invalid(const char *s, int len) |
---|
| 379 | { |
---|
| 380 | int retval = 0; |
---|
| 381 | int i = 0; |
---|
| 382 | int num_inv = 0; |
---|
| 383 | wchar_t wchar; |
---|
| 384 | const char *t; |
---|
| 385 | |
---|
| 386 | t = s; |
---|
| 387 | |
---|
| 388 | for(i=0; i<len; t++, i++) |
---|
| 389 | { |
---|
| 390 | retval = mbtowc (&wchar, t, len - i); |
---|
| 391 | if(retval == -1) |
---|
| 392 | num_inv++; |
---|
| 393 | else |
---|
| 394 | return -1; |
---|
| 395 | } |
---|
| 396 | return num_inv; |
---|
| 397 | } |
---|