@@ -396,4 +396,110 @@ void ToLowerCase(std::string *in_out) {
396
396
[](unsigned char c) { return std::tolower (c); });
397
397
}
398
398
399
+ static inline bool InRange (uint8_t x, uint8_t low, uint8_t high) {
400
+ return low <= x && x <= high;
401
+ }
402
+
403
+ /*
404
+ Please see
405
+ https://stackoverflow.com/questions/6555015/check-for-invalid-utf8
406
+
407
+
408
+ Table 3-7. Well-Formed UTF-8 Byte Sequences
409
+
410
+ Code Points First Byte Second Byte Third Byte Fourth Byte
411
+ U+0000..U+007F 00..7F
412
+ U+0080..U+07FF C2..DF 80..BF
413
+ U+0800..U+0FFF E0 A0..BF 80..BF
414
+ U+1000..U+CFFF E1..EC 80..BF 80..BF
415
+ U+D000..U+D7FF ED 80..9F 80..BF
416
+ U+E000..U+FFFF EE..EF 80..BF 80..BF
417
+ U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
418
+ U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
419
+ U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
420
+ */
421
+ std::string RemoveInvalidUtf8Sequences (const std::string &text,
422
+ bool show_debug_msg /* = false*/ ) {
423
+ int32_t n = static_cast <int32_t >(text.size ());
424
+
425
+ std::string ans;
426
+ ans.reserve (n);
427
+
428
+ int32_t i = 0 ;
429
+ const uint8_t *p = reinterpret_cast <const uint8_t *>(text.data ());
430
+ while (i < n) {
431
+ if (p[i] <= 0x7f ) {
432
+ ans.append (text, i, 1 );
433
+ i += 1 ;
434
+ continue ;
435
+ }
436
+
437
+ if (InRange (p[i], 0xc2 , 0xdf ) && i + 1 < n &&
438
+ InRange (p[i + 1 ], 0x80 , 0xbf )) {
439
+ ans.append (text, i, 2 );
440
+ i += 2 ;
441
+ continue ;
442
+ }
443
+
444
+ if (p[i] == 0xe0 && i + 2 < n && InRange (p[i + 1 ], 0xa0 , 0xbf ) &&
445
+ InRange (p[i + 2 ], 0x80 , 0xbf )) {
446
+ ans.append (text, i, 3 );
447
+ i += 3 ;
448
+ continue ;
449
+ }
450
+
451
+ if (InRange (p[i], 0xe1 , 0xec ) && i + 2 < n &&
452
+ InRange (p[i + 1 ], 0x80 , 0xbf ) && InRange (p[i + 2 ], 0x80 , 0xbf )) {
453
+ ans.append (text, i, 3 );
454
+ i += 3 ;
455
+ continue ;
456
+ }
457
+
458
+ if (p[i] == 0xed && i + 2 < n && InRange (p[i + 1 ], 0x80 , 0x9f ) &&
459
+ InRange (p[i + 2 ], 0x80 , 0xbf )) {
460
+ ans.append (text, i, 3 );
461
+ i += 3 ;
462
+ continue ;
463
+ }
464
+
465
+ if (InRange (p[i], 0xee , 0xef ) && i + 2 < n &&
466
+ InRange (p[i + 1 ], 0x80 , 0xbf ) && InRange (p[i + 2 ], 0x80 , 0xbf )) {
467
+ ans.append (text, i, 3 );
468
+ i += 3 ;
469
+ continue ;
470
+ }
471
+
472
+ if (p[i] == 0xf0 && i + 3 < n && InRange (p[i + 1 ], 0x90 , 0xbf ) &&
473
+ InRange (p[i + 2 ], 0x80 , 0xbf ) && InRange (p[i + 3 ], 0x80 , 0xbf )) {
474
+ ans.append (text, i, 4 );
475
+ i += 4 ;
476
+ continue ;
477
+ }
478
+
479
+ if (InRange (p[i], 0xf1 , 0xf3 ) && i + 3 < n &&
480
+ InRange (p[i + 1 ], 0x80 , 0xbf ) && InRange (p[i + 2 ], 0x80 , 0xbf ) &&
481
+ InRange (p[i + 3 ], 0x80 , 0xbf )) {
482
+ ans.append (text, i, 4 );
483
+ i += 4 ;
484
+ continue ;
485
+ }
486
+
487
+ if (p[i] == 0xf4 && i + 3 < n && InRange (p[i + 1 ], 0x80 , 0x8f ) &&
488
+ InRange (p[i + 2 ], 0x80 , 0xbf ) && InRange (p[i + 3 ], 0x80 , 0xbf )) {
489
+ ans.append (text, i, 4 );
490
+ i += 4 ;
491
+ continue ;
492
+ }
493
+
494
+ if (show_debug_msg) {
495
+ SHERPA_ONNX_LOGE (" Ignore invalid utf8 sequence at pos: %d, value: %02x" ,
496
+ i, p[i]);
497
+ }
498
+
499
+ i += 1 ;
500
+ }
501
+
502
+ return ans;
503
+ }
504
+
399
505
} // namespace sherpa_onnx
0 commit comments