feat(ast): implement utf8 to utf16 span converter (#8687)

Boshen · Boshen · commit b7f13e636ea5 · 2025-01-24T16:57:44.000Z
closes #8629
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,7 +45,7 @@ jobs:
           save-cache: ${{ github.ref_name == 'main' }}
           cache-key: warm
       - run: cargo ck
-      - run: cargo test
+      - run: cargo test --all-features
       - run: git diff --exit-code # Must commit everything
 
   test-windows:
@@ -96,7 +96,7 @@ jobs:
         shell: bash
         run: |
           # cargo ck # no need to check because it's already checked in linux
-          cargo test --workspace
+          cargo test --all-features
 
   test-wasm32-wasip1-threads:
     name: Test wasm32-wasip1-threads
diff --git a/crates/oxc_ast/src/lib.rs b/crates/oxc_ast/src/lib.rs
@@ -53,6 +53,8 @@ mod ast_impl;
 mod ast_kind_impl;
 pub mod precedence;
 mod trivia;
+#[cfg(feature = "serialize")]
+pub mod utf8_to_utf16;
 
 mod generated {
     #![allow(missing_docs)]
diff --git a/crates/oxc_ast/src/utf8_to_utf16.rs b/crates/oxc_ast/src/utf8_to_utf16.rs
@@ -0,0 +1,248 @@
+//! Convert UTF-8 span offsets to UTF-16.
+
+use oxc_span::Span;
+
+use crate::{ast::Program, visit::VisitMut};
+
+/// Convert UTF-8 span offsets to UTF-16.
+pub struct Utf8ToUtf16 {
+    translations: Vec<Translation>,
+}
+
+#[derive(Clone, Copy)]
+#[repr(align(8))]
+struct Translation {
+    // UTF-8 byte offset
+    utf8_offset: u32,
+    // Number to subtract from UTF-8 byte offset to get UTF-16 char offset
+    // for offsets *after* `utf8_offset`
+    utf16_difference: u32,
+}
+
+impl Utf8ToUtf16 {
+    /// Create new `Utf8ToUtf16` converter.
+    #[expect(clippy::new_without_default)]
+    pub fn new() -> Self {
+        let mut translations = Vec::with_capacity(16);
+        translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });
+        Self { translations }
+    }
+
+    /// Convert all spans in the AST to UTF-16.
+    pub fn convert(mut self, program: &mut Program<'_>) {
+        self.build_table(program.source_text);
+        // Skip if source is entirely ASCII
+        if self.translations.len() == 1 {
+            return;
+        }
+        self.visit_program(program);
+        for comment in &mut program.comments {
+            self.convert_span(&mut comment.span);
+        }
+    }
+
+    #[allow(clippy::cast_possible_truncation)]
+    fn build_table(&mut self, source_text: &str) {
+        // Translation from UTF-8 byte offset to UTF-16 char offset:
+        //
+        // * 1-byte UTF-8 sequence
+        //   = 1st byte 0xxxxxxx (0 - 0x7F)
+        //   -> 1 x UTF-16 char
+        //   UTF-16 len = UTF-8 len
+        // * 2-byte UTF-8 sequence
+        //   = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+        //   -> 1 x UTF-16
+        //   UTF-16 len = UTF-8 len - 1
+        // * 3-byte UTF-8 sequence
+        //   = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+        //   -> 1 x UTF-16
+        //   UTF-16 len = UTF-8 len - 2
+        // * 4-byte UTF-8 sequence
+        //   = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
+        //   -> 2 x UTF-16
+        //   UTF-16 len = UTF-8 len - 2
+        //
+        // So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
+        let mut utf16_difference = 0;
+        for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
+            if byte >= 0xC0 {
+                let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
+                utf16_difference += difference_for_this_byte;
+                // Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
+                // Unicode character that need to be shifted
+                self.translations
+                    .push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
+            }
+        }
+    }
+
+    fn convert_span(&self, span: &mut Span) {
+        span.start = self.convert_offset(span.start);
+        span.end = self.convert_offset(span.end);
+    }
+
+    fn convert_offset(&self, utf8_offset: u32) -> u32 {
+        // Find the first entry in table *after* the UTF-8 offset.
+        // The difference we need to subtract is recorded in the entry prior to it.
+        let index =
+            self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset);
+        // First entry in table is `0, 0`. `partition_point` finds the first entry where
+        // `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists).
+        // So guaranteed `index > 0`, and `index <= translations.len()`.
+        // Therefore `index - 1` cannot wrap around, and cannot be out of bounds.
+        let translation = self.translations[index - 1];
+        utf8_offset - translation.utf16_difference
+    }
+}
+
+impl VisitMut<'_> for Utf8ToUtf16 {
+    fn visit_span(&mut self, span: &mut Span) {
+        self.convert_span(span);
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use oxc_allocator::Allocator;
+    use oxc_span::{GetSpan, SourceType, Span};
+
+    use crate::{
+        ast::{Expression, Statement},
+        AstBuilder, Comment, CommentKind,
+    };
+
+    use super::Utf8ToUtf16;
+
+    #[test]
+    fn translate_ast() {
+        let allocator = Allocator::new();
+        let ast = AstBuilder::new(&allocator);
+
+        let mut program = ast.program(
+            Span::new(0, 15),
+            SourceType::default(),
+            ";'🤨' // 🤨",
+            ast.vec1(Comment::new(8, 15, CommentKind::Line)),
+            None,
+            ast.vec(),
+            ast.vec_from_array([
+                ast.statement_empty(Span::new(0, 1)),
+                ast.statement_expression(
+                    Span::new(1, 7),
+                    ast.expression_string_literal(Span::new(1, 7), "🤨", None),
+                ),
+            ]),
+        );
+
+        Utf8ToUtf16::new().convert(&mut program);
+        assert_eq!(program.span, Span::new(0, 11));
+        assert_eq!(program.body[1].span(), Span::new(1, 5));
+        let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() };
+        let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() };
+        assert_eq!(s.span, Span::new(1, 5));
+        assert_eq!(program.comments[0].span, Span::new(6, 11));
+    }
+
+    #[test]
+    fn translate_offsets() {
+        assert_eq!('_'.len_utf8(), 1);
+        assert_eq!('_'.len_utf16(), 1);
+        assert_eq!('£'.len_utf8(), 2);
+        assert_eq!('£'.len_utf16(), 1);
+        assert_eq!('ऊ'.len_utf8(), 3);
+        assert_eq!('ऊ'.len_utf16(), 1);
+        assert_eq!('🤨'.len_utf8(), 4);
+        assert_eq!('🤨'.len_utf16(), 2);
+
+        let cases: &[(&str, &[(u32, u32)])] = &[
+            // 1-byte
+            ("_", &[(0, 0), (1, 1)]),
+            // 2-byte
+            ("£", &[(0, 0), (2, 1)]),
+            ("£_", &[(0, 0), (2, 1), (3, 2)]),
+            ("_£", &[(0, 0), (1, 1), (3, 2)]),
+            ("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]),
+            ("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]),
+            ("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]),
+            // 3-byte
+            ("ऊ", &[(0, 0), (3, 1)]),
+            ("ऊ_", &[(0, 0), (3, 1), (4, 2)]),
+            ("_ऊ", &[(0, 0), (1, 1), (4, 2)]),
+            ("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]),
+            ("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]),
+            ("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]),
+            // 4-byte
+            ("🤨", &[(0, 0), (4, 2)]),
+            ("🤨_", &[(0, 0), (4, 2), (5, 3)]),
+            ("_🤨", &[(0, 0), (1, 1), (5, 3)]),
+            ("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]),
+            ("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]),
+            ("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
+        ];
+
+        for (text, translations) in cases {
+            let mut converter = Utf8ToUtf16::new();
+            converter.build_table(text);
+            for &(utf8_offset, expected_utf16_offset) in *translations {
+                assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset);
+            }
+        }
+    }
+
+    // Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences,
+    // which are relied on by `build_table`
+    #[test]
+    fn char_lengths() {
+        macro_rules! assert_utf8_bytes_eq {
+            ($c:expr, $bytes:expr) => {{
+                let mut buffer = [0; 4];
+                let bytes = $c.encode_utf8(&mut buffer).as_bytes();
+                assert!($bytes == bytes);
+            }};
+        }
+
+        // All 1-byte UTF-8 character sequences = 1 x UTF-16 character.
+        // First byte is 0x00 - 0x7F.
+        let min_1_byte_char = char::from_u32(0).unwrap();
+        assert_eq!(min_1_byte_char.len_utf8(), 1);
+        assert_eq!(min_1_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(min_1_byte_char, [0x00]);
+        let max_1_byte_char = char::from_u32(0x7F).unwrap();
+        assert_eq!(max_1_byte_char.len_utf8(), 1);
+        assert_eq!(max_1_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]);
+
+        // All 2-byte UTF-8 character sequences = 1 x UTF-16 character
+        // First byte is 0xC2 - 0xDF.
+        let min_2_byte_char = char::from_u32(0x80).unwrap();
+        assert_eq!(min_2_byte_char.len_utf8(), 2);
+        assert_eq!(min_2_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]);
+        let max_2_byte_char = char::from_u32(0x7FF).unwrap();
+        assert_eq!(max_2_byte_char.len_utf8(), 2);
+        assert_eq!(max_2_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]);
+
+        // All 3-byte UTF-8 character sequences = 1 x UTF-16 character
+        // First byte is 0xE0 - 0xEF.
+        let min_3_byte_char = char::from_u32(0x800).unwrap();
+        assert_eq!(min_3_byte_char.len_utf8(), 3);
+        assert_eq!(min_3_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]);
+        let max_3_byte_char = char::from_u32(0xFFFF).unwrap();
+        assert_eq!(max_3_byte_char.len_utf8(), 3);
+        assert_eq!(max_3_byte_char.len_utf16(), 1);
+        assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]);
+
+        // All 4-byte UTF-8 character sequences = 2 x UTF-16 characters
+        // First byte is 0xF0 - 0xF4.
+        let min_4_byte_char = char::from_u32(0x10000).unwrap();
+        assert_eq!(min_4_byte_char.len_utf8(), 4);
+        assert_eq!(min_4_byte_char.len_utf16(), 2);
+        assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]);
+        let max_4_byte_char = char::MAX;
+        assert_eq!(max_4_byte_char.len_utf8(), 4);
+        assert_eq!(max_4_byte_char.len_utf16(), 2);
+        assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]);
+    }
+}
diff --git a/crates/oxc_parser/examples/parser.rs b/crates/oxc_parser/examples/parser.rs
@@ -2,19 +2,21 @@
 use std::{fs, path::Path};
 
 use oxc_allocator::Allocator;
+use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
 use oxc_parser::{ParseOptions, Parser};
 use oxc_span::SourceType;
 use pico_args::Arguments;
 
 // Instruction:
 // create a `test.js`,
 // run `cargo run -p oxc_parser --example parser`
-// or `cargo watch -x "run -p oxc_parser --example parser"`
+// or `just watch "cargo run -p oxc_parser --example parser"`
 
 fn main() -> Result<(), String> {
     let mut args = Arguments::from_env();
 
     let show_ast = args.contains("--ast");
+    let show_estree = args.contains("--estree");
     let show_comments = args.contains("--comments");
     let name = args.free_from_str().unwrap_or_else(|_| "test.js".to_string());
 
@@ -26,20 +28,24 @@ fn main() -> Result<(), String> {
     let ret = Parser::new(&allocator, &source_text, source_type)
         .with_options(ParseOptions { parse_regular_expression: true, ..ParseOptions::default() })
         .parse();
-
-    if show_ast {
-        println!("AST:");
-        println!("{}", serde_json::to_string_pretty(&ret.program).unwrap());
-    }
+    let mut program = ret.program;
 
     if show_comments {
         println!("Comments:");
-        for comment in ret.program.comments {
+        for comment in &program.comments {
             let s = comment.content_span().source_text(&source_text);
             println!("{s}");
         }
     }
 
+    if show_ast || show_estree {
+        println!("AST:");
+        if show_estree {
+            Utf8ToUtf16::new().convert(&mut program);
+        }
+        println!("{}", serde_json::to_string_pretty(&program).unwrap());
+    }
+
     if ret.errors.is_empty() {
         println!("Parsed Successfully.");
     } else {
diff --git a/tasks/benchmark/Cargo.toml b/tasks/benchmark/Cargo.toml
@@ -65,7 +65,7 @@ bench = false
 # with only the crates it needs, to speed up the builds
 [dependencies]
 oxc_allocator = { workspace = true, optional = true }
-oxc_ast = { workspace = true, optional = true }
+oxc_ast = { workspace = true, optional = true, features = ["serialize"] }
 oxc_codegen = { workspace = true, optional = true }
 oxc_isolated_declarations = { workspace = true, optional = true }
 oxc_linter = { workspace = true, optional = true }
@@ -106,7 +106,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]
 # Features for running each benchmark separately with minimum dependencies that benchmark needs.
 # e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
 lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
-parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
+parser = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
 transformer = [
   "dep:oxc_allocator",
   "dep:oxc_parser",
diff --git a/tasks/benchmark/benches/parser.rs b/tasks/benchmark/benches/parser.rs
@@ -1,4 +1,5 @@
 use oxc_allocator::Allocator;
+use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
 use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use oxc_parser::{ParseOptions, Parser};
 use oxc_span::SourceType;
@@ -29,5 +30,33 @@ fn bench_parser(criterion: &mut Criterion) {
     group.finish();
 }
 
-criterion_group!(parser, bench_parser);
+fn bench_estree(criterion: &mut Criterion) {
+    let mut group = criterion.benchmark_group("estree");
+    for file in TestFiles::complicated().files().iter().take(1) {
+        let id = BenchmarkId::from_parameter(&file.file_name);
+        let source_text = file.source_text.as_str();
+        let source_type = SourceType::from_path(&file.file_name).unwrap();
+        let mut allocator = Allocator::default();
+        group.bench_function(id, |b| {
+            b.iter_with_setup_wrapper(|runner| {
+                allocator.reset();
+                let mut program = Parser::new(&allocator, source_text, source_type)
+                    .with_options(ParseOptions {
+                        parse_regular_expression: true,
+                        ..ParseOptions::default()
+                    })
+                    .parse()
+                    .program;
+                runner.run(|| {
+                    Utf8ToUtf16::new().convert(&mut program);
+                    program.to_json();
+                    program
+                });
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(parser, bench_parser, bench_estree);
 criterion_main!(parser);