Skip to content

Commit b7f13e6

Browse files
committed
feat(ast): implement utf8 to utf16 span converter (#8687)
closes #8629
1 parent 10e5920 commit b7f13e6

File tree

6 files changed

+297
-12
lines changed

6 files changed

+297
-12
lines changed

.github/workflows/ci.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
save-cache: ${{ github.ref_name == 'main' }}
4646
cache-key: warm
4747
- run: cargo ck
48-
- run: cargo test
48+
- run: cargo test --all-features
4949
- run: git diff --exit-code # Must commit everything
5050

5151
test-windows:
@@ -96,7 +96,7 @@ jobs:
9696
shell: bash
9797
run: |
9898
# cargo ck # no need to check because it's already checked in linux
99-
cargo test --workspace
99+
cargo test --all-features
100100
101101
test-wasm32-wasip1-threads:
102102
name: Test wasm32-wasip1-threads

crates/oxc_ast/src/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ mod ast_impl;
5353
mod ast_kind_impl;
5454
pub mod precedence;
5555
mod trivia;
56+
#[cfg(feature = "serialize")]
57+
pub mod utf8_to_utf16;
5658

5759
mod generated {
5860
#![allow(missing_docs)]

crates/oxc_ast/src/utf8_to_utf16.rs

+248
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
//! Convert UTF-8 span offsets to UTF-16.
2+
3+
use oxc_span::Span;
4+
5+
use crate::{ast::Program, visit::VisitMut};
6+
7+
/// Convert UTF-8 span offsets to UTF-16.
8+
pub struct Utf8ToUtf16 {
9+
translations: Vec<Translation>,
10+
}
11+
12+
#[derive(Clone, Copy)]
13+
#[repr(align(8))]
14+
struct Translation {
15+
// UTF-8 byte offset
16+
utf8_offset: u32,
17+
// Number to subtract from UTF-8 byte offset to get UTF-16 char offset
18+
// for offsets *after* `utf8_offset`
19+
utf16_difference: u32,
20+
}
21+
22+
impl Utf8ToUtf16 {
23+
/// Create new `Utf8ToUtf16` converter.
24+
#[expect(clippy::new_without_default)]
25+
pub fn new() -> Self {
26+
let mut translations = Vec::with_capacity(16);
27+
translations.push(Translation { utf8_offset: 0, utf16_difference: 0 });
28+
Self { translations }
29+
}
30+
31+
/// Convert all spans in the AST to UTF-16.
32+
pub fn convert(mut self, program: &mut Program<'_>) {
33+
self.build_table(program.source_text);
34+
// Skip if source is entirely ASCII
35+
if self.translations.len() == 1 {
36+
return;
37+
}
38+
self.visit_program(program);
39+
for comment in &mut program.comments {
40+
self.convert_span(&mut comment.span);
41+
}
42+
}
43+
44+
#[allow(clippy::cast_possible_truncation)]
45+
fn build_table(&mut self, source_text: &str) {
46+
// Translation from UTF-8 byte offset to UTF-16 char offset:
47+
//
48+
// * 1-byte UTF-8 sequence
49+
// = 1st byte 0xxxxxxx (0 - 0x7F)
50+
// -> 1 x UTF-16 char
51+
// UTF-16 len = UTF-8 len
52+
// * 2-byte UTF-8 sequence
53+
// = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF)
54+
// -> 1 x UTF-16
55+
// UTF-16 len = UTF-8 len - 1
56+
// * 3-byte UTF-8 sequence
57+
// = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF)
58+
// -> 1 x UTF-16
59+
// UTF-16 len = UTF-8 len - 2
60+
// * 4-byte UTF-8 sequence
61+
// = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF)
62+
// -> 2 x UTF-16
63+
// UTF-16 len = UTF-8 len - 2
64+
//
65+
// So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0`
66+
let mut utf16_difference = 0;
67+
for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() {
68+
if byte >= 0xC0 {
69+
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
70+
utf16_difference += difference_for_this_byte;
71+
// Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this
72+
// Unicode character that need to be shifted
73+
self.translations
74+
.push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference });
75+
}
76+
}
77+
}
78+
79+
fn convert_span(&self, span: &mut Span) {
80+
span.start = self.convert_offset(span.start);
81+
span.end = self.convert_offset(span.end);
82+
}
83+
84+
fn convert_offset(&self, utf8_offset: u32) -> u32 {
85+
// Find the first entry in table *after* the UTF-8 offset.
86+
// The difference we need to subtract is recorded in the entry prior to it.
87+
let index =
88+
self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset);
89+
// First entry in table is `0, 0`. `partition_point` finds the first entry where
90+
// `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists).
91+
// So guaranteed `index > 0`, and `index <= translations.len()`.
92+
// Therefore `index - 1` cannot wrap around, and cannot be out of bounds.
93+
let translation = self.translations[index - 1];
94+
utf8_offset - translation.utf16_difference
95+
}
96+
}
97+
98+
impl VisitMut<'_> for Utf8ToUtf16 {
99+
fn visit_span(&mut self, span: &mut Span) {
100+
self.convert_span(span);
101+
}
102+
}
103+
104+
#[cfg(test)]
105+
mod test {
106+
use oxc_allocator::Allocator;
107+
use oxc_span::{GetSpan, SourceType, Span};
108+
109+
use crate::{
110+
ast::{Expression, Statement},
111+
AstBuilder, Comment, CommentKind,
112+
};
113+
114+
use super::Utf8ToUtf16;
115+
116+
#[test]
117+
fn translate_ast() {
118+
let allocator = Allocator::new();
119+
let ast = AstBuilder::new(&allocator);
120+
121+
let mut program = ast.program(
122+
Span::new(0, 15),
123+
SourceType::default(),
124+
";'🤨' // 🤨",
125+
ast.vec1(Comment::new(8, 15, CommentKind::Line)),
126+
None,
127+
ast.vec(),
128+
ast.vec_from_array([
129+
ast.statement_empty(Span::new(0, 1)),
130+
ast.statement_expression(
131+
Span::new(1, 7),
132+
ast.expression_string_literal(Span::new(1, 7), "🤨", None),
133+
),
134+
]),
135+
);
136+
137+
Utf8ToUtf16::new().convert(&mut program);
138+
assert_eq!(program.span, Span::new(0, 11));
139+
assert_eq!(program.body[1].span(), Span::new(1, 5));
140+
let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() };
141+
let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() };
142+
assert_eq!(s.span, Span::new(1, 5));
143+
assert_eq!(program.comments[0].span, Span::new(6, 11));
144+
}
145+
146+
#[test]
147+
fn translate_offsets() {
148+
assert_eq!('_'.len_utf8(), 1);
149+
assert_eq!('_'.len_utf16(), 1);
150+
assert_eq!('£'.len_utf8(), 2);
151+
assert_eq!('£'.len_utf16(), 1);
152+
assert_eq!('ऊ'.len_utf8(), 3);
153+
assert_eq!('ऊ'.len_utf16(), 1);
154+
assert_eq!('🤨'.len_utf8(), 4);
155+
assert_eq!('🤨'.len_utf16(), 2);
156+
157+
let cases: &[(&str, &[(u32, u32)])] = &[
158+
// 1-byte
159+
("_", &[(0, 0), (1, 1)]),
160+
// 2-byte
161+
("£", &[(0, 0), (2, 1)]),
162+
("£_", &[(0, 0), (2, 1), (3, 2)]),
163+
("_£", &[(0, 0), (1, 1), (3, 2)]),
164+
("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]),
165+
("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]),
166+
("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]),
167+
// 3-byte
168+
("ऊ", &[(0, 0), (3, 1)]),
169+
("ऊ_", &[(0, 0), (3, 1), (4, 2)]),
170+
("_ऊ", &[(0, 0), (1, 1), (4, 2)]),
171+
("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]),
172+
("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]),
173+
("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]),
174+
// 4-byte
175+
("🤨", &[(0, 0), (4, 2)]),
176+
("🤨_", &[(0, 0), (4, 2), (5, 3)]),
177+
("_🤨", &[(0, 0), (1, 1), (5, 3)]),
178+
("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]),
179+
("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]),
180+
("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]),
181+
];
182+
183+
for (text, translations) in cases {
184+
let mut converter = Utf8ToUtf16::new();
185+
converter.build_table(text);
186+
for &(utf8_offset, expected_utf16_offset) in *translations {
187+
assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset);
188+
}
189+
}
190+
}
191+
192+
// Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences,
193+
// which are relied on by `build_table`
194+
#[test]
195+
fn char_lengths() {
196+
macro_rules! assert_utf8_bytes_eq {
197+
($c:expr, $bytes:expr) => {{
198+
let mut buffer = [0; 4];
199+
let bytes = $c.encode_utf8(&mut buffer).as_bytes();
200+
assert!($bytes == bytes);
201+
}};
202+
}
203+
204+
// All 1-byte UTF-8 character sequences = 1 x UTF-16 character.
205+
// First byte is 0x00 - 0x7F.
206+
let min_1_byte_char = char::from_u32(0).unwrap();
207+
assert_eq!(min_1_byte_char.len_utf8(), 1);
208+
assert_eq!(min_1_byte_char.len_utf16(), 1);
209+
assert_utf8_bytes_eq!(min_1_byte_char, [0x00]);
210+
let max_1_byte_char = char::from_u32(0x7F).unwrap();
211+
assert_eq!(max_1_byte_char.len_utf8(), 1);
212+
assert_eq!(max_1_byte_char.len_utf16(), 1);
213+
assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]);
214+
215+
// All 2-byte UTF-8 character sequences = 1 x UTF-16 character
216+
// First byte is 0xC2 - 0xDF.
217+
let min_2_byte_char = char::from_u32(0x80).unwrap();
218+
assert_eq!(min_2_byte_char.len_utf8(), 2);
219+
assert_eq!(min_2_byte_char.len_utf16(), 1);
220+
assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]);
221+
let max_2_byte_char = char::from_u32(0x7FF).unwrap();
222+
assert_eq!(max_2_byte_char.len_utf8(), 2);
223+
assert_eq!(max_2_byte_char.len_utf16(), 1);
224+
assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]);
225+
226+
// All 3-byte UTF-8 character sequences = 1 x UTF-16 character
227+
// First byte is 0xE0 - 0xEF.
228+
let min_3_byte_char = char::from_u32(0x800).unwrap();
229+
assert_eq!(min_3_byte_char.len_utf8(), 3);
230+
assert_eq!(min_3_byte_char.len_utf16(), 1);
231+
assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]);
232+
let max_3_byte_char = char::from_u32(0xFFFF).unwrap();
233+
assert_eq!(max_3_byte_char.len_utf8(), 3);
234+
assert_eq!(max_3_byte_char.len_utf16(), 1);
235+
assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]);
236+
237+
// All 4-byte UTF-8 character sequences = 2 x UTF-16 characters
238+
// First byte is 0xF0 - 0xF4.
239+
let min_4_byte_char = char::from_u32(0x10000).unwrap();
240+
assert_eq!(min_4_byte_char.len_utf8(), 4);
241+
assert_eq!(min_4_byte_char.len_utf16(), 2);
242+
assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]);
243+
let max_4_byte_char = char::MAX;
244+
assert_eq!(max_4_byte_char.len_utf8(), 4);
245+
assert_eq!(max_4_byte_char.len_utf16(), 2);
246+
assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]);
247+
}
248+
}

crates/oxc_parser/examples/parser.rs

+13-7
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,21 @@
22
use std::{fs, path::Path};
33

44
use oxc_allocator::Allocator;
5+
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
56
use oxc_parser::{ParseOptions, Parser};
67
use oxc_span::SourceType;
78
use pico_args::Arguments;
89

910
// Instruction:
1011
// create a `test.js`,
1112
// run `cargo run -p oxc_parser --example parser`
12-
// or `cargo watch -x "run -p oxc_parser --example parser"`
13+
// or `just watch "cargo run -p oxc_parser --example parser"`
1314

1415
fn main() -> Result<(), String> {
1516
let mut args = Arguments::from_env();
1617

1718
let show_ast = args.contains("--ast");
19+
let show_estree = args.contains("--estree");
1820
let show_comments = args.contains("--comments");
1921
let name = args.free_from_str().unwrap_or_else(|_| "test.js".to_string());
2022

@@ -26,20 +28,24 @@ fn main() -> Result<(), String> {
2628
let ret = Parser::new(&allocator, &source_text, source_type)
2729
.with_options(ParseOptions { parse_regular_expression: true, ..ParseOptions::default() })
2830
.parse();
29-
30-
if show_ast {
31-
println!("AST:");
32-
println!("{}", serde_json::to_string_pretty(&ret.program).unwrap());
33-
}
31+
let mut program = ret.program;
3432

3533
if show_comments {
3634
println!("Comments:");
37-
for comment in ret.program.comments {
35+
for comment in &program.comments {
3836
let s = comment.content_span().source_text(&source_text);
3937
println!("{s}");
4038
}
4139
}
4240

41+
if show_ast || show_estree {
42+
println!("AST:");
43+
if show_estree {
44+
Utf8ToUtf16::new().convert(&mut program);
45+
}
46+
println!("{}", serde_json::to_string_pretty(&program).unwrap());
47+
}
48+
4349
if ret.errors.is_empty() {
4450
println!("Parsed Successfully.");
4551
} else {

tasks/benchmark/Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ bench = false
6565
# with only the crates it needs, to speed up the builds
6666
[dependencies]
6767
oxc_allocator = { workspace = true, optional = true }
68-
oxc_ast = { workspace = true, optional = true }
68+
oxc_ast = { workspace = true, optional = true, features = ["serialize"] }
6969
oxc_codegen = { workspace = true, optional = true }
7070
oxc_isolated_declarations = { workspace = true, optional = true }
7171
oxc_linter = { workspace = true, optional = true }
@@ -106,7 +106,7 @@ codspeed_napi = ["criterion2/codspeed", "dep:serde", "dep:serde_json"]
106106
# Features for running each benchmark separately with minimum dependencies that benchmark needs.
107107
# e.g. `cargo build --release -p oxc_benchmark --bench parser --no-default-features --features parser`
108108
lexer = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
109-
parser = ["dep:oxc_allocator", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
109+
parser = ["dep:oxc_allocator", "dep:oxc_ast", "dep:oxc_parser", "dep:oxc_span", "dep:oxc_tasks_common"]
110110
transformer = [
111111
"dep:oxc_allocator",
112112
"dep:oxc_parser",

tasks/benchmark/benches/parser.rs

+30-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use oxc_allocator::Allocator;
2+
use oxc_ast::utf8_to_utf16::Utf8ToUtf16;
23
use oxc_benchmark::{criterion_group, criterion_main, BenchmarkId, Criterion};
34
use oxc_parser::{ParseOptions, Parser};
45
use oxc_span::SourceType;
@@ -29,5 +30,33 @@ fn bench_parser(criterion: &mut Criterion) {
2930
group.finish();
3031
}
3132

32-
criterion_group!(parser, bench_parser);
33+
fn bench_estree(criterion: &mut Criterion) {
34+
let mut group = criterion.benchmark_group("estree");
35+
for file in TestFiles::complicated().files().iter().take(1) {
36+
let id = BenchmarkId::from_parameter(&file.file_name);
37+
let source_text = file.source_text.as_str();
38+
let source_type = SourceType::from_path(&file.file_name).unwrap();
39+
let mut allocator = Allocator::default();
40+
group.bench_function(id, |b| {
41+
b.iter_with_setup_wrapper(|runner| {
42+
allocator.reset();
43+
let mut program = Parser::new(&allocator, source_text, source_type)
44+
.with_options(ParseOptions {
45+
parse_regular_expression: true,
46+
..ParseOptions::default()
47+
})
48+
.parse()
49+
.program;
50+
runner.run(|| {
51+
Utf8ToUtf16::new().convert(&mut program);
52+
program.to_json();
53+
program
54+
});
55+
});
56+
});
57+
}
58+
group.finish();
59+
}
60+
61+
criterion_group!(parser, bench_parser, bench_estree);
3362
criterion_main!(parser);

0 commit comments

Comments
 (0)