Skip to content

Commit 8d237c4

Browse files
committed
fix(transformer): JSX source calculate correct column when Unicode chars (#3615)
Fix column number in JSX source transform, and add tests. It was correct in all cases, except for when a Unicode character with code point above `0xFFFF` appears earlier on the line. Such characters are: * 4 bytes in UTF-8. * 2 characters in UTF-16. * 1 `char` in Rust. Babel (which we're trying to match) uses count of UTF-16 characters for column number, whereas we were using count of Rust `char`s.
1 parent 9e8f4d6 commit 8d237c4

File tree

2 files changed

+88
-8
lines changed

2 files changed

+88
-8
lines changed

crates/oxc_transformer/src/react/jsx_source.rs

+3
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ impl<'a> ReactJsxSource<'a> {
8585
let key = JSXAttributeName::Identifier(
8686
self.ctx.ast.alloc(self.ctx.ast.jsx_identifier(SPAN, SOURCE.into())),
8787
);
88+
// TODO: We shouldn't calculate line + column from scratch each time as it's expensive.
89+
// Build a table of byte indexes of each line's start on first usage, and save it.
90+
// Then calculate line and column from that.
8891
let (line, column) = get_line_column(elem.span.start, self.ctx.source_text);
8992
let object = self.get_source_object(line, column, ctx);
9093
let expr = self.ctx.ast.jsx_expression_container(SPAN, JSXExpression::from(object));
+85-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,91 @@
11
use ropey::Rope;
22

3-
/// Get line and column from offset and source text
3+
/// Get line and column from offset and source text.
4+
///
5+
/// Line number starts at 1.
6+
/// Column number is in UTF-16 characters, and starts at 1.
7+
///
8+
/// This matches Babel's output.
49
pub fn get_line_column(offset: u32, source_text: &str) -> (usize, usize) {
510
let offset = offset as usize;
611
let rope = Rope::from_str(source_text);
7-
let line = rope.byte_to_line(offset);
8-
let first_char_of_line = rope.line_to_char(line);
9-
// Original offset is byte, but Rope uses char offset
10-
let offset = rope.byte_to_char(offset);
11-
let column = offset - first_char_of_line;
12-
// line and column is zero-indexed, but we want 1-indexed
13-
(line + 1, column + 1)
12+
// Get line number and byte offset of start of line
13+
let line_index = rope.byte_to_line(offset);
14+
let line_offset = rope.line_to_byte(line_index);
15+
// Get column number
16+
let column_index = source_text[line_offset..offset].encode_utf16().count();
17+
// line and column are zero-indexed, but we want 1-indexed
18+
(line_index + 1, column_index + 1)
19+
}
20+
21+
#[test]
22+
fn empty_file() {
23+
assert_eq!(get_line_column(0, ""), (1, 1));
24+
}
25+
26+
#[test]
27+
fn first_line_start() {
28+
assert_eq!(get_line_column(0, "foo\nbar\n"), (1, 1));
29+
}
30+
31+
#[test]
32+
fn first_line_middle() {
33+
assert_eq!(get_line_column(5, "blahblahblah\noops\n"), (1, 6));
34+
}
35+
36+
#[test]
37+
fn later_line_start() {
38+
assert_eq!(get_line_column(8, "foo\nbar\nblahblahblah"), (3, 1));
39+
}
40+
41+
#[test]
42+
fn later_line_middle() {
43+
assert_eq!(get_line_column(12, "foo\nbar\nblahblahblah"), (3, 5));
44+
}
45+
46+
#[test]
47+
fn after_2_byte_unicode() {
48+
assert_eq!("£".len(), 2);
49+
assert_eq!(utf16_len("£"), 1);
50+
assert_eq!(get_line_column(4, "£abc"), (1, 4));
51+
}
52+
53+
#[test]
54+
fn after_3_byte_unicode() {
55+
assert_eq!("अ".len(), 3);
56+
assert_eq!(utf16_len("अ"), 1);
57+
assert_eq!(get_line_column(5, "अabc"), (1, 4));
58+
}
59+
60+
#[test]
61+
fn after_4_byte_unicode() {
62+
assert_eq!("🍄".len(), 4);
63+
assert_eq!(utf16_len("🍄"), 2);
64+
assert_eq!(get_line_column(6, "🍄abc"), (1, 5));
65+
}
66+
67+
#[test]
68+
fn after_2_byte_unicode_on_previous_line() {
69+
assert_eq!("£".len(), 2);
70+
assert_eq!(utf16_len("£"), 1);
71+
assert_eq!(get_line_column(4, \nabc"), (2, 2));
72+
}
73+
74+
#[test]
75+
fn after_3_byte_unicode_on_previous_line() {
76+
assert_eq!("अ".len(), 3);
77+
assert_eq!(utf16_len("अ"), 1);
78+
assert_eq!(get_line_column(5, "अ\nabc"), (2, 2));
79+
}
80+
81+
#[test]
82+
fn after_4_byte_unicode_on_previous_line() {
83+
assert_eq!("🍄".len(), 4);
84+
assert_eq!(utf16_len("🍄"), 2);
85+
assert_eq!(get_line_column(6, "🍄\nabc"), (2, 2));
86+
}
87+
88+
#[cfg(test)]
89+
fn utf16_len(s: &str) -> usize {
90+
s.encode_utf16().count()
1491
}

0 commit comments

Comments
 (0)