fix(transformer): JSX source calculate correct column when Unicode chars (#3615)

overlookmotel · overlookmotel · commit 8d237c49a97d · 2024-06-11T06:41:18.000Z
Fix column number in JSX source transform, and add tests.

It was correct in all cases, except for when a Unicode character with code point above `0xFFFF` appears earlier on the line.

Such characters are:

* 4 bytes in UTF-8.
* 2 characters in UTF-16.
* 1 `char` in Rust.

Babel (which we're trying to match) uses count of UTF-16 characters for column number, whereas we were using count of Rust `char`s.
diff --git a/crates/oxc_transformer/src/react/jsx_source.rs b/crates/oxc_transformer/src/react/jsx_source.rs
@@ -85,6 +85,9 @@ impl<'a> ReactJsxSource<'a> {
         let key = JSXAttributeName::Identifier(
             self.ctx.ast.alloc(self.ctx.ast.jsx_identifier(SPAN, SOURCE.into())),
         );
+        // TODO: We shouldn't calculate line + column from scratch each time as it's expensive.
+        // Build a table of byte indexes of each line's start on first usage, and save it.
+        // Then calculate line and column from that.
         let (line, column) = get_line_column(elem.span.start, self.ctx.source_text);
         let object = self.get_source_object(line, column, ctx);
         let expr = self.ctx.ast.jsx_expression_container(SPAN, JSXExpression::from(object));
diff --git a/crates/oxc_transformer/src/react/utils.rs b/crates/oxc_transformer/src/react/utils.rs
@@ -1,14 +1,91 @@
 use ropey::Rope;
 
-/// Get line and column from offset and source text
+/// Get line and column from offset and source text.
+///
+/// Line number starts at 1.
+/// Column number is in UTF-16 characters, and starts at 1.
+///
+/// This matches Babel's output.
 pub fn get_line_column(offset: u32, source_text: &str) -> (usize, usize) {
     let offset = offset as usize;
     let rope = Rope::from_str(source_text);
-    let line = rope.byte_to_line(offset);
-    let first_char_of_line = rope.line_to_char(line);
-    // Original offset is byte, but Rope uses char offset
-    let offset = rope.byte_to_char(offset);
-    let column = offset - first_char_of_line;
-    // line and column is zero-indexed, but we want 1-indexed
-    (line + 1, column + 1)
+    // Get line number and byte offset of start of line
+    let line_index = rope.byte_to_line(offset);
+    let line_offset = rope.line_to_byte(line_index);
+    // Get column number
+    let column_index = source_text[line_offset..offset].encode_utf16().count();
+    // line and column are zero-indexed, but we want 1-indexed
+    (line_index + 1, column_index + 1)
+}
+
+#[test]
+fn empty_file() {
+    assert_eq!(get_line_column(0, ""), (1, 1));
+}
+
+#[test]
+fn first_line_start() {
+    assert_eq!(get_line_column(0, "foo\nbar\n"), (1, 1));
+}
+
+#[test]
+fn first_line_middle() {
+    assert_eq!(get_line_column(5, "blahblahblah\noops\n"), (1, 6));
+}
+
+#[test]
+fn later_line_start() {
+    assert_eq!(get_line_column(8, "foo\nbar\nblahblahblah"), (3, 1));
+}
+
+#[test]
+fn later_line_middle() {
+    assert_eq!(get_line_column(12, "foo\nbar\nblahblahblah"), (3, 5));
+}
+
+#[test]
+fn after_2_byte_unicode() {
+    assert_eq!("£".len(), 2);
+    assert_eq!(utf16_len("£"), 1);
+    assert_eq!(get_line_column(4, "£abc"), (1, 4));
+}
+
+#[test]
+fn after_3_byte_unicode() {
+    assert_eq!("अ".len(), 3);
+    assert_eq!(utf16_len("अ"), 1);
+    assert_eq!(get_line_column(5, "अabc"), (1, 4));
+}
+
+#[test]
+fn after_4_byte_unicode() {
+    assert_eq!("🍄".len(), 4);
+    assert_eq!(utf16_len("🍄"), 2);
+    assert_eq!(get_line_column(6, "🍄abc"), (1, 5));
+}
+
+#[test]
+fn after_2_byte_unicode_on_previous_line() {
+    assert_eq!("£".len(), 2);
+    assert_eq!(utf16_len("£"), 1);
+    assert_eq!(get_line_column(4, "£\nabc"), (2, 2));
+}
+
+#[test]
+fn after_3_byte_unicode_on_previous_line() {
+    assert_eq!("अ".len(), 3);
+    assert_eq!(utf16_len("अ"), 1);
+    assert_eq!(get_line_column(5, "अ\nabc"), (2, 2));
+}
+
+#[test]
+fn after_4_byte_unicode_on_previous_line() {
+    assert_eq!("🍄".len(), 4);
+    assert_eq!(utf16_len("🍄"), 2);
+    assert_eq!(get_line_column(6, "🍄\nabc"), (2, 2));
+}
+
+#[cfg(test)]
+fn utf16_len(s: &str) -> usize {
+    s.encode_utf16().count()
 }