|
| 1 | +//! Convert UTF-8 span offsets to UTF-16. |
| 2 | +
|
| 3 | +use oxc_span::Span; |
| 4 | + |
| 5 | +use crate::{ast::Program, visit::VisitMut}; |
| 6 | + |
| 7 | +/// Convert UTF-8 span offsets to UTF-16. |
| 8 | +pub struct Utf8ToUtf16 { |
| 9 | + translations: Vec<Translation>, |
| 10 | +} |
| 11 | + |
| 12 | +#[derive(Clone, Copy)] |
| 13 | +#[repr(align(8))] |
| 14 | +struct Translation { |
| 15 | + // UTF-8 byte offset |
| 16 | + utf8_offset: u32, |
| 17 | + // Number to subtract from UTF-8 byte offset to get UTF-16 char offset |
| 18 | + // for offsets *after* `utf8_offset` |
| 19 | + utf16_difference: u32, |
| 20 | +} |
| 21 | + |
| 22 | +impl Utf8ToUtf16 { |
| 23 | + /// Create new `Utf8ToUtf16` converter. |
| 24 | + #[expect(clippy::new_without_default)] |
| 25 | + pub fn new() -> Self { |
| 26 | + let mut translations = Vec::with_capacity(16); |
| 27 | + translations.push(Translation { utf8_offset: 0, utf16_difference: 0 }); |
| 28 | + Self { translations } |
| 29 | + } |
| 30 | + |
| 31 | + /// Convert all spans in the AST to UTF-16. |
| 32 | + pub fn convert(mut self, program: &mut Program<'_>) { |
| 33 | + self.build_table(program.source_text); |
| 34 | + // Skip if source is entirely ASCII |
| 35 | + if self.translations.len() == 1 { |
| 36 | + return; |
| 37 | + } |
| 38 | + self.visit_program(program); |
| 39 | + for comment in &mut program.comments { |
| 40 | + self.convert_span(&mut comment.span); |
| 41 | + } |
| 42 | + } |
| 43 | + |
| 44 | + #[allow(clippy::cast_possible_truncation)] |
| 45 | + fn build_table(&mut self, source_text: &str) { |
| 46 | + // Translation from UTF-8 byte offset to UTF-16 char offset: |
| 47 | + // |
| 48 | + // * 1-byte UTF-8 sequence |
| 49 | + // = 1st byte 0xxxxxxx (0 - 0x7F) |
| 50 | + // -> 1 x UTF-16 char |
| 51 | + // UTF-16 len = UTF-8 len |
| 52 | + // * 2-byte UTF-8 sequence |
| 53 | + // = 1st byte 110xxxxx (0xC0 - 0xDF), remaining bytes 10xxxxxx (0x80 - 0xBF) |
| 54 | + // -> 1 x UTF-16 |
| 55 | + // UTF-16 len = UTF-8 len - 1 |
| 56 | + // * 3-byte UTF-8 sequence |
| 57 | + // = 1st byte 1110xxxx (0xE0 - 0xEF), remaining bytes 10xxxxxx (0x80 - 0xBF) |
| 58 | + // -> 1 x UTF-16 |
| 59 | + // UTF-16 len = UTF-8 len - 2 |
| 60 | + // * 4-byte UTF-8 sequence |
| 61 | + // = 1st byte 1111xxxx (0xF0 - 0xFF), remaining bytes 10xxxxxx (0x80 - 0xBF) |
| 62 | + // -> 2 x UTF-16 |
| 63 | + // UTF-16 len = UTF-8 len - 2 |
| 64 | + // |
| 65 | + // So UTF-16 offset = UTF-8 offset - count of bytes `>= 0xC0` - count of bytes `>= 0xE0` |
| 66 | + let mut utf16_difference = 0; |
| 67 | + for (utf8_offset, &byte) in source_text.as_bytes().iter().enumerate() { |
| 68 | + if byte >= 0xC0 { |
| 69 | + let difference_for_this_byte = u32::from(byte >= 0xE0) + 1; |
| 70 | + utf16_difference += difference_for_this_byte; |
| 71 | + // Record `utf8_offset + 1` not `utf8_offset`, because it's only offsets *after* this |
| 72 | + // Unicode character that need to be shifted |
| 73 | + self.translations |
| 74 | + .push(Translation { utf8_offset: utf8_offset as u32 + 1, utf16_difference }); |
| 75 | + } |
| 76 | + } |
| 77 | + } |
| 78 | + |
| 79 | + fn convert_span(&self, span: &mut Span) { |
| 80 | + span.start = self.convert_offset(span.start); |
| 81 | + span.end = self.convert_offset(span.end); |
| 82 | + } |
| 83 | + |
| 84 | + fn convert_offset(&self, utf8_offset: u32) -> u32 { |
| 85 | + // Find the first entry in table *after* the UTF-8 offset. |
| 86 | + // The difference we need to subtract is recorded in the entry prior to it. |
| 87 | + let index = |
| 88 | + self.translations.partition_point(|translation| translation.utf8_offset <= utf8_offset); |
| 89 | + // First entry in table is `0, 0`. `partition_point` finds the first entry where |
| 90 | + // `utf8_offset < translation.utf8_offset` (or `translations.len()` if none exists). |
| 91 | + // So guaranteed `index > 0`, and `index <= translations.len()`. |
| 92 | + // Therefore `index - 1` cannot wrap around, and cannot be out of bounds. |
| 93 | + let translation = self.translations[index - 1]; |
| 94 | + utf8_offset - translation.utf16_difference |
| 95 | + } |
| 96 | +} |
| 97 | + |
| 98 | +impl VisitMut<'_> for Utf8ToUtf16 { |
| 99 | + fn visit_span(&mut self, span: &mut Span) { |
| 100 | + self.convert_span(span); |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +#[cfg(test)] |
| 105 | +mod test { |
| 106 | + use oxc_allocator::Allocator; |
| 107 | + use oxc_span::{GetSpan, SourceType, Span}; |
| 108 | + |
| 109 | + use crate::{ |
| 110 | + ast::{Expression, Statement}, |
| 111 | + AstBuilder, Comment, CommentKind, |
| 112 | + }; |
| 113 | + |
| 114 | + use super::Utf8ToUtf16; |
| 115 | + |
| 116 | + #[test] |
| 117 | + fn translate_ast() { |
| 118 | + let allocator = Allocator::new(); |
| 119 | + let ast = AstBuilder::new(&allocator); |
| 120 | + |
| 121 | + let mut program = ast.program( |
| 122 | + Span::new(0, 15), |
| 123 | + SourceType::default(), |
| 124 | + ";'🤨' // 🤨", |
| 125 | + ast.vec1(Comment::new(8, 15, CommentKind::Line)), |
| 126 | + None, |
| 127 | + ast.vec(), |
| 128 | + ast.vec_from_array([ |
| 129 | + ast.statement_empty(Span::new(0, 1)), |
| 130 | + ast.statement_expression( |
| 131 | + Span::new(1, 7), |
| 132 | + ast.expression_string_literal(Span::new(1, 7), "🤨", None), |
| 133 | + ), |
| 134 | + ]), |
| 135 | + ); |
| 136 | + |
| 137 | + Utf8ToUtf16::new().convert(&mut program); |
| 138 | + assert_eq!(program.span, Span::new(0, 11)); |
| 139 | + assert_eq!(program.body[1].span(), Span::new(1, 5)); |
| 140 | + let Statement::ExpressionStatement(expr_stmt) = &program.body[1] else { unreachable!() }; |
| 141 | + let Expression::StringLiteral(s) = &expr_stmt.expression else { unreachable!() }; |
| 142 | + assert_eq!(s.span, Span::new(1, 5)); |
| 143 | + assert_eq!(program.comments[0].span, Span::new(6, 11)); |
| 144 | + } |
| 145 | + |
| 146 | + #[test] |
| 147 | + fn translate_offsets() { |
| 148 | + assert_eq!('_'.len_utf8(), 1); |
| 149 | + assert_eq!('_'.len_utf16(), 1); |
| 150 | + assert_eq!('£'.len_utf8(), 2); |
| 151 | + assert_eq!('£'.len_utf16(), 1); |
| 152 | + assert_eq!('ऊ'.len_utf8(), 3); |
| 153 | + assert_eq!('ऊ'.len_utf16(), 1); |
| 154 | + assert_eq!('🤨'.len_utf8(), 4); |
| 155 | + assert_eq!('🤨'.len_utf16(), 2); |
| 156 | + |
| 157 | + let cases: &[(&str, &[(u32, u32)])] = &[ |
| 158 | + // 1-byte |
| 159 | + ("_", &[(0, 0), (1, 1)]), |
| 160 | + // 2-byte |
| 161 | + ("£", &[(0, 0), (2, 1)]), |
| 162 | + ("£_", &[(0, 0), (2, 1), (3, 2)]), |
| 163 | + ("_£", &[(0, 0), (1, 1), (3, 2)]), |
| 164 | + ("_£_", &[(0, 0), (1, 1), (3, 2), (4, 3)]), |
| 165 | + ("_££_", &[(0, 0), (1, 1), (3, 2), (5, 3), (6, 4)]), |
| 166 | + ("_£_£_", &[(0, 0), (1, 1), (3, 2), (4, 3), (6, 4), (7, 5)]), |
| 167 | + // 3-byte |
| 168 | + ("ऊ", &[(0, 0), (3, 1)]), |
| 169 | + ("ऊ_", &[(0, 0), (3, 1), (4, 2)]), |
| 170 | + ("_ऊ", &[(0, 0), (1, 1), (4, 2)]), |
| 171 | + ("_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3)]), |
| 172 | + ("_ऊऊ_", &[(0, 0), (1, 1), (4, 2), (7, 3), (8, 4)]), |
| 173 | + ("_ऊ_ऊ_", &[(0, 0), (1, 1), (4, 2), (5, 3), (8, 4), (9, 5)]), |
| 174 | + // 4-byte |
| 175 | + ("🤨", &[(0, 0), (4, 2)]), |
| 176 | + ("🤨_", &[(0, 0), (4, 2), (5, 3)]), |
| 177 | + ("_🤨", &[(0, 0), (1, 1), (5, 3)]), |
| 178 | + ("_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4)]), |
| 179 | + ("_🤨🤨_", &[(0, 0), (1, 1), (5, 3), (9, 5), (10, 6)]), |
| 180 | + ("_🤨_🤨_", &[(0, 0), (1, 1), (5, 3), (6, 4), (10, 6), (11, 7)]), |
| 181 | + ]; |
| 182 | + |
| 183 | + for (text, translations) in cases { |
| 184 | + let mut converter = Utf8ToUtf16::new(); |
| 185 | + converter.build_table(text); |
| 186 | + for &(utf8_offset, expected_utf16_offset) in *translations { |
| 187 | + assert_eq!(converter.convert_offset(utf8_offset), expected_utf16_offset); |
| 188 | + } |
| 189 | + } |
| 190 | + } |
| 191 | + |
| 192 | + // Check assumptions about how many UTF-16 chars result from different UTF-8 character sequences, |
| 193 | + // which are relied on by `build_table` |
| 194 | + #[test] |
| 195 | + fn char_lengths() { |
| 196 | + macro_rules! assert_utf8_bytes_eq { |
| 197 | + ($c:expr, $bytes:expr) => {{ |
| 198 | + let mut buffer = [0; 4]; |
| 199 | + let bytes = $c.encode_utf8(&mut buffer).as_bytes(); |
| 200 | + assert!($bytes == bytes); |
| 201 | + }}; |
| 202 | + } |
| 203 | + |
| 204 | + // All 1-byte UTF-8 character sequences = 1 x UTF-16 character. |
| 205 | + // First byte is 0x00 - 0x7F. |
| 206 | + let min_1_byte_char = char::from_u32(0).unwrap(); |
| 207 | + assert_eq!(min_1_byte_char.len_utf8(), 1); |
| 208 | + assert_eq!(min_1_byte_char.len_utf16(), 1); |
| 209 | + assert_utf8_bytes_eq!(min_1_byte_char, [0x00]); |
| 210 | + let max_1_byte_char = char::from_u32(0x7F).unwrap(); |
| 211 | + assert_eq!(max_1_byte_char.len_utf8(), 1); |
| 212 | + assert_eq!(max_1_byte_char.len_utf16(), 1); |
| 213 | + assert_utf8_bytes_eq!(max_1_byte_char, [0x7F]); |
| 214 | + |
| 215 | + // All 2-byte UTF-8 character sequences = 1 x UTF-16 character |
| 216 | + // First byte is 0xC2 - 0xDF. |
| 217 | + let min_2_byte_char = char::from_u32(0x80).unwrap(); |
| 218 | + assert_eq!(min_2_byte_char.len_utf8(), 2); |
| 219 | + assert_eq!(min_2_byte_char.len_utf16(), 1); |
| 220 | + assert_utf8_bytes_eq!(min_2_byte_char, [0xC2, 0x80]); |
| 221 | + let max_2_byte_char = char::from_u32(0x7FF).unwrap(); |
| 222 | + assert_eq!(max_2_byte_char.len_utf8(), 2); |
| 223 | + assert_eq!(max_2_byte_char.len_utf16(), 1); |
| 224 | + assert_utf8_bytes_eq!(max_2_byte_char, [0xDF, 0xBF]); |
| 225 | + |
| 226 | + // All 3-byte UTF-8 character sequences = 1 x UTF-16 character |
| 227 | + // First byte is 0xE0 - 0xEF. |
| 228 | + let min_3_byte_char = char::from_u32(0x800).unwrap(); |
| 229 | + assert_eq!(min_3_byte_char.len_utf8(), 3); |
| 230 | + assert_eq!(min_3_byte_char.len_utf16(), 1); |
| 231 | + assert_utf8_bytes_eq!(min_3_byte_char, [0xE0, 0xA0, 0x80]); |
| 232 | + let max_3_byte_char = char::from_u32(0xFFFF).unwrap(); |
| 233 | + assert_eq!(max_3_byte_char.len_utf8(), 3); |
| 234 | + assert_eq!(max_3_byte_char.len_utf16(), 1); |
| 235 | + assert_utf8_bytes_eq!(max_3_byte_char, [0xEF, 0xBF, 0xBF]); |
| 236 | + |
| 237 | + // All 4-byte UTF-8 character sequences = 2 x UTF-16 characters |
| 238 | + // First byte is 0xF0 - 0xF4. |
| 239 | + let min_4_byte_char = char::from_u32(0x10000).unwrap(); |
| 240 | + assert_eq!(min_4_byte_char.len_utf8(), 4); |
| 241 | + assert_eq!(min_4_byte_char.len_utf16(), 2); |
| 242 | + assert_utf8_bytes_eq!(min_4_byte_char, [0xF0, 0x90, 0x80, 0x80]); |
| 243 | + let max_4_byte_char = char::MAX; |
| 244 | + assert_eq!(max_4_byte_char.len_utf8(), 4); |
| 245 | + assert_eq!(max_4_byte_char.len_utf16(), 2); |
| 246 | + assert_utf8_bytes_eq!(max_4_byte_char, [0xF4, 0x8F, 0xBF, 0xBF]); |
| 247 | + } |
| 248 | +} |
0 commit comments