diff --git a/Cargo.lock b/Cargo.lock index 4407504f2a34..6c4e21af5ca2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -482,6 +482,23 @@ dependencies = [ "biome_rowan", ] +[[package]] +name = "biome_html_parser" +version = "0.0.0" +dependencies = [ + "biome_console", + "biome_diagnostics", + "biome_html_factory", + "biome_html_syntax", + "biome_parser", + "biome_rowan", + "biome_test_utils", + "biome_unicode_table", + "insta", + "tests_macros", + "tracing", +] + [[package]] name = "biome_html_syntax" version = "0.5.7" diff --git a/Cargo.toml b/Cargo.toml index 3b4c55e50eb6..c9486eea4719 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ biome_graphql_syntax = { version = "0.1.0", path = "./crates/biome_graph biome_grit_factory = { version = "0.5.7", path = "./crates/biome_grit_factory" } biome_grit_parser = { version = "0.1.0", path = "./crates/biome_grit_parser" } biome_grit_syntax = { version = "0.5.7", path = "./crates/biome_grit_syntax" } +biome_html_factory = { version = "0.5.7", path = "./crates/biome_html_factory" } biome_html_syntax = { version = "0.5.7", path = "./crates/biome_html_syntax" } biome_js_analyze = { version = "0.5.7", path = "./crates/biome_js_analyze" } biome_js_factory = { version = "0.5.7", path = "./crates/biome_js_factory" } diff --git a/crates/biome_css_parser/Cargo.toml b/crates/biome_css_parser/Cargo.toml index 762b3cf75038..5499aebcd3a5 100644 --- a/crates/biome_css_parser/Cargo.toml +++ b/crates/biome_css_parser/Cargo.toml @@ -27,9 +27,5 @@ quickcheck = { workspace = true } quickcheck_macros = { workspace = true } tests_macros = { path = "../tests_macros" } -# cargo-workspaces metadata -[package.metadata.workspaces] -independent = true - [lints] workspace = true diff --git a/crates/biome_css_parser/src/lexer/mod.rs b/crates/biome_css_parser/src/lexer/mod.rs index 3d747472d650..e6631ed9e8d2 100644 --- a/crates/biome_css_parser/src/lexer/mod.rs +++ b/crates/biome_css_parser/src/lexer/mod.rs @@ -5,7 +5,9 @@ mod tests; use crate::CssParserOptions; use biome_css_syntax::{CssSyntaxKind, CssSyntaxKind::*, TextLen, TextSize, T}; use biome_parser::diagnostic::ParseDiagnostic; -use biome_parser::lexer::{LexContext, Lexer, LexerCheckpoint, TokenFlags}; +use biome_parser::lexer::{ + LexContext, Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags, +}; use biome_unicode_table::{ is_css_id_continue, is_css_id_start, lookup_byte, Dispatch, Dispatch::*, }; @@ -78,6 +80,9 @@ pub(crate) struct CssLexer<'src> { } impl<'src> Lexer<'src> for CssLexer<'src> { + const NEWLINE: Self::Kind = NEWLINE; + + const WHITESPACE: Self::Kind = WHITESPACE; type Kind = CssSyntaxKind; type LexContext = CssLexContext; type ReLexContext = CssReLexContext; @@ -102,18 +107,6 @@ impl<'src> Lexer<'src> for CssLexer<'src> { self.diagnostics.push(diagnostic); } - fn checkpoint(&self) -> LexerCheckpoint { - LexerCheckpoint { - position: TextSize::from(self.position as u32), - current_start: self.current_start, - current_flags: self.current_flags, - current_kind: self.current_kind, - after_line_break: self.after_newline, - unicode_bom_length: self.unicode_bom_length, - diagnostics_pos: self.diagnostics.len() as u32, - } - } - fn next_token(&mut self, context: Self::LexContext) -> Self::Kind { self.current_start = self.text_position(); self.current_flags = TokenFlags::empty(); @@ -140,25 +133,6 @@ impl<'src> Lexer<'src> for CssLexer<'src> { kind } - fn re_lex(&mut self, _context: Self::ReLexContext) -> Self::Kind { - let old_position = self.position; - self.position = u32::from(self.current_start) as usize; - - let re_lexed_kind = match self.current_byte() { - Some(current) => self.consume_selector_token(current), - None => EOF, - }; - - if self.current() == re_lexed_kind { - // Didn't re-lex anything. Return existing token again - self.position = old_position; - } else { - self.current_kind = re_lexed_kind; - } - - re_lexed_kind - } - fn has_preceding_line_break(&self) -> bool { self.current_flags.has_preceding_line_break() } @@ -197,20 +171,6 @@ impl<'src> Lexer<'src> for CssLexer<'src> { self.current_flags } - /// Consume one newline or all whitespace until a non-whitespace or a newline is found. - /// - /// ## Safety - /// Must be called at a valid UT8 char boundary - fn consume_newline_or_whitespaces(&mut self) -> Self::Kind { - if self.consume_newline() { - self.after_newline = true; - NEWLINE - } else { - self.consume_whitespaces(); - WHITESPACE - } - } - #[inline] fn advance_char_unchecked(&mut self) { let c = self.current_char_unchecked(); @@ -314,7 +274,13 @@ impl<'src> CssLexer<'src> { let dispatched = lookup_byte(current); match dispatched { - WHS => self.consume_newline_or_whitespaces(), + WHS => { + let kind = self.consume_newline_or_whitespaces(); + if kind == Self::NEWLINE { + self.after_newline = true; + } + kind + } QOT => self.consume_string_literal(current), SLH => self.consume_slash(), @@ -1268,6 +1234,42 @@ impl<'src> CssLexer<'src> { } } } + +impl<'src> ReLexer<'src> for CssLexer<'src> { + fn re_lex(&mut self, _context: Self::ReLexContext) -> Self::Kind { + let old_position = self.position; + self.position = u32::from(self.current_start) as usize; + + let re_lexed_kind = match self.current_byte() { + Some(current) => self.consume_selector_token(current), + None => EOF, + }; + + if self.current() == re_lexed_kind { + // Didn't re-lex anything. Return existing token again + self.position = old_position; + } else { + self.current_kind = re_lexed_kind; + } + + re_lexed_kind + } +} + +impl<'src> LexerWithCheckpoint<'src> for CssLexer<'src> { + fn checkpoint(&self) -> LexerCheckpoint { + LexerCheckpoint { + position: TextSize::from(self.position as u32), + current_start: self.current_start, + current_flags: self.current_flags, + current_kind: self.current_kind, + after_line_break: self.after_newline, + unicode_bom_length: self.unicode_bom_length, + diagnostics_pos: self.diagnostics.len() as u32, + } + } +} + #[derive(Copy, Clone, Debug)] enum LexStringState { /// String that contains an invalid escape sequence diff --git a/crates/biome_grit_parser/src/lexer/mod.rs b/crates/biome_grit_parser/src/lexer/mod.rs index 0cfec0c48ff6..690f9f9835d4 100644 --- a/crates/biome_grit_parser/src/lexer/mod.rs +++ b/crates/biome_grit_parser/src/lexer/mod.rs @@ -34,6 +34,8 @@ pub(crate) struct GritLexer<'src> { } impl<'src> Lexer<'src> for GritLexer<'src> { + const NEWLINE: Self::Kind = NEWLINE; + const WHITESPACE: Self::Kind = WHITESPACE; type Kind = GritSyntaxKind; type LexContext = (); @@ -51,10 +53,6 @@ impl<'src> Lexer<'src> for GritLexer<'src> { self.current_start } - fn checkpoint(&self) -> LexerCheckpoint { - unimplemented!("Grit lexer doesn't support checkpoints"); - } - fn next_token(&mut self, _context: Self::LexContext) -> Self::Kind { self.current_start = self.text_position(); @@ -79,11 +77,6 @@ impl<'src> Lexer<'src> for GritLexer<'src> { kind } - - fn re_lex(&mut self, _context: Self::ReLexContext) -> Self::Kind { - unimplemented!("Grit lexer doesn't support re-lexing"); - } - fn has_preceding_line_break(&self) -> bool { self.after_newline } @@ -108,16 +101,6 @@ impl<'src> Lexer<'src> for GritLexer<'src> { self.diagnostics.push(diagnostic); } - fn consume_newline_or_whitespaces(&mut self) -> Self::Kind { - if self.consume_newline() { - self.after_newline = true; - NEWLINE - } else { - self.consume_whitespaces(); - WHITESPACE - } - } - #[inline] fn advance_char_unchecked(&mut self) { let c = self.current_char_unchecked(); diff --git a/crates/biome_grit_parser/src/parser/mod.rs b/crates/biome_grit_parser/src/parser/mod.rs index 3bfe0b52a524..64a29527f97b 100644 --- a/crates/biome_grit_parser/src/parser/mod.rs +++ b/crates/biome_grit_parser/src/parser/mod.rs @@ -78,7 +78,7 @@ impl<'source> Parser for GritParser<'source> { } } -pub(crate) fn parse_root(p: &mut GritParser) -> CompletedMarker { +pub(crate) fn parse_root(p: &mut GritParser) { let m = p.start(); p.eat(UNICODE_BOM); @@ -90,7 +90,7 @@ pub(crate) fn parse_root(p: &mut GritParser) -> CompletedMarker { p.expect(EOF); - m.complete(p, GRIT_ROOT) + m.complete(p, GRIT_ROOT); } fn parse_version(p: &mut GritParser) -> ParsedSyntax { diff --git a/crates/biome_html_factory/src/generated/node_factory.rs b/crates/biome_html_factory/src/generated/node_factory.rs index ebb5167507cf..778699f90bd4 100644 --- a/crates/biome_html_factory/src/generated/node_factory.rs +++ b/crates/biome_html_factory/src/generated/node_factory.rs @@ -64,18 +64,64 @@ pub fn html_closing_element( pub fn html_directive( l_angle_token: SyntaxToken, excl_token: SyntaxToken, - content: HtmlString, + doctype_token: SyntaxToken, r_angle_token: SyntaxToken, -) -> HtmlDirective { - HtmlDirective::unwrap_cast(SyntaxNode::new_detached( - HtmlSyntaxKind::HTML_DIRECTIVE, - [ - Some(SyntaxElement::Token(l_angle_token)), - Some(SyntaxElement::Token(excl_token)), - Some(SyntaxElement::Node(content.into_syntax())), - Some(SyntaxElement::Token(r_angle_token)), - ], - )) +) -> HtmlDirectiveBuilder { + HtmlDirectiveBuilder { + l_angle_token, + excl_token, + doctype_token, + r_angle_token, + html_token: None, + quirk_token: None, + public_id_token: None, + system_id_token: None, + } +} +pub struct HtmlDirectiveBuilder { + l_angle_token: SyntaxToken, + excl_token: SyntaxToken, + doctype_token: SyntaxToken, + r_angle_token: SyntaxToken, + html_token: Option, + quirk_token: Option, + public_id_token: Option, + system_id_token: Option, +} +impl HtmlDirectiveBuilder { + pub fn with_html_token(mut self, html_token: SyntaxToken) -> Self { + self.html_token = Some(html_token); + self + } + pub fn with_quirk_token(mut self, quirk_token: SyntaxToken) -> Self { + self.quirk_token = Some(quirk_token); + self + } + pub fn with_public_id_token(mut self, public_id_token: SyntaxToken) -> Self { + self.public_id_token = Some(public_id_token); + self + } + pub fn with_system_id_token(mut self, system_id_token: SyntaxToken) -> Self { + self.system_id_token = Some(system_id_token); + self + } + pub fn build(self) -> HtmlDirective { + HtmlDirective::unwrap_cast(SyntaxNode::new_detached( + HtmlSyntaxKind::HTML_DIRECTIVE, + [ + Some(SyntaxElement::Token(self.l_angle_token)), + Some(SyntaxElement::Token(self.excl_token)), + Some(SyntaxElement::Token(self.doctype_token)), + self.html_token.map(|token| SyntaxElement::Token(token)), + self.quirk_token.map(|token| SyntaxElement::Token(token)), + self.public_id_token + .map(|token| SyntaxElement::Token(token)), + self.system_id_token + .map(|token| SyntaxElement::Token(token)), + Some(SyntaxElement::Token(self.r_angle_token)), + ], + )) + } } pub fn html_element( opening_element: HtmlOpeningElement, @@ -113,36 +159,42 @@ pub fn html_opening_element( ], )) } -pub fn html_root( - directive: HtmlDirective, - tags: HtmlElementList, - eof_token: SyntaxToken, -) -> HtmlRootBuilder { +pub fn html_root(eof_token: SyntaxToken) -> HtmlRootBuilder { HtmlRootBuilder { - directive, - tags, eof_token, bom_token: None, + directive: None, + html: None, } } pub struct HtmlRootBuilder { - directive: HtmlDirective, - tags: HtmlElementList, eof_token: SyntaxToken, bom_token: Option, + directive: Option, + html: Option, } impl HtmlRootBuilder { pub fn with_bom_token(mut self, bom_token: SyntaxToken) -> Self { self.bom_token = Some(bom_token); self } + pub fn with_directive(mut self, directive: HtmlDirective) -> Self { + self.directive = Some(directive); + self + } + pub fn with_html(mut self, html: HtmlElement) -> Self { + self.html = Some(html); + self + } pub fn build(self) -> HtmlRoot { HtmlRoot::unwrap_cast(SyntaxNode::new_detached( HtmlSyntaxKind::HTML_ROOT, [ self.bom_token.map(|token| SyntaxElement::Token(token)), - Some(SyntaxElement::Node(self.directive.into_syntax())), - Some(SyntaxElement::Node(self.tags.into_syntax())), + self.directive + .map(|token| SyntaxElement::Node(token.into_syntax())), + self.html + .map(|token| SyntaxElement::Node(token.into_syntax())), Some(SyntaxElement::Token(self.eof_token)), ], )) diff --git a/crates/biome_html_factory/src/generated/syntax_factory.rs b/crates/biome_html_factory/src/generated/syntax_factory.rs index beab1db6a5ba..95b0b19a24c6 100644 --- a/crates/biome_html_factory/src/generated/syntax_factory.rs +++ b/crates/biome_html_factory/src/generated/syntax_factory.rs @@ -109,7 +109,7 @@ impl SyntaxFactory for HtmlSyntaxFactory { } HTML_DIRECTIVE => { let mut elements = (&children).into_iter(); - let mut slots: RawNodeSlots<4usize> = RawNodeSlots::default(); + let mut slots: RawNodeSlots<8usize> = RawNodeSlots::default(); let mut current_element = elements.next(); if let Some(element) = ¤t_element { if element.kind() == T ! [<] { @@ -126,7 +126,35 @@ impl SyntaxFactory for HtmlSyntaxFactory { } slots.next_slot(); if let Some(element) = ¤t_element { - if HtmlString::can_cast(element.kind()) { + if element.kind() == T![doctype] { + slots.mark_present(); + current_element = elements.next(); + } + } + slots.next_slot(); + if let Some(element) = ¤t_element { + if element.kind() == T![html] { + slots.mark_present(); + current_element = elements.next(); + } + } + slots.next_slot(); + if let Some(element) = ¤t_element { + if element.kind() == HTML_LITERAL { + slots.mark_present(); + current_element = elements.next(); + } + } + slots.next_slot(); + if let Some(element) = ¤t_element { + if element.kind() == HTML_STRING_LITERAL { + slots.mark_present(); + current_element = elements.next(); + } + } + slots.next_slot(); + if let Some(element) = ¤t_element { + if element.kind() == HTML_STRING_LITERAL { slots.mark_present(); current_element = elements.next(); } @@ -258,7 +286,7 @@ impl SyntaxFactory for HtmlSyntaxFactory { } slots.next_slot(); if let Some(element) = ¤t_element { - if HtmlElementList::can_cast(element.kind()) { + if HtmlElement::can_cast(element.kind()) { slots.mark_present(); current_element = elements.next(); } diff --git a/crates/biome_html_parser/Cargo.toml b/crates/biome_html_parser/Cargo.toml new file mode 100644 index 000000000000..7ae7af1b9e74 --- /dev/null +++ b/crates/biome_html_parser/Cargo.toml @@ -0,0 +1,30 @@ +[package] +authors.workspace = true +categories.workspace = true +description = "" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +name = "biome_html_parser" +repository.workspace = true +version = "0.0.0" + +[lints] +workspace = true + +[dependencies] +biome_console = { workspace = true } +biome_diagnostics = { workspace = true } +biome_html_factory = { workspace = true } +biome_html_syntax = { workspace = true } +biome_parser = { workspace = true } +biome_rowan = { workspace = true } +biome_unicode_table = { workspace = true } +tracing = { workspace = true } + + +[dev-dependencies] +biome_test_utils = { path = "../biome_test_utils" } +insta = { workspace = true } +tests_macros = { path = "../tests_macros" } diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs new file mode 100644 index 000000000000..aaf4afcfac81 --- /dev/null +++ b/crates/biome_html_parser/src/lexer/mod.rs @@ -0,0 +1,161 @@ +mod tests; + +use biome_html_syntax::HtmlSyntaxKind::{ + EOF, ERROR_TOKEN, NEWLINE, TOMBSTONE, UNICODE_BOM, WHITESPACE, +}; +use biome_html_syntax::{HtmlSyntaxKind, TextLen, TextSize, T}; +use biome_parser::diagnostic::ParseDiagnostic; +use biome_parser::lexer::{Lexer, LexerCheckpoint, TokenFlags}; + +pub(crate) struct HtmlLexer<'src> { + /// Source text + source: &'src str, + + /// The start byte position in the source text of the next token. + position: usize, + + current_kind: HtmlSyntaxKind, + + current_start: TextSize, + + diagnostics: Vec, + + current_flags: TokenFlags, + + preceding_line_break: bool, + after_newline: bool, +} + +impl<'src> HtmlLexer<'src> { + pub fn from_str(string: &'src str) -> Self { + Self { + source: string, + position: 0, + diagnostics: vec![], + current_start: TextSize::from(0), + current_kind: TOMBSTONE, + preceding_line_break: false, + after_newline: false, + current_flags: TokenFlags::empty(), + } + } +} + +impl<'src> HtmlLexer<'src> { + fn consume_token(&mut self, current: u8) -> HtmlSyntaxKind { + match current { + b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), + b'<' => self.consume_byte(T![<]), + b'>' => self.consume_byte(T![>]), + b'/' => self.consume_byte(T![/]), + b'!' => self.consume_byte(T![!]), + _ if self.position == 0 && self.consume_potential_bom(UNICODE_BOM).is_some() => { + UNICODE_BOM + } + _ => self.consume_unexpected_character(), + } + } + + /// Bumps the current byte and creates a lexed token of the passed in kind. + #[inline] + fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind { + self.advance(1); + tok + } + + fn consume_unexpected_character(&mut self) -> HtmlSyntaxKind { + self.assert_at_char_boundary(); + + let char = self.current_char_unchecked(); + let err = ParseDiagnostic::new( + format!("Unexpected character `{}`", char), + self.text_position()..self.text_position() + char.text_len(), + ); + self.diagnostics.push(err); + self.advance(char.len_utf8()); + + ERROR_TOKEN + } + + /// Asserts that the lexer is at a UTF8 char boundary + #[inline] + fn assert_at_char_boundary(&self) { + debug_assert!(self.source.is_char_boundary(self.position)); + } +} + +impl<'src> Lexer<'src> for HtmlLexer<'src> { + const NEWLINE: Self::Kind = NEWLINE; + const WHITESPACE: Self::Kind = WHITESPACE; + type Kind = HtmlSyntaxKind; + type LexContext = (); + type ReLexContext = (); + + fn source(&self) -> &'src str { + self.source + } + + fn current(&self) -> Self::Kind { + self.current_kind + } + + fn current_start(&self) -> TextSize { + self.current_start + } + + fn next_token(&mut self, _context: Self::LexContext) -> Self::Kind { + self.current_start = TextSize::from(self.position as u32); + self.current_flags = TokenFlags::empty(); + + let kind = if self.is_eof() { + EOF + } else { + match self.current_byte() { + Some(current) => self.consume_token(current), + None => EOF, + } + }; + + self.current_flags + .set(TokenFlags::PRECEDING_LINE_BREAK, self.after_newline); + self.current_kind = kind; + + if !kind.is_trivia() { + self.after_newline = false; + } + + kind + } + fn has_preceding_line_break(&self) -> bool { + self.preceding_line_break + } + + fn has_unicode_escape(&self) -> bool { + self.current_flags.has_unicode_escape() + } + + fn rewind(&mut self, _checkpoint: LexerCheckpoint) { + unreachable!("no need") + } + + fn finish(self) -> Vec { + self.diagnostics + } + + fn position(&self) -> usize { + self.position + } + + fn push_diagnostic(&mut self, diagnostic: ParseDiagnostic) { + self.diagnostics.push(diagnostic); + } + + fn advance_char_unchecked(&mut self) { + let c = self.current_char_unchecked(); + self.position += c.len_utf8(); + } + + fn advance(&mut self, n: usize) { + self.position += n; + } +} diff --git a/crates/biome_html_parser/src/lexer/tests.rs b/crates/biome_html_parser/src/lexer/tests.rs new file mode 100644 index 000000000000..2dadd18ceea1 --- /dev/null +++ b/crates/biome_html_parser/src/lexer/tests.rs @@ -0,0 +1,88 @@ +#![cfg(test)] +#![allow(unused_mut, unused_variables, unused_assignments)] + +use super::{HtmlLexer, TextSize}; +use biome_html_syntax::HtmlSyntaxKind::{self, *}; +use biome_parser::lexer::Lexer; +use biome_rowan::TextRange; + +pub struct Token { + kind: HtmlSyntaxKind, + range: TextRange, +} + +impl Iterator for HtmlLexer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + let kind = self.next_token(()); + if kind == EOF { + None + } else { + Some(Token { + kind, + range: self.current_range(), + }) + } + } +} + +// Assert the result of lexing a piece of source code, +// and make sure the tokens yielded are fully lossless and the source can be reconstructed from only the tokens +macro_rules! assert_lex { + ($src:expr, $($kind:ident:$len:expr $(,)?)*) => {{ + let mut lexer = HtmlLexer::from_str($src); + let mut idx = 0; + let mut tok_idx = TextSize::default(); + + let mut new_str = String::with_capacity($src.len()); + let tokens: Vec<_> = lexer.collect(); + + $( + assert_eq!( + tokens[idx].kind, + HtmlSyntaxKind::$kind, + "expected token kind {}, but found {:?}", + stringify!($kind), + tokens[idx].kind, + ); + + assert_eq!( + tokens[idx].range.len(), + TextSize::from($len), + "expected token length of {}, but found {:?} for token {:?}", + $len, + tokens[idx].range.len(), + tokens[idx].kind, + ); + + new_str.push_str(&$src[tokens[idx].range]); + tok_idx += tokens[idx].range.len(); + + idx += 1; + )* + + if idx < tokens.len() { + panic!( + "expected {} tokens but lexer returned {}, first unexpected token is '{:?}'", + idx, + tokens.len(), + tokens[idx].kind + ); + } else { + assert_eq!(idx, tokens.len()); + } + + assert_eq!($src, new_str, "Failed to reconstruct input"); + }}; +} + +// TODO: to fix +#[test] +#[ignore = "currently not handled"] +fn doctype() { + assert_lex! { + "doctype", + ERROR_TOKEN:1, + } +} diff --git a/crates/biome_html_parser/src/lib.rs b/crates/biome_html_parser/src/lib.rs new file mode 100644 index 000000000000..5e1b8522186a --- /dev/null +++ b/crates/biome_html_parser/src/lib.rs @@ -0,0 +1,93 @@ +mod lexer; +mod parser; +mod syntax; +mod token_source; + +use crate::parser::{HtmlLosslessTreeSink, HtmlParser}; +use crate::syntax::parse_root; +use biome_html_syntax::{HtmlRoot, HtmlSyntaxNode}; +use biome_parser::diagnostic::ParseDiagnostic; +use biome_rowan::{AstNode, NodeCache}; + +/// Parses the provided string as HTML program using the provided node cache. +pub fn parse_html_with_cache(source: &str, cache: &mut NodeCache) -> HtmlParse { + tracing::debug_span!("Parsing phase").in_scope(move || { + let mut parser = HtmlParser::new(source); + + parse_root(&mut parser); + + let (events, diagnostics, trivia) = parser.finish(); + + let mut tree_sink = HtmlLosslessTreeSink::with_cache(source, &trivia, cache); + biome_parser::event::process(&mut tree_sink, events, diagnostics); + let (green, diagnostics) = tree_sink.finish(); + + HtmlParse::new(green, diagnostics) + }) +} +pub fn parse_html(source: &str) -> HtmlParse { + let mut cache = NodeCache::default(); + parse_html_with_cache(source, &mut cache) +} + +/// A utility struct for managing the result of a parser job +#[derive(Debug)] +pub struct HtmlParse { + root: HtmlSyntaxNode, + diagnostics: Vec, +} + +impl HtmlParse { + pub fn new(root: HtmlSyntaxNode, diagnostics: Vec) -> Self { + Self { root, diagnostics } + } + + /// The syntax node represented by this Parse result + /// + /// ``` + /// # use biome_html_parser::parse_html; + /// # use biome_html_syntax::HtmlSyntaxKind; + /// # use biome_rowan::{AstNode, AstNodeList, SyntaxError}; + /// + /// # fn main() -> Result<(), SyntaxError> { + /// use biome_html_syntax::HtmlSyntaxKind; + /// // let parse = parse_html(r#""#); + /// + /// // Get the root value + /// // let root_value = parse.tree().html()?; + /// + /// // assert_eq!(root_value.syntax().kind(), HtmlSyntaxKind::HTML_ELEMENT); + /// + /// # Ok(()) + /// # } + /// ``` + pub fn syntax(&self) -> HtmlSyntaxNode { + self.root.clone() + } + + /// Get the diagnostics which occurred when parsing + pub fn diagnostics(&self) -> &[ParseDiagnostic] { + &self.diagnostics + } + + /// Get the diagnostics which occurred when parsing + pub fn into_diagnostics(self) -> Vec { + self.diagnostics + } + + /// Returns [true] if the parser encountered some errors during the parsing. + pub fn has_errors(&self) -> bool { + self.diagnostics + .iter() + .any(|diagnostic| diagnostic.is_error()) + } + + /// Convert this parse result into a typed AST node. + /// + /// # Panics + /// + /// It panics if the node represented by this parse result mismatches. + pub fn tree(&self) -> HtmlRoot { + HtmlRoot::unwrap_cast(self.syntax()) + } +} diff --git a/crates/biome_html_parser/src/parser.rs b/crates/biome_html_parser/src/parser.rs new file mode 100644 index 000000000000..0c535d04d2d5 --- /dev/null +++ b/crates/biome_html_parser/src/parser.rs @@ -0,0 +1,61 @@ +use crate::token_source::HtmlTokenSource; +use biome_html_factory::HtmlSyntaxFactory; +use biome_html_syntax::{HtmlLanguage, HtmlSyntaxKind}; +use biome_parser::diagnostic::{merge_diagnostics, ParseDiagnostic}; +use biome_parser::event::Event; +use biome_parser::prelude::*; +use biome_parser::tree_sink::LosslessTreeSink; +use biome_parser::{Parser, ParserContext}; + +pub(crate) type HtmlLosslessTreeSink<'source> = + LosslessTreeSink<'source, HtmlLanguage, HtmlSyntaxFactory>; + +pub(crate) struct HtmlParser<'source> { + context: ParserContext, + source: HtmlTokenSource<'source>, +} + +impl<'source> HtmlParser<'source> { + pub fn new(source: &'source str) -> Self { + Self { + context: ParserContext::default(), + source: HtmlTokenSource::from_str(source), + } + } + + pub fn finish( + self, + ) -> ( + Vec>, + Vec, + Vec, + ) { + let (trivia, lexer_diagnostics) = self.source.finish(); + let (events, parse_diagnostics) = self.context.finish(); + + let diagnostics = merge_diagnostics(lexer_diagnostics, parse_diagnostics); + + (events, diagnostics, trivia) + } +} + +impl<'src> Parser for HtmlParser<'src> { + type Kind = HtmlSyntaxKind; + type Source = HtmlTokenSource<'src>; + + fn context(&self) -> &ParserContext { + &self.context + } + + fn context_mut(&mut self) -> &mut ParserContext { + &mut self.context + } + + fn source(&self) -> &Self::Source { + &self.source + } + + fn source_mut(&mut self) -> &mut Self::Source { + &mut self.source + } +} diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs new file mode 100644 index 000000000000..759a5c085b9f --- /dev/null +++ b/crates/biome_html_parser/src/syntax/mod.rs @@ -0,0 +1,34 @@ +use crate::parser::HtmlParser; +use biome_html_syntax::HtmlSyntaxKind::{HTML_DIRECTIVE, HTML_ROOT, UNICODE_BOM}; +use biome_html_syntax::T; +use biome_parser::prelude::ParsedSyntax::Absent; +use biome_parser::prelude::*; +use biome_parser::Parser; + +pub(crate) fn parse_root(p: &mut HtmlParser) { + let m = p.start(); + + p.eat(UNICODE_BOM); + + parse_doc_type(p).ok(); + + m.complete(p, HTML_ROOT); +} + +fn parse_doc_type(p: &mut HtmlParser) -> ParsedSyntax { + if !p.at(T![<]) { + return Absent; + } + + let m = p.start(); + p.eat(T![<]); + p.bump(T![!]); + + if p.at(T![doctype]) { + p.eat(T![doctype]); + } + + p.eat(T![>]); + + ParsedSyntax::Present(m.complete(p, HTML_DIRECTIVE)) +} diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs new file mode 100644 index 000000000000..c3deace10579 --- /dev/null +++ b/crates/biome_html_parser/src/token_source.rs @@ -0,0 +1,129 @@ +use crate::lexer::HtmlLexer; +use biome_html_syntax::HtmlSyntaxKind::{EOF, TOMBSTONE}; +use biome_html_syntax::{HtmlSyntaxKind, TextRange}; +use biome_parser::diagnostic::ParseDiagnostic; +use biome_parser::lexer::{LexContext, Lexer}; +use biome_parser::token_source::{TokenSource, Trivia}; +use biome_rowan::TriviaPieceKind; + +pub(crate) struct HtmlTokenSource<'source> { + lexer: HtmlLexer<'source>, + trivia: Vec, + current: HtmlSyntaxKind, + current_range: TextRange, + preceding_line_break: bool, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] +pub enum HtmlLexContext { + /// Default context: no particular rules are applied to the lexer logic. + #[default] + Regular, +} + +impl HtmlLexContext { + const fn is_regular(&self) -> bool { + matches!(self, Self::Regular) + } +} + +impl LexContext for HtmlLexContext { + fn is_regular(&self) -> bool { + self.is_regular() + } +} + +impl<'source> HtmlTokenSource<'source> { + pub fn from_str(source: &'source str) -> Self { + let lexer = HtmlLexer::from_str(source); + + let mut source = Self { + lexer, + trivia: Vec::new(), + current: TOMBSTONE, + current_range: TextRange::default(), + preceding_line_break: false, + }; + + source.next_non_trivia_token(true); + source + } + + fn next_non_trivia_token(&mut self, first_token: bool) { + let mut trailing = !first_token; + self.preceding_line_break = false; + + loop { + let token = self.lexer.next_token(()); + if token == EOF { + break; + } else { + let trivia_kind = TriviaPieceKind::try_from(token); + + match trivia_kind { + Err(_) => { + self.current = token; + self.current_range = self.lexer.current_range(); + // Not trivia + break; + } + + Ok(trivia_kind) => { + if trivia_kind.is_newline() { + trailing = false; + self.preceding_line_break = true; + } + + self.trivia.push(Trivia::new( + trivia_kind, + self.lexer.current_range(), + trailing, + )); + } + } + } + } + } +} + +impl<'src> TokenSource for HtmlTokenSource<'src> { + type Kind = HtmlSyntaxKind; + + fn current(&self) -> Self::Kind { + self.current + } + + fn current_range(&self) -> TextRange { + self.current_range + } + + fn text(&self) -> &str { + self.lexer.source() + } + + fn has_preceding_line_break(&self) -> bool { + self.lexer.has_preceding_line_break() + } + + fn bump(&mut self) { + if self.current != EOF { + self.next_non_trivia_token(false) + } + } + + fn skip_as_trivia(&mut self) { + if self.current() != EOF { + self.trivia.push(Trivia::new( + TriviaPieceKind::Skipped, + self.current_range(), + false, + )); + + self.next_non_trivia_token(false) + } + } + + fn finish(self) -> (Vec, Vec) { + (self.trivia, self.lexer.finish()) + } +} diff --git a/crates/biome_html_parser/tests/html_specs/ok/ok.html b/crates/biome_html_parser/tests/html_specs/ok/ok.html new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/ok.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/ok.html.snap b/crates/biome_html_parser/tests/html_specs/ok/ok.html.snap new file mode 100644 index 000000000000..2d7fc5b1a66c --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/ok.html.snap @@ -0,0 +1,33 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```css + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: missing (optional), + html: missing (optional), + eof_token: EOF@0..1 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..1 + 0: (empty) + 1: (empty) + 2: (empty) + 3: EOF@0..1 "" [Newline("\n")] [] + +``` diff --git a/crates/biome_html_parser/tests/spec_test.rs b/crates/biome_html_parser/tests/spec_test.rs new file mode 100644 index 000000000000..e56948f05628 --- /dev/null +++ b/crates/biome_html_parser/tests/spec_test.rs @@ -0,0 +1,128 @@ +use biome_console::fmt::{Formatter, Termcolor}; +use biome_console::markup; +use biome_diagnostics::{termcolor, DiagnosticExt, PrintDiagnostic}; +use biome_html_parser::parse_html; +use biome_rowan::SyntaxKind; +use biome_test_utils::has_bogus_nodes_or_empty_slots; +use std::fmt::Write; +use std::fs; +use std::path::Path; + +#[derive(Copy, Clone)] +pub enum ExpectedOutcome { + Pass, + Fail, + Undefined, +} + +pub fn run(test_case: &str, _snapshot_name: &str, test_directory: &str, outcome_str: &str) { + let outcome = match outcome_str { + "ok" => ExpectedOutcome::Pass, + "error" => ExpectedOutcome::Fail, + "undefined" => ExpectedOutcome::Undefined, + _ => panic!("Invalid expected outcome {outcome_str}"), + }; + + let test_case_path = Path::new(test_case); + + let file_name = test_case_path + .file_name() + .expect("Expected test to have a file name") + .to_str() + .expect("File name to be valid UTF8"); + + let content = fs::read_to_string(test_case_path) + .expect("Expected test path to be a readable file in UTF8 encoding"); + + let parsed = parse_html(&content); + let formatted_ast = format!("{:#?}", parsed.tree()); + + let mut snapshot = String::new(); + writeln!(snapshot, "\n## Input\n\n```css\n{content}\n```\n\n").unwrap(); + + writeln!( + snapshot, + r#"## AST + +``` +{formatted_ast} +``` + +## CST + +``` +{:#?} +``` +"#, + parsed.syntax() + ) + .unwrap(); + + let diagnostics = parsed.diagnostics(); + if !diagnostics.is_empty() { + let mut diagnostics_buffer = termcolor::Buffer::no_color(); + + let termcolor = &mut Termcolor(&mut diagnostics_buffer); + let mut formatter = Formatter::new(termcolor); + + for diagnostic in diagnostics { + let error = diagnostic + .clone() + .with_file_path(file_name) + .with_file_source_code(&content); + + formatter + .write_markup(markup! { + {PrintDiagnostic::verbose(&error)} + }) + .expect("failed to emit diagnostic"); + } + + let formatted_diagnostics = + std::str::from_utf8(diagnostics_buffer.as_slice()).expect("non utf8 in error buffer"); + + if matches!(outcome, ExpectedOutcome::Pass) { + panic!("Expected no errors to be present in a test case that is expected to pass but the following diagnostics are present:\n{formatted_diagnostics}") + } + + writeln!(snapshot, "## Diagnostics\n\n```").unwrap(); + snapshot.write_str(formatted_diagnostics).unwrap(); + + writeln!(snapshot, "```\n").unwrap(); + } + + match outcome { + ExpectedOutcome::Pass => { + let missing_required = formatted_ast.contains("missing (required)"); + if missing_required + || parsed + .syntax() + .descendants() + .any(|node| node.kind().is_bogus()) + { + panic!("Parsed tree of a 'OK' test case should not contain any missing required children or bogus nodes: \n {formatted_ast:#?} \n\n {}", formatted_ast); + } + + let syntax = parsed.syntax(); + if has_bogus_nodes_or_empty_slots(&syntax) { + panic!( + "modified tree has bogus nodes or empty slots:\n{syntax:#?} \n\n {}", + syntax + ) + } + } + ExpectedOutcome::Fail => { + if parsed.diagnostics().is_empty() { + panic!("Failing test must have diagnostics"); + } + } + _ => {} + } + + insta::with_settings!({ + prepend_module_to_snapshot => false, + snapshot_path => &test_directory, + }, { + insta::assert_snapshot!(file_name, snapshot); + }); +} diff --git a/crates/biome_html_parser/tests/spec_tests.rs b/crates/biome_html_parser/tests/spec_tests.rs new file mode 100644 index 000000000000..f835579ffd31 --- /dev/null +++ b/crates/biome_html_parser/tests/spec_tests.rs @@ -0,0 +1,8 @@ +#![allow(non_snake_case)] + +mod spec_test; + +mod ok { + tests_macros::gen_tests! {"tests/html_specs/ok/**/*.html", crate::spec_test::run, "ok"} + tests_macros::gen_tests! {"tests/html_specs/error/**/*.html", crate::spec_test::run, "error"} +} diff --git a/crates/biome_html_syntax/src/generated/kind.rs b/crates/biome_html_syntax/src/generated/kind.rs index 3e25d00222e1..07a16de7e642 100644 --- a/crates/biome_html_syntax/src/generated/kind.rs +++ b/crates/biome_html_syntax/src/generated/kind.rs @@ -21,7 +21,10 @@ pub enum HtmlSyntaxKind { NULL_KW, TRUE_KW, FALSE_KW, + DOCTYPE_KW, + HTML_KW, HTML_STRING_LITERAL, + HTML_LITERAL, ERROR_TOKEN, NEWLINE, WHITESPACE, @@ -55,7 +58,7 @@ impl HtmlSyntaxKind { } pub const fn is_literal(self) -> bool { match self { - HTML_STRING_LITERAL => true, + HTML_STRING_LITERAL | HTML_LITERAL => true, _ => false, } } @@ -70,6 +73,8 @@ impl HtmlSyntaxKind { "null" => NULL_KW, "true" => TRUE_KW, "false" => FALSE_KW, + "doctype" => DOCTYPE_KW, + "html" => HTML_KW, _ => return None, }; Some(kw) @@ -84,6 +89,8 @@ impl HtmlSyntaxKind { NULL_KW => "null", TRUE_KW => "true", FALSE_KW => "false", + DOCTYPE_KW => "doctype", + HTML_KW => "html", HTML_STRING_LITERAL => "string literal", _ => return None, }; @@ -92,4 +99,4 @@ impl HtmlSyntaxKind { } #[doc = r" Utility macro for creating a SyntaxKind through simple macro syntax"] #[macro_export] -macro_rules ! T { [<] => { $ crate :: HtmlSyntaxKind :: L_ANGLE } ; [>] => { $ crate :: HtmlSyntaxKind :: R_ANGLE } ; [/] => { $ crate :: HtmlSyntaxKind :: SLASH } ; [=] => { $ crate :: HtmlSyntaxKind :: EQ } ; [!] => { $ crate :: HtmlSyntaxKind :: BANG } ; [null] => { $ crate :: HtmlSyntaxKind :: NULL_KW } ; [true] => { $ crate :: HtmlSyntaxKind :: TRUE_KW } ; [false] => { $ crate :: HtmlSyntaxKind :: FALSE_KW } ; [ident] => { $ crate :: HtmlSyntaxKind :: IDENT } ; [EOF] => { $ crate :: HtmlSyntaxKind :: EOF } ; [UNICODE_BOM] => { $ crate :: HtmlSyntaxKind :: UNICODE_BOM } ; [#] => { $ crate :: HtmlSyntaxKind :: HASH } ; } +macro_rules ! T { [<] => { $ crate :: HtmlSyntaxKind :: L_ANGLE } ; [>] => { $ crate :: HtmlSyntaxKind :: R_ANGLE } ; [/] => { $ crate :: HtmlSyntaxKind :: SLASH } ; [=] => { $ crate :: HtmlSyntaxKind :: EQ } ; [!] => { $ crate :: HtmlSyntaxKind :: BANG } ; [null] => { $ crate :: HtmlSyntaxKind :: NULL_KW } ; [true] => { $ crate :: HtmlSyntaxKind :: TRUE_KW } ; [false] => { $ crate :: HtmlSyntaxKind :: FALSE_KW } ; [doctype] => { $ crate :: HtmlSyntaxKind :: DOCTYPE_KW } ; [html] => { $ crate :: HtmlSyntaxKind :: HTML_KW } ; [ident] => { $ crate :: HtmlSyntaxKind :: IDENT } ; [EOF] => { $ crate :: HtmlSyntaxKind :: EOF } ; [UNICODE_BOM] => { $ crate :: HtmlSyntaxKind :: UNICODE_BOM } ; [#] => { $ crate :: HtmlSyntaxKind :: HASH } ; } diff --git a/crates/biome_html_syntax/src/generated/nodes.rs b/crates/biome_html_syntax/src/generated/nodes.rs index 430859542684..1fa5742d55fd 100644 --- a/crates/biome_html_syntax/src/generated/nodes.rs +++ b/crates/biome_html_syntax/src/generated/nodes.rs @@ -175,7 +175,11 @@ impl HtmlDirective { HtmlDirectiveFields { l_angle_token: self.l_angle_token(), excl_token: self.excl_token(), - content: self.content(), + doctype_token: self.doctype_token(), + html_token: self.html_token(), + quirk_token: self.quirk_token(), + public_id_token: self.public_id_token(), + system_id_token: self.system_id_token(), r_angle_token: self.r_angle_token(), } } @@ -185,11 +189,23 @@ impl HtmlDirective { pub fn excl_token(&self) -> SyntaxResult { support::required_token(&self.syntax, 1usize) } - pub fn content(&self) -> SyntaxResult { - support::required_node(&self.syntax, 2usize) + pub fn doctype_token(&self) -> SyntaxResult { + support::required_token(&self.syntax, 2usize) + } + pub fn html_token(&self) -> Option { + support::token(&self.syntax, 3usize) + } + pub fn quirk_token(&self) -> Option { + support::token(&self.syntax, 4usize) + } + pub fn public_id_token(&self) -> Option { + support::token(&self.syntax, 5usize) + } + pub fn system_id_token(&self) -> Option { + support::token(&self.syntax, 6usize) } pub fn r_angle_token(&self) -> SyntaxResult { - support::required_token(&self.syntax, 3usize) + support::required_token(&self.syntax, 7usize) } } #[cfg(feature = "serde")] @@ -205,7 +221,11 @@ impl Serialize for HtmlDirective { pub struct HtmlDirectiveFields { pub l_angle_token: SyntaxResult, pub excl_token: SyntaxResult, - pub content: SyntaxResult, + pub doctype_token: SyntaxResult, + pub html_token: Option, + pub quirk_token: Option, + pub public_id_token: Option, + pub system_id_token: Option, pub r_angle_token: SyntaxResult, } #[derive(Clone, PartialEq, Eq, Hash)] @@ -359,18 +379,18 @@ impl HtmlRoot { HtmlRootFields { bom_token: self.bom_token(), directive: self.directive(), - tags: self.tags(), + html: self.html(), eof_token: self.eof_token(), } } pub fn bom_token(&self) -> Option { support::token(&self.syntax, 0usize) } - pub fn directive(&self) -> SyntaxResult { - support::required_node(&self.syntax, 1usize) + pub fn directive(&self) -> Option { + support::node(&self.syntax, 1usize) } - pub fn tags(&self) -> HtmlElementList { - support::list(&self.syntax, 2usize) + pub fn html(&self) -> Option { + support::node(&self.syntax, 2usize) } pub fn eof_token(&self) -> SyntaxResult { support::required_token(&self.syntax, 3usize) @@ -388,8 +408,8 @@ impl Serialize for HtmlRoot { #[cfg_attr(feature = "serde", derive(Serialize))] pub struct HtmlRootFields { pub bom_token: Option, - pub directive: SyntaxResult, - pub tags: HtmlElementList, + pub directive: Option, + pub html: Option, pub eof_token: SyntaxResult, } #[derive(Clone, PartialEq, Eq, Hash)] @@ -664,7 +684,26 @@ impl std::fmt::Debug for HtmlDirective { &support::DebugSyntaxResult(self.l_angle_token()), ) .field("excl_token", &support::DebugSyntaxResult(self.excl_token())) - .field("content", &support::DebugSyntaxResult(self.content())) + .field( + "doctype_token", + &support::DebugSyntaxResult(self.doctype_token()), + ) + .field( + "html_token", + &support::DebugOptionalElement(self.html_token()), + ) + .field( + "quirk_token", + &support::DebugOptionalElement(self.quirk_token()), + ) + .field( + "public_id_token", + &support::DebugOptionalElement(self.public_id_token()), + ) + .field( + "system_id_token", + &support::DebugOptionalElement(self.system_id_token()), + ) .field( "r_angle_token", &support::DebugSyntaxResult(self.r_angle_token()), @@ -844,8 +883,11 @@ impl std::fmt::Debug for HtmlRoot { "bom_token", &support::DebugOptionalElement(self.bom_token()), ) - .field("directive", &support::DebugSyntaxResult(self.directive())) - .field("tags", &self.tags()) + .field( + "directive", + &support::DebugOptionalElement(self.directive()), + ) + .field("html", &support::DebugOptionalElement(self.html())) .field("eof_token", &support::DebugSyntaxResult(self.eof_token())) .finish() } diff --git a/crates/biome_html_syntax/src/generated/nodes_mut.rs b/crates/biome_html_syntax/src/generated/nodes_mut.rs index 21dfbdf5b624..29bb21edcabc 100644 --- a/crates/biome_html_syntax/src/generated/nodes_mut.rs +++ b/crates/biome_html_syntax/src/generated/nodes_mut.rs @@ -70,16 +70,40 @@ impl HtmlDirective { .splice_slots(1usize..=1usize, once(Some(element.into()))), ) } - pub fn with_content(self, element: HtmlString) -> Self { + pub fn with_doctype_token(self, element: SyntaxToken) -> Self { Self::unwrap_cast( self.syntax - .splice_slots(2usize..=2usize, once(Some(element.into_syntax().into()))), + .splice_slots(2usize..=2usize, once(Some(element.into()))), + ) + } + pub fn with_html_token(self, element: Option) -> Self { + Self::unwrap_cast( + self.syntax + .splice_slots(3usize..=3usize, once(element.map(|element| element.into()))), + ) + } + pub fn with_quirk_token(self, element: Option) -> Self { + Self::unwrap_cast( + self.syntax + .splice_slots(4usize..=4usize, once(element.map(|element| element.into()))), + ) + } + pub fn with_public_id_token(self, element: Option) -> Self { + Self::unwrap_cast( + self.syntax + .splice_slots(5usize..=5usize, once(element.map(|element| element.into()))), + ) + } + pub fn with_system_id_token(self, element: Option) -> Self { + Self::unwrap_cast( + self.syntax + .splice_slots(6usize..=6usize, once(element.map(|element| element.into()))), ) } pub fn with_r_angle_token(self, element: SyntaxToken) -> Self { Self::unwrap_cast( self.syntax - .splice_slots(3usize..=3usize, once(Some(element.into()))), + .splice_slots(7usize..=7usize, once(Some(element.into()))), ) } } @@ -144,17 +168,17 @@ impl HtmlRoot { .splice_slots(0usize..=0usize, once(element.map(|element| element.into()))), ) } - pub fn with_directive(self, element: HtmlDirective) -> Self { - Self::unwrap_cast( - self.syntax - .splice_slots(1usize..=1usize, once(Some(element.into_syntax().into()))), - ) + pub fn with_directive(self, element: Option) -> Self { + Self::unwrap_cast(self.syntax.splice_slots( + 1usize..=1usize, + once(element.map(|element| element.into_syntax().into())), + )) } - pub fn with_tags(self, element: HtmlElementList) -> Self { - Self::unwrap_cast( - self.syntax - .splice_slots(2usize..=2usize, once(Some(element.into_syntax().into()))), - ) + pub fn with_html(self, element: Option) -> Self { + Self::unwrap_cast(self.syntax.splice_slots( + 2usize..=2usize, + once(element.map(|element| element.into_syntax().into())), + )) } pub fn with_eof_token(self, element: SyntaxToken) -> Self { Self::unwrap_cast( diff --git a/crates/biome_js_parser/src/lexer/mod.rs b/crates/biome_js_parser/src/lexer/mod.rs index 935eb69bab54..b4f52ebb4799 100644 --- a/crates/biome_js_parser/src/lexer/mod.rs +++ b/crates/biome_js_parser/src/lexer/mod.rs @@ -22,7 +22,9 @@ mod tests; use biome_js_syntax::JsSyntaxKind::*; pub use biome_js_syntax::*; use biome_parser::diagnostic::ParseDiagnostic; -use biome_parser::lexer::{LexContext, Lexer, LexerCheckpoint, TokenFlags}; +use biome_parser::lexer::{ + LexContext, Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags, +}; use biome_unicode_table::{ is_js_id_continue, is_js_id_start, lookup_byte, Dispatch::{self, *}, @@ -131,6 +133,9 @@ pub(crate) struct JsLexer<'src> { } impl<'src> Lexer<'src> for JsLexer<'src> { + const NEWLINE: Self::Kind = NEWLINE; + const WHITESPACE: Self::Kind = WHITESPACE; + type Kind = JsSyntaxKind; type LexContext = JsLexContext; type ReLexContext = JsReLexContext; @@ -158,18 +163,6 @@ impl<'src> Lexer<'src> for JsLexer<'src> { self.current_start } - fn checkpoint(&self) -> LexerCheckpoint { - LexerCheckpoint { - position: TextSize::from(self.position as u32), - current_start: self.current_start, - current_flags: self.current_flags, - current_kind: self.current_kind, - after_line_break: self.after_newline, - unicode_bom_length: self.unicode_bom_length, - diagnostics_pos: self.diagnostics.len() as u32, - } - } - fn next_token(&mut self, context: Self::LexContext) -> Self::Kind { self.current_start = TextSize::from(self.position as u32); self.current_flags = TokenFlags::empty(); @@ -196,29 +189,6 @@ impl<'src> Lexer<'src> for JsLexer<'src> { kind } - fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind { - let old_position = self.position; - self.position = u32::from(self.current_start) as usize; - - let re_lexed_kind = match context { - JsReLexContext::Regex if matches!(self.current(), T![/] | T![/=]) => self.read_regex(), - JsReLexContext::BinaryOperator => self.re_lex_binary_operator(), - JsReLexContext::TypeArgumentLessThan => self.re_lex_type_argument_less_than(), - JsReLexContext::JsxIdentifier => self.re_lex_jsx_identifier(old_position), - JsReLexContext::JsxChild if !self.is_eof() => self.lex_jsx_child_token(), - _ => self.current(), - }; - - if self.current() == re_lexed_kind { - // Didn't re-lex anything. Return existing token again - self.position = old_position; - } else { - self.current_kind = re_lexed_kind; - } - - re_lexed_kind - } - fn has_preceding_line_break(&self) -> bool { self.current_flags.has_preceding_line_break() } @@ -287,6 +257,45 @@ impl<'src> Lexer<'src> for JsLexer<'src> { } } +impl<'src> ReLexer<'src> for JsLexer<'src> { + fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind { + let old_position = self.position; + self.position = u32::from(self.current_start) as usize; + + let re_lexed_kind = match context { + JsReLexContext::Regex if matches!(self.current(), T![/] | T![/=]) => self.read_regex(), + JsReLexContext::BinaryOperator => self.re_lex_binary_operator(), + JsReLexContext::TypeArgumentLessThan => self.re_lex_type_argument_less_than(), + JsReLexContext::JsxIdentifier => self.re_lex_jsx_identifier(old_position), + JsReLexContext::JsxChild if !self.is_eof() => self.lex_jsx_child_token(), + _ => self.current(), + }; + + if self.current() == re_lexed_kind { + // Didn't re-lex anything. Return existing token again + self.position = old_position; + } else { + self.current_kind = re_lexed_kind; + } + + re_lexed_kind + } +} + +impl<'src> LexerWithCheckpoint<'src> for JsLexer<'src> { + fn checkpoint(&self) -> LexerCheckpoint { + LexerCheckpoint { + position: TextSize::from(self.position as u32), + current_start: self.current_start, + current_flags: self.current_flags, + current_kind: self.current_kind, + after_line_break: self.after_newline, + unicode_bom_length: self.unicode_bom_length, + diagnostics_pos: self.diagnostics.len() as u32, + } + } +} + impl<'src> JsLexer<'src> { /// Make a new lexer from a str, this is safe because strs are valid utf8 pub fn from_str(source: &'src str) -> Self { @@ -1771,7 +1780,13 @@ impl<'src> JsLexer<'src> { let dispatched = lookup_byte(byte); match dispatched { - WHS => self.consume_newline_or_whitespaces(), + WHS => { + let kind = self.consume_newline_or_whitespaces(); + if kind == Self::NEWLINE { + self.after_newline = true; + } + kind + } EXL => self.resolve_bang(), HAS => self.read_shebang(), PRC => self.bin_or_assign(T![%], T![%=]), @@ -1876,7 +1891,11 @@ impl<'src> JsLexer<'src> { if is_linebreak(chr) || (UNICODE_WHITESPACE_STARTS.contains(&byte) && UNICODE_SPACES.contains(&chr)) { - self.consume_newline_or_whitespaces() + let kind = self.consume_newline_or_whitespaces(); + if kind == Self::NEWLINE { + self.after_newline = true; + } + kind } else { self.advance(chr.len_utf8() - 1); if is_js_id_start(chr) { diff --git a/crates/biome_parser/src/lexer.rs b/crates/biome_parser/src/lexer.rs index 72539b070cb1..195696fb67bb 100644 --- a/crates/biome_parser/src/lexer.rs +++ b/crates/biome_parser/src/lexer.rs @@ -9,6 +9,12 @@ use unicode_bom::Bom; /// `Lexer` trait defines the necessary methods a lexer must implement. /// Lexer is responsible for dividing the source code into meaningful parsing units (kinds). pub trait Lexer<'src> { + /// The kind of the newline + const NEWLINE: Self::Kind; + + /// The kind of the space + const WHITESPACE: Self::Kind; + /// A kind of syntax, as identified by the lexer. type Kind: SyntaxKind; /// The specific context in which the lexer operates. @@ -32,17 +38,9 @@ pub trait Lexer<'src> { fn current_start(&self) -> TextSize; - /// Creating a checkpoint of the lexer's current state. - /// `rewind` can be used later to restore the lexer to the checkpoint's state. - fn checkpoint(&self) -> LexerCheckpoint; - /// Tokenizes the next kind into a single coherent token within the given lexing context. fn next_token(&mut self, context: Self::LexContext) -> Self::Kind; - /// Re-lexes the current kind under a different lexing context. - /// Useful when a token can have different interpretations based on context. - fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind; - /// Returns `true` if the current kind is preceded by a line break. fn has_preceding_line_break(&self) -> bool; @@ -69,7 +67,14 @@ pub trait Lexer<'src> { /// /// ## Safety /// Must be called at a valid UT8 char boundary - fn consume_newline_or_whitespaces(&mut self) -> Self::Kind; + fn consume_newline_or_whitespaces(&mut self) -> Self::Kind { + if self.consume_newline() { + Self::NEWLINE + } else { + self.consume_whitespaces(); + Self::WHITESPACE + } + } /// Consumes all whitespace until a non-whitespace or a newline is found. /// @@ -261,6 +266,18 @@ pub trait Lexer<'src> { } } +pub trait ReLexer<'src>: Lexer<'src> + LexerWithCheckpoint<'src> { + /// Re-lexes the current kind under a different lexing context. + /// Useful when a token can have different interpretations based on context. + fn re_lex(&mut self, context: Self::ReLexContext) -> Self::Kind; +} + +pub trait LexerWithCheckpoint<'src>: Lexer<'src> { + /// Creating a checkpoint of the lexer's current state. + /// `rewind` can be used later to restore the lexer to the checkpoint's state. + fn checkpoint(&self) -> LexerCheckpoint; +} + /// `LexContext` is a trait that represents the context in /// which a parser is currently operating. It is used to specify /// and handle the variations in parsing requirements depending @@ -407,16 +424,6 @@ impl<'l, Lex: Lexer<'l>> BufferedLexer<'l, Lex> { self.inner.source() } - /// Creates a checkpoint representing the current lexer state. Allows rewinding - /// the lexer to this position later on. - pub fn checkpoint(&self) -> LexerCheckpoint { - if let Some(current) = &self.current { - current.clone() - } else { - self.inner.checkpoint() - } - } - /// Rewinds the lexer to the state stored in the checkpoint. pub fn rewind(&mut self, checkpoint: LexerCheckpoint) { self.inner.rewind(checkpoint); @@ -431,8 +438,23 @@ impl<'l, Lex: Lexer<'l>> BufferedLexer<'l, Lex> { } } + /// Returns an iterator over the tokens following the current token to perform lookahead. + /// For example, what's the 3rd token after the current token? + #[inline(always)] + pub fn lookahead<'s>(&'s mut self) -> LookaheadIterator<'s, 'l, Lex> { + LookaheadIterator::new(self) + } + + /// Consumes the buffered lexer and returns the lexing diagnostics + pub fn finish(self) -> Vec { + self.inner.finish() + } +} +impl<'l, Lex> BufferedLexer<'l, Lex> +where + Lex: ReLexer<'l>, +{ /// Re-lex the current token in the given context - /// See [Lexer::re_lex] pub fn re_lex(&mut self, context: Lex::ReLexContext) -> Lex::Kind { let current_kind = self.current(); let current_checkpoint = self.inner.checkpoint(); @@ -455,17 +477,18 @@ impl<'l, Lex: Lexer<'l>> BufferedLexer<'l, Lex> { new_kind } +} - /// Returns an iterator over the tokens following the current token to perform lookahead. - /// For example, what's the 3rd token after the current token? - #[inline(always)] - pub fn lookahead<'s>(&'s mut self) -> LookaheadIterator<'s, 'l, Lex> { - LookaheadIterator::new(self) - } - - /// Consumes the buffered lexer and returns the lexing diagnostics - pub fn finish(self) -> Vec { - self.inner.finish() +impl<'l, Lex> BufferedLexer<'l, Lex> +where + Lex: LexerWithCheckpoint<'l>, +{ + pub fn checkpoint(&self) -> LexerCheckpoint { + if let Some(current) = &self.current { + current.clone() + } else { + self.inner.checkpoint() + } } } @@ -484,7 +507,7 @@ impl<'l, 't, Lex: Lexer<'t>> LookaheadIterator<'l, 't, Lex> { } } -impl<'l, 't, Lex: Lexer<'t>> Iterator for LookaheadIterator<'l, 't, Lex> { +impl<'l, 't, Lex: LexerWithCheckpoint<'t>> Iterator for LookaheadIterator<'l, 't, Lex> { type Item = LookaheadToken; #[inline] @@ -524,7 +547,7 @@ impl<'l, 't, Lex: Lexer<'t>> Iterator for LookaheadIterator<'l, 't, Lex> { } } -impl<'l, 't, Lex: Lexer<'t>> FusedIterator for LookaheadIterator<'l, 't, Lex> {} +impl<'l, 't, Lex: LexerWithCheckpoint<'t>> FusedIterator for LookaheadIterator<'l, 't, Lex> {} #[derive(Debug)] pub struct LookaheadToken { @@ -553,7 +576,7 @@ impl From<&LexerCheckpoint> for LookaheadToken { /// Stores the state of the lexer so that it may later be restored to that position. #[derive(Debug, Clone)] -pub struct LexerCheckpoint { +pub struct LexerCheckpoint { pub position: TextSize, pub current_start: TextSize, pub current_kind: Kind, diff --git a/crates/biome_parser/src/prelude.rs b/crates/biome_parser/src/prelude.rs index f91f48823b28..02b140765705 100644 --- a/crates/biome_parser/src/prelude.rs +++ b/crates/biome_parser/src/prelude.rs @@ -1,6 +1,6 @@ pub use crate::diagnostic::{ParseDiagnostic, ToDiagnostic}; pub use crate::marker::{CompletedMarker, Marker}; pub use crate::parsed_syntax::ParsedSyntax; -pub use crate::token_source::{BumpWithContext, NthToken, TokenSource}; +pub use crate::token_source::{BumpWithContext, NthToken, TokenSource, Trivia}; pub use crate::{token_set, TokenSet}; pub use crate::{Parser, SyntaxFeature}; diff --git a/knope.toml b/knope.toml index 8bd5af58f2e3..b7216a5a0fe2 100644 --- a/knope.toml +++ b/knope.toml @@ -135,3 +135,7 @@ changelog = "crates/biome_graphql_syntax/CHANGELOG.md" [packages.biome_graphql_factory] versioned_files = ["crates/biome_graphql_factory/Cargo.toml"] changelog = "crates/biome_graphql_factory/CHANGELOG.md" + +[package."biome_html_parser"] +versioned_files = ["creates/biome_html_parser/Cargo.toml"] +changelog = "crates/biome_html_parser/CHANGELOG.md" diff --git a/xtask/codegen/html.ungram b/xtask/codegen/html.ungram index 7c2ae13a5566..94d69d01a255 100644 --- a/xtask/codegen/html.ungram +++ b/xtask/codegen/html.ungram @@ -40,8 +40,8 @@ HtmlBogus = SyntaxElement* HtmlRoot = bom: 'UNICODE_BOM'? - directive: HtmlDirective - tags: HtmlElementList + directive: HtmlDirective? + html: HtmlElement? eof: 'EOF' // @@ -50,7 +50,11 @@ HtmlRoot = HtmlDirective = '<' '!' - content: HtmlString + doctype: 'doctype' + html: 'html'? + quirk: 'html_literal'? + public_id: 'html_string_literal'? + system_id: 'html_string_literal'? '>' // ================================== diff --git a/xtask/codegen/src/html_kinds_src.rs b/xtask/codegen/src/html_kinds_src.rs index e1bbc332a6e7..2748bf14918d 100644 --- a/xtask/codegen/src/html_kinds_src.rs +++ b/xtask/codegen/src/html_kinds_src.rs @@ -8,8 +8,8 @@ pub const HTML_KINDS_SRC: KindsSrc = KindsSrc { ("=", "EQ"), ("!", "BANG"), ], - keywords: &["null", "true", "false"], - literals: &["HTML_STRING_LITERAL"], + keywords: &["null", "true", "false", "doctype", "html"], + literals: &["HTML_STRING_LITERAL", "HTML_LITERAL"], tokens: &[ "ERROR_TOKEN", "NEWLINE",