Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle carriage returns in the lexer #770

Merged
merged 1 commit into from
Jan 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 29 additions & 22 deletions compiler/surface/lexer.cppo.ml
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,12 @@ let hspace = [%sedlex.regexp? Sub (white_space, Chars "\n\r")]
(** Operator explicit typing suffix chars *)
let op_kind_re = [%sedlex.regexp? "" | MR_MONEY_OP_SUFFIX | Chars "!.@^"]

(** Regexp matching every character except newlines *)
let any_but_eol = [%sedlex.regexp? (Sub (any, Chars "\n\r"))]

(** Regexp matching newlines *)
let eol = [%sedlex.regexp? Opt '\r', '\n' ]

let op_kind = function
| "" -> Ast.KPoly
| "!" -> Ast.KInt
Expand All @@ -376,7 +382,7 @@ let rec lex_code (lexbuf : lexbuf) : token =
(* Whitespaces *)
L.update_acc lexbuf;
lex_code lexbuf
| '#', Star (Compl '\n'), '\n' ->
| '#', Star any_but_eol, eol ->
(* Comments *)
L.update_acc lexbuf;
lex_code lexbuf
Expand Down Expand Up @@ -737,7 +743,7 @@ let rec lex_directive_args (lexbuf : lexbuf) : token =
| MR_EXTERNAL -> MODULE_EXTERNAL
| Plus (Compl white_space) -> DIRECTIVE_ARG (Utf8.lexeme lexbuf)
| Plus hspace -> lex_directive_args lexbuf
| '\n' | eof ->
| eol | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
Expand All @@ -753,7 +759,7 @@ let rec lex_directive (lexbuf : lexbuf) : token =
| ":" ->
L.context := Directive_args;
COLON
| '\n' | eof ->
| eol | eof ->
L.context := Law;
END_DIRECTIVE
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme
Expand All @@ -765,19 +771,20 @@ let lex_raw (lexbuf : lexbuf) : token =
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```", Star hspace, ('\n' | eof) ->
| "```", Star hspace, (eol | eof) ->
L.context := Law;
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star (any_but_eol), (eol | eof) ->
LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Main lexing function used outside code blocks *)
Expand All @@ -788,10 +795,10 @@ let lex_law (lexbuf : lexbuf) : token =
if at_bol then
match%sedlex lexbuf with
| eof -> EOF
| "```catala", Star white_space, ('\n' | eof) ->
| "```catala", Star white_space, (eol | eof) ->
L.context := Code;
BEGIN_CODE
| "```catala-metadata", Star white_space, ('\n' | eof) ->
| "```catala-metadata", Star white_space, (eol | eof) ->
L.context := Code;
BEGIN_METADATA
| "```", Star (idchar | '-') ->
Expand All @@ -800,18 +807,18 @@ let lex_law (lexbuf : lexbuf) : token =
| '>' ->
L.context := Directive;
BEGIN_DIRECTIVE
| Plus '#', Star hspace, Plus (Compl '\n'), Star hspace, ('\n' | eof) ->
| Plus '#', Star hspace, Plus any_but_eol, Star hspace, (eol | eof) ->
L.get_law_heading lexbuf
| _ -> (
(* Nested match for lower priority; `_` matches length 0 so we effectively retry the
sub-match at the same point *)
match%sedlex lexbuf with
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme)
else
match%sedlex lexbuf with
| eof -> EOF
| Star (Compl '\n'), ('\n' | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| Star any_but_eol, (eol | eof) -> LAW_TEXT (Utf8.lexeme lexbuf)
| _ -> L.raise_lexer_error (Pos.from_lpos prev_pos) prev_lexeme

(** Entry point of the lexer, distributes to {!val: lex_code} or {!val:lex_law}
Expand Down Expand Up @@ -860,9 +867,9 @@ let line_dir_arg_upcase_re =
let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
match%sedlex lexbuf with
| eof -> None
| "```catala-test-inline", Star hspace, ('\n' | eof) ->
| "```catala-test-inline", Star hspace, (eol | eof) ->
Some (Utf8.lexeme lexbuf, LINE_INLINE_TEST)
| "```catala-test", Star (Compl '\n'), ('\n' | eof) ->
| "```catala-test", Star (any_but_eol), (eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let id = Re.Group.get (Re.exec line_test_id_re str) 1 in
Expand All @@ -872,36 +879,36 @@ let lex_line (lexbuf : lexbuf) : (string * L.line_token) option =
"Ignored invalid test section, must have an explicit \
`{ id = \"name\" }` specification";
Some (str, LINE_ANY))
| "```", Star hspace, ('\n' | eof) ->
| "```", Star hspace, (eol | eof) ->
Some (Utf8.lexeme lexbuf, LINE_BLOCK_END)
| '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_LAW_INCLUDE, Star hspace, ':', Plus any_but_eol,
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let file = Re.Group.get (Re.exec line_dir_arg_re str) 1 in
Some (str, LINE_INCLUDE file)
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace,
uppercase, Star (Compl white_space), Plus hspace,
MR_EXTERNAL, Star hspace, ('\n' | eof) ->
MR_EXTERNAL, Star hspace, (eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, true))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_MODULE_DEF, Plus hspace, uppercase, Star any_but_eol,
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_DEF (mdl, false))
with Not_found -> Some (str, LINE_ANY))
| '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (Compl '\n'),
('\n' | eof) ->
| '>', Star hspace, MR_MODULE_USE, Plus hspace, uppercase, Star (any_but_eol),
(eol | eof) ->
let str = Utf8.lexeme lexbuf in
(try
let mdl = Re.Group.get (Re.exec line_dir_arg_upcase_re str) 1 in
Some (str, LINE_MODULE_USE mdl)
with Not_found -> Some (str, LINE_ANY))
| Star (Compl '\n'), ('\n' | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
| Star any_but_eol, (eol | eof) -> Some (Utf8.lexeme lexbuf, LINE_ANY)
| _ -> assert false