Skip to content

Commit 58a2288

Browse files
committed
Also tokenize filenames in French
1 parent 3c40ba3 commit 58a2288

File tree

4 files changed

+10261
-6255
lines changed

4 files changed

+10261
-6255
lines changed

src/edu/stanford/nlp/international/french/process/FrenchLexer.flex

+8-5
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
286286
SPAMP = &
287287
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
288288
SPLET = &[aeiouAEIOU](acute|grave|uml);
289-
/* \u3000 is ideographic space */
290-
SPACE = [ \t\u00A0\u2000-\u200A\u3000]
291-
SPACES = {SPACE}+
292-
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
293-
SPACENL = ({SPACE}|{NEWLINE})
289+
290+
%include ../../../process/LexCommon.tokens
291+
294292
SENTEND = {SPACENL}({SPACENL}|([A-Z]|{SGML}))
295293
HYPHEN = [-_\u058A\u2010\u2011]
296294
HYPHENS = \-+
@@ -452,6 +450,11 @@ MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\
452450
return getNext();
453451
}
454452
}
453+
454+
/* TODO: not using LexCommon.productions because there are no PerLine settings */
455+
/* we might want to add those settings to the other tokenizers anyway */
456+
{FILENAME}/({SPACE}|[.?!,\"'<()]) { return getNext(); }
457+
455458
{ORDINAL}/{SPACE} { return getNext(); }
456459
{SPAMP} { return getNormalizedAmpNext(); }
457460
{SPPUNC} |

0 commit comments

Comments
 (0)