-
Notifications
You must be signed in to change notification settings - Fork 184
/
Copy pathscanner.js
218 lines (181 loc) · 5.75 KB
/
scanner.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/**
The scanner provides an interface that takes a string of text as input, and
outputs an array of tokens instances that can be used for easy URL parsing.
@module linkify
@submodule scanner
@main scanner
*/
import {CharacterState as State, stateify} from './state';
import * as TOKENS from './tokens/text';
import {
DOMAIN,
LOCALHOST,
NUM,
PROTOCOL,
TLD,
WS,
AT,
DOT,
PLUS,
POUND,
QUERY,
SLASH,
UNDERSCORE,
COLON,
OPENBRACE,
OPENBRACKET,
OPENANGLEBRACKET,
OPENPAREN,
CLOSEBRACE,
CLOSEBRACKET,
CLOSEANGLEBRACKET,
CLOSEPAREN,
PUNCTUATION,
NL,
SYM
} from './tokens/text';
const tlds = __TLDS__; // macro, see gulpfile.js
const NUMBERS = '0123456789'.split('');
const ALPHANUM = '0123456789abcdefghijklmnopqrstuvwxyz'.split('');
const WHITESPACE = [' ', '\f', '\r', '\t', '\v', '\u00a0', '\u1680', '\u180e']; // excluding line breaks
let domainStates = []; // states that jump to DOMAIN on /[a-z0-9]/
let makeState = (tokenClass) => new State(tokenClass);
// Frequently used states
const S_START = makeState();
const S_NUM = makeState(NUM);
const S_DOMAIN = makeState(DOMAIN);
const S_DOMAIN_HYPHEN = makeState(); // domain followed by 1 or more hyphen characters
const S_WS = makeState(WS);
// States for special URL symbols
S_START
.on('@', makeState(AT))
.on('.', makeState(DOT))
.on('+', makeState(PLUS))
.on('#', makeState(POUND))
.on('?', makeState(QUERY))
.on('/', makeState(SLASH))
.on('_', makeState(UNDERSCORE))
.on(':', makeState(COLON))
.on('{', makeState(OPENBRACE))
.on('[', makeState(OPENBRACKET))
.on('<', makeState(OPENANGLEBRACKET))
.on('(', makeState(OPENPAREN))
.on('}', makeState(CLOSEBRACE))
.on(']', makeState(CLOSEBRACKET))
.on('>', makeState(CLOSEANGLEBRACKET))
.on(')', makeState(CLOSEPAREN))
.on([',', ';', '!', '"', '\''], makeState(PUNCTUATION));
// Whitespace jumps
// Tokens of only non-newline whitespace are arbitrarily long
S_START
.on('\n', makeState(NL))
.on(WHITESPACE, S_WS);
// If any whitespace except newline, more whitespace!
S_WS.on(WHITESPACE, S_WS);
// Generates states for top-level domains
// Note that this is most accurate when tlds are in alphabetical order
for (let i = 0; i < tlds.length; i++) {
let newStates = stateify(tlds[i], S_START, TLD, DOMAIN);
domainStates.push.apply(domainStates, newStates);
}
// Collect the states generated by different protocls
let partialProtocolFileStates = stateify('file', S_START, DOMAIN, DOMAIN);
let partialProtocolFtpStates = stateify('ftp', S_START, DOMAIN, DOMAIN);
let partialProtocolHttpStates = stateify('http', S_START, DOMAIN, DOMAIN);
// Add the states to the array of DOMAINeric states
domainStates.push.apply(domainStates, partialProtocolFileStates);
domainStates.push.apply(domainStates, partialProtocolFtpStates);
domainStates.push.apply(domainStates, partialProtocolHttpStates);
// Protocol states
let S_PROTOCOL_FILE = partialProtocolFileStates.pop();
let S_PROTOCOL_FTP = partialProtocolFtpStates.pop();
let S_PROTOCOL_HTTP = partialProtocolHttpStates.pop();
let S_PROTOCOL_SECURE = makeState(DOMAIN);
let S_FULL_PROTOCOL = makeState(PROTOCOL); // Full protocol ends with COLON
// Secure protocols (end with 's')
S_PROTOCOL_FTP
.on('s', S_PROTOCOL_SECURE)
.on(':', S_FULL_PROTOCOL);
S_PROTOCOL_HTTP
.on('s', S_PROTOCOL_SECURE)
.on(':', S_FULL_PROTOCOL);
domainStates.push(S_PROTOCOL_SECURE);
// Become protocol tokens after a COLON
S_PROTOCOL_FILE.on(':', S_FULL_PROTOCOL);
S_PROTOCOL_SECURE.on(':', S_FULL_PROTOCOL);
// Localhost
let partialLocalhostStates = stateify('localhost', S_START, LOCALHOST, DOMAIN);
domainStates.push.apply(domainStates, partialLocalhostStates);
// Everything else
// DOMAINs make more DOMAINs
// Number and character transitions
S_START.on(NUMBERS, S_NUM);
S_NUM
.on('-', S_DOMAIN_HYPHEN)
.on(NUMBERS, S_NUM)
.on(ALPHANUM, S_DOMAIN); // number becomes DOMAIN
S_DOMAIN
.on('-', S_DOMAIN_HYPHEN)
.on(ALPHANUM, S_DOMAIN);
// All the generated states should have a jump to DOMAIN
for (let i = 0; i < domainStates.length; i++) {
domainStates[i]
.on('-', S_DOMAIN_HYPHEN)
.on(ALPHANUM, S_DOMAIN);
}
S_DOMAIN_HYPHEN
.on('-', S_DOMAIN_HYPHEN)
.on(NUMBERS, S_DOMAIN)
.on(ALPHANUM, S_DOMAIN);
// Set default transition
S_START.defaultTransition = makeState(SYM);
/**
Given a string, returns an array of TOKEN instances representing the
composition of that string.
@method run
@param {String} str Input string to scan
@return {Array} Array of TOKEN instances
*/
let run = function (str) {
// The state machine only looks at lowercase strings.
// This selective `toLowerCase` is used because lowercasing the entire
// string causes the length and character position to vary in some in some
// non-English strings. This happens only on V8-based runtimes.
let lowerStr = str.replace(/[A-Z]/g, (c) => c.toLowerCase());
let len = str.length;
let tokens = []; // return value
var cursor = 0;
// Tokenize the string
while (cursor < len) {
let state = S_START;
let secondState = null;
let nextState = null;
let tokenLength = 0;
let latestAccepting = null;
let sinceAccepts = -1;
while (cursor < len && (nextState = state.next(lowerStr[cursor]))) {
secondState = null;
state = nextState;
// Keep track of the latest accepting state
if (state.accepts()) {
sinceAccepts = 0;
latestAccepting = state;
} else if (sinceAccepts >= 0) {
sinceAccepts++;
}
tokenLength++;
cursor++;
}
if (sinceAccepts < 0) { continue; } // Should never happen
// Roll back to the latest accepting state
cursor -= sinceAccepts;
tokenLength -= sinceAccepts;
// Get the class for the new token
let TOKEN = latestAccepting.emit(); // Current token class
// No more jumps, just make a new token
tokens.push(new TOKEN(str.substr(cursor - tokenLength, tokenLength)));
}
return tokens;
};
let start = S_START;
export {State, TOKENS, run, start};