Skip to content

Commit

Permalink
start implementing tokenizer & tokenize
Browse files Browse the repository at this point in the history
  • Loading branch information
anonrig committed Dec 4, 2024
1 parent b08176a commit 5967d62
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 38 deletions.
168 changes: 131 additions & 37 deletions include/ada/url_pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,6 @@

namespace ada {

namespace url_pattern {

enum class errors { type_error };

// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
std::optional<std::string> canonicalize_protocol(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
std::optional<std::string> canonicalize_username(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
std::optional<std::string> canonicalize_password(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
std::optional<std::string> canonicalize_hostname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
std::optional<std::string> canonicalize_ipv6_hostname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
std::optional<std::string> canonicalize_port(
std::string_view input, std::string_view protocol = "fake");

// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
std::optional<std::string> canonicalize_pathname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
std::optional<std::string> canonicalize_opaque_pathname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
std::optional<std::string> canonicalize_search(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
std::optional<std::string> canonicalize_hash(std::string_view input);

} // namespace url_pattern

// URLPattern is a Web Platform standard API for matching URLs against a
// pattern syntax (think of it as a regular expression for URLs). It is
// defined in https://wicg.github.io/urlpattern.
Expand Down Expand Up @@ -175,6 +138,137 @@ class URLPattern {
bool ignore_case_ = false;
};

namespace url_pattern {

enum class errors { type_error };

// @see https://urlpattern.spec.whatwg.org/#tokens
struct Token {
// @see https://urlpattern.spec.whatwg.org/#tokenize-policy
enum Policy {
STRICT,
LENIENT,
};

// @see https://urlpattern.spec.whatwg.org/#token
enum Type {
INVALID_CHAR, // 0
OPEN, // 1
CLOSE, // 2
REGEXP, // 3
NAME, // 4
CHAR, // 5
ESCAPED_CHAR, // 6
OTHER_MODIFIER, // 7
ASTERISK, // 8
END, // 9
};
};

// @see https://urlpattern.spec.whatwg.org/#tokenizer
struct Tokenizer {
explicit Tokenizer(std::string_view input, Token::Policy policy)
: input(input), policy(std::move(policy));

// has an associated input, a pattern string, initially the empty string.
std::string input{};
// has an associated policy, a tokenize policy, initially "strict".
Token::Policy policy = Token::Policy::STRICT;
// has an associated token list, a token list, initially an empty list.
std::vector<Token> token_list{};
// has an associated index, a number, initially 0.
size_t index = 0;
// has an associated next index, a number, initially 0.
size_t next_index = 0;
// has an associated code point, a Unicode code point, initially null.
char* code_point = nullptr;
};

// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser
struct ConstructorStringParser {
explicit ConstructorStringParser(std::string_view input,
std::vector<Token>& token_list);

private:
// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
enum State {
INIT,
PROTOCOL,
AUTHORITY,
PASSWORD,
HOSTNAME,
PORT,
PATHNAME,
SEARCH,
HASH,
DONE,
};
// has an associated input, a string, which must be set upon creation.
std::string input;
// has an associated token list, a token list, which must be set upon
// creation.
std::vector<Token> token_list;
// has an associated result, a URLPatternInit, initially set to a new
// URLPatternInit.
URLPattern::Init result{};
// has an associated component start, a number, initially set to 0.
size_t component_start = 0;
// has an associated token index, a number, initially set to 0.
size_t token_index = 0;
// has an associated token increment, a number, initially set to 1.
size_t token_increment = 1;
// has an associated group depth, a number, initially set to 0.
size_t group_depth = 0;
// has an associated hostname IPv6 bracket depth, a number, initially set to
// 0.
size_t hostname_ipv6_bracket_depth = 0;
// has an associated protocol matches a special scheme flag, a boolean,
// initially set to false.
bool protocol_matches_a_special_scheme_flag = false;
// has an associated state, a string, initially set to "init". It must be one
// of the following:
State state = INIT;
};

// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
std::optional<std::string> canonicalize_protocol(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
std::optional<std::string> canonicalize_username(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
std::optional<std::string> canonicalize_password(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
std::optional<std::string> canonicalize_hostname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
std::optional<std::string> canonicalize_ipv6_hostname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
std::optional<std::string> canonicalize_port(
std::string_view input, std::string_view protocol = "fake");

// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
std::optional<std::string> canonicalize_pathname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
std::optional<std::string> canonicalize_opaque_pathname(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
std::optional<std::string> canonicalize_search(std::string_view input);

// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
std::optional<std::string> canonicalize_hash(std::string_view input);

// @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
URLPattern::Init parse_constructor_string(std::string_view input);

// @see https://urlpattern.spec.whatwg.org/#tokenize
std::string tokenize(std::string_view input, Token::Policy policy);

} // namespace url_pattern

} // namespace ada

#endif
39 changes: 38 additions & 1 deletion src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,44 @@ result_type parse_url_impl(std::string_view user_input,
tl::expected<ada::URLPattern, ada::url_pattern::errors> parse_url_pattern(
std::variant<std::string_view, URLPattern::Init> input,
const std::string_view* base_url, const ada::URLPattern::Options* options) {
// TODO: Implement parser here.
// Let init be null.
URLPattern::Init init;

// If input is a scalar value string then:
if (std::holds_alternative<std::string_view>(input)) {
// Set init to the result of running parse a constructor string given input.
init = url_pattern::parse_constructor_string(
std::get<std::string_view>(input));

// If baseURL is null and init["protocol"] does not exist, then throw a
// TypeError.
if (base_url == nullptr && !init.protocol.has_value()) {
return tl::unexpected(url_pattern::errors::type_error);
}

// If baseURL is not null, set init["baseURL"] to baseURL.
if (base_url != nullptr) {
init.base_url = std::string(*base_url);
}
} else {
// Assert: input is a URLPatternInit.
ADA_ASSERT_TRUE(std::holds_alternative<URLPattern::Init>(input));
// If baseURL is not null, then throw a TypeError.
if (base_url == nullptr) {
return tl::unexpected(url_pattern::errors::type_error);
}
// Optimization: Avoid copy by moving the input value.
// Set init to input.
init = std::move(std::get<URLPattern::Init>(input));
}

// Let processedInit be the result of process a URLPatternInit given init,
// "pattern", null, null, null, null, null, null, null, and null.
// TODO: Implement this

// For each componentName of « "protocol", "username", "password", "hostname",
// "port", "pathname", "search", "hash" If processedInit[componentName] does
// not exist, then set processedInit[componentName] to "*".
return tl::unexpected(url_pattern::errors::type_error);
}

Expand Down
22 changes: 22 additions & 0 deletions src/url_pattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,28 @@ std::optional<std::string> canonicalize_hash(std::string_view input) {
return std::string(hash.substr(1));
}

URLPattern::Init parse_constructor_string(std::string_view input) {
// Let parser be a new constructor string parser whose input is input and
// token list is the result of running tokenize given input and "lenient".
// TODO: Implement this
return {};
}

std::string tokenize(std::string_view input, Token::Policy policy) {
// Let tokenizer be a new tokenizer.
// Set tokenizer’s input to input.
// Set tokenizer’s policy to policy.
auto tokenizer = Tokenizer(input, policy);
// While tokenizer’s index is less than tokenizer’s input's code point length:
while (tokenizer.index < tokenizer.input.size()) {
// Run seek and get the next code point given tokenizer and tokenizer’s
// index.
// TODO
}
// TODO: Implement this
return "";
}

} // namespace url_pattern

URLPattern::Component::Component(std::string_view pattern_,
Expand Down

0 comments on commit 5967d62

Please sign in to comment.