diff --git a/.clang-format b/.clang-format index 1acba5a7b..59d0684df 100644 --- a/.clang-format +++ b/.clang-format @@ -1,2 +1,2 @@ BasedOnStyle: Google -SortIncludes: false +SortIncludes: Never diff --git a/.editorconfig b/.editorconfig index 0b3779e53..6c2eeb87e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,3 +3,5 @@ root = true [*] end_of_line = lf insert_final_newline = true +indent_size = 2 +indent_style = space diff --git a/.github/workflows/macos_install.yml b/.github/workflows/macos_install.yml index 3f9570eec..fe1b0e1b0 100644 --- a/.github/workflows/macos_install.yml +++ b/.github/workflows/macos_install.yml @@ -21,12 +21,12 @@ concurrency: cancel-in-progress: true jobs: - ubuntu-build: + macos-build: runs-on: macos-latest strategy: matrix: include: - shared: [ON, OFF] + - shared: OFF steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Prepare diff --git a/.gitignore b/.gitignore index 6f397c905..00d0f053e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ benchmarks/competitors/servo-url/target #ignore VScode .vscode/ +.idea # bazel output bazel-* diff --git a/fuzz/build.sh b/fuzz/build.sh index ee78c20c2..0f5c65d19 100755 --- a/fuzz/build.sh +++ b/fuzz/build.sh @@ -37,6 +37,14 @@ $CXX $CFLAGS $CXXFLAGS \ $CXX $CFLAGS $CXXFLAGS $LIB_FUZZING_ENGINE url_search_params.o \ -o $OUT/url_search_params +$CXX $CFLAGS $CXXFLAGS \ + -std=c++20 \ + -I build/singleheader \ + -c fuzz/url_pattern.cc -o url_pattern.o + +$CXX $CFLAGS $CXXFLAGS $LIB_FUZZING_ENGINE url_pattern.o \ + -o $OUT/url_pattern + $CXX $CFLAGS $CXXFLAGS \ -std=c++20 \ -I build/singleheader \ diff --git a/fuzz/url_pattern.cc b/fuzz/url_pattern.cc new file mode 100644 index 000000000..cbc1c8c06 --- /dev/null +++ b/fuzz/url_pattern.cc @@ -0,0 +1,44 @@ +#include + +#include +#include + +#include "ada.cpp" +#include "ada.h" + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + FuzzedDataProvider fdp(data, size); + std::string source = fdp.ConsumeRandomLengthString(256); + std::string base_source = fdp.ConsumeRandomLengthString(256); + + // Without base or options + auto result = ada::parse_url_pattern(source, nullptr, nullptr); + (void)result; + + // Testing with base_url + std::string_view base_source_view(base_source.data(), base_source.length()); + auto result_with_base = + ada::parse_url_pattern(source, &base_source_view, nullptr); + (void)result_with_base; + + // Testing with base_url and options + ada::url_pattern_options options{.ignore_case = true}; + auto result_with_base_and_options = + ada::parse_url_pattern(source, &base_source_view, &options); + (void)result_with_base_and_options; + + // Testing with url_pattern_init and base url. + ada::url_pattern_init init{.protocol = source, + .username = source, + .password = source, + .hostname = source, + .port = source, + .pathname = source, + .search = source, + .hash = source}; + auto result_with_init = + ada::parse_url_pattern(init, &base_source_view, nullptr); + (void)result_with_init; + + return 0; +} diff --git a/include/ada.h b/include/ada.h index c5d0946ec..7c579d95d 100644 --- a/include/ada.h +++ b/include/ada.h @@ -26,6 +26,10 @@ #include "ada/url_aggregator-inl.h" #include "ada/url_search_params.h" #include "ada/url_search_params-inl.h" +#include "ada/url_pattern.h" +#include "ada/url_pattern-inl.h" +#include "ada/url_pattern_helpers.h" +#include "ada/url_pattern_helpers-inl.h" // Public API #include "ada/ada_version.h" diff --git a/include/ada/ada_idna.h b/include/ada/ada_idna.h index eb0537726..83532b59c 100644 --- a/include/ada/ada_idna.h +++ b/include/ada/ada_idna.h @@ -1,4 +1,4 @@ -/* auto-generated on 2024-12-05 20:44:13 -0500. Do not edit! */ +/* auto-generated on 2024-12-18 09:44:34 -0500. Do not edit! */ /* begin file include/idna.h */ #ifndef ADA_IDNA_H #define ADA_IDNA_H @@ -141,6 +141,29 @@ std::string to_unicode(std::string_view input); #endif // ADA_IDNA_TO_UNICODE_H /* end file include/ada/idna/to_unicode.h */ +/* begin file include/ada/idna/identifier.h */ +#ifndef ADA_IDNA_IDENTIFIER_H +#define ADA_IDNA_IDENTIFIER_H + +#include +#include + +namespace ada::idna { + +// Access the first code point of the input string. +// Verify if it is valid name code point given a Unicode code point and a +// boolean first: If first is true return the result of checking if code point +// is contained in the IdentifierStart set of code points. Otherwise return the +// result of checking if code point is contained in the IdentifierPart set of +// code points. Returns false if the input is empty or the code point is not +// valid. There is minimal Unicode error handling: the input should be valid +// UTF-8. https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point +bool valid_name_code_point(char32_t input, bool first); + +} // namespace ada::idna + +#endif +/* end file include/ada/idna/identifier.h */ #endif /* end file include/idna.h */ diff --git a/include/ada/checkers.h b/include/ada/checkers.h index 6b50915ef..e69cb4dd4 100644 --- a/include/ada/checkers.h +++ b/include/ada/checkers.h @@ -7,8 +7,8 @@ #include "ada/common_defs.h" -#include #include +#include /** * These functions are not part of our public API and may diff --git a/include/ada/common_defs.h b/include/ada/common_defs.h index cbac9029a..30a1ae99b 100644 --- a/include/ada/common_defs.h +++ b/include/ada/common_defs.h @@ -5,6 +5,10 @@ #ifndef ADA_COMMON_DEFS_H #define ADA_COMMON_DEFS_H +// https://en.cppreference.com/w/cpp/feature_test#Library_features +// detect C++20 features +#include + #ifdef _MSC_VER #define ADA_VISUAL_STUDIO 1 /** @@ -250,4 +254,11 @@ namespace ada { #define ada_lifetime_bound #endif +#ifdef __cpp_lib_format +#if __cpp_lib_format >= 202110L +#include +#define ADA_HAS_FORMAT 1 +#endif +#endif + #endif // ADA_COMMON_DEFS_H diff --git a/include/ada/helpers.h b/include/ada/helpers.h index d20473f0c..96c4b5e15 100644 --- a/include/ada/helpers.h +++ b/include/ada/helpers.h @@ -139,7 +139,7 @@ ada_really_inline std::pair get_host_delimiter_location( * Removes leading and trailing C0 control and whitespace characters from * string. */ -ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept; +void trim_c0_whitespace(std::string_view& input) noexcept; /** * @private diff --git a/include/ada/implementation.h b/include/ada/implementation.h index 295ea03a9..33bf67978 100644 --- a/include/ada/implementation.h +++ b/include/ada/implementation.h @@ -17,7 +17,7 @@ #include "ada/url_aggregator.h" namespace ada { -enum class errors { generic_error }; +enum class errors : uint8_t { type_error }; template using result = tl::expected; @@ -49,6 +49,20 @@ extern template ada::result parse( bool can_parse(std::string_view input, const std::string_view* base_input = nullptr); +/** + * Implementation of the URL pattern parsing algorithm. + * @see https://urlpattern.spec.whatwg.org + * + * @param input valid UTF-8 string or URLPatternInit struct + * @param base_url an optional valid UTF-8 string + * @param options an optional url_pattern_options struct + * @return url_pattern instance + */ +ada_warn_unused tl::expected parse_url_pattern( + std::variant input, + const std::string_view* base_url = nullptr, + const url_pattern_options* options = nullptr); + /** * Computes a href string from a file path. The function assumes * that the input is a valid ASCII or UTF-8 string. diff --git a/include/ada/parser.h b/include/ada/parser.h index 5bb148c89..80e97decc 100644 --- a/include/ada/parser.h +++ b/include/ada/parser.h @@ -5,12 +5,10 @@ #ifndef ADA_PARSER_H #define ADA_PARSER_H -#include #include +#include -#include "ada/encoding_type.h" #include "ada/expected.h" -#include "ada/state.h" /** * @private @@ -18,6 +16,10 @@ namespace ada { struct url_aggregator; struct url; +class url_pattern; +struct url_pattern_options; +struct url_pattern_init; +enum class errors : uint8_t; } // namespace ada /** @@ -31,7 +33,7 @@ namespace ada::parser { * parameter that can be used to resolve relative URLs. If the base_url is * provided, the user_input is resolved against the base_url. */ -template +template result_type parse_url(std::string_view user_input, const result_type* base_url = nullptr); @@ -40,7 +42,7 @@ extern template url_aggregator parse_url( extern template url parse_url(std::string_view user_input, const url* base_url); -template +template result_type parse_url_impl(std::string_view user_input, const result_type* base_url = nullptr); @@ -48,6 +50,11 @@ extern template url_aggregator parse_url_impl( std::string_view user_input, const url_aggregator* base_url); extern template url parse_url_impl(std::string_view user_input, const url* base_url); + +tl::expected parse_url_pattern_impl( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options); + } // namespace ada::parser #endif // ADA_PARSER_H diff --git a/include/ada/unicode-inl.h b/include/ada/unicode-inl.h index 4a7d4114d..cd9339e6a 100644 --- a/include/ada/unicode-inl.h +++ b/include/ada/unicode-inl.h @@ -4,8 +4,8 @@ */ #ifndef ADA_UNICODE_INL_H #define ADA_UNICODE_INL_H -#include #include "ada/unicode.h" +#include "ada/character_sets-inl.h" /** * Unicode operations. These functions are not part of our public API and may diff --git a/include/ada/unicode.h b/include/ada/unicode.h index 53b484139..c380908b0 100644 --- a/include/ada/unicode.h +++ b/include/ada/unicode.h @@ -124,6 +124,20 @@ ada_really_inline constexpr bool is_alnum_plus(char c) noexcept; */ ada_really_inline constexpr bool is_ascii_hex_digit(char c) noexcept; +/** + * @private + * An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), + * inclusive. + */ +ada_really_inline constexpr bool is_ascii_digit(char c) noexcept; + +/** + * @private + * @details If a char is between U+0000 and U+007F inclusive, then it's an ASCII + * character. + */ +ada_really_inline constexpr bool is_ascii(char32_t c) noexcept; + /** * @private * Checks if the input is a C0 control or space character. diff --git a/include/ada/url.h b/include/ada/url.h index bbcac47e1..09279f447 100644 --- a/include/ada/url.h +++ b/include/ada/url.h @@ -20,6 +20,7 @@ #include "ada/unicode.h" #include "ada/url_base.h" #include "ada/url_components.h" +#include "ada/helpers.h" namespace ada { diff --git a/include/ada/url_aggregator-inl.h b/include/ada/url_aggregator-inl.h index 2bca0d196..2012b79d4 100644 --- a/include/ada/url_aggregator-inl.h +++ b/include/ada/url_aggregator-inl.h @@ -7,9 +7,8 @@ #include "ada/character_sets.h" #include "ada/character_sets-inl.h" -#include "ada/checkers-inl.h" +#include "ada/checkers.h" #include "ada/helpers.h" -#include "ada/unicode.h" #include "ada/unicode-inl.h" #include "ada/url_aggregator.h" #include "ada/url_components.h" @@ -269,7 +268,6 @@ inline void url_aggregator::update_base_pathname(const std::string_view input) { const bool begins_with_dashdash = input.starts_with("//"); if (!begins_with_dashdash && has_dash_dot()) { - ada_log("url_aggregator::update_base_pathname has /.: \n", to_diagram()); // We must delete the ./ delete_dash_dot(); } @@ -292,8 +290,6 @@ inline void url_aggregator::update_base_pathname(const std::string_view input) { if (components.hash_start != url_components::omitted) { components.hash_start += difference; } - ada_log("url_aggregator::update_base_pathname end '", input, "' [", - input.size(), " bytes] \n", to_diagram()); ADA_ASSERT_TRUE(validate()); } @@ -413,7 +409,7 @@ inline void url_aggregator::append_base_username(const std::string_view input) { } constexpr void url_aggregator::clear_password() { - ada_log("url_aggregator::clear_password ", to_string(), "\n", to_diagram()); + ada_log("url_aggregator::clear_password ", to_string()); ADA_ASSERT_TRUE(validate()); if (!has_password()) { return; diff --git a/include/ada/url_aggregator.h b/include/ada/url_aggregator.h index dad4750c8..66f7991c3 100644 --- a/include/ada/url_aggregator.h +++ b/include/ada/url_aggregator.h @@ -8,9 +8,11 @@ #include #include +#include "ada/url_pattern.h" #include "ada/common_defs.h" #include "ada/url_base.h" #include "ada/url_components.h" +#include "ada/parser.h" namespace ada { @@ -168,7 +170,7 @@ struct url_aggregator : url_base { * @see * https://github.com/servo/rust-url/blob/b65a45515c10713f6d212e6726719a020203cc98/url/src/quirks.rs#L31 */ - [[nodiscard]] ada_really_inline const ada::url_components &get_components() + [[nodiscard]] ada_really_inline const url_components &get_components() const noexcept; /** * Returns a string representation of this URL. @@ -208,15 +210,21 @@ struct url_aggregator : url_base { inline void clear_search() override; private: - friend ada::url_aggregator ada::parser::parse_url( - std::string_view, const ada::url_aggregator *); - friend void ada::helpers::strip_trailing_spaces_from_opaque_path< - ada::url_aggregator>(ada::url_aggregator &url) noexcept; - friend ada::url_aggregator ada::parser::parse_url_impl< - ada::url_aggregator, true>(std::string_view, const ada::url_aggregator *); - friend ada::url_aggregator - ada::parser::parse_url_impl( - std::string_view, const ada::url_aggregator *); + // helper methods + friend void helpers::strip_trailing_spaces_from_opaque_path( + url_aggregator &url) noexcept; + // parse_url methods + friend url_aggregator parser::parse_url( + std::string_view, const url_aggregator *); + + friend url_aggregator parser::parse_url_impl( + std::string_view, const url_aggregator *); + friend url_aggregator parser::parse_url_impl( + std::string_view, const url_aggregator *); + // url_pattern methods + friend tl::expected parse_url_pattern_impl( + std::variant input, + const std::string_view *base_url, const url_pattern_options *options); std::string buffer{}; url_components components{}; @@ -277,7 +285,7 @@ struct url_aggregator : url_base { ada_really_inline bool parse_host(std::string_view input); inline void update_base_authority(std::string_view base_buffer, - const ada::url_components &base); + const url_components &base); inline void update_unencoded_base_hash(std::string_view input); inline void update_base_hostname(std::string_view input); inline void update_base_search(std::string_view input); @@ -317,7 +325,7 @@ struct url_aggregator : url_base { }; // url_aggregator -inline std::ostream &operator<<(std::ostream &out, const ada::url &u); +inline std::ostream &operator<<(std::ostream &out, const url &u); } // namespace ada #endif diff --git a/include/ada/url_pattern-inl.h b/include/ada/url_pattern-inl.h new file mode 100644 index 000000000..2ad9e0af4 --- /dev/null +++ b/include/ada/url_pattern-inl.h @@ -0,0 +1,154 @@ +/** + * @file url_pattern-inl.h + * @brief Declaration for the URLPattern inline functions. + */ +#ifndef ADA_URL_PATTERN_INL_H +#define ADA_URL_PATTERN_INL_H + +#include "ada/common_defs.h" +#include "ada/url_pattern.h" + +#include + +namespace ada { + +inline bool url_pattern_init::operator==(const url_pattern_init& other) const { + return protocol == other.protocol && username == other.username && + password == other.password && hostname == other.hostname && + port == other.port && search == other.search && hash == other.hash && + pathname == other.pathname; +} + +inline bool url_pattern_component_result::operator==( + const url_pattern_component_result& other) const { + return input == other.input && groups == other.groups; +} + +inline std::string url_pattern_component::to_string() const { +#ifdef ADA_HAS_FORMAT + return std::format(R"({{"pattern": "{}", "has_regexp_groups": {}}})", pattern, + has_regexp_groups ? "true" : "false" //, + ); +#else + return ""; +#endif +} + +inline url_pattern_component_result +url_pattern_component::create_component_match_result( + std::string_view input, const std::smatch& exec_result) { + // Let result be a new URLPatternComponentResult. + // Set result["input"] to input. + // Let groups be a record. + auto result = + url_pattern_component_result{.input = std::string(input), .groups = {}}; + + // If input is empty, then groups will always be empty. + if (input.empty()) { + return result; + } + + // Optimization: Let's reserve the size. + result.groups.reserve(exec_result.size() - 1); + + size_t group_index = 0; + // Let index be 1. + // While index is less than Get(execResult, "length"): + for (size_t index = 1; index < exec_result.size(); index++) { + // Let name be component’s group name list[index - 1]. + // Let value be Get(execResult, ToString(index)). + // Set groups[name] to value. + auto exec = exec_result[index]; + if (!exec.matched) continue; + result.groups.insert({ + group_name_list[group_index], + exec.str(), + }); + + group_index++; + } + return result; +} + +inline std::string url_pattern::to_string() const { +#ifdef ADA_HAS_FORMAT + return std::format( + R"({{"protocol_component": "{}", "username_component": {}, "password_component": {}, "hostname_component": {}, "port_component": {}, "pathname_component": {}, "search_component": {}, "hash_component": {}, "ignore_case": {}}})", + protocol_component.to_string(), username_component.to_string(), + password_component.to_string(), hostname_component.to_string(), + port_component.to_string(), pathname_component.to_string(), + search_component.to_string(), hash_component.to_string(), + ignore_case_ ? "true" : "false"); +#else + return ""; +#endif +} + +inline std::string_view url_pattern::get_protocol() const ada_lifetime_bound { + // Return this's associated URL pattern's protocol component's pattern string. + return protocol_component.pattern; +} +inline std::string_view url_pattern::get_username() const ada_lifetime_bound { + // Return this's associated URL pattern's username component's pattern string. + return username_component.pattern; +} +inline std::string_view url_pattern::get_password() const ada_lifetime_bound { + // Return this's associated URL pattern's password component's pattern string. + return password_component.pattern; +} +inline std::string_view url_pattern::get_hostname() const ada_lifetime_bound { + // Return this's associated URL pattern's hostname component's pattern string. + return hostname_component.pattern; +} +inline std::string_view url_pattern::get_port() const ada_lifetime_bound { + // Return this's associated URL pattern's port component's pattern string. + return port_component.pattern; +} +inline std::string_view url_pattern::get_pathname() const ada_lifetime_bound { + // Return this's associated URL pattern's pathname component's pattern string. + return pathname_component.pattern; +} +inline std::string_view url_pattern::get_search() const ada_lifetime_bound { + // Return this's associated URL pattern's search component's pattern string. + return search_component.pattern; +} +inline std::string_view url_pattern::get_hash() const ada_lifetime_bound { + // Return this's associated URL pattern's hash component's pattern string. + return hash_component.pattern; +} + +inline bool url_pattern::ignore_case() const { return ignore_case_; } + +inline bool url_pattern::has_regexp_groups() const { + // If this's associated URL pattern's has regexp groups, then return true. + return protocol_component.has_regexp_groups || + username_component.has_regexp_groups || + password_component.has_regexp_groups || + hostname_component.has_regexp_groups || + port_component.has_regexp_groups || + pathname_component.has_regexp_groups || + search_component.has_regexp_groups || hash_component.has_regexp_groups; +} + +inline bool url_pattern_part::is_regexp() const noexcept { + return type == url_pattern_part_type::REGEXP; +} + +inline std::string_view url_pattern_compile_component_options::get_delimiter() + const { + if (delimiter) { + return {&delimiter.value(), 1}; + } + return {}; +} + +inline std::string_view url_pattern_compile_component_options::get_prefix() + const { + if (prefix) { + return {&prefix.value(), 1}; + } + return {}; +} +} // namespace ada + +#endif diff --git a/include/ada/url_pattern.h b/include/ada/url_pattern.h new file mode 100644 index 000000000..4c0b897be --- /dev/null +++ b/include/ada/url_pattern.h @@ -0,0 +1,341 @@ +/** + * @file url_pattern.h + * @brief Declaration for the URLPattern implementation. + */ +#ifndef ADA_URL_PATTERN_H +#define ADA_URL_PATTERN_H + +#include "ada/implementation.h" +#include "ada/expected.h" + +#include +#include +#include +#include +#include + +namespace ada { + +namespace parser { +template +tl::expected parse_url_pattern_impl( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options); +} + +// Important: C++20 allows us to use concept rather than `using` or `typedef +// and allows functions with second argument, which is optional (using either +// std::nullopt or a parameter with default value) +template +concept url_pattern_encoding_callback = requires(F f, std::string_view sv) { + { f(sv) } -> std::same_as>; +}; + +// A structure providing matching patterns for individual components +// of a URL. When a URLPattern is created, or when a URLPattern is +// used to match or test against a URL, the input can be given as +// either a string or a URLPatternInit struct. If a string is given, +// it will be parsed to create a URLPatternInit. The URLPatternInit +// API is defined as part of the URLPattern specification. +struct url_pattern_init { + // @see https://urlpattern.spec.whatwg.org/#process-a-urlpatterninit + static tl::expected process( + url_pattern_init init, std::string_view type, + std::optional protocol = std::nullopt, + std::optional username = std::nullopt, + std::optional password = std::nullopt, + std::optional hostname = std::nullopt, + std::optional port = std::nullopt, + std::optional pathname = std::nullopt, + std::optional search = std::nullopt, + std::optional hash = std::nullopt); + + // @see https://urlpattern.spec.whatwg.org/#process-protocol-for-init + static tl::expected process_protocol( + std::string_view value, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-username-for-init + static tl::expected process_username( + std::string_view value, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-password-for-init + static tl::expected process_password( + std::string_view value, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-hostname-for-init + static tl::expected process_hostname( + std::string_view value, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-port-for-init + static tl::expected process_port( + std::string_view port, std::string_view protocol, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-pathname-for-init + static tl::expected process_pathname( + std::string_view value, std::string_view protocol, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-search-for-init + static tl::expected process_search( + std::string_view value, std::string_view type); + + // @see https://urlpattern.spec.whatwg.org/#process-hash-for-init + static tl::expected process_hash(std::string_view value, + std::string_view type); + + [[nodiscard]] std::string to_string() const; + + bool operator==(const url_pattern_init&) const; + + std::optional protocol{}; + std::optional username{}; + std::optional password{}; + std::optional hostname{}; + std::optional port{}; + std::optional pathname{}; + std::optional search{}; + std::optional hash{}; + std::optional base_url{}; +}; + +enum class url_pattern_part_type : uint8_t { + // The part represents a simple fixed text string. + FIXED_TEXT, + // The part represents a matching group with a custom regular expression. + REGEXP, + // The part represents a matching group that matches code points up to the + // next separator code point. This is typically used for a named group like + // ":foo" that does not have a custom regular expression. + SEGMENT_WILDCARD, + // The part represents a matching group that greedily matches all code points. + // This is typically used for the "*" wildcard matching group. + FULL_WILDCARD, +}; + +enum class url_pattern_part_modifier : uint8_t { + // The part does not have a modifier. + NONE, + // The part has an optional modifier indicated by the U+003F (?) code point. + OPTIONAL, + // The part has a "zero or more" modifier indicated by the U+002A (*) code + // point. + ZERO_OR_MORE, + // The part has a "one or more" modifier indicated by the U+002B (+) code + // point. + ONE_OR_MORE, +}; + +// @see https://urlpattern.spec.whatwg.org/#part +class url_pattern_part { + public: + url_pattern_part(url_pattern_part_type _type, std::string&& _value, + url_pattern_part_modifier _modifier) + : type(_type), value(_value), modifier(_modifier) {} + + url_pattern_part(url_pattern_part_type _type, std::string&& _value, + url_pattern_part_modifier _modifier, std::string&& _name, + std::string&& _prefix, std::string&& _suffix) + : type(_type), + value(_value), + modifier(_modifier), + name(_name), + prefix(_prefix), + suffix(_suffix) {} + // A part has an associated type, a string, which must be set upon creation. + url_pattern_part_type type; + // A part has an associated value, a string, which must be set upon creation. + std::string value; + // A part has an associated modifier a string, which must be set upon + // creation. + url_pattern_part_modifier modifier; + // A part has an associated name, a string, initially the empty string. + std::string name{}; + // A part has an associated prefix, a string, initially the empty string. + std::string prefix{}; + // A part has an associated suffix, a string, initially the empty string. + std::string suffix{}; + + inline bool is_regexp() const noexcept; +}; + +// @see https://urlpattern.spec.whatwg.org/#options-header +struct url_pattern_compile_component_options { + url_pattern_compile_component_options() = default; + explicit url_pattern_compile_component_options( + std::optional new_delimiter = std::nullopt, + std::optional new_prefix = std::nullopt) + : delimiter(new_delimiter), prefix(new_prefix) {} + + std::string_view get_delimiter() const ada_warn_unused; + std::string_view get_prefix() const ada_warn_unused; + + // @see https://urlpattern.spec.whatwg.org/#options-ignore-case + bool ignore_case = false; + + static url_pattern_compile_component_options DEFAULT; + static url_pattern_compile_component_options HOSTNAME; + static url_pattern_compile_component_options PATHNAME; + + private: + // @see https://urlpattern.spec.whatwg.org/#options-delimiter-code-point + std::optional delimiter{}; + // @see https://urlpattern.spec.whatwg.org/#options-prefix-code-point + std::optional prefix{}; +}; + +// A struct providing the URLPattern matching results for a single +// URL component. The URLPatternComponentResult is only ever used +// as a member attribute of a URLPatternResult struct. The +// URLPatternComponentResult API is defined as part of the URLPattern +// specification. +struct url_pattern_component_result { + std::string input; + std::unordered_map groups; + + bool operator==(const url_pattern_component_result&) const; + +#if ADA_TESTING + friend void PrintTo(const url_pattern_component_result& result, + std::ostream* os) { + *os << "input: '" << result.input << "', group: "; + for (const auto& group : result.groups) { + *os << "(" << group.first << ", " << group.second << ") "; + } + } +#endif // ADA_TESTING +}; + +class url_pattern_component { + public: + url_pattern_component() = default; + + // This function explicitly takes a std::string because it is moved. + // To avoid unnecessary copy, move each value while calling the constructor. + url_pattern_component(std::string&& new_pattern, std::regex&& new_regexp, + std::regex_constants::syntax_option_type new_flags, + std::vector&& new_group_name_list, + bool new_has_regexp_groups) + : regexp(std::move(new_regexp)), + pattern(std::move(new_pattern)), + flags(new_flags), + group_name_list(new_group_name_list), + has_regexp_groups(new_has_regexp_groups) {} + + // @see https://urlpattern.spec.whatwg.org/#compile-a-component + template + static tl::expected compile( + std::string_view input, F& encoding_callback, + url_pattern_compile_component_options& options); + + // @see https://urlpattern.spec.whatwg.org/#create-a-component-match-result + url_pattern_component_result create_component_match_result( + std::string_view input, const std::smatch& exec_result); + + std::string to_string() const; + + std::regex regexp{}; + std::string pattern{}; + std::regex_constants::syntax_option_type flags = std::regex::ECMAScript; + std::vector group_name_list{}; + bool has_regexp_groups = false; +}; + +using url_pattern_input = std::variant; + +// A struct providing the URLPattern matching results for all +// components of a URL. The URLPatternResult API is defined as +// part of the URLPattern specification. +struct url_pattern_result { + std::vector inputs; + url_pattern_component_result protocol; + url_pattern_component_result username; + url_pattern_component_result password; + url_pattern_component_result hostname; + url_pattern_component_result port; + url_pattern_component_result pathname; + url_pattern_component_result search; + url_pattern_component_result hash; +}; + +struct url_pattern_options { + bool ignore_case = false; + + std::string to_string() const; +}; + +// URLPattern is a Web Platform standard API for matching URLs against a +// pattern syntax (think of it as a regular expression for URLs). It is +// defined in https://wicg.github.io/urlpattern. +// More information about the URL Pattern syntax can be found at +// https://developer.mozilla.org/en-US/docs/Web/API/URL_Pattern_API +class url_pattern { + public: + url_pattern() = default; + explicit url_pattern(std::optional&& input, + std::optional&& base_url, + std::optional&& options); + + /** + * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-exec + */ + result> exec(const url_pattern_input& input, + std::string_view* base_url); + + /** + * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-test + */ + result test(const url_pattern_input& input, std::string_view* base_url); + + /** + * @see https://urlpattern.spec.whatwg.org/#url-pattern-match + * This function expects a valid UTF-8 string if input is a string. + */ + result> match( + const url_pattern_input& input, std::string_view* base_url_string); + + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-protocol + std::string_view get_protocol() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-username + std::string_view get_username() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-password + std::string_view get_password() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hostname + std::string_view get_hostname() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-port + std::string_view get_port() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-pathname + std::string_view get_pathname() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-search + std::string_view get_search() const ada_lifetime_bound; + // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hash + std::string_view get_hash() const ada_lifetime_bound; + + // If ignoreCase is true, the JavaScript regular expression created for each + // pattern must use the `vi` flag. Otherwise, they must use the `v` flag. + bool ignore_case() const; + + // @see https://urlpattern.spec.whatwg.org/#url-pattern-has-regexp-groups + bool has_regexp_groups() const; + + std::string to_string() const; + + url_pattern_component protocol_component{}; + url_pattern_component username_component{}; + url_pattern_component password_component{}; + url_pattern_component hostname_component{}; + url_pattern_component port_component{}; + url_pattern_component pathname_component{}; + url_pattern_component search_component{}; + url_pattern_component hash_component{}; + bool ignore_case_ = false; + + template + friend tl::expected parser::parse_url_pattern_impl( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options); +}; + +} // namespace ada + +#endif diff --git a/include/ada/url_pattern_helpers-inl.h b/include/ada/url_pattern_helpers-inl.h new file mode 100644 index 000000000..37311bb2b --- /dev/null +++ b/include/ada/url_pattern_helpers-inl.h @@ -0,0 +1,767 @@ +/** + * @file url_pattern_helpers-inl.h + * @brief Declaration for the URLPattern helpers. + */ +#ifndef ADA_URL_PATTERN_HELPERS_INL_H +#define ADA_URL_PATTERN_HELPERS_INL_H + +#include "ada/common_defs.h" +#include "ada/expected.h" +#include "ada/url_pattern.h" +#include "ada/url_pattern_helpers.h" +#include "ada/implementation.h" + +namespace ada::url_pattern_helpers { +inline std::string to_string(token_type type) { + switch (type) { + case token_type::INVALID_CHAR: + return "INVALID_CHAR"; + case token_type::OPEN: + return "OPEN"; + case token_type::CLOSE: + return "CLOSE"; + case token_type::REGEXP: + return "REGEXP"; + case token_type::NAME: + return "NAME"; + case token_type::CHAR: + return "CHAR"; + case token_type::ESCAPED_CHAR: + return "ESCAPED_CHAR"; + case token_type::OTHER_MODIFIER: + return "OTHER_MODIFIER"; + case token_type::ASTERISK: + return "ASTERISK"; + case token_type::END: + return "END"; + default: + ada::unreachable(); + } +} + +inline void constructor_string_parser::rewind() { + // Set parser’s token index to parser’s component start. + token_index = component_start; + // Set parser’s token increment to 0. + token_increment = 0; +} + +inline bool constructor_string_parser::is_hash_prefix() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index and "#". + return is_non_special_pattern_char(token_index, "#"); +} + +inline bool constructor_string_parser::is_search_prefix() { + // If result of running is a non-special pattern char given parser, parser’s + // token index and "?" is true, then return true. + if (is_non_special_pattern_char(token_index, "?")) { + return true; + } + + // If parser’s token list[parser’s token index]'s value is not "?", then + // return false. + if (token_list[token_index].value != "?") { + return false; + } + + // If previous index is less than 0, then return true. + if (token_index == 0) return true; + // Let previous index be parser’s token index − 1. + auto previous_index = token_index - 1; + // Let previous token be the result of running get a safe token given parser + // and previous index. + auto previous_token = get_safe_token(previous_index); + ADA_ASSERT_TRUE(previous_token); + // If any of the following are true, then return false: + // - previous token’s type is "name". + // - previous token’s type is "regexp". + // - previous token’s type is "close". + // - previous token’s type is "asterisk". + return !(previous_token->type == token_type::NAME || + previous_token->type == token_type::REGEXP || + previous_token->type == token_type::CLOSE || + previous_token->type == token_type::ASTERISK); +} + +inline bool constructor_string_parser::is_non_special_pattern_char( + size_t index, std::string_view value) { + // Let token be the result of running get a safe token given parser and index. + auto token = get_safe_token(index); + ADA_ASSERT_TRUE(token); + + // If token’s value is not value, then return false. + if (token->value != value) { + return false; + } + + // If any of the following are true: + // - token’s type is "char"; + // - token’s type is "escaped-char"; or + // - token’s type is "invalid-char", + // - then return true. + return token->type == token_type::CHAR || + token->type == token_type::ESCAPED_CHAR || + token->type == token_type::INVALID_CHAR; +} + +inline const Token* constructor_string_parser::get_safe_token(size_t index) { + // If index is less than parser’s token list's size, then return parser’s + // token list[index]. + if (index < token_list.size()) [[likely]] { + return &token_list[index]; + } + + // Assert: parser’s token list's size is greater than or equal to 1. + ADA_ASSERT_TRUE(!token_list.empty()); + + // Let token be parser’s token list[last index]. + // Assert: token’s type is "end". + ADA_ASSERT_TRUE(token_list.back().type == token_type::END); + + // Return token. + return &token_list.back(); +} + +inline bool constructor_string_parser::is_group_open() const { + // If parser’s token list[parser’s token index]'s type is "open", then return + // true. + return token_list[token_index].type == token_type::OPEN; +} + +inline bool constructor_string_parser::is_group_close() const { + // If parser’s token list[parser’s token index]'s type is "close", then return + // true. + return token_list[token_index].type == token_type::CLOSE; +} + +inline bool constructor_string_parser::next_is_authority_slashes() { + // If the result of running is a non-special pattern char given parser, + // parser’s token index + 1, and "/" is false, then return false. + if (!is_non_special_pattern_char(token_index + 1, "/")) { + return false; + } + // If the result of running is a non-special pattern char given parser, + // parser’s token index + 2, and "/" is false, then return false. + if (!is_non_special_pattern_char(token_index + 2, "/")) { + return false; + } + return true; +} + +inline bool constructor_string_parser::is_protocol_suffix() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and ":". + return is_non_special_pattern_char(token_index, ":"); +} + +inline void constructor_string_parser::change_state(State new_state, + size_t skip) { + // If parser’s state is not "init", not "authority", and not "done", then set + // parser’s result[parser’s state] to the result of running make a component + // string given parser. + if (state != State::INIT && state != State::AUTHORITY && + state != State::DONE) { + auto value = make_component_string(); + // TODO: Simplify this. + switch (state) { + case State::PROTOCOL: { + result.protocol = value; + break; + } + case State::USERNAME: { + result.username = value; + break; + } + case State::PASSWORD: { + result.password = value; + break; + } + case State::HOSTNAME: { + result.hostname = value; + break; + } + case State::PORT: { + result.port = value; + break; + } + case State::PATHNAME: { + result.pathname = value; + break; + } + case State::SEARCH: { + result.search = value; + break; + } + case State::HASH: { + result.hash = value; + break; + } + default: + ada::unreachable(); + } + } + + // If parser’s state is not "init" and new state is not "done", then: + if (state != State::INIT && new_state != State::DONE) { + // If parser’s state is "protocol", "authority", "username", or "password"; + // new state is "port", "pathname", "search", or "hash"; and parser’s + // result["hostname"] does not exist, then set parser’s result["hostname"] + // to the empty string. + if ((state == State::PROTOCOL || state == State::AUTHORITY || + state == State::USERNAME || state == State::PASSWORD) && + (new_state == State::PORT || new_state == State::PATHNAME || + new_state == State::SEARCH || new_state == State::HASH) && + !result.hostname) + result.hostname = ""; + } + + // If parser’s state is "protocol", "authority", "username", "password", + // "hostname", or "port"; new state is "search" or "hash"; and parser’s + // result["pathname"] does not exist, then: + if ((state == State::PROTOCOL || state == State::AUTHORITY || + state == State::USERNAME || state == State::PASSWORD || + state == State::HOSTNAME || state == State::PORT) && + (new_state == State::SEARCH || new_state == State::HASH) && + !result.pathname) { + if (protocol_matches_a_special_scheme_flag) { + result.pathname = "/"; + } else { + // Otherwise, set parser’s result["pathname"] to the empty string. + result.pathname = ""; + } + } + + // If parser’s state is "protocol", "authority", "username", "password", + // "hostname", "port", or "pathname"; new state is "hash"; and parser’s + // result["search"] does not exist, then set parser’s result["search"] to + // the empty string. + if ((state == State::PROTOCOL || state == State::AUTHORITY || + state == State::USERNAME || state == State::PASSWORD || + state == State::HOSTNAME || state == State::PORT || + state == State::PATHNAME) && + new_state == State::HASH && !result.search) { + result.search = ""; + } + + // Set parser’s state to new state. + state = new_state; + // Increment parser’s token index by skip. + token_index += skip; + // Set parser’s component start to parser’s token index. + component_start = token_index; + // Set parser’s token increment to 0. + token_increment = 0; +} + +inline std::string constructor_string_parser::make_component_string() { + // Assert: parser’s token index is less than parser’s token list's size. + ADA_ASSERT_TRUE(token_index < token_list.size()); + + // Let token be parser’s token list[parser’s token index]. + // Let end index be token’s index. + const auto end_index = token_list[token_index].index; + // Let component start token be the result of running get a safe token given + // parser and parser’s component start. + const auto component_start_token = get_safe_token(component_start); + ADA_ASSERT_TRUE(component_start_token); + // Let component start input index be component start token’s index. + const auto component_start_input_index = component_start_token->index; + // Return the code point substring from component start input index to end + // index within parser’s input. + return input.substr(component_start_input_index, + end_index - component_start_input_index); +} + +inline bool constructor_string_parser::is_an_identity_terminator() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and "@". + return is_non_special_pattern_char(token_index, "@"); +} + +inline bool constructor_string_parser::is_pathname_start() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and "/". + return is_non_special_pattern_char(token_index, "/"); +} + +inline bool constructor_string_parser::is_password_prefix() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and ":". + return is_non_special_pattern_char(token_index, ":"); +} + +inline bool constructor_string_parser::is_an_ipv6_open() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and "[". + return is_non_special_pattern_char(token_index, "["); +} + +inline bool constructor_string_parser::is_an_ipv6_close() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and "]". + return is_non_special_pattern_char(token_index, "]"); +} + +inline bool constructor_string_parser::is_port_prefix() { + // Return the result of running is a non-special pattern char given parser, + // parser’s token index, and ":". + return is_non_special_pattern_char(token_index, ":"); +} + +inline void Tokenizer::get_next_code_point() { + ada_log("Tokenizer::get_next_code_point called with index=", next_index); + ADA_ASSERT_TRUE(next_index < input.size()); + // this assumes that we have a valid, non-truncated UTF-8 stream. + code_point = 0; + size_t number_bytes = 0; + unsigned char first_byte = input[next_index]; + + if ((first_byte & 0x80) == 0) { + // 1-byte character (ASCII) + next_index++; + code_point = first_byte; + ada_log("Tokenizer::get_next_code_point returning ASCII code point=", + uint32_t(code_point)); + ada_log("Tokenizer::get_next_code_point next_index =", next_index, + " input.size()=", input.size()); + return; + } + ada_log("Tokenizer::get_next_code_point read first byte=", + uint32_t(first_byte)); + if ((first_byte & 0xE0) == 0xC0) { + code_point = first_byte & 0x1F; + number_bytes = 2; + ada_log("Tokenizer::get_next_code_point two bytes"); + } else if ((first_byte & 0xF0) == 0xE0) { + code_point = first_byte & 0x0F; + number_bytes = 3; + ada_log("Tokenizer::get_next_code_point three bytes"); + } else if ((first_byte & 0xF8) == 0xF0) { + code_point = first_byte & 0x07; + number_bytes = 4; + ada_log("Tokenizer::get_next_code_point four bytes"); + } + ADA_ASSERT_TRUE(number_bytes + next_index <= input.size()); + + for (size_t i = 1 + next_index; i < number_bytes + next_index; ++i) { + unsigned char byte = input[i]; + ada_log("Tokenizer::get_next_code_point read byte=", uint32_t(byte)); + code_point = (code_point << 6) | (byte & 0x3F); + } + ada_log("Tokenizer::get_next_code_point returning non-ASCII code point=", + uint32_t(code_point)); + ada_log("Tokenizer::get_next_code_point next_index =", next_index, + " input.size()=", input.size()); + next_index += number_bytes; +} + +inline void Tokenizer::seek_and_get_next_code_point(size_t new_index) { + ada_log("Tokenizer::seek_and_get_next_code_point called with new_index=", + new_index); + // Set tokenizer’s next index to index. + next_index = new_index; + // Run get the next code point given tokenizer. + get_next_code_point(); +} + +inline void Tokenizer::add_token(token_type type, size_t next_position, + size_t value_position, size_t value_length) { + ada_log("Tokenizer::add_token called with type=", to_string(type), + " next_position=", next_position, " value_position=", value_position); + ADA_ASSERT_TRUE(next_position >= value_position); + + // Let token be a new token. + // Set token’s type to type. + // Set token’s index to tokenizer’s index. + // Set token’s value to the code point substring from value position with + // length value length within tokenizer’s input. + // Append token to the back of tokenizer’s token list. + token_list.emplace_back(type, index, + input.substr(value_position, value_length)); + // Set tokenizer’s index to next position. + index = next_position; +} + +inline void Tokenizer::add_token_with_default_length(token_type type, + size_t next_position, + size_t value_position) { + // Let computed length be next position − value position. + auto computed_length = next_position - value_position; + // Run add a token given tokenizer, type, next position, value position, and + // computed length. + add_token(type, next_position, value_position, computed_length); +} + +inline void Tokenizer::add_token_with_defaults(token_type type) { + ada_log("Tokenizer::add_token_with_defaults called with type=", + to_string(type)); + // Run add a token with default length given tokenizer, type, tokenizer’s next + // index, and tokenizer’s index. + add_token_with_default_length(type, next_index, index); +} + +inline ada_warn_unused std::optional +Tokenizer::process_tokenizing_error(size_t next_position, + size_t value_position) { + // If tokenizer’s policy is "strict", then throw a TypeError. + if (policy == token_policy::STRICT) { + ada_log("process_tokenizing_error failed with next_position=", + next_position, " value_position=", value_position); + return errors::type_error; + } + // Assert: tokenizer’s policy is "lenient". + ADA_ASSERT_TRUE(policy == token_policy::LENIENT); + // Run add a token with default length given tokenizer, "invalid-char", next + // position, and value position. + add_token_with_default_length(token_type::INVALID_CHAR, next_position, + value_position); + return std::nullopt; +} + +template +Token* url_pattern_parser::try_consume_modifier_token() { + // Let token be the result of running try to consume a token given parser and + // "other-modifier". + auto token = try_consume_token(token_type::OTHER_MODIFIER); + // If token is not null, then return token. + if (token) return token; + // Set token to the result of running try to consume a token given parser and + // "asterisk". + // Return token. + return try_consume_token(token_type::ASTERISK); +} + +template +Token* url_pattern_parser::try_consume_regexp_or_wildcard_token( + Token* name_token) { + // Let token be the result of running try to consume a token given parser and + // "regexp". + auto token = try_consume_token(token_type::REGEXP); + // If name token is null and token is null, then set token to the result of + // running try to consume a token given parser and "asterisk". + if (!name_token && !token) { + token = try_consume_token(token_type::ASTERISK); + } + // Return token. + return token; +} + +template +Token* url_pattern_parser::try_consume_token(token_type type) { + ada_log("url_pattern_parser::try_consume_token called with type=", + to_string(type)); + // Assert: parser’s index is less than parser’s token list size. + ADA_ASSERT_TRUE(index < tokens.size()); + // Let next token be parser’s token list[parser’s index]. + auto& next_token = tokens[index]; + // If next token’s type is not type return null. + if (next_token.type != type) return nullptr; + // Increase parser’s index by 1. + index++; + // Return next token. + return &next_token; +} + +template +std::string url_pattern_parser::consume_text() { + // Let result be the empty string. + std::string result{}; + // While true: + while (true) { + // Let token be the result of running try to consume a token given parser + // and "char". + auto token = try_consume_token(token_type::CHAR); + // If token is null, then set token to the result of running try to consume + // a token given parser and "escaped-char". + if (!token) token = try_consume_token(token_type::ESCAPED_CHAR); + // If token is null, then break. + if (!token) break; + // Append token’s value to the end of result. + result.append(token->value); + } + // Return result. + return result; +} + +template +bool url_pattern_parser::consume_required_token(token_type type) { + ada_log("url_pattern_parser::consume_required_token called with type=", + to_string(type)); + // Let result be the result of running try to consume a token given parser and + // type. + return try_consume_token(type) != nullptr; +} + +template +std::optional +url_pattern_parser::maybe_add_part_from_the_pending_fixed_value() { + // If parser’s pending fixed value is the empty string, then return. + if (pending_fixed_value.empty()) { + ada_log("pending_fixed_value is empty"); + return std::nullopt; + } + // Let encoded value be the result of running parser’s encoding callback given + // parser’s pending fixed value. + auto encoded_value = encoding_callback(pending_fixed_value); + if (!encoded_value) { + ada_log("failed to encode pending_fixed_value: ", pending_fixed_value); + return encoded_value.error(); + } + // Set parser’s pending fixed value to the empty string. + pending_fixed_value.clear(); + // Let part be a new part whose type is "fixed-text", value is encoded value, + // and modifier is "none". + // Append part to parser’s part list. + parts.emplace_back(url_pattern_part_type::FIXED_TEXT, + std::move(*encoded_value), + url_pattern_part_modifier::NONE); + return std::nullopt; +} + +template +std::optional url_pattern_parser::add_part( + std::string_view prefix, Token* name_token, Token* regexp_or_wildcard_token, + std::string_view suffix, Token* modifier_token) { + // Let modifier be "none". + auto modifier = url_pattern_part_modifier::NONE; + // If modifier token is not null: + if (modifier_token) { + // If modifier token’s value is "?" then set modifier to "optional". + if (modifier_token->value == "?") { + modifier = url_pattern_part_modifier::OPTIONAL; + } else if (modifier_token->value == "*") { + // Otherwise if modifier token’s value is "*" then set modifier to + // "zero-or-more". + modifier = url_pattern_part_modifier::ZERO_OR_MORE; + } else if (modifier_token->value == "+") { + // Otherwise if modifier token’s value is "+" then set modifier to + // "one-or-more". + modifier = url_pattern_part_modifier::ONE_OR_MORE; + } + } + // If name token is null and regexp or wildcard token is null and modifier + // is "none": + if (!name_token && !regexp_or_wildcard_token && + modifier == url_pattern_part_modifier::NONE) { + // Append prefix to the end of parser’s pending fixed value. + pending_fixed_value.append(prefix); + return std::nullopt; + } + // Run maybe add a part from the pending fixed value given parser. + if (auto error = maybe_add_part_from_the_pending_fixed_value()) { + return *error; + } + // If name token is null and regexp or wildcard token is null: + if (!name_token && !regexp_or_wildcard_token) { + // Assert: suffix is the empty string. + ADA_ASSERT_TRUE(suffix.empty()); + // If prefix is the empty string, then return. + if (prefix.empty()) return std::nullopt; + // Let encoded value be the result of running parser’s encoding callback + // given prefix. + auto encoded_value = encoding_callback(prefix); + if (!encoded_value) { + return encoded_value.error(); + } + // Let part be a new part whose type is "fixed-text", value is encoded + // value, and modifier is modifier. + // Append part to parser’s part list. + parts.emplace_back(url_pattern_part_type::FIXED_TEXT, + std::move(*encoded_value), modifier); + return std::nullopt; + } + // Let regexp value be the empty string. + std::string regexp_value{}; + // If regexp or wildcard token is null, then set regexp value to parser’s + // segment wildcard regexp. + if (!regexp_or_wildcard_token) { + regexp_value = segment_wildcard_regexp; + } else if (regexp_or_wildcard_token->type == token_type::ASTERISK) { + // Otherwise if regexp or wildcard token’s type is "asterisk", then set + // regexp value to the full wildcard regexp value. + regexp_value = ".*"; + } else { + // Otherwise set regexp value to regexp or wildcard token’s value. + regexp_value = regexp_or_wildcard_token->value; + } + // Let type be "regexp". + auto type = url_pattern_part_type::REGEXP; + // If regexp value is parser’s segment wildcard regexp: + if (regexp_value == segment_wildcard_regexp) { + // Set type to "segment-wildcard". + type = url_pattern_part_type::SEGMENT_WILDCARD; + // Set regexp value to the empty string. + regexp_value.clear(); + } else if (regexp_value == ".*") { + // Otherwise if regexp value is the full wildcard regexp value: + // Set type to "full-wildcard". + type = url_pattern_part_type::FULL_WILDCARD; + // Set regexp value to the empty string. + regexp_value.clear(); + } + // Let name be the empty string. + std::string name{}; + // If name token is not null, then set name to name token’s value. + if (name_token) { + name = name_token->value; + } else if (regexp_or_wildcard_token) { + // Otherwise if regexp or wildcard token is not null: + // Set name to parser’s next numeric name, serialized. + // TODO: Make sure this is correct. + name = std::to_string(next_numeric_name); + // Increment parser’s next numeric name by 1. + next_numeric_name++; + } + // If the result of running is a duplicate name given parser and name is + // true, then throw a TypeError. + if (std::ranges::any_of( + parts, [&name](const auto& part) { return part.name == name; })) { + return errors::type_error; + } + // Let encoded prefix be the result of running parser’s encoding callback + // given prefix. + auto encoded_prefix = encoding_callback(prefix); + if (!encoded_prefix) return encoded_prefix.error(); + // Let encoded suffix be the result of running parser’s encoding callback + // given suffix. + auto encoded_suffix = encoding_callback(suffix); + if (!encoded_suffix) return encoded_suffix.error(); + // Let part be a new part whose type is type, value is regexp value, + // modifier is modifier, name is name, prefix is encoded prefix, and suffix + // is encoded suffix. + // Append part to parser’s part list. + parts.emplace_back(type, std::move(regexp_value), modifier, std::move(name), + std::move(*encoded_prefix), std::move(*encoded_suffix)); + return std::nullopt; +} + +template +tl::expected, errors> parse_pattern_string( + std::string_view input, url_pattern_compile_component_options& options, + F& encoding_callback) { + ada_log("parse_pattern_string input=", input); + // Let parser be a new pattern parser whose encoding callback is encoding + // callback and segment wildcard regexp is the result of running generate a + // segment wildcard regexp given options. + auto parser = url_pattern_parser( + encoding_callback, generate_segment_wildcard_regexp(options)); + // Set parser’s token list to the result of running tokenize given input and + // "strict". + auto tokenize_result = tokenize(input, token_policy::STRICT); + if (!tokenize_result) { + ada_log("parse_pattern_string tokenize failed"); + return tl::unexpected(tokenize_result.error()); + } + parser.tokens = std::move(*tokenize_result); + + // While parser’s index is less than parser’s token list's size: + while (parser.can_continue()) { + // Let char token be the result of running try to consume a token given + // parser and "char". + auto char_token = parser.try_consume_token(token_type::CHAR); + // Let name token be the result of running try to consume a token given + // parser and "name". + auto name_token = parser.try_consume_token(token_type::NAME); + // Let regexp or wildcard token be the result of running try to consume a + // regexp or wildcard token given parser and name token. + auto regexp_or_wildcard_token = + parser.try_consume_regexp_or_wildcard_token(name_token); + // If name token is not null or regexp or wildcard token is not null: + if (name_token || regexp_or_wildcard_token) { + // Let prefix be the empty string. + std::string prefix{}; + // If char token is not null then set prefix to char token’s value. + if (char_token) prefix = char_token->value; + // If prefix is not the empty string and not options’s prefix code point: + if (!prefix.empty() && prefix != options.get_prefix()) { + // Append prefix to the end of parser’s pending fixed value. + parser.pending_fixed_value.append(prefix); + // Set prefix to the empty string. + prefix.clear(); + } + // Run maybe add a part from the pending fixed value given parser. + if (auto error = parser.maybe_add_part_from_the_pending_fixed_value()) { + ada_log("maybe_add_part_from_the_pending_fixed_value failed"); + return tl::unexpected(*error); + } + // Let modifier token be the result of running try to consume a modifier + // token given parser. + auto modifier_token = parser.try_consume_modifier_token(); + // Run add a part given parser, prefix, name token, regexp or wildcard + // token, the empty string, and modifier token. + if (auto error = + parser.add_part(prefix, name_token, regexp_or_wildcard_token, "", + modifier_token)) { + ada_log("parser.add_part failed"); + return tl::unexpected(*error); + } + // Continue. + continue; + } + + // Let fixed token be char token. + auto fixed_token = char_token; + // If fixed token is null, then set fixed token to the result of running try + // to consume a token given parser and "escaped-char". + if (!fixed_token) + fixed_token = parser.try_consume_token(token_type::ESCAPED_CHAR); + // If fixed token is not null: + if (fixed_token) { + // Append fixed token’s value to parser’s pending fixed value. + parser.pending_fixed_value.append(fixed_token->value); + // Continue. + continue; + } + // Let open token be the result of running try to consume a token given + // parser and "open". + auto open_token = parser.try_consume_token(token_type::OPEN); + // If open token is not null: + if (open_token) { + // Set prefix be the result of running consume text given parser. + auto prefix_ = parser.consume_text(); + // Set name token to the result of running try to consume a token given + // parser and "name". + name_token = parser.try_consume_token(token_type::NAME); + // Set regexp or wildcard token to the result of running try to consume a + // regexp or wildcard token given parser and name token. + regexp_or_wildcard_token = + parser.try_consume_regexp_or_wildcard_token(name_token); + // Let suffix be the result of running consume text given parser. + auto suffix_ = parser.consume_text(); + // Run consume a required token given parser and "close". + if (!parser.consume_required_token(token_type::CLOSE)) { + ada_log("parser.consume_required_token failed"); + return tl::unexpected(errors::type_error); + } + // Set modifier token to the result of running try to consume a modifier + // token given parser. + auto modifier_token = parser.try_consume_modifier_token(); + // Run add a part given parser, prefix, name token, regexp or wildcard + // token, suffix, and modifier token. + if (auto error = + parser.add_part(prefix_, name_token, regexp_or_wildcard_token, + suffix_, modifier_token)) { + return tl::unexpected(*error); + } + // Continue. + continue; + } + // Run maybe add a part from the pending fixed value given parser. + if (auto error = parser.maybe_add_part_from_the_pending_fixed_value()) { + ada_log("maybe_add_part_from_the_pending_fixed_value failed on line 992"); + return tl::unexpected(*error); + } + // Run consume a required token given parser and "end". + if (!parser.consume_required_token(token_type::END)) { + return tl::unexpected(errors::type_error); + } + } + ada_log("parser.parts size is: ", parser.parts.size()); + // Return parser’s part list. + return parser.parts; +} + +} // namespace ada::url_pattern_helpers + +#endif diff --git a/include/ada/url_pattern_helpers.h b/include/ada/url_pattern_helpers.h new file mode 100644 index 000000000..4d9c29f65 --- /dev/null +++ b/include/ada/url_pattern_helpers.h @@ -0,0 +1,336 @@ +/** + * @file url_pattern_helpers.h + * @brief Declaration for the URLPattern helpers. + */ +#ifndef ADA_URL_PATTERN_HELPERS_H +#define ADA_URL_PATTERN_HELPERS_H + +#include "ada/expected.h" +#include "ada/url_pattern.h" + +#include +#include +#include + +namespace ada::url_pattern_helpers { + +// @see https://urlpattern.spec.whatwg.org/#token +enum class token_type : uint8_t { + INVALID_CHAR, // 0 + OPEN, // 1 + CLOSE, // 2 + REGEXP, // 3 + NAME, // 4 + CHAR, // 5 + ESCAPED_CHAR, // 6 + OTHER_MODIFIER, // 7 + ASTERISK, // 8 + END, // 9 +}; + +std::string to_string(token_type type); + +// @see https://urlpattern.spec.whatwg.org/#tokenize-policy +enum class token_policy { + STRICT, + LENIENT, +}; + +// @see https://urlpattern.spec.whatwg.org/#tokens +class Token { + public: + Token(token_type _type, size_t _index, std::string&& _value) + : type(_type), index(_index), value(std::move(_value)) {} + + // A token has an associated type, a string, initially "invalid-char". + token_type type = token_type::INVALID_CHAR; + + // A token has an associated index, a number, initially 0. It is the position + // of the first code point in the pattern string represented by the token. + size_t index = 0; + + // A token has an associated value, a string, initially the empty string. It + // contains the code points from the pattern string represented by the token. + std::string value{}; +}; + +// @see https://urlpattern.spec.whatwg.org/#pattern-parser +template +class url_pattern_parser { + public: + url_pattern_parser(F& encoding_callback_, + std::string_view segment_wildcard_regexp_) + : encoding_callback(encoding_callback_), + segment_wildcard_regexp(segment_wildcard_regexp_) {} + + bool can_continue() const { return index < tokens.size(); } + + // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-token + Token* try_consume_token(token_type type); + // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token + Token* try_consume_modifier_token(); + // @see + // https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token + Token* try_consume_regexp_or_wildcard_token(Token* name_token); + // @see https://urlpattern.spec.whatwg.org/#consume-text + std::string consume_text(); + // @see https://urlpattern.spec.whatwg.org/#consume-a-required-token + bool consume_required_token(token_type type); + // @see + // https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value + std::optional maybe_add_part_from_the_pending_fixed_value() + ada_warn_unused; + // @see https://urlpattern.spec.whatwg.org/#add-a-part + std::optional add_part(std::string_view prefix, Token* name_token, + Token* regexp_or_wildcard_token, + std::string_view suyffix, + Token* modifier_token) ada_warn_unused; + + std::vector tokens{}; + F& encoding_callback; + std::string segment_wildcard_regexp; + std::vector parts{}; + std::string pending_fixed_value{}; + size_t index = 0; + size_t next_numeric_name = 0; +}; + +// @see https://urlpattern.spec.whatwg.org/#tokenizer +class Tokenizer { + public: + explicit Tokenizer(std::string_view new_input, token_policy new_policy) + : input(new_input), policy(new_policy) {} + + // @see https://urlpattern.spec.whatwg.org/#get-the-next-code-point + void get_next_code_point(); + + // @see https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point + void seek_and_get_next_code_point(size_t index); + + // @see https://urlpattern.spec.whatwg.org/#add-a-token + + void add_token(token_type type, size_t next_position, size_t value_position, + size_t value_length); + + // @see https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length + void add_token_with_default_length(token_type type, size_t next_position, + size_t value_position); + + // @see + // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length + void add_token_with_defaults(token_type type); + + // @see https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error + std::optional process_tokenizing_error( + size_t next_position, size_t value_position) ada_warn_unused; + + // has an associated input, a pattern string, initially the empty string. + std::string input; + // has an associated policy, a tokenize policy, initially "strict". + token_policy policy; + // has an associated token list, a token list, initially an empty list. + std::vector token_list{}; + // has an associated index, a number, initially 0. + size_t index = 0; + // has an associated next index, a number, initially 0. + size_t next_index = 0; + // has an associated code point, a Unicode code point, initially null. + char32_t code_point{}; +}; + +// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser +struct constructor_string_parser { + explicit constructor_string_parser(std::string_view new_input, + std::vector&& new_token_list) + : input(new_input), token_list(std::move(new_token_list)) {} + + // @see https://urlpattern.spec.whatwg.org/#rewind + void rewind(); + + // @see https://urlpattern.spec.whatwg.org/#is-a-hash-prefix + bool is_hash_prefix(); + + // @see https://urlpattern.spec.whatwg.org/#is-a-search-prefix + bool is_search_prefix(); + + // @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string + static tl::expected parse(std::string_view input); + + // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state + enum class State { + INIT, + PROTOCOL, + AUTHORITY, + USERNAME, + PASSWORD, + HOSTNAME, + PORT, + PATHNAME, + SEARCH, + HASH, + DONE, + }; + + // @see https://urlpattern.spec.whatwg.org/#change-state + void change_state(State state, size_t skip); + + // @see https://urlpattern.spec.whatwg.org/#is-a-group-open + bool is_group_open() const; + + // @see https://urlpattern.spec.whatwg.org/#is-a-group-close + bool is_group_close() const; + + // @see https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix + bool is_protocol_suffix(); + + // @see + // https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag + std::optional compute_protocol_matches_special_scheme_flag(); + + // @see https://urlpattern.spec.whatwg.org/#next-is-authority-slashes + bool next_is_authority_slashes(); + + // @see https://urlpattern.spec.whatwg.org/#is-an-identity-terminator + bool is_an_identity_terminator(); + + // @see https://urlpattern.spec.whatwg.org/#is-a-pathname-start + bool is_pathname_start(); + + // @see https://urlpattern.spec.whatwg.org/#is-a-password-prefix + bool is_password_prefix(); + + // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-open + bool is_an_ipv6_open(); + + // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-close + bool is_an_ipv6_close(); + + // @see https://urlpattern.spec.whatwg.org/#is-a-port-prefix + bool is_port_prefix(); + + // has an associated input, a string, which must be set upon creation. + std::string input; + // has an associated token list, a token list, which must be set upon + // creation. + std::vector token_list; + // has an associated result, a URLPatternInit, initially set to a new + // URLPatternInit. + url_pattern_init result{}; + // has an associated component start, a number, initially set to 0. + size_t component_start = 0; + // has an associated token index, a number, initially set to 0. + size_t token_index = 0; + // has an associated token increment, a number, initially set to 1. + size_t token_increment = 1; + // has an associated group depth, a number, initially set to 0. + size_t group_depth = 0; + // has an associated hostname IPv6 bracket depth, a number, initially set to + // 0. + size_t hostname_ipv6_bracket_depth = 0; + // has an associated protocol matches a special scheme flag, a boolean, + // initially set to false. + bool protocol_matches_a_special_scheme_flag = false; + // has an associated state, a string, initially set to "init". + State state = State::INIT; + + private: + // @see https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char + bool is_non_special_pattern_char(size_t index, std::string_view value); + + // @see https://urlpattern.spec.whatwg.org/#get-a-safe-token + const Token* get_safe_token(size_t index); + + // @see https://urlpattern.spec.whatwg.org/#make-a-component-string + std::string make_component_string(); +}; + +// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol +tl::expected canonicalize_protocol(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-username +tl::expected canonicalize_username(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-password +tl::expected canonicalize_password(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-password +tl::expected canonicalize_hostname(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname +tl::expected canonicalize_ipv6_hostname( + std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-port +tl::expected canonicalize_port(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-port +tl::expected canonicalize_port_with_protocol( + std::string_view input, std::string_view protocol); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname +tl::expected canonicalize_pathname(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname +tl::expected canonicalize_opaque_pathname( + std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-search +tl::expected canonicalize_search(std::string_view input); + +// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash +tl::expected canonicalize_hash(std::string_view input); + +// @see https://urlpattern.spec.whatwg.org/#tokenize +tl::expected, errors> tokenize(std::string_view input, + token_policy policy); + +// @see https://urlpattern.spec.whatwg.org/#process-a-base-url-string +std::string process_base_url_string(std::string_view input, + std::string_view type); + +// @see https://urlpattern.spec.whatwg.org/#escape-a-pattern-string +std::string escape_pattern_string(std::string_view input); + +// @see https://urlpattern.spec.whatwg.org/#escape-a-regexp-string +std::string escape_regexp_string(std::string_view input); + +// @see https://urlpattern.spec.whatwg.org/#is-an-absolute-pathname +constexpr bool is_absolute_pathname(std::string_view input, + std::string_view type) noexcept; + +// @see https://urlpattern.spec.whatwg.org/#parse-a-pattern-string +template +tl::expected, errors> parse_pattern_string( + std::string_view input, url_pattern_compile_component_options& options, + F& encoding_callback); + +// @see https://urlpattern.spec.whatwg.org/#generate-a-pattern-string +std::string generate_pattern_string( + std::vector& part_list, + url_pattern_compile_component_options& options); + +// @see +// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list +std::tuple> +generate_regular_expression_and_name_list( + std::vector& part_list, + url_pattern_compile_component_options options); + +// @see https://urlpattern.spec.whatwg.org/#hostname-pattern-is-an-ipv6-address +bool is_ipv6_address(std::string_view input) noexcept; + +// @see +// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme +bool protocol_component_matches_special_scheme( + ada::url_pattern_component& input); + +// @see https://urlpattern.spec.whatwg.org/#convert-a-modifier-to-a-string +std::string convert_modifier_to_string(url_pattern_part_modifier modifier); + +// @see https://urlpattern.spec.whatwg.org/#generate-a-segment-wildcard-regexp +std::string generate_segment_wildcard_regexp( + url_pattern_compile_component_options options); + +} // namespace ada::url_pattern_helpers + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5a1a8d383..7c8f36e39 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,10 +41,6 @@ endif() if(ADA_DEVELOPMENT_CHECKS) target_compile_definitions(ada PUBLIC ADA_DEVELOPMENT_CHECKS=1) endif() -if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_BUILD_TYPE STREQUAL "Debug")) - message(STATUS "Enabling _GLIBCXX_DEBUG") - target_compile_definitions(ada PRIVATE _GLIBCXX_DEBUG=1) -endif() if(ADA_SANITIZE) target_compile_options(ada PUBLIC -fsanitize=address -fno-omit-frame-pointer -fno-sanitize-recover=all) @@ -58,4 +54,8 @@ endif() if(ADA_LOGGING) target_compile_definitions(ada PRIVATE ADA_LOGGING=1) -endif() \ No newline at end of file +endif() + +if(ADA_TESTING) + target_compile_definitions(ada PRIVATE ADA_TESTING=1) +endif() diff --git a/src/ada.cpp b/src/ada.cpp index 26090909f..3d35569dd 100644 --- a/src/ada.cpp +++ b/src/ada.cpp @@ -8,4 +8,6 @@ #include "parser.cpp" #include "url_components.cpp" #include "url_aggregator.cpp" +#include "url_pattern.cpp" +#include "url_pattern_helpers.cpp" #include "ada_c.cpp" diff --git a/src/ada_c.cpp b/src/ada_c.cpp index 033af1d7a..ffde38ec6 100644 --- a/src/ada_c.cpp +++ b/src/ada_c.cpp @@ -1,4 +1,5 @@ -#include "ada.h" +#include "ada/url_aggregator-inl.h" +#include "ada/url_search_params-inl.h" ada::result& get_instance(void* result) noexcept { return *(ada::result*)result; diff --git a/src/ada_idna.cpp b/src/ada_idna.cpp index f965844de..c1b8544f2 100644 --- a/src/ada_idna.cpp +++ b/src/ada_idna.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2024-12-05 20:44:13 -0500. Do not edit! */ +/* auto-generated on 2024-12-18 09:44:34 -0500. Do not edit! */ /* begin file src/idna.cpp */ /* begin file src/unicode_transcoding.cpp */ @@ -9630,4 +9630,642 @@ std::string to_unicode(std::string_view input) { } } // namespace ada::idna /* end file src/to_unicode.cpp */ +/* begin file src/identifier.cpp */ + +#include +#include +#include + +/* begin file src/id_tables.cpp */ +// IDNA 15.1.0 + +// clang-format off +#ifndef ADA_IDNA_IDENTIFIER_TABLES_H +#define ADA_IDNA_IDENTIFIER_TABLES_H +#include + +namespace ada::idna { + +const uint32_t id_continue[1344][2] = +{ + {48, 57}, {65, 90}, {95, 95}, {97, 122}, + {170, 170}, {181, 181}, {183, 183}, {186, 186}, + {192, 214}, {216, 246}, {248, 442}, {443, 443}, + {444, 447}, {448, 451}, {452, 659}, {660, 660}, + {661, 687}, {688, 705}, {710, 721}, {736, 740}, + {748, 748}, {750, 750}, {768, 879}, {880, 883}, + {884, 884}, {886, 887}, {890, 890}, {891, 893}, + {895, 895}, {902, 902}, {903, 903}, {904, 906}, + {908, 908}, {910, 929}, {931, 1013}, {1015, 1153}, + {1155, 1159}, {1162, 1327}, {1329, 1366}, {1369, 1369}, + {1376, 1416}, {1425, 1469}, {1471, 1471}, {1473, 1474}, + {1476, 1477}, {1479, 1479}, {1488, 1514}, {1519, 1522}, + {1552, 1562}, {1568, 1599}, {1600, 1600}, {1601, 1610}, + {1611, 1631}, {1632, 1641}, {1646, 1647}, {1648, 1648}, + {1649, 1747}, {1749, 1749}, {1750, 1756}, {1759, 1764}, + {1765, 1766}, {1767, 1768}, {1770, 1773}, {1774, 1775}, + {1776, 1785}, {1786, 1788}, {1791, 1791}, {1808, 1808}, + {1809, 1809}, {1810, 1839}, {1840, 1866}, {1869, 1957}, + {1958, 1968}, {1969, 1969}, {1984, 1993}, {1994, 2026}, + {2027, 2035}, {2036, 2037}, {2042, 2042}, {2045, 2045}, + {2048, 2069}, {2070, 2073}, {2074, 2074}, {2075, 2083}, + {2084, 2084}, {2085, 2087}, {2088, 2088}, {2089, 2093}, + {2112, 2136}, {2137, 2139}, {2144, 2154}, {2160, 2183}, + {2185, 2190}, {2200, 2207}, {2208, 2248}, {2249, 2249}, + {2250, 2273}, {2275, 2306}, {2307, 2307}, {2308, 2361}, + {2362, 2362}, {2363, 2363}, {2364, 2364}, {2365, 2365}, + {2366, 2368}, {2369, 2376}, {2377, 2380}, {2381, 2381}, + {2382, 2383}, {2384, 2384}, {2385, 2391}, {2392, 2401}, + {2402, 2403}, {2406, 2415}, {2417, 2417}, {2418, 2432}, + {2433, 2433}, {2434, 2435}, {2437, 2444}, {2447, 2448}, + {2451, 2472}, {2474, 2480}, {2482, 2482}, {2486, 2489}, + {2492, 2492}, {2493, 2493}, {2494, 2496}, {2497, 2500}, + {2503, 2504}, {2507, 2508}, {2509, 2509}, {2510, 2510}, + {2519, 2519}, {2524, 2525}, {2527, 2529}, {2530, 2531}, + {2534, 2543}, {2544, 2545}, {2556, 2556}, {2558, 2558}, + {2561, 2562}, {2563, 2563}, {2565, 2570}, {2575, 2576}, + {2579, 2600}, {2602, 2608}, {2610, 2611}, {2613, 2614}, + {2616, 2617}, {2620, 2620}, {2622, 2624}, {2625, 2626}, + {2631, 2632}, {2635, 2637}, {2641, 2641}, {2649, 2652}, + {2654, 2654}, {2662, 2671}, {2672, 2673}, {2674, 2676}, + {2677, 2677}, {2689, 2690}, {2691, 2691}, {2693, 2701}, + {2703, 2705}, {2707, 2728}, {2730, 2736}, {2738, 2739}, + {2741, 2745}, {2748, 2748}, {2749, 2749}, {2750, 2752}, + {2753, 2757}, {2759, 2760}, {2761, 2761}, {2763, 2764}, + {2765, 2765}, {2768, 2768}, {2784, 2785}, {2786, 2787}, + {2790, 2799}, {2809, 2809}, {2810, 2815}, {2817, 2817}, + {2818, 2819}, {2821, 2828}, {2831, 2832}, {2835, 2856}, + {2858, 2864}, {2866, 2867}, {2869, 2873}, {2876, 2876}, + {2877, 2877}, {2878, 2878}, {2879, 2879}, {2880, 2880}, + {2881, 2884}, {2887, 2888}, {2891, 2892}, {2893, 2893}, + {2901, 2902}, {2903, 2903}, {2908, 2909}, {2911, 2913}, + {2914, 2915}, {2918, 2927}, {2929, 2929}, {2946, 2946}, + {2947, 2947}, {2949, 2954}, {2958, 2960}, {2962, 2965}, + {2969, 2970}, {2972, 2972}, {2974, 2975}, {2979, 2980}, + {2984, 2986}, {2990, 3001}, {3006, 3007}, {3008, 3008}, + {3009, 3010}, {3014, 3016}, {3018, 3020}, {3021, 3021}, + {3024, 3024}, {3031, 3031}, {3046, 3055}, {3072, 3072}, + {3073, 3075}, {3076, 3076}, {3077, 3084}, {3086, 3088}, + {3090, 3112}, {3114, 3129}, {3132, 3132}, {3133, 3133}, + {3134, 3136}, {3137, 3140}, {3142, 3144}, {3146, 3149}, + {3157, 3158}, {3160, 3162}, {3165, 3165}, {3168, 3169}, + {3170, 3171}, {3174, 3183}, {3200, 3200}, {3201, 3201}, + {3202, 3203}, {3205, 3212}, {3214, 3216}, {3218, 3240}, + {3242, 3251}, {3253, 3257}, {3260, 3260}, {3261, 3261}, + {3262, 3262}, {3263, 3263}, {3264, 3268}, {3270, 3270}, + {3271, 3272}, {3274, 3275}, {3276, 3277}, {3285, 3286}, + {3293, 3294}, {3296, 3297}, {3298, 3299}, {3302, 3311}, + {3313, 3314}, {3315, 3315}, {3328, 3329}, {3330, 3331}, + {3332, 3340}, {3342, 3344}, {3346, 3386}, {3387, 3388}, + {3389, 3389}, {3390, 3392}, {3393, 3396}, {3398, 3400}, + {3402, 3404}, {3405, 3405}, {3406, 3406}, {3412, 3414}, + {3415, 3415}, {3423, 3425}, {3426, 3427}, {3430, 3439}, + {3450, 3455}, {3457, 3457}, {3458, 3459}, {3461, 3478}, + {3482, 3505}, {3507, 3515}, {3517, 3517}, {3520, 3526}, + {3530, 3530}, {3535, 3537}, {3538, 3540}, {3542, 3542}, + {3544, 3551}, {3558, 3567}, {3570, 3571}, {3585, 3632}, + {3633, 3633}, {3634, 3635}, {3636, 3642}, {3648, 3653}, + {3654, 3654}, {3655, 3662}, {3664, 3673}, {3713, 3714}, + {3716, 3716}, {3718, 3722}, {3724, 3747}, {3749, 3749}, + {3751, 3760}, {3761, 3761}, {3762, 3763}, {3764, 3772}, + {3773, 3773}, {3776, 3780}, {3782, 3782}, {3784, 3790}, + {3792, 3801}, {3804, 3807}, {3840, 3840}, {3864, 3865}, + {3872, 3881}, {3893, 3893}, {3895, 3895}, {3897, 3897}, + {3902, 3903}, {3904, 3911}, {3913, 3948}, {3953, 3966}, + {3967, 3967}, {3968, 3972}, {3974, 3975}, {3976, 3980}, + {3981, 3991}, {3993, 4028}, {4038, 4038}, {4096, 4138}, + {4139, 4140}, {4141, 4144}, {4145, 4145}, {4146, 4151}, + {4152, 4152}, {4153, 4154}, {4155, 4156}, {4157, 4158}, + {4159, 4159}, {4160, 4169}, {4176, 4181}, {4182, 4183}, + {4184, 4185}, {4186, 4189}, {4190, 4192}, {4193, 4193}, + {4194, 4196}, {4197, 4198}, {4199, 4205}, {4206, 4208}, + {4209, 4212}, {4213, 4225}, {4226, 4226}, {4227, 4228}, + {4229, 4230}, {4231, 4236}, {4237, 4237}, {4238, 4238}, + {4239, 4239}, {4240, 4249}, {4250, 4252}, {4253, 4253}, + {4256, 4293}, {4295, 4295}, {4301, 4301}, {4304, 4346}, + {4348, 4348}, {4349, 4351}, {4352, 4680}, {4682, 4685}, + {4688, 4694}, {4696, 4696}, {4698, 4701}, {4704, 4744}, + {4746, 4749}, {4752, 4784}, {4786, 4789}, {4792, 4798}, + {4800, 4800}, {4802, 4805}, {4808, 4822}, {4824, 4880}, + {4882, 4885}, {4888, 4954}, {4957, 4959}, {4969, 4977}, + {4992, 5007}, {5024, 5109}, {5112, 5117}, {5121, 5740}, + {5743, 5759}, {5761, 5786}, {5792, 5866}, {5870, 5872}, + {5873, 5880}, {5888, 5905}, {5906, 5908}, {5909, 5909}, + {5919, 5937}, {5938, 5939}, {5940, 5940}, {5952, 5969}, + {5970, 5971}, {5984, 5996}, {5998, 6000}, {6002, 6003}, + {6016, 6067}, {6068, 6069}, {6070, 6070}, {6071, 6077}, + {6078, 6085}, {6086, 6086}, {6087, 6088}, {6089, 6099}, + {6103, 6103}, {6108, 6108}, {6109, 6109}, {6112, 6121}, + {6155, 6157}, {6159, 6159}, {6160, 6169}, {6176, 6210}, + {6211, 6211}, {6212, 6264}, {6272, 6276}, {6277, 6278}, + {6279, 6312}, {6313, 6313}, {6314, 6314}, {6320, 6389}, + {6400, 6430}, {6432, 6434}, {6435, 6438}, {6439, 6440}, + {6441, 6443}, {6448, 6449}, {6450, 6450}, {6451, 6456}, + {6457, 6459}, {6470, 6479}, {6480, 6509}, {6512, 6516}, + {6528, 6571}, {6576, 6601}, {6608, 6617}, {6618, 6618}, + {6656, 6678}, {6679, 6680}, {6681, 6682}, {6683, 6683}, + {6688, 6740}, {6741, 6741}, {6742, 6742}, {6743, 6743}, + {6744, 6750}, {6752, 6752}, {6753, 6753}, {6754, 6754}, + {6755, 6756}, {6757, 6764}, {6765, 6770}, {6771, 6780}, + {6783, 6783}, {6784, 6793}, {6800, 6809}, {6823, 6823}, + {6832, 6845}, {6847, 6862}, {6912, 6915}, {6916, 6916}, + {6917, 6963}, {6964, 6964}, {6965, 6965}, {6966, 6970}, + {6971, 6971}, {6972, 6972}, {6973, 6977}, {6978, 6978}, + {6979, 6980}, {6981, 6988}, {6992, 7001}, {7019, 7027}, + {7040, 7041}, {7042, 7042}, {7043, 7072}, {7073, 7073}, + {7074, 7077}, {7078, 7079}, {7080, 7081}, {7082, 7082}, + {7083, 7085}, {7086, 7087}, {7088, 7097}, {7098, 7141}, + {7142, 7142}, {7143, 7143}, {7144, 7145}, {7146, 7148}, + {7149, 7149}, {7150, 7150}, {7151, 7153}, {7154, 7155}, + {7168, 7203}, {7204, 7211}, {7212, 7219}, {7220, 7221}, + {7222, 7223}, {7232, 7241}, {7245, 7247}, {7248, 7257}, + {7258, 7287}, {7288, 7293}, {7296, 7304}, {7312, 7354}, + {7357, 7359}, {7376, 7378}, {7380, 7392}, {7393, 7393}, + {7394, 7400}, {7401, 7404}, {7405, 7405}, {7406, 7411}, + {7412, 7412}, {7413, 7414}, {7415, 7415}, {7416, 7417}, + {7418, 7418}, {7424, 7467}, {7468, 7530}, {7531, 7543}, + {7544, 7544}, {7545, 7578}, {7579, 7615}, {7616, 7679}, + {7680, 7957}, {7960, 7965}, {7968, 8005}, {8008, 8013}, + {8016, 8023}, {8025, 8025}, {8027, 8027}, {8029, 8029}, + {8031, 8061}, {8064, 8116}, {8118, 8124}, {8126, 8126}, + {8130, 8132}, {8134, 8140}, {8144, 8147}, {8150, 8155}, + {8160, 8172}, {8178, 8180}, {8182, 8188}, {8204, 8205}, + {8255, 8256}, {8276, 8276}, {8305, 8305}, {8319, 8319}, + {8336, 8348}, {8400, 8412}, {8417, 8417}, {8421, 8432}, + {8450, 8450}, {8455, 8455}, {8458, 8467}, {8469, 8469}, + {8472, 8472}, {8473, 8477}, {8484, 8484}, {8486, 8486}, + {8488, 8488}, {8490, 8493}, {8494, 8494}, {8495, 8500}, + {8501, 8504}, {8505, 8505}, {8508, 8511}, {8517, 8521}, + {8526, 8526}, {8544, 8578}, {8579, 8580}, {8581, 8584}, + {11264, 11387}, {11388, 11389}, {11390, 11492}, {11499, 11502}, + {11503, 11505}, {11506, 11507}, {11520, 11557}, {11559, 11559}, + {11565, 11565}, {11568, 11623}, {11631, 11631}, {11647, 11647}, + {11648, 11670}, {11680, 11686}, {11688, 11694}, {11696, 11702}, + {11704, 11710}, {11712, 11718}, {11720, 11726}, {11728, 11734}, + {11736, 11742}, {11744, 11775}, {12293, 12293}, {12294, 12294}, + {12295, 12295}, {12321, 12329}, {12330, 12333}, {12334, 12335}, + {12337, 12341}, {12344, 12346}, {12347, 12347}, {12348, 12348}, + {12353, 12438}, {12441, 12442}, {12443, 12444}, {12445, 12446}, + {12447, 12447}, {12449, 12538}, {12539, 12539}, {12540, 12542}, + {12543, 12543}, {12549, 12591}, {12593, 12686}, {12704, 12735}, + {12784, 12799}, {13312, 19903}, {19968, 40980}, {40981, 40981}, + {40982, 42124}, {42192, 42231}, {42232, 42237}, {42240, 42507}, + {42508, 42508}, {42512, 42527}, {42528, 42537}, {42538, 42539}, + {42560, 42605}, {42606, 42606}, {42607, 42607}, {42612, 42621}, + {42623, 42623}, {42624, 42651}, {42652, 42653}, {42654, 42655}, + {42656, 42725}, {42726, 42735}, {42736, 42737}, {42775, 42783}, + {42786, 42863}, {42864, 42864}, {42865, 42887}, {42888, 42888}, + {42891, 42894}, {42895, 42895}, {42896, 42954}, {42960, 42961}, + {42963, 42963}, {42965, 42969}, {42994, 42996}, {42997, 42998}, + {42999, 42999}, {43000, 43001}, {43002, 43002}, {43003, 43009}, + {43010, 43010}, {43011, 43013}, {43014, 43014}, {43015, 43018}, + {43019, 43019}, {43020, 43042}, {43043, 43044}, {43045, 43046}, + {43047, 43047}, {43052, 43052}, {43072, 43123}, {43136, 43137}, + {43138, 43187}, {43188, 43203}, {43204, 43205}, {43216, 43225}, + {43232, 43249}, {43250, 43255}, {43259, 43259}, {43261, 43262}, + {43263, 43263}, {43264, 43273}, {43274, 43301}, {43302, 43309}, + {43312, 43334}, {43335, 43345}, {43346, 43347}, {43360, 43388}, + {43392, 43394}, {43395, 43395}, {43396, 43442}, {43443, 43443}, + {43444, 43445}, {43446, 43449}, {43450, 43451}, {43452, 43453}, + {43454, 43456}, {43471, 43471}, {43472, 43481}, {43488, 43492}, + {43493, 43493}, {43494, 43494}, {43495, 43503}, {43504, 43513}, + {43514, 43518}, {43520, 43560}, {43561, 43566}, {43567, 43568}, + {43569, 43570}, {43571, 43572}, {43573, 43574}, {43584, 43586}, + {43587, 43587}, {43588, 43595}, {43596, 43596}, {43597, 43597}, + {43600, 43609}, {43616, 43631}, {43632, 43632}, {43633, 43638}, + {43642, 43642}, {43643, 43643}, {43644, 43644}, {43645, 43645}, + {43646, 43695}, {43696, 43696}, {43697, 43697}, {43698, 43700}, + {43701, 43702}, {43703, 43704}, {43705, 43709}, {43710, 43711}, + {43712, 43712}, {43713, 43713}, {43714, 43714}, {43739, 43740}, + {43741, 43741}, {43744, 43754}, {43755, 43755}, {43756, 43757}, + {43758, 43759}, {43762, 43762}, {43763, 43764}, {43765, 43765}, + {43766, 43766}, {43777, 43782}, {43785, 43790}, {43793, 43798}, + {43808, 43814}, {43816, 43822}, {43824, 43866}, {43868, 43871}, + {43872, 43880}, {43881, 43881}, {43888, 43967}, {43968, 44002}, + {44003, 44004}, {44005, 44005}, {44006, 44007}, {44008, 44008}, + {44009, 44010}, {44012, 44012}, {44013, 44013}, {44016, 44025}, + {44032, 55203}, {55216, 55238}, {55243, 55291}, {63744, 64109}, + {64112, 64217}, {64256, 64262}, {64275, 64279}, {64285, 64285}, + {64286, 64286}, {64287, 64296}, {64298, 64310}, {64312, 64316}, + {64318, 64318}, {64320, 64321}, {64323, 64324}, {64326, 64433}, + {64467, 64829}, {64848, 64911}, {64914, 64967}, {65008, 65019}, + {65024, 65039}, {65056, 65071}, {65075, 65076}, {65101, 65103}, + {65136, 65140}, {65142, 65276}, {65296, 65305}, {65313, 65338}, + {65343, 65343}, {65345, 65370}, {65381, 65381}, {65382, 65391}, + {65392, 65392}, {65393, 65437}, {65438, 65439}, {65440, 65470}, + {65474, 65479}, {65482, 65487}, {65490, 65495}, {65498, 65500}, + {65536, 65547}, {65549, 65574}, {65576, 65594}, {65596, 65597}, + {65599, 65613}, {65616, 65629}, {65664, 65786}, {65856, 65908}, + {66045, 66045}, {66176, 66204}, {66208, 66256}, {66272, 66272}, + {66304, 66335}, {66349, 66368}, {66369, 66369}, {66370, 66377}, + {66378, 66378}, {66384, 66421}, {66422, 66426}, {66432, 66461}, + {66464, 66499}, {66504, 66511}, {66513, 66517}, {66560, 66639}, + {66640, 66717}, {66720, 66729}, {66736, 66771}, {66776, 66811}, + {66816, 66855}, {66864, 66915}, {66928, 66938}, {66940, 66954}, + {66956, 66962}, {66964, 66965}, {66967, 66977}, {66979, 66993}, + {66995, 67001}, {67003, 67004}, {67072, 67382}, {67392, 67413}, + {67424, 67431}, {67456, 67461}, {67463, 67504}, {67506, 67514}, + {67584, 67589}, {67592, 67592}, {67594, 67637}, {67639, 67640}, + {67644, 67644}, {67647, 67669}, {67680, 67702}, {67712, 67742}, + {67808, 67826}, {67828, 67829}, {67840, 67861}, {67872, 67897}, + {67968, 68023}, {68030, 68031}, {68096, 68096}, {68097, 68099}, + {68101, 68102}, {68108, 68111}, {68112, 68115}, {68117, 68119}, + {68121, 68149}, {68152, 68154}, {68159, 68159}, {68192, 68220}, + {68224, 68252}, {68288, 68295}, {68297, 68324}, {68325, 68326}, + {68352, 68405}, {68416, 68437}, {68448, 68466}, {68480, 68497}, + {68608, 68680}, {68736, 68786}, {68800, 68850}, {68864, 68899}, + {68900, 68903}, {68912, 68921}, {69248, 69289}, {69291, 69292}, + {69296, 69297}, {69373, 69375}, {69376, 69404}, {69415, 69415}, + {69424, 69445}, {69446, 69456}, {69488, 69505}, {69506, 69509}, + {69552, 69572}, {69600, 69622}, {69632, 69632}, {69633, 69633}, + {69634, 69634}, {69635, 69687}, {69688, 69702}, {69734, 69743}, + {69744, 69744}, {69745, 69746}, {69747, 69748}, {69749, 69749}, + {69759, 69761}, {69762, 69762}, {69763, 69807}, {69808, 69810}, + {69811, 69814}, {69815, 69816}, {69817, 69818}, {69826, 69826}, + {69840, 69864}, {69872, 69881}, {69888, 69890}, {69891, 69926}, + {69927, 69931}, {69932, 69932}, {69933, 69940}, {69942, 69951}, + {69956, 69956}, {69957, 69958}, {69959, 69959}, {69968, 70002}, + {70003, 70003}, {70006, 70006}, {70016, 70017}, {70018, 70018}, + {70019, 70066}, {70067, 70069}, {70070, 70078}, {70079, 70080}, + {70081, 70084}, {70089, 70092}, {70094, 70094}, {70095, 70095}, + {70096, 70105}, {70106, 70106}, {70108, 70108}, {70144, 70161}, + {70163, 70187}, {70188, 70190}, {70191, 70193}, {70194, 70195}, + {70196, 70196}, {70197, 70197}, {70198, 70199}, {70206, 70206}, + {70207, 70208}, {70209, 70209}, {70272, 70278}, {70280, 70280}, + {70282, 70285}, {70287, 70301}, {70303, 70312}, {70320, 70366}, + {70367, 70367}, {70368, 70370}, {70371, 70378}, {70384, 70393}, + {70400, 70401}, {70402, 70403}, {70405, 70412}, {70415, 70416}, + {70419, 70440}, {70442, 70448}, {70450, 70451}, {70453, 70457}, + {70459, 70460}, {70461, 70461}, {70462, 70463}, {70464, 70464}, + {70465, 70468}, {70471, 70472}, {70475, 70477}, {70480, 70480}, + {70487, 70487}, {70493, 70497}, {70498, 70499}, {70502, 70508}, + {70512, 70516}, {70656, 70708}, {70709, 70711}, {70712, 70719}, + {70720, 70721}, {70722, 70724}, {70725, 70725}, {70726, 70726}, + {70727, 70730}, {70736, 70745}, {70750, 70750}, {70751, 70753}, + {70784, 70831}, {70832, 70834}, {70835, 70840}, {70841, 70841}, + {70842, 70842}, {70843, 70846}, {70847, 70848}, {70849, 70849}, + {70850, 70851}, {70852, 70853}, {70855, 70855}, {70864, 70873}, + {71040, 71086}, {71087, 71089}, {71090, 71093}, {71096, 71099}, + {71100, 71101}, {71102, 71102}, {71103, 71104}, {71128, 71131}, + {71132, 71133}, {71168, 71215}, {71216, 71218}, {71219, 71226}, + {71227, 71228}, {71229, 71229}, {71230, 71230}, {71231, 71232}, + {71236, 71236}, {71248, 71257}, {71296, 71338}, {71339, 71339}, + {71340, 71340}, {71341, 71341}, {71342, 71343}, {71344, 71349}, + {71350, 71350}, {71351, 71351}, {71352, 71352}, {71360, 71369}, + {71424, 71450}, {71453, 71455}, {71456, 71457}, {71458, 71461}, + {71462, 71462}, {71463, 71467}, {71472, 71481}, {71488, 71494}, + {71680, 71723}, {71724, 71726}, {71727, 71735}, {71736, 71736}, + {71737, 71738}, {71840, 71903}, {71904, 71913}, {71935, 71942}, + {71945, 71945}, {71948, 71955}, {71957, 71958}, {71960, 71983}, + {71984, 71989}, {71991, 71992}, {71995, 71996}, {71997, 71997}, + {71998, 71998}, {71999, 71999}, {72000, 72000}, {72001, 72001}, + {72002, 72002}, {72003, 72003}, {72016, 72025}, {72096, 72103}, + {72106, 72144}, {72145, 72147}, {72148, 72151}, {72154, 72155}, + {72156, 72159}, {72160, 72160}, {72161, 72161}, {72163, 72163}, + {72164, 72164}, {72192, 72192}, {72193, 72202}, {72203, 72242}, + {72243, 72248}, {72249, 72249}, {72250, 72250}, {72251, 72254}, + {72263, 72263}, {72272, 72272}, {72273, 72278}, {72279, 72280}, + {72281, 72283}, {72284, 72329}, {72330, 72342}, {72343, 72343}, + {72344, 72345}, {72349, 72349}, {72368, 72440}, {72704, 72712}, + {72714, 72750}, {72751, 72751}, {72752, 72758}, {72760, 72765}, + {72766, 72766}, {72767, 72767}, {72768, 72768}, {72784, 72793}, + {72818, 72847}, {72850, 72871}, {72873, 72873}, {72874, 72880}, + {72881, 72881}, {72882, 72883}, {72884, 72884}, {72885, 72886}, + {72960, 72966}, {72968, 72969}, {72971, 73008}, {73009, 73014}, + {73018, 73018}, {73020, 73021}, {73023, 73029}, {73030, 73030}, + {73031, 73031}, {73040, 73049}, {73056, 73061}, {73063, 73064}, + {73066, 73097}, {73098, 73102}, {73104, 73105}, {73107, 73108}, + {73109, 73109}, {73110, 73110}, {73111, 73111}, {73112, 73112}, + {73120, 73129}, {73440, 73458}, {73459, 73460}, {73461, 73462}, + {73472, 73473}, {73474, 73474}, {73475, 73475}, {73476, 73488}, + {73490, 73523}, {73524, 73525}, {73526, 73530}, {73534, 73535}, + {73536, 73536}, {73537, 73537}, {73538, 73538}, {73552, 73561}, + {73648, 73648}, {73728, 74649}, {74752, 74862}, {74880, 75075}, + {77712, 77808}, {77824, 78895}, {78912, 78912}, {78913, 78918}, + {78919, 78933}, {82944, 83526}, {92160, 92728}, {92736, 92766}, + {92768, 92777}, {92784, 92862}, {92864, 92873}, {92880, 92909}, + {92912, 92916}, {92928, 92975}, {92976, 92982}, {92992, 92995}, + {93008, 93017}, {93027, 93047}, {93053, 93071}, {93760, 93823}, + {93952, 94026}, {94031, 94031}, {94032, 94032}, {94033, 94087}, + {94095, 94098}, {94099, 94111}, {94176, 94177}, {94179, 94179}, + {94180, 94180}, {94192, 94193}, {94208, 100343}, {100352, 101589}, + {101632, 101640}, {110576, 110579}, {110581, 110587}, {110589, 110590}, + {110592, 110882}, {110898, 110898}, {110928, 110930}, {110933, 110933}, + {110948, 110951}, {110960, 111355}, {113664, 113770}, {113776, 113788}, + {113792, 113800}, {113808, 113817}, {113821, 113822}, {118528, 118573}, + {118576, 118598}, {119141, 119142}, {119143, 119145}, {119149, 119154}, + {119163, 119170}, {119173, 119179}, {119210, 119213}, {119362, 119364}, + {119808, 119892}, {119894, 119964}, {119966, 119967}, {119970, 119970}, + {119973, 119974}, {119977, 119980}, {119982, 119993}, {119995, 119995}, + {119997, 120003}, {120005, 120069}, {120071, 120074}, {120077, 120084}, + {120086, 120092}, {120094, 120121}, {120123, 120126}, {120128, 120132}, + {120134, 120134}, {120138, 120144}, {120146, 120485}, {120488, 120512}, + {120514, 120538}, {120540, 120570}, {120572, 120596}, {120598, 120628}, + {120630, 120654}, {120656, 120686}, {120688, 120712}, {120714, 120744}, + {120746, 120770}, {120772, 120779}, {120782, 120831}, {121344, 121398}, + {121403, 121452}, {121461, 121461}, {121476, 121476}, {121499, 121503}, + {121505, 121519}, {122624, 122633}, {122634, 122634}, {122635, 122654}, + {122661, 122666}, {122880, 122886}, {122888, 122904}, {122907, 122913}, + {122915, 122916}, {122918, 122922}, {122928, 122989}, {123023, 123023}, + {123136, 123180}, {123184, 123190}, {123191, 123197}, {123200, 123209}, + {123214, 123214}, {123536, 123565}, {123566, 123566}, {123584, 123627}, + {123628, 123631}, {123632, 123641}, {124112, 124138}, {124139, 124139}, + {124140, 124143}, {124144, 124153}, {124896, 124902}, {124904, 124907}, + {124909, 124910}, {124912, 124926}, {124928, 125124}, {125136, 125142}, + {125184, 125251}, {125252, 125258}, {125259, 125259}, {125264, 125273}, + {126464, 126467}, {126469, 126495}, {126497, 126498}, {126500, 126500}, + {126503, 126503}, {126505, 126514}, {126516, 126519}, {126521, 126521}, + {126523, 126523}, {126530, 126530}, {126535, 126535}, {126537, 126537}, + {126539, 126539}, {126541, 126543}, {126545, 126546}, {126548, 126548}, + {126551, 126551}, {126553, 126553}, {126555, 126555}, {126557, 126557}, + {126559, 126559}, {126561, 126562}, {126564, 126564}, {126567, 126570}, + {126572, 126578}, {126580, 126583}, {126585, 126588}, {126590, 126590}, + {126592, 126601}, {126603, 126619}, {126625, 126627}, {126629, 126633}, + {126635, 126651}, {130032, 130041}, {131072, 173791}, {173824, 177977}, + {177984, 178205}, {178208, 183969}, {183984, 191456}, {191472, 192093}, + {194560, 195101}, {196608, 201546}, {201552, 205743}, {917760, 917999} +}; +const uint32_t id_start[740][2] = +{ + {65, 90}, {97, 122}, {170, 170}, {181, 181}, + {186, 186}, {192, 214}, {216, 246}, {248, 442}, + {443, 443}, {444, 447}, {448, 451}, {452, 659}, + {660, 660}, {661, 687}, {688, 705}, {710, 721}, + {736, 740}, {748, 748}, {750, 750}, {880, 883}, + {884, 884}, {886, 887}, {890, 890}, {891, 893}, + {895, 895}, {902, 902}, {904, 906}, {908, 908}, + {910, 929}, {931, 1013}, {1015, 1153}, {1162, 1327}, + {1329, 1366}, {1369, 1369}, {1376, 1416}, {1488, 1514}, + {1519, 1522}, {1568, 1599}, {1600, 1600}, {1601, 1610}, + {1646, 1647}, {1649, 1747}, {1749, 1749}, {1765, 1766}, + {1774, 1775}, {1786, 1788}, {1791, 1791}, {1808, 1808}, + {1810, 1839}, {1869, 1957}, {1969, 1969}, {1994, 2026}, + {2036, 2037}, {2042, 2042}, {2048, 2069}, {2074, 2074}, + {2084, 2084}, {2088, 2088}, {2112, 2136}, {2144, 2154}, + {2160, 2183}, {2185, 2190}, {2208, 2248}, {2249, 2249}, + {2308, 2361}, {2365, 2365}, {2384, 2384}, {2392, 2401}, + {2417, 2417}, {2418, 2432}, {2437, 2444}, {2447, 2448}, + {2451, 2472}, {2474, 2480}, {2482, 2482}, {2486, 2489}, + {2493, 2493}, {2510, 2510}, {2524, 2525}, {2527, 2529}, + {2544, 2545}, {2556, 2556}, {2565, 2570}, {2575, 2576}, + {2579, 2600}, {2602, 2608}, {2610, 2611}, {2613, 2614}, + {2616, 2617}, {2649, 2652}, {2654, 2654}, {2674, 2676}, + {2693, 2701}, {2703, 2705}, {2707, 2728}, {2730, 2736}, + {2738, 2739}, {2741, 2745}, {2749, 2749}, {2768, 2768}, + {2784, 2785}, {2809, 2809}, {2821, 2828}, {2831, 2832}, + {2835, 2856}, {2858, 2864}, {2866, 2867}, {2869, 2873}, + {2877, 2877}, {2908, 2909}, {2911, 2913}, {2929, 2929}, + {2947, 2947}, {2949, 2954}, {2958, 2960}, {2962, 2965}, + {2969, 2970}, {2972, 2972}, {2974, 2975}, {2979, 2980}, + {2984, 2986}, {2990, 3001}, {3024, 3024}, {3077, 3084}, + {3086, 3088}, {3090, 3112}, {3114, 3129}, {3133, 3133}, + {3160, 3162}, {3165, 3165}, {3168, 3169}, {3200, 3200}, + {3205, 3212}, {3214, 3216}, {3218, 3240}, {3242, 3251}, + {3253, 3257}, {3261, 3261}, {3293, 3294}, {3296, 3297}, + {3313, 3314}, {3332, 3340}, {3342, 3344}, {3346, 3386}, + {3389, 3389}, {3406, 3406}, {3412, 3414}, {3423, 3425}, + {3450, 3455}, {3461, 3478}, {3482, 3505}, {3507, 3515}, + {3517, 3517}, {3520, 3526}, {3585, 3632}, {3634, 3635}, + {3648, 3653}, {3654, 3654}, {3713, 3714}, {3716, 3716}, + {3718, 3722}, {3724, 3747}, {3749, 3749}, {3751, 3760}, + {3762, 3763}, {3773, 3773}, {3776, 3780}, {3782, 3782}, + {3804, 3807}, {3840, 3840}, {3904, 3911}, {3913, 3948}, + {3976, 3980}, {4096, 4138}, {4159, 4159}, {4176, 4181}, + {4186, 4189}, {4193, 4193}, {4197, 4198}, {4206, 4208}, + {4213, 4225}, {4238, 4238}, {4256, 4293}, {4295, 4295}, + {4301, 4301}, {4304, 4346}, {4348, 4348}, {4349, 4351}, + {4352, 4680}, {4682, 4685}, {4688, 4694}, {4696, 4696}, + {4698, 4701}, {4704, 4744}, {4746, 4749}, {4752, 4784}, + {4786, 4789}, {4792, 4798}, {4800, 4800}, {4802, 4805}, + {4808, 4822}, {4824, 4880}, {4882, 4885}, {4888, 4954}, + {4992, 5007}, {5024, 5109}, {5112, 5117}, {5121, 5740}, + {5743, 5759}, {5761, 5786}, {5792, 5866}, {5870, 5872}, + {5873, 5880}, {5888, 5905}, {5919, 5937}, {5952, 5969}, + {5984, 5996}, {5998, 6000}, {6016, 6067}, {6103, 6103}, + {6108, 6108}, {6176, 6210}, {6211, 6211}, {6212, 6264}, + {6272, 6276}, {6277, 6278}, {6279, 6312}, {6314, 6314}, + {6320, 6389}, {6400, 6430}, {6480, 6509}, {6512, 6516}, + {6528, 6571}, {6576, 6601}, {6656, 6678}, {6688, 6740}, + {6823, 6823}, {6917, 6963}, {6981, 6988}, {7043, 7072}, + {7086, 7087}, {7098, 7141}, {7168, 7203}, {7245, 7247}, + {7258, 7287}, {7288, 7293}, {7296, 7304}, {7312, 7354}, + {7357, 7359}, {7401, 7404}, {7406, 7411}, {7413, 7414}, + {7418, 7418}, {7424, 7467}, {7468, 7530}, {7531, 7543}, + {7544, 7544}, {7545, 7578}, {7579, 7615}, {7680, 7957}, + {7960, 7965}, {7968, 8005}, {8008, 8013}, {8016, 8023}, + {8025, 8025}, {8027, 8027}, {8029, 8029}, {8031, 8061}, + {8064, 8116}, {8118, 8124}, {8126, 8126}, {8130, 8132}, + {8134, 8140}, {8144, 8147}, {8150, 8155}, {8160, 8172}, + {8178, 8180}, {8182, 8188}, {8305, 8305}, {8319, 8319}, + {8336, 8348}, {8450, 8450}, {8455, 8455}, {8458, 8467}, + {8469, 8469}, {8472, 8472}, {8473, 8477}, {8484, 8484}, + {8486, 8486}, {8488, 8488}, {8490, 8493}, {8494, 8494}, + {8495, 8500}, {8501, 8504}, {8505, 8505}, {8508, 8511}, + {8517, 8521}, {8526, 8526}, {8544, 8578}, {8579, 8580}, + {8581, 8584}, {11264, 11387}, {11388, 11389}, {11390, 11492}, + {11499, 11502}, {11506, 11507}, {11520, 11557}, {11559, 11559}, + {11565, 11565}, {11568, 11623}, {11631, 11631}, {11648, 11670}, + {11680, 11686}, {11688, 11694}, {11696, 11702}, {11704, 11710}, + {11712, 11718}, {11720, 11726}, {11728, 11734}, {11736, 11742}, + {12293, 12293}, {12294, 12294}, {12295, 12295}, {12321, 12329}, + {12337, 12341}, {12344, 12346}, {12347, 12347}, {12348, 12348}, + {12353, 12438}, {12443, 12444}, {12445, 12446}, {12447, 12447}, + {12449, 12538}, {12540, 12542}, {12543, 12543}, {12549, 12591}, + {12593, 12686}, {12704, 12735}, {12784, 12799}, {13312, 19903}, + {19968, 40980}, {40981, 40981}, {40982, 42124}, {42192, 42231}, + {42232, 42237}, {42240, 42507}, {42508, 42508}, {42512, 42527}, + {42538, 42539}, {42560, 42605}, {42606, 42606}, {42623, 42623}, + {42624, 42651}, {42652, 42653}, {42656, 42725}, {42726, 42735}, + {42775, 42783}, {42786, 42863}, {42864, 42864}, {42865, 42887}, + {42888, 42888}, {42891, 42894}, {42895, 42895}, {42896, 42954}, + {42960, 42961}, {42963, 42963}, {42965, 42969}, {42994, 42996}, + {42997, 42998}, {42999, 42999}, {43000, 43001}, {43002, 43002}, + {43003, 43009}, {43011, 43013}, {43015, 43018}, {43020, 43042}, + {43072, 43123}, {43138, 43187}, {43250, 43255}, {43259, 43259}, + {43261, 43262}, {43274, 43301}, {43312, 43334}, {43360, 43388}, + {43396, 43442}, {43471, 43471}, {43488, 43492}, {43494, 43494}, + {43495, 43503}, {43514, 43518}, {43520, 43560}, {43584, 43586}, + {43588, 43595}, {43616, 43631}, {43632, 43632}, {43633, 43638}, + {43642, 43642}, {43646, 43695}, {43697, 43697}, {43701, 43702}, + {43705, 43709}, {43712, 43712}, {43714, 43714}, {43739, 43740}, + {43741, 43741}, {43744, 43754}, {43762, 43762}, {43763, 43764}, + {43777, 43782}, {43785, 43790}, {43793, 43798}, {43808, 43814}, + {43816, 43822}, {43824, 43866}, {43868, 43871}, {43872, 43880}, + {43881, 43881}, {43888, 43967}, {43968, 44002}, {44032, 55203}, + {55216, 55238}, {55243, 55291}, {63744, 64109}, {64112, 64217}, + {64256, 64262}, {64275, 64279}, {64285, 64285}, {64287, 64296}, + {64298, 64310}, {64312, 64316}, {64318, 64318}, {64320, 64321}, + {64323, 64324}, {64326, 64433}, {64467, 64829}, {64848, 64911}, + {64914, 64967}, {65008, 65019}, {65136, 65140}, {65142, 65276}, + {65313, 65338}, {65345, 65370}, {65382, 65391}, {65392, 65392}, + {65393, 65437}, {65438, 65439}, {65440, 65470}, {65474, 65479}, + {65482, 65487}, {65490, 65495}, {65498, 65500}, {65536, 65547}, + {65549, 65574}, {65576, 65594}, {65596, 65597}, {65599, 65613}, + {65616, 65629}, {65664, 65786}, {65856, 65908}, {66176, 66204}, + {66208, 66256}, {66304, 66335}, {66349, 66368}, {66369, 66369}, + {66370, 66377}, {66378, 66378}, {66384, 66421}, {66432, 66461}, + {66464, 66499}, {66504, 66511}, {66513, 66517}, {66560, 66639}, + {66640, 66717}, {66736, 66771}, {66776, 66811}, {66816, 66855}, + {66864, 66915}, {66928, 66938}, {66940, 66954}, {66956, 66962}, + {66964, 66965}, {66967, 66977}, {66979, 66993}, {66995, 67001}, + {67003, 67004}, {67072, 67382}, {67392, 67413}, {67424, 67431}, + {67456, 67461}, {67463, 67504}, {67506, 67514}, {67584, 67589}, + {67592, 67592}, {67594, 67637}, {67639, 67640}, {67644, 67644}, + {67647, 67669}, {67680, 67702}, {67712, 67742}, {67808, 67826}, + {67828, 67829}, {67840, 67861}, {67872, 67897}, {67968, 68023}, + {68030, 68031}, {68096, 68096}, {68112, 68115}, {68117, 68119}, + {68121, 68149}, {68192, 68220}, {68224, 68252}, {68288, 68295}, + {68297, 68324}, {68352, 68405}, {68416, 68437}, {68448, 68466}, + {68480, 68497}, {68608, 68680}, {68736, 68786}, {68800, 68850}, + {68864, 68899}, {69248, 69289}, {69296, 69297}, {69376, 69404}, + {69415, 69415}, {69424, 69445}, {69488, 69505}, {69552, 69572}, + {69600, 69622}, {69635, 69687}, {69745, 69746}, {69749, 69749}, + {69763, 69807}, {69840, 69864}, {69891, 69926}, {69956, 69956}, + {69959, 69959}, {69968, 70002}, {70006, 70006}, {70019, 70066}, + {70081, 70084}, {70106, 70106}, {70108, 70108}, {70144, 70161}, + {70163, 70187}, {70207, 70208}, {70272, 70278}, {70280, 70280}, + {70282, 70285}, {70287, 70301}, {70303, 70312}, {70320, 70366}, + {70405, 70412}, {70415, 70416}, {70419, 70440}, {70442, 70448}, + {70450, 70451}, {70453, 70457}, {70461, 70461}, {70480, 70480}, + {70493, 70497}, {70656, 70708}, {70727, 70730}, {70751, 70753}, + {70784, 70831}, {70852, 70853}, {70855, 70855}, {71040, 71086}, + {71128, 71131}, {71168, 71215}, {71236, 71236}, {71296, 71338}, + {71352, 71352}, {71424, 71450}, {71488, 71494}, {71680, 71723}, + {71840, 71903}, {71935, 71942}, {71945, 71945}, {71948, 71955}, + {71957, 71958}, {71960, 71983}, {71999, 71999}, {72001, 72001}, + {72096, 72103}, {72106, 72144}, {72161, 72161}, {72163, 72163}, + {72192, 72192}, {72203, 72242}, {72250, 72250}, {72272, 72272}, + {72284, 72329}, {72349, 72349}, {72368, 72440}, {72704, 72712}, + {72714, 72750}, {72768, 72768}, {72818, 72847}, {72960, 72966}, + {72968, 72969}, {72971, 73008}, {73030, 73030}, {73056, 73061}, + {73063, 73064}, {73066, 73097}, {73112, 73112}, {73440, 73458}, + {73474, 73474}, {73476, 73488}, {73490, 73523}, {73648, 73648}, + {73728, 74649}, {74752, 74862}, {74880, 75075}, {77712, 77808}, + {77824, 78895}, {78913, 78918}, {82944, 83526}, {92160, 92728}, + {92736, 92766}, {92784, 92862}, {92880, 92909}, {92928, 92975}, + {92992, 92995}, {93027, 93047}, {93053, 93071}, {93760, 93823}, + {93952, 94026}, {94032, 94032}, {94099, 94111}, {94176, 94177}, + {94179, 94179}, {94208, 100343}, {100352, 101589}, {101632, 101640}, + {110576, 110579}, {110581, 110587}, {110589, 110590}, {110592, 110882}, + {110898, 110898}, {110928, 110930}, {110933, 110933}, {110948, 110951}, + {110960, 111355}, {113664, 113770}, {113776, 113788}, {113792, 113800}, + {113808, 113817}, {119808, 119892}, {119894, 119964}, {119966, 119967}, + {119970, 119970}, {119973, 119974}, {119977, 119980}, {119982, 119993}, + {119995, 119995}, {119997, 120003}, {120005, 120069}, {120071, 120074}, + {120077, 120084}, {120086, 120092}, {120094, 120121}, {120123, 120126}, + {120128, 120132}, {120134, 120134}, {120138, 120144}, {120146, 120485}, + {120488, 120512}, {120514, 120538}, {120540, 120570}, {120572, 120596}, + {120598, 120628}, {120630, 120654}, {120656, 120686}, {120688, 120712}, + {120714, 120744}, {120746, 120770}, {120772, 120779}, {122624, 122633}, + {122634, 122634}, {122635, 122654}, {122661, 122666}, {122928, 122989}, + {123136, 123180}, {123191, 123197}, {123214, 123214}, {123536, 123565}, + {123584, 123627}, {124112, 124138}, {124139, 124139}, {124896, 124902}, + {124904, 124907}, {124909, 124910}, {124912, 124926}, {124928, 125124}, + {125184, 125251}, {125259, 125259}, {126464, 126467}, {126469, 126495}, + {126497, 126498}, {126500, 126500}, {126503, 126503}, {126505, 126514}, + {126516, 126519}, {126521, 126521}, {126523, 126523}, {126530, 126530}, + {126535, 126535}, {126537, 126537}, {126539, 126539}, {126541, 126543}, + {126545, 126546}, {126548, 126548}, {126551, 126551}, {126553, 126553}, + {126555, 126555}, {126557, 126557}, {126559, 126559}, {126561, 126562}, + {126564, 126564}, {126567, 126570}, {126572, 126578}, {126580, 126583}, + {126585, 126588}, {126590, 126590}, {126592, 126601}, {126603, 126619}, + {126625, 126627}, {126629, 126633}, {126635, 126651}, {131072, 173791}, + {173824, 177977}, {177984, 178205}, {178208, 183969}, {183984, 191456}, + {191472, 192093}, {194560, 195101}, {196608, 201546}, {201552, 205743} +}; + + +} // namespace ada::idna +#endif // ADA_IDNA_IDENTIFIER_TABLES_H + +/* end file src/id_tables.cpp */ + +namespace ada::idna { +// return 0xffffffff in case of error +// We do not fully validate the input +uint32_t get_first_code_point(std::string_view input) { + constexpr uint32_t error = 0xffffffff; + // Check if the input is empty + if (input.empty()) { + return error; + } + + uint32_t code_point = 0; + size_t number_bytes = 0; + unsigned char first_byte = input[0]; + + if ((first_byte & 0x80) == 0) { + // 1-byte character (ASCII) + return first_byte; + } else if ((first_byte & 0xE0) == 0xC0) { + // 2-byte character + code_point = first_byte & 0x1F; + number_bytes = 2; + } else if ((first_byte & 0xF0) == 0xE0) { + // 3-byte character + code_point = first_byte & 0x0F; + number_bytes = 3; + } else if ((first_byte & 0xF8) == 0xF0) { + // 4-byte character + code_point = first_byte & 0x07; + number_bytes = 4; + } else { + return error; + } + + // Decode the remaining bytes + for (size_t i = 1; i < number_bytes; ++i) { + if (i >= input.size()) { + return error; + } + unsigned char byte = input[i]; + if ((byte & 0xC0) != 0x80) { + return error; + } + code_point = (code_point << 6) | (byte & 0x3F); + } + return code_point; +} + +bool is_ascii_letter(char32_t c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} + +bool is_ascii_letter_or_digit(char32_t c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9'); +} + +bool valid_name_code_point(char32_t code_point, bool first) { + // https://tc39.es/ecma262/#prod-IdentifierStart + // Fast paths: + if (first && + (code_point == '$' || code_point == '_' || is_ascii_letter(code_point))) { + return true; + } + if (!first && (code_point == '$' || is_ascii_letter_or_digit(code_point))) { + return true; + } + // Slow path... + if (code_point == 0xffffffff) { + return false; // minimal error handling + } + if (first) { + auto iter = std::lower_bound( + std::begin(ada::idna::id_start), std::end(ada::idna::id_start), + code_point, [](const uint32_t* range, uint32_t code_point) { + return range[1] < code_point; + }); + return iter != std::end(id_start) && code_point >= (*iter)[0]; + } else { + auto iter = std::lower_bound( + std::begin(id_continue), std::end(id_continue), code_point, + [](const uint32_t* range, uint32_t code_point) { + return range[1] < code_point; + }); + return iter != std::end(id_start) && code_point >= (*iter)[0]; + } +} +} // namespace ada::idna +/* end file src/identifier.cpp */ /* end file src/idna.cpp */ diff --git a/src/checkers.cpp b/src/checkers.cpp index 82e1fe32f..d486f5544 100644 --- a/src/checkers.cpp +++ b/src/checkers.cpp @@ -1,6 +1,11 @@ +#include "ada/checkers-inl.h" #include "ada/checkers.h" +#include "ada/unicode-inl.h" +#include "ada/common_defs.h" #include +#include +#include namespace ada::checkers { diff --git a/src/helpers.cpp b/src/helpers.cpp index b84b533ec..0a4216cab 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -1,4 +1,3 @@ -#include "ada.h" #include "ada/checkers-inl.h" #include "ada/common_defs.h" #include "ada/scheme.h" @@ -539,7 +538,7 @@ ada_really_inline std::pair get_host_delimiter_location( return {location, found_colon}; } -ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept { +void trim_c0_whitespace(std::string_view& input) noexcept { while (!input.empty() && ada::unicode::is_c0_control_or_space(input.front())) { input.remove_prefix(1); diff --git a/src/implementation.cpp b/src/implementation.cpp index fef682e51..cad5af5ff 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -1,20 +1,20 @@ #include -#include "ada.h" #include "ada/common_defs.h" #include "ada/parser.h" #include "ada/url.h" #include "ada/url_aggregator.h" +#include "ada/url_pattern.h" namespace ada { template -ada_warn_unused tl::expected parse( +ada_warn_unused tl::expected parse( std::string_view input, const result_type* base_url) { result_type u = ada::parser::parse_url_impl(input, base_url); if (!u.is_valid) { - return tl::unexpected(errors::generic_error); + return tl::unexpected(errors::type_error); } return u; } @@ -79,4 +79,10 @@ ada_warn_unused std::string to_string(ada::encoding_type type) { } } +ada_warn_unused tl::expected parse_url_pattern( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options) { + return parser::parse_url_pattern_impl(std::move(input), base_url, options); +} + } // namespace ada diff --git a/src/parser.cpp b/src/parser.cpp index 6937bab4c..09c6ad283 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -2,12 +2,10 @@ #include -#include "ada.h" #include "ada/character_sets-inl.h" #include "ada/common_defs.h" #include "ada/log.h" #include "ada/unicode.h" -#include "ada/url-inl.h" namespace ada::parser { @@ -19,10 +17,9 @@ result_type parse_url_impl(std::string_view user_input, // means that doing if constexpr(result_type_is_ada_url) { something } else { // something else } is free (at runtime). This means that ada::url_aggregator // and ada::url **do not have to support the exact same API**. - constexpr bool result_type_is_ada_url = - std::is_same::value; + constexpr bool result_type_is_ada_url = std::is_same_v; constexpr bool result_type_is_ada_url_aggregator = - std::is_same::value; + std::is_same_v; static_assert(result_type_is_ada_url || result_type_is_ada_url_aggregator); // We don't support // anything else for now. @@ -31,7 +28,7 @@ result_type parse_url_impl(std::string_view user_input, " bytes],", (base_url != nullptr ? base_url->to_string() : "null"), ")"); - ada::state state = ada::state::SCHEME_START; + state state = state::SCHEME_START; result_type url{}; // We refuse to parse URL strings that exceed 4GB. Such strings are almost @@ -102,27 +99,27 @@ result_type parse_url_impl(std::string_view user_input, ada_log("In parsing at ", input_position, " out of ", input_size, " in state ", ada::to_string(state)); switch (state) { - case ada::state::SCHEME_START: { + case state::SCHEME_START: { ada_log("SCHEME_START ", helpers::substring(url_data, input_position)); // If c is an ASCII alpha, append c, lowercased, to buffer, and set // state to scheme state. if ((input_position != input_size) && checkers::is_alpha(url_data[input_position])) { - state = ada::state::SCHEME; + state = state::SCHEME; input_position++; } else { // Otherwise, if state override is not given, set state to no scheme // state and decrease pointer by 1. - state = ada::state::NO_SCHEME; + state = state::NO_SCHEME; } break; } - case ada::state::SCHEME: { + case state::SCHEME: { ada_log("SCHEME ", helpers::substring(url_data, input_position)); // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), // append c, lowercased, to buffer. while ((input_position != input_size) && - (ada::unicode::is_alnum_plus(url_data[input_position]))) { + (unicode::is_alnum_plus(url_data[input_position]))) { input_position++; } // Otherwise, if c is U+003A (:), then: @@ -144,9 +141,9 @@ result_type parse_url_impl(std::string_view user_input, ada_log("SCHEME the scheme is ", url.get_protocol()); // If url's scheme is "file", then: - if (url.type == ada::scheme::type::FILE) { + if (url.type == scheme::type::FILE) { // Set state to file state. - state = ada::state::FILE; + state = state::FILE; } // Otherwise, if url is special, base is non-null, and base's scheme // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url @@ -154,38 +151,38 @@ result_type parse_url_impl(std::string_view user_input, else if (url.is_special() && base_url != nullptr && base_url->type == url.type) { // Set state to special relative or authority state. - state = ada::state::SPECIAL_RELATIVE_OR_AUTHORITY; + state = state::SPECIAL_RELATIVE_OR_AUTHORITY; } // Otherwise, if url is special, set state to special authority // slashes state. else if (url.is_special()) { - state = ada::state::SPECIAL_AUTHORITY_SLASHES; + state = state::SPECIAL_AUTHORITY_SLASHES; } // Otherwise, if remaining starts with an U+002F (/), set state to // path or authority state and increase pointer by 1. else if (input_position + 1 < input_size && url_data[input_position + 1] == '/') { - state = ada::state::PATH_OR_AUTHORITY; + state = state::PATH_OR_AUTHORITY; input_position++; } // Otherwise, set url's path to the empty string and set state to // opaque path state. else { - state = ada::state::OPAQUE_PATH; + state = state::OPAQUE_PATH; } } // Otherwise, if state override is not given, set buffer to the empty // string, state to no scheme state, and start over (from the first code // point in input). else { - state = ada::state::NO_SCHEME; + state = state::NO_SCHEME; input_position = 0; break; } input_position++; break; } - case ada::state::NO_SCHEME: { + case state::NO_SCHEME: { ada_log("NO_SCHEME ", helpers::substring(url_data, input_position)); // If base is null, or base has an opaque path and c is not U+0023 (#), // validation error, return failure. @@ -216,18 +213,18 @@ result_type parse_url_impl(std::string_view user_input, } // Otherwise, if base's scheme is not "file", set state to relative // state and decrease pointer by 1. - else if (base_url->type != ada::scheme::type::FILE) { + else if (base_url->type != scheme::type::FILE) { ada_log("NO_SCHEME non-file relative path"); - state = ada::state::RELATIVE_SCHEME; + state = state::RELATIVE_SCHEME; } // Otherwise, set state to file state and decrease pointer by 1. else { ada_log("NO_SCHEME file base type"); - state = ada::state::FILE; + state = state::FILE; } break; } - case ada::state::AUTHORITY: { + case state::AUTHORITY: { ada_log("AUTHORITY ", helpers::substring(url_data, input_position)); // most URLs have no @. Having no @ tells us that we don't have to worry // about AUTHORITY. Of course, we could have @ and still not have to @@ -240,7 +237,7 @@ result_type parse_url_impl(std::string_view user_input, // Check if url data contains an @. if (url_data.find('@', input_position) == std::string_view::npos) { - state = ada::state::HOST; + state = state::HOST; break; } bool at_sign_seen{false}; @@ -337,7 +334,7 @@ result_type parse_url_impl(std::string_view user_input, url.is_valid = false; return url; } - state = ada::state::HOST; + state = state::HOST; break; } if (end_of_authority == input_size) { @@ -353,7 +350,7 @@ result_type parse_url_impl(std::string_view user_input, break; } - case ada::state::SPECIAL_RELATIVE_OR_AUTHORITY: { + case state::SPECIAL_RELATIVE_OR_AUTHORITY: { ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ", helpers::substring(url_data, input_position)); @@ -361,33 +358,33 @@ result_type parse_url_impl(std::string_view user_input, // then set state to special authority ignore slashes state and increase // pointer by 1. if (url_data.substr(input_position, 2) == "//") { - state = ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES; + state = state::SPECIAL_AUTHORITY_IGNORE_SLASHES; input_position += 2; } else { // Otherwise, validation error, set state to relative state and // decrease pointer by 1. - state = ada::state::RELATIVE_SCHEME; + state = state::RELATIVE_SCHEME; } break; } - case ada::state::PATH_OR_AUTHORITY: { + case state::PATH_OR_AUTHORITY: { ada_log("PATH_OR_AUTHORITY ", helpers::substring(url_data, input_position)); // If c is U+002F (/), then set state to authority state. if ((input_position != input_size) && (url_data[input_position] == '/')) { - state = ada::state::AUTHORITY; + state = state::AUTHORITY; input_position++; } else { // Otherwise, set state to path state, and decrease pointer by 1. - state = ada::state::PATH; + state = state::PATH; } break; } - case ada::state::RELATIVE_SCHEME: { + case state::RELATIVE_SCHEME: { ada_log("RELATIVE_SCHEME ", helpers::substring(url_data, input_position)); @@ -400,7 +397,7 @@ result_type parse_url_impl(std::string_view user_input, ada_log( "RELATIVE_SCHEME if c is U+002F (/), then set state to relative " "slash state"); - state = ada::state::RELATIVE_SLASH; + state = state::RELATIVE_SLASH; } else if (url.is_special() && (input_position != input_size) && (url_data[input_position] == '\\')) { // Otherwise, if url is special and c is U+005C (\), validation error, @@ -408,7 +405,7 @@ result_type parse_url_impl(std::string_view user_input, ada_log( "RELATIVE_SCHEME if url is special and c is U+005C, validation " "error, set state to relative slash state"); - state = ada::state::RELATIVE_SLASH; + state = state::RELATIVE_SLASH; } else { ada_log("RELATIVE_SCHEME otherwise"); // Set url's username to base's username, url's password to base's @@ -441,7 +438,7 @@ result_type parse_url_impl(std::string_view user_input, // state to query state. if ((input_position != input_size) && (url_data[input_position] == '?')) { - state = ada::state::QUERY; + state = state::QUERY; } // Otherwise, if c is not the EOF code point: else if (input_position != input_size) { @@ -457,14 +454,14 @@ result_type parse_url_impl(std::string_view user_input, } } // Set state to path state and decrease pointer by 1. - state = ada::state::PATH; + state = state::PATH; break; } } input_position++; break; } - case ada::state::RELATIVE_SLASH: { + case state::RELATIVE_SLASH: { ada_log("RELATIVE_SLASH ", helpers::substring(url_data, input_position)); @@ -473,12 +470,12 @@ result_type parse_url_impl(std::string_view user_input, (url_data[input_position] == '/' || url_data[input_position] == '\\')) { // Set state to special authority ignore slashes state. - state = ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES; + state = state::SPECIAL_AUTHORITY_IGNORE_SLASHES; } // Otherwise, if c is U+002F (/), then set state to authority state. else if ((input_position != input_size) && (url_data[input_position] == '/')) { - state = ada::state::AUTHORITY; + state = state::AUTHORITY; } // Otherwise, set // - url's username to base's username, @@ -498,14 +495,14 @@ result_type parse_url_impl(std::string_view user_input, url.update_host_to_base_host(base_url->get_hostname()); url.update_base_port(base_url->retrieve_base_port()); } - state = ada::state::PATH; + state = state::PATH; break; } input_position++; break; } - case ada::state::SPECIAL_AUTHORITY_SLASHES: { + case state::SPECIAL_AUTHORITY_SLASHES: { ada_log("SPECIAL_AUTHORITY_SLASHES ", helpers::substring(url_data, input_position)); @@ -518,7 +515,7 @@ result_type parse_url_impl(std::string_view user_input, [[fallthrough]]; } - case ada::state::SPECIAL_AUTHORITY_IGNORE_SLASHES: { + case state::SPECIAL_AUTHORITY_IGNORE_SLASHES: { ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ", helpers::substring(url_data, input_position)); @@ -529,19 +526,18 @@ result_type parse_url_impl(std::string_view user_input, (url_data[input_position] == '\\'))) { input_position++; } - state = ada::state::AUTHORITY; + state = state::AUTHORITY; break; } - case ada::state::QUERY: { + case state::QUERY: { ada_log("QUERY ", helpers::substring(url_data, input_position)); if constexpr (store_values) { // Let queryPercentEncodeSet be the special-query percent-encode set // if url is special; otherwise the query percent-encode set. const uint8_t* query_percent_encode_set = - url.is_special() - ? ada::character_sets::SPECIAL_QUERY_PERCENT_ENCODE - : ada::character_sets::QUERY_PERCENT_ENCODE; + url.is_special() ? character_sets::SPECIAL_QUERY_PERCENT_ENCODE + : character_sets::QUERY_PERCENT_ENCODE; // Percent-encode after encoding, with encoding, buffer, and // queryPercentEncodeSet, and append the result to url's query. @@ -554,7 +550,7 @@ result_type parse_url_impl(std::string_view user_input, } return url; } - case ada::state::HOST: { + case state::HOST: { ada_log("HOST ", helpers::substring(url_data, input_position)); std::string_view host_view = url_data.substr(input_position); @@ -577,7 +573,7 @@ result_type parse_url_impl(std::string_view user_input, ada_log("HOST parsing results in ", url.get_hostname()); // Set url's host to host, buffer to the empty string, and state to // port state. - state = ada::state::PORT; + state = state::PORT; input_position++; } // Otherwise, if one of the following is true: @@ -604,12 +600,12 @@ result_type parse_url_impl(std::string_view user_input, " href=", url.get_href()); // Set url's host to host, and state to path start state. - state = ada::state::PATH_START; + state = state::PATH_START; } break; } - case ada::state::OPAQUE_PATH: { + case state::OPAQUE_PATH: { ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position)); std::string_view view = url_data.substr(input_position); // If c is U+003F (?), then set url's query to the empty string and @@ -617,7 +613,7 @@ result_type parse_url_impl(std::string_view user_input, size_t location = view.find('?'); if (location != std::string_view::npos) { view.remove_suffix(view.size() - location); - state = ada::state::QUERY; + state = state::QUERY; input_position += location + 1; } else { input_position = input_size + 1; @@ -629,7 +625,7 @@ result_type parse_url_impl(std::string_view user_input, view, character_sets::C0_CONTROL_PERCENT_ENCODE)); break; } - case ada::state::PORT: { + case state::PORT: { ada_log("PORT ", helpers::substring(url_data, input_position)); std::string_view port_view = url_data.substr(input_position); input_position += url.parse_port(port_view, true); @@ -639,13 +635,13 @@ result_type parse_url_impl(std::string_view user_input, state = state::PATH_START; [[fallthrough]]; } - case ada::state::PATH_START: { + case state::PATH_START: { ada_log("PATH_START ", helpers::substring(url_data, input_position)); // If url is special, then: if (url.is_special()) { // Set state to path state. - state = ada::state::PATH; + state = state::PATH; // Optimization: Avoiding going into PATH state improves the // performance of urls ending with /. @@ -670,12 +666,12 @@ result_type parse_url_impl(std::string_view user_input, // set url's query to the empty string and state to query state. else if ((input_position != input_size) && (url_data[input_position] == '?')) { - state = ada::state::QUERY; + state = state::QUERY; } // Otherwise, if c is not the EOF code point: else if (input_position != input_size) { // Set state to path state. - state = ada::state::PATH; + state = state::PATH; // If c is not U+002F (/), then decrease pointer by 1. if (url_data[input_position] != '/') { @@ -686,7 +682,7 @@ result_type parse_url_impl(std::string_view user_input, input_position++; break; } - case ada::state::PATH: { + case state::PATH: { ada_log("PATH ", helpers::substring(url_data, input_position)); std::string_view view = url_data.substr(input_position); @@ -694,7 +690,7 @@ result_type parse_url_impl(std::string_view user_input, // Furthermore, we can immediately locate the '?'. size_t locofquestionmark = view.find('?'); if (locofquestionmark != std::string_view::npos) { - state = ada::state::QUERY; + state = state::QUERY; view.remove_suffix(view.size() - locofquestionmark); input_position += locofquestionmark + 1; } else { @@ -710,7 +706,7 @@ result_type parse_url_impl(std::string_view user_input, } break; } - case ada::state::FILE_SLASH: { + case state::FILE_SLASH: { ada_log("FILE_SLASH ", helpers::substring(url_data, input_position)); // If c is U+002F (/) or U+005C (\), then: @@ -719,15 +715,14 @@ result_type parse_url_impl(std::string_view user_input, url_data[input_position] == '\\')) { ada_log("FILE_SLASH c is U+002F or U+005C"); // Set state to file host state. - state = ada::state::FILE_HOST; + state = state::FILE_HOST; input_position++; } else { ada_log("FILE_SLASH otherwise"); // If base is non-null and base's scheme is "file", then: // Note: it is unsafe to do base_url->scheme unless you know that // base_url_has_value() is true. - if (base_url != nullptr && - base_url->type == ada::scheme::type::FILE) { + if (base_url != nullptr && base_url->type == scheme::type::FILE) { // Set url's host to base's host. if constexpr (result_type_is_ada_url) { url.host = base_url->host; @@ -762,12 +757,12 @@ result_type parse_url_impl(std::string_view user_input, } // Set state to path state, and decrease pointer by 1. - state = ada::state::PATH; + state = state::PATH; } break; } - case ada::state::FILE_HOST: { + case state::FILE_HOST: { ada_log("FILE_HOST ", helpers::substring(url_data, input_position)); std::string_view view = url_data.substr(input_position); @@ -777,7 +772,7 @@ result_type parse_url_impl(std::string_view user_input, (location != std::string_view::npos) ? location : view.size()); if (checkers::is_windows_drive_letter(file_host_buffer)) { - state = ada::state::PATH; + state = state::PATH; } else if (file_host_buffer.empty()) { // Set url's host to the empty string. if constexpr (result_type_is_ada_url) { @@ -786,7 +781,7 @@ result_type parse_url_impl(std::string_view user_input, url.update_base_hostname(""); } // Set state to path start state. - state = ada::state::PATH_START; + state = state::PATH_START; } else { size_t consumed_bytes = file_host_buffer.size(); input_position += consumed_bytes; @@ -808,12 +803,12 @@ result_type parse_url_impl(std::string_view user_input, } // Set buffer to the empty string and state to path start state. - state = ada::state::PATH_START; + state = state::PATH_START; } break; } - case ada::state::FILE: { + case state::FILE: { ada_log("FILE ", helpers::substring(url_data, input_position)); std::string_view file_view = url_data.substr(input_position); @@ -830,11 +825,10 @@ result_type parse_url_impl(std::string_view user_input, url_data[input_position] == '\\')) { ada_log("FILE c is U+002F or U+005C"); // Set state to file slash state. - state = ada::state::FILE_SLASH; + state = state::FILE_SLASH; } // Otherwise, if base is non-null and base's scheme is "file": - else if (base_url != nullptr && - base_url->type == ada::scheme::type::FILE) { + else if (base_url != nullptr && base_url->type == scheme::type::FILE) { // Set url's host to base's host, url's path to a clone of base's // path, and url's query to base's query. ada_log("FILE base non-null"); @@ -852,7 +846,7 @@ result_type parse_url_impl(std::string_view user_input, // If c is U+003F (?), then set url's query to the empty string and // state to query state. if (input_position != input_size && url_data[input_position] == '?') { - state = ada::state::QUERY; + state = state::QUERY; } // Otherwise, if c is not the EOF code point: else if (input_position != input_size) { @@ -878,14 +872,14 @@ result_type parse_url_impl(std::string_view user_input, } // Set state to path state and decrease pointer by 1. - state = ada::state::PATH; + state = state::PATH; break; } } // Otherwise, set state to path state, and decrease pointer by 1. else { ada_log("FILE go to path"); - state = ada::state::PATH; + state = state::PATH; break; } @@ -893,7 +887,7 @@ result_type parse_url_impl(std::string_view user_input, break; } default: - ada::unreachable(); + unreachable(); } } if constexpr (store_values) { @@ -904,6 +898,258 @@ result_type parse_url_impl(std::string_view user_input, return url; } +tl::expected parse_url_pattern_impl( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options) { + // Let init be null. + url_pattern_init init; + + // If input is a scalar value string then: + if (std::holds_alternative(input)) { + // Set init to the result of running parse a constructor string given input. + auto parse_result = url_pattern_helpers::constructor_string_parser::parse( + std::get(input)); + if (!parse_result) { + ada_log("constructor_string_parser::parse failed"); + return tl::unexpected(parse_result.error()); + } + init = std::move(*parse_result); + // If baseURL is null and init["protocol"] does not exist, then throw a + // TypeError. + if (!base_url && !init.protocol) { + ada_log("base url is null and protocol is not set"); + return tl::unexpected(errors::type_error); + } + + // If baseURL is not null, set init["baseURL"] to baseURL. + if (base_url) { + init.base_url = std::string(*base_url); + } + } else { + // Assert: input is a URLPatternInit. + ADA_ASSERT_TRUE(std::holds_alternative(input)); + // If baseURL is not null, then throw a TypeError. + if (base_url) { + ada_log("base url is not null"); + return tl::unexpected(errors::type_error); + } + // Optimization: Avoid copy by moving the input value. + // Set init to input. + init = std::move(std::get(input)); + } + + // Let processedInit be the result of process a URLPatternInit given init, + // "pattern", null, null, null, null, null, null, null, and null. + // TODO: Make "pattern" an enum to avoid creating a string everytime. + auto processed_init = url_pattern_init::process(init, "pattern"); + if (!processed_init) { + ada_log("url_pattern_init::process failed for init and 'pattern'"); + return tl::unexpected(processed_init.error()); + } + + // For each componentName of « "protocol", "username", "password", "hostname", + // "port", "pathname", "search", "hash" If processedInit[componentName] does + // not exist, then set processedInit[componentName] to "*". + ADA_ASSERT_TRUE(processed_init.has_value()); + if (!processed_init->protocol) processed_init->protocol = "*"; + if (!processed_init->username) processed_init->username = "*"; + if (!processed_init->password) processed_init->password = "*"; + if (!processed_init->hostname) processed_init->hostname = "*"; + if (!processed_init->port) processed_init->port = "*"; + if (!processed_init->pathname) processed_init->pathname = "*"; + if (!processed_init->search) processed_init->search = "*"; + if (!processed_init->hash) processed_init->hash = "*"; + + ada_log("-- processed_init->protocol: ", processed_init->protocol.value()); + ada_log("-- processed_init->username: ", processed_init->username.value()); + ada_log("-- processed_init->password: ", processed_init->password.value()); + ada_log("-- processed_init->hostname: ", processed_init->hostname.value()); + ada_log("-- processed_init->port: ", processed_init->port.value()); + ada_log("-- processed_init->pathname: ", processed_init->pathname.value()); + ada_log("-- processed_init->search: ", processed_init->search.value()); + ada_log("-- processed_init->hash: ", processed_init->hash.value()); + + // If processedInit["protocol"] is a special scheme and processedInit["port"] + // is a string which represents its corresponding default port in radix-10 + // using ASCII digits then set processedInit["port"] to the empty string. + // TODO: Optimization opportunity. + if (scheme::is_special(*processed_init->protocol)) { + std::string_view port = processed_init->port.value(); + helpers::trim_c0_whitespace(port); + if (std::to_string(scheme::get_special_port(*processed_init->protocol)) == + port) { + processed_init->port->clear(); + } + } + + // Let urlPattern be a new URL pattern. + auto url_pattern_ = url_pattern{}; + + // Set urlPattern’s protocol component to the result of compiling a component + // given processedInit["protocol"], canonicalize a protocol, and default + // options. + auto protocol_component = url_pattern_component::compile( + processed_init->protocol.value(), + url_pattern_helpers::canonicalize_protocol, + url_pattern_compile_component_options::DEFAULT); + if (!protocol_component) { + ada_log("url_pattern_component::compile failed for protocol ", + processed_init->protocol.value()); + return tl::unexpected(protocol_component.error()); + } + url_pattern_.protocol_component = std::move(*protocol_component); + + // Set urlPattern’s username component to the result of compiling a component + // given processedInit["username"], canonicalize a username, and default + // options. + auto username_component = url_pattern_component::compile( + processed_init->username.value(), + url_pattern_helpers::canonicalize_username, + url_pattern_compile_component_options::DEFAULT); + if (!username_component) { + ada_log("url_pattern_component::compile failed for username ", + processed_init->username.value()); + return tl::unexpected(username_component.error()); + } + url_pattern_.username_component = std::move(*username_component); + + // Set urlPattern’s password component to the result of compiling a component + // given processedInit["password"], canonicalize a password, and default + // options. + auto password_component = url_pattern_component::compile( + processed_init->password.value(), + url_pattern_helpers::canonicalize_password, + url_pattern_compile_component_options::DEFAULT); + if (!password_component) { + ada_log("url_pattern_component::compile failed for password ", + processed_init->password.value()); + return tl::unexpected(password_component.error()); + } + url_pattern_.password_component = std::move(*password_component); + + // TODO: Optimization opportunity. The following if statement can be + // simplified. + // If the result running hostname pattern is an IPv6 address given + // processedInit["hostname"] is true, then set urlPattern’s hostname component + // to the result of compiling a component given processedInit["hostname"], + // canonicalize an IPv6 hostname, and hostname options. + if (url_pattern_helpers::is_ipv6_address(processed_init->hostname.value())) { + ada_log("processed_init->hostname is ipv6 address"); + // then set urlPattern’s hostname component to the result of compiling a + // component given processedInit["hostname"], canonicalize an IPv6 hostname, + // and hostname options. + auto hostname_component = url_pattern_component::compile( + processed_init->hostname.value(), + url_pattern_helpers::canonicalize_ipv6_hostname, + url_pattern_compile_component_options::DEFAULT); + if (!hostname_component) { + ada_log("url_pattern_component::compile failed for ipv6 hostname ", + processed_init->hostname.value()); + return tl::unexpected(hostname_component.error()); + } + url_pattern_.hostname_component = std::move(*hostname_component); + } else { + // Otherwise, set urlPattern’s hostname component to the result of compiling + // a component given processedInit["hostname"], canonicalize a hostname, and + // hostname options. + auto hostname_component = url_pattern_component::compile( + processed_init->hostname.value(), + url_pattern_helpers::canonicalize_hostname, + url_pattern_compile_component_options::HOSTNAME); + if (!hostname_component) { + ada_log("url_pattern_component::compile failed for hostname ", + processed_init->hostname.value()); + return tl::unexpected(hostname_component.error()); + } + url_pattern_.hostname_component = std::move(*hostname_component); + } + + // Set urlPattern’s port component to the result of compiling a component + // given processedInit["port"], canonicalize a port, and default options. + auto port_component = url_pattern_component::compile( + processed_init->port.value(), url_pattern_helpers::canonicalize_port, + url_pattern_compile_component_options::DEFAULT); + if (!port_component) { + ada_log("url_pattern_component::compile failed for port ", + processed_init->port.value()); + return tl::unexpected(port_component.error()); + } + url_pattern_.port_component = std::move(*port_component); + + // Let compileOptions be a copy of the default options with the ignore case + // property set to options["ignoreCase"]. + auto compile_options = url_pattern_compile_component_options::DEFAULT; + if (options) { + compile_options.ignore_case = options->ignore_case; + } + + // TODO: Optimization opportunity: Simplify this if statement. + // If the result of running protocol component matches a special scheme given + // urlPattern’s protocol component is true, then: + if (url_pattern_helpers::protocol_component_matches_special_scheme( + url_pattern_.protocol_component)) { + // Let pathCompileOptions be copy of the pathname options with the ignore + // case property set to options["ignoreCase"]. + auto path_compile_options = url_pattern_compile_component_options::PATHNAME; + if (options) { + path_compile_options.ignore_case = options->ignore_case; + } + + // Set urlPattern’s pathname component to the result of compiling a + // component given processedInit["pathname"], canonicalize a pathname, and + // pathCompileOptions. + auto pathname_component = url_pattern_component::compile( + processed_init->pathname.value(), + url_pattern_helpers::canonicalize_pathname, path_compile_options); + if (!pathname_component) { + ada_log("url_pattern_component::compile failed for pathname ", + processed_init->pathname.value()); + return tl::unexpected(pathname_component.error()); + } + url_pattern_.pathname_component = std::move(*pathname_component); + } else { + // Otherwise set urlPattern’s pathname component to the result of compiling + // a component given processedInit["pathname"], canonicalize an opaque + // pathname, and compileOptions. + auto pathname_component = url_pattern_component::compile( + processed_init->pathname.value(), + url_pattern_helpers::canonicalize_opaque_pathname, compile_options); + if (!pathname_component) { + ada_log("url_pattern_component::compile failed for opaque pathname ", + processed_init->pathname.value()); + return tl::unexpected(pathname_component.error()); + } + url_pattern_.pathname_component = std::move(*pathname_component); + } + + // Set urlPattern’s search component to the result of compiling a component + // given processedInit["search"], canonicalize a search, and compileOptions. + auto search_component = url_pattern_component::compile( + processed_init->search.value(), url_pattern_helpers::canonicalize_search, + compile_options); + if (!search_component) { + ada_log("url_pattern_component::compile failed for search ", + processed_init->search.value()); + return tl::unexpected(search_component.error()); + } + url_pattern_.search_component = std::move(*search_component); + + // Set urlPattern’s hash component to the result of compiling a component + // given processedInit["hash"], canonicalize a hash, and compileOptions. + auto hash_component = url_pattern_component::compile( + processed_init->hash.value(), url_pattern_helpers::canonicalize_hash, + compile_options); + if (!hash_component) { + ada_log("url_pattern_component::compile failed for hash ", + processed_init->hash.value()); + return tl::unexpected(hash_component.error()); + } + url_pattern_.hash_component = std::move(*hash_component); + + // Return urlPattern. + return url_pattern_; +} + template url parse_url_impl(std::string_view user_input, const url* base_url = nullptr); template url_aggregator parse_url_impl( diff --git a/src/serializers.cpp b/src/serializers.cpp index 91be39ce1..3b4f967a6 100644 --- a/src/serializers.cpp +++ b/src/serializers.cpp @@ -1,6 +1,5 @@ -#include "ada.h" - #include +#include #include namespace ada::serializers { diff --git a/src/unicode.cpp b/src/unicode.cpp index 78b4e57f4..00ae20e5a 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -1,7 +1,8 @@ -#include "ada.h" -#include "ada/character_sets-inl.h" #include "ada/common_defs.h" +#include "ada/character_sets-inl.h" +#include "ada/character_sets.h" #include "ada/unicode.h" +#include "ada/log.h" ADA_PUSH_DISABLE_ALL_WARNINGS #include "ada_idna.cpp" @@ -272,6 +273,17 @@ ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept { (c >= 'a' && c <= 'f'); } +ada_really_inline constexpr bool is_ascii_digit(const char c) noexcept { + // An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), + // inclusive. + return (c >= '0' && c <= '9'); +} + +ada_really_inline constexpr bool is_ascii(const char32_t c) noexcept { + // If code point is between U+0000 and U+007F inclusive, then return true. + return c <= 0x7F; +} + ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept { return (unsigned char)c <= ' '; } diff --git a/src/url.cpp b/src/url.cpp index e14b37b75..f35625bed 100644 --- a/src/url.cpp +++ b/src/url.cpp @@ -1,6 +1,6 @@ -#include "ada.h" -#include "ada/scheme.h" +#include "ada/scheme-inl.h" #include "ada/log.h" +#include "ada/unicode-inl.h" #include #include diff --git a/src/url_aggregator.cpp b/src/url_aggregator.cpp index 2c431cef1..08211436d 100644 --- a/src/url_aggregator.cpp +++ b/src/url_aggregator.cpp @@ -1,4 +1,3 @@ -#include "ada.h" #include "ada/checkers-inl.h" #include "ada/helpers.h" #include "ada/implementation.h" diff --git a/src/url_components.cpp b/src/url_components.cpp index 40508f4e0..da981c09c 100644 --- a/src/url_components.cpp +++ b/src/url_components.cpp @@ -1,4 +1,3 @@ -#include "ada.h" #include "ada/helpers.h" #include "ada/url_components.h" diff --git a/src/url_pattern.cpp b/src/url_pattern.cpp new file mode 100644 index 000000000..95ec41ded --- /dev/null +++ b/src/url_pattern.cpp @@ -0,0 +1,795 @@ +#include "ada/url_pattern-inl.h" + +#include +#include +#include +#include + +namespace ada { +// The default options is an options struct with delimiter code point set to +// the empty string and prefix code point set to the empty string. +url_pattern_compile_component_options + url_pattern_compile_component_options::DEFAULT(std::nullopt, std::nullopt); + +// The hostname options is an options struct with delimiter code point set +// "." and prefix code point set to the empty string. +url_pattern_compile_component_options + url_pattern_compile_component_options::HOSTNAME('.', std::nullopt); + +// The pathname options is an options struct with delimiter code point set +// "/" and prefix code point set to "/". +url_pattern_compile_component_options + url_pattern_compile_component_options::PATHNAME('/', '/'); + +tl::expected url_pattern_init::process( + url_pattern_init init, std::string_view type, + std::optional protocol, + std::optional username, + std::optional password, + std::optional hostname, + std::optional port, + std::optional pathname, + std::optional search, + std::optional hash) { + // Let result be the result of creating a new URLPatternInit. + auto result = url_pattern_init{}; + + // If protocol is not null, set result["protocol"] to protocol. + if (protocol.has_value()) result.protocol = *protocol; + + // If username is not null, set result["username"] to username. + if (username.has_value()) result.username = *username; + + // If password is not null, set result["password"] to password. + if (password.has_value()) result.password = *password; + + // If hostname is not null, set result["hostname"] to hostname. + if (hostname.has_value()) result.hostname = *hostname; + + // If port is not null, set result["port"] to port. + if (port.has_value()) result.port = *port; + + // If pathname is not null, set result["pathname"] to pathname. + if (pathname.has_value()) result.pathname = *pathname; + + // If search is not null, set result["search"] to search. + if (search.has_value()) result.search = *search; + + // If hash is not null, set result["hash"] to hash. + if (hash.has_value()) result.hash = *hash; + + // Let baseURL be null. + std::optional base_url{}; + + // If init["baseURL"] exists: + if (init.base_url.has_value()) { + // Set baseURL to the result of parsing init["baseURL"]. + auto parsing_result = ada::parse(*init.base_url); + // If baseURL is failure, then throw a TypeError. + if (!parsing_result) { + return tl::unexpected(errors::type_error); + } + base_url = std::move(*parsing_result); + + // If init["protocol"] does not exist, then set result["protocol"] to the + // result of processing a base URL string given baseURL’s scheme and type. + if (!init.protocol.has_value()) { + ADA_ASSERT_TRUE(base_url.has_value()); + std::string_view base_url_protocol = base_url->get_protocol(); + if (base_url_protocol.ends_with(":")) base_url_protocol.remove_suffix(1); + result.protocol = + url_pattern_helpers::process_base_url_string(base_url_protocol, type); + } + + // If type is not "pattern" and init contains none of "protocol", + // "hostname", "port" and "username", then set result["username"] to the + // result of processing a base URL string given baseURL’s username and type. + if (type != "pattern" && !init.protocol && !init.hostname && !init.port && + !init.username) { + result.username = url_pattern_helpers::process_base_url_string( + base_url->get_username(), type); + } + + // TODO: Optimization opportunity: Merge this with the previous check. + // If type is not "pattern" and init contains none of "protocol", + // "hostname", "port", "username" and "password", then set + // result["password"] to the result of processing a base URL string given + // baseURL’s password and type. + if (type != "pattern" && !init.protocol && !init.hostname && !init.port && + !init.username && !init.password) { + result.password = url_pattern_helpers::process_base_url_string( + base_url->get_password(), type); + } + + // If init contains neither "protocol" nor "hostname", then: + if (!init.protocol && !init.hostname) { + // Let baseHost be baseURL’s host. + // If baseHost is null, then set baseHost to the empty string. + auto base_host = base_url->get_hostname(); + // Set result["hostname"] to the result of processing a base URL string + // given baseHost and type. + result.hostname = + url_pattern_helpers::process_base_url_string(base_host, type); + } + + // If init contains none of "protocol", "hostname", and "port", then: + if (!init.protocol && !init.hostname && !init.port) { + // If baseURL’s port is null, then set result["port"] to the empty string. + // Otherwise, set result["port"] to baseURL’s port, serialized. + result.port = base_url->get_port(); + } + + // If init contains none of "protocol", "hostname", "port", and "pathname", + // then set result["pathname"] to the result of processing a base URL string + // given the result of URL path serializing baseURL and type. + if (!init.protocol && !init.hostname && !init.port && !init.pathname) { + result.pathname = url_pattern_helpers::process_base_url_string( + base_url->get_pathname(), type); + } + + // If init contains none of "protocol", "hostname", "port", "pathname", and + // "search", then: + if (!init.protocol && !init.hostname && !init.port && !init.pathname && + !init.search) { + // Let baseQuery be baseURL’s query. + // Set result["search"] to the result of processing a base URL string + // given baseQuery and type. + result.search = url_pattern_helpers::process_base_url_string( + base_url->get_search(), type); + } + + // If init contains none of "protocol", "hostname", "port", "pathname", + // "search", and "hash", then: + if (!init.protocol && !init.hostname && !init.port && !init.pathname && + !init.search && !init.hash) { + // Let baseFragment be baseURL’s fragment. + // Set result["hash"] to the result of processing a base URL string given + // baseFragment and type. + result.hash = url_pattern_helpers::process_base_url_string( + base_url->get_hash(), type); + } + } + + // If init["protocol"] exists, then set result["protocol"] to the result of + // process protocol for init given init["protocol"] and type. + if (init.protocol) { + auto process_result = process_protocol(*init.protocol, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.protocol = std::move(*process_result); + } + + // If init["username"] exists, then set result["username"] to the result of + // process username for init given init["username"] and type. + if (init.username.has_value()) { + auto process_result = process_username(*init.username, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.username = std::move(*process_result); + } + + // If init["password"] exists, then set result["password"] to the result of + // process password for init given init["password"] and type. + if (init.password.has_value()) { + auto process_result = process_password(*init.password, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.password = std::move(*process_result); + } + + // If init["hostname"] exists, then set result["hostname"] to the result of + // process hostname for init given init["hostname"] and type. + if (init.hostname.has_value()) { + auto process_result = process_hostname(*init.hostname, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.hostname = std::move(*process_result); + } + + // If init["port"] exists, then set result["port"] to the result of process + // port for init given init["port"], result["protocol"], and type. + if (init.port) { + auto process_result = + process_port(*init.port, result.protocol.value_or("fake"), type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.port = std::move(*process_result); + } + + // If init["pathname"] exists: + if (init.pathname.has_value()) { + // Set result["pathname"] to init["pathname"]. + result.pathname = init.pathname; + + // If the following are all true: + // - baseURL is not null; + // - baseURL has an opaque path; and + // - the result of running is an absolute pathname given result["pathname"] + // and type is false, + if (base_url && !base_url->has_opaque_path && + !url_pattern_helpers::is_absolute_pathname(*result.pathname, type)) { + // Let baseURLPath be the result of running process a base URL string + // given the result of URL path serializing baseURL and type. + std::string base_url_path = url_pattern_helpers::process_base_url_string( + base_url->get_pathname(), type); + + // Let slash index be the index of the last U+002F (/) code point found in + // baseURLPath, interpreted as a sequence of code points, or null if there + // are no instances of the code point. + auto slash_index = base_url_path.find_last_of('/'); + + // If slash index is not null: + if (slash_index != std::string::npos) { + // Let new pathname be the code point substring from 0 to slash index + + // 1 within baseURLPath. + std::string new_pathname = base_url_path.substr(0, slash_index + 1); + // Append result["pathname"] to the end of new pathname. + ADA_ASSERT_TRUE(result.pathname.has_value()); + new_pathname.append(result.pathname.value()); + // Set result["pathname"] to new pathname. + result.pathname = std::move(new_pathname); + } + } + + // Set result["pathname"] to the result of process pathname for init given + // result["pathname"], result["protocol"], and type. + auto pathname_processing_result = + process_pathname(*result.pathname, result.protocol.value_or(""), type); + if (!pathname_processing_result) { + return tl::unexpected(pathname_processing_result.error()); + } + result.pathname = std::move(*pathname_processing_result); + } + + // If init["search"] exists then set result["search"] to the result of process + // search for init given init["search"] and type. + if (init.search) { + auto process_result = process_search(*init.search, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.search = std::move(*process_result); + } + + // If init["hash"] exists then set result["hash"] to the result of process + // hash for init given init["hash"] and type. + if (init.hash) { + auto process_result = process_hash(*init.hash, type); + if (!process_result) { + return tl::unexpected(process_result.error()); + } + result.hash = std::move(*process_result); + } + // Return result. + return result; +} + +tl::expected url_pattern_init::process_protocol( + std::string_view value, std::string_view type) { + ada_log("process_protocol=", value, " [", type, "]"); + // Let strippedValue be the given value with a single trailing U+003A (:) + // removed, if any. + if (value.ends_with(":")) { + value.remove_suffix(1); + } + // If type is "pattern" then return strippedValue. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a protocol given strippedValue. + return url_pattern_helpers::canonicalize_protocol(value); +} + +tl::expected url_pattern_init::process_username( + std::string_view value, std::string_view type) { + // If type is "pattern" then return value. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a username given value. + return url_pattern_helpers::canonicalize_username(value); +} + +tl::expected url_pattern_init::process_password( + std::string_view value, std::string_view type) { + // If type is "pattern" then return value. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a password given value. + return url_pattern_helpers::canonicalize_password(value); +} + +tl::expected url_pattern_init::process_hostname( + std::string_view value, std::string_view type) { + ada_log("process_hostname value=", value, " type=", type); + // If type is "pattern" then return value. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a hostname given value. + return url_pattern_helpers::canonicalize_hostname(value); +} + +tl::expected url_pattern_init::process_port( + std::string_view port, std::string_view protocol, std::string_view type) { + // If type is "pattern" then return portValue. + if (type == "pattern") { + return std::string(port); + } + // Return the result of running canonicalize a port given portValue and + // protocolValue. + return url_pattern_helpers::canonicalize_port_with_protocol(port, protocol); +} + +tl::expected url_pattern_init::process_pathname( + std::string_view value, std::string_view protocol, std::string_view type) { + // If type is "pattern" then return pathnameValue. + if (type == "pattern") { + return std::string(value); + } + + // If protocolValue is a special scheme or the empty string, then return the + // result of running canonicalize a pathname given pathnameValue. + if (protocol.empty() || scheme::is_special(protocol)) { + return url_pattern_helpers::canonicalize_pathname(value); + } + + // Return the result of running canonicalize an opaque pathname given + // pathnameValue. + return url_pattern_helpers::canonicalize_opaque_pathname(value); +} + +tl::expected url_pattern_init::process_search( + std::string_view value, std::string_view type) { + // Let strippedValue be the given value with a single leading U+003F (?) + // removed, if any. + if (value.starts_with("?")) { + value.remove_prefix(1); + } + ADA_ASSERT_TRUE(!value.starts_with("?")); + // If type is "pattern" then return strippedValue. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a search given strippedValue. + return url_pattern_helpers::canonicalize_search(value); +} + +tl::expected url_pattern_init::process_hash( + std::string_view value, std::string_view type) { + // Let strippedValue be the given value with a single leading U+0023 (#) + // removed, if any. + if (value.starts_with("#")) { + value.remove_prefix(1); + } + ADA_ASSERT_TRUE(!value.starts_with("#")); + // If type is "pattern" then return strippedValue. + if (type == "pattern") { + return std::string(value); + } + // Return the result of running canonicalize a hash given strippedValue. + return url_pattern_helpers::canonicalize_hash(value); +} + +std::string url_pattern_options::to_string() const { + std::string answer; + answer.append("{\n"); + answer.append("\t\"ignore_case\":\""); + answer.append(ignore_case ? "true" : "false"); + answer.append("\",\n"); + answer.append("}"); + return answer; +} + +std::string url_pattern_init::to_string() const { + std::string answer; + auto back = std::back_insert_iterator(answer); + answer.append("{\n"); + + if (protocol.has_value()) { + answer.append("\t\"protocol\":\""); + helpers::encode_json(protocol.value(), back); + answer.append("\",\n"); + } + + if (username.has_value()) { + answer.append("\t\"username\":\""); + helpers::encode_json(username.value(), back); + answer.append("\",\n"); + } + + if (password.has_value()) { + answer.append("\t\"password\":\""); + helpers::encode_json(password.value(), back); + answer.append("\",\n"); + } + + if (hostname.has_value()) { + answer.append("\t\"hostname\":\""); + helpers::encode_json(hostname.value(), back); + answer.append("\",\n"); + } + + if (port.has_value()) { + answer.append("\t\"port\":\""); + helpers::encode_json(port.value(), back); + answer.append("\",\n"); + } + + if (pathname.has_value()) { + answer.append("\t\"pathname\":\""); + helpers::encode_json(pathname.value(), back); + answer.append("\",\n"); + } + + if (search.has_value()) { + answer.append("\t\"search\":\""); + helpers::encode_json(search.value(), back); + answer.append("\",\n"); + } + + if (hash.has_value()) { + answer.append("\t\"hash\":\""); + helpers::encode_json(hash.value(), back); + answer.append("\",\n"); + } + + if (base_url.has_value()) { + answer.append("\t\"base_url\":\""); + helpers::encode_json(base_url.value(), back); + answer.append("\",\n"); + } + + answer.append("}"); + return answer; +} + +template +tl::expected url_pattern_component::compile( + std::string_view input, F& encoding_callback, + url_pattern_compile_component_options& options) { + ada_log("url_pattern_component::compile input: ", input); + // Let part list be the result of running parse a pattern string given input, + // options, and encoding callback. + auto part_list = url_pattern_helpers::parse_pattern_string(input, options, + encoding_callback); + + if (!part_list) { + ada_log("parse_pattern_string failed"); + return tl::unexpected(part_list.error()); + } + + // Let (regular expression string, name list) be the result of running + // generate a regular expression and name list given part list and options. + auto [regular_expression_string, name_list] = + url_pattern_helpers::generate_regular_expression_and_name_list(*part_list, + options); + + ada_log("regular expression string: ", regular_expression_string); + + // Let flags be an empty string. + // If options’s ignore case is true then set flags to "vi". + // Otherwise set flags to "v" + auto flags = options.ignore_case + ? std::regex::icase | std::regex_constants::ECMAScript + : std::regex_constants::ECMAScript; + + // Let pattern string be the result of running generate a pattern + // string given part list and options. + auto pattern_string = + url_pattern_helpers::generate_pattern_string(*part_list, options); + + // Let regular expression be RegExpCreate(regular expression string, + // flags). If this throws an exception, catch it, and throw a + // TypeError. + std::regex regular_expression; + try { + regular_expression = std::regex(regular_expression_string, flags); + } catch (std::regex_error& error) { + (void)error; + ada_log("std::regex_error: ", error.what()); + return tl::unexpected(errors::type_error); + } + + // For each part of part list: + // - If part’s type is "regexp", then set has regexp groups to true. + const auto has_regexp = [](const auto& part) { return part.is_regexp(); }; + const bool has_regexp_groups = std::ranges::any_of(*part_list, has_regexp); + + ada_log("has regexp groups: ", has_regexp_groups); + + // Return a new component whose pattern string is pattern string, regular + // expression is regular expression, group name list is name list, and has + // regexp groups is has regexp groups. + return url_pattern_component(std::move(pattern_string), + std::move(regular_expression), flags, + std::move(name_list), has_regexp_groups); +} + +result> url_pattern::exec( + const url_pattern_input& input, std::string_view* base_url = nullptr) { + // Return the result of match given this's associated URL pattern, input, and + // baseURL if given. + return match(input, base_url); +} + +result url_pattern::test(const url_pattern_input& input, + std::string_view* base_url = nullptr) { + // TODO: Optimization opportunity. Rather than returning `url_pattern_result` + // Implement a fast path just like `can_parse()` in ada_url. + // Let result be the result of match given this's associated URL pattern, + // input, and baseURL if given. + // If result is null, return false. + if (auto result = match(input, base_url); result.has_value()) { + return result->has_value(); + } + return tl::unexpected(errors::type_error); +} + +result> url_pattern::match( + const url_pattern_input& input, std::string_view* base_url_string) { + std::string protocol{}; + std::string username{}; + std::string password{}; + std::string hostname{}; + std::string port{}; + std::string pathname{}; + std::string search{}; + std::string hash{}; + + // Let inputs be an empty list. + // Append input to inputs. + std::vector inputs{input}; + + // If input is a URLPatternInit then: + if (std::holds_alternative(input)) { + ada_log( + "url_pattern::match called with url_pattern_init and base_url_string=", + base_url_string); + // If baseURLString was given, throw a TypeError. + if (base_url_string) { + ada_log("failed to match because base_url_string was given"); + return tl::unexpected(errors::type_error); + } + + // Let applyResult be the result of process a URLPatternInit given input, + // "url", protocol, username, password, hostname, port, pathname, search, + // and hash. + auto apply_result = url_pattern_init::process( + std::get(input), "url", protocol, username, password, + hostname, port, pathname, search, hash); + + // If this throws an exception, catch it, and return null. + if (!apply_result.has_value()) { + ada_log("match returned std::nullopt because process threw"); + return std::nullopt; + } + + // Set protocol to applyResult["protocol"]. + ADA_ASSERT_TRUE(apply_result->protocol.has_value()); + protocol = apply_result->protocol.value(); + + // Set username to applyResult["username"]. + ADA_ASSERT_TRUE(apply_result->username.has_value()); + username = apply_result->username.value(); + + // Set password to applyResult["password"]. + ADA_ASSERT_TRUE(apply_result->password.has_value()); + password = apply_result->password.value(); + + // Set hostname to applyResult["hostname"]. + ADA_ASSERT_TRUE(apply_result->hostname.has_value()); + hostname = apply_result->hostname.value(); + + // Set port to applyResult["port"]. + ADA_ASSERT_TRUE(apply_result->port.has_value()); + port = apply_result->port.value(); + + // Set pathname to applyResult["pathname"]. + ADA_ASSERT_TRUE(apply_result->pathname.has_value()); + pathname = apply_result->pathname.value(); + + // Set search to applyResult["search"]. + ADA_ASSERT_TRUE(apply_result->search.has_value()); + if (apply_result->search->starts_with("?")) { + search = apply_result->search->substr(1); + } else { + search = apply_result->search.value(); + } + + // Set hash to applyResult["hash"]. + ADA_ASSERT_TRUE(apply_result->hash.has_value()); + ADA_ASSERT_TRUE(!apply_result->hash->starts_with("#")); + hash = apply_result->hash.value(); + } else { + ADA_ASSERT_TRUE(std::holds_alternative(input)); + + // Let baseURL be null. + result base_url; + + // If baseURLString was given, then: + if (base_url_string) { + // Let baseURL be the result of parsing baseURLString. + base_url = ada::parse(*base_url_string, nullptr); + + // If baseURL is failure, return null. + if (!base_url) { + ada_log("match returned std::nullopt because failed to parse base_url=", + *base_url_string); + return std::nullopt; + } + + // Append baseURLString to inputs. + inputs.emplace_back(*base_url_string); + } + + url_aggregator* base_url_value = + base_url.has_value() ? &base_url.value() : nullptr; + + // Set url to the result of parsing input given baseURL. + auto url = ada::parse(std::get(input), + base_url_value); + + // If url is failure, return null. + if (!url) { + ada_log("match returned std::nullopt because url failed"); + return std::nullopt; + } + + // Set protocol to url’s scheme. + // IMPORTANT: Not documented on the URLPattern spec, but protocol suffix ':' + // is removed. Similar work was done on workerd: + // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2038 + protocol = url->get_protocol().substr(0, url->get_protocol().size() - 1); + // Set username to url’s username. + username = url->get_username(); + // Set password to url’s password. + password = url->get_password(); + // Set hostname to url’s host, serialized, or the empty string if the value + // is null. + hostname = url->get_hostname(); + // Set port to url’s port, serialized, or the empty string if the value is + // null. + port = url->get_port(); + // Set pathname to the result of URL path serializing url. + pathname = url->get_pathname(); + // Set search to url’s query or the empty string if the value is null. + // IMPORTANT: Not documented on the URLPattern spec, but search prefix '?' + // is removed. Similar work was done on workerd: + // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2232 + if (url->has_search()) { + ADA_ASSERT_TRUE(url->get_search().starts_with("?")); + search = url->get_search().substr(1); + } else { + search = ""; + } + // Set hash to url’s fragment or the empty string if the value is null. + // IMPORTANT: Not documented on the URLPattern spec, but hash prefix '#' is + // removed. Similar work was done on workerd: + // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2242 + if (url->has_hash()) { + ADA_ASSERT_TRUE(url->get_hash().starts_with("#")); + hash = url->get_hash().substr(1); + } else { + hash = ""; + } + } + + auto regex_flags = std::regex_constants::match_any; + + // Let protocolExecResult be RegExpBuiltinExec(urlPattern’s protocol + // component's regular expression, protocol). + std::smatch protocol_exec_result_value; + auto protocol_exec_result = + std::regex_search(protocol, protocol_exec_result_value, + protocol_component.regexp, regex_flags); + + // Let usernameExecResult be RegExpBuiltinExec(urlPattern’s username + // component's regular expression, username). + std::smatch username_exec_result_value; + auto username_exec_result = + std::regex_search(username, username_exec_result_value, + username_component.regexp, regex_flags); + + // Let passwordExecResult be RegExpBuiltinExec(urlPattern’s password + // component's regular expression, password). + std::smatch password_exec_result_value; + auto password_exec_result = + std::regex_search(password, password_exec_result_value, + password_component.regexp, regex_flags); + + // Let hostnameExecResult be RegExpBuiltinExec(urlPattern’s hostname + // component's regular expression, hostname). + std::smatch hostname_exec_result_value; + auto hostname_exec_result = + std::regex_search(hostname, hostname_exec_result_value, + hostname_component.regexp, regex_flags); + + // Let portExecResult be RegExpBuiltinExec(urlPattern’s port component's + // regular expression, port). + std::smatch port_exec_result_value; + auto port_exec_result = std::regex_search(port, port_exec_result_value, + port_component.regexp, regex_flags); + + // Let pathnameExecResult be RegExpBuiltinExec(urlPattern’s pathname + // component's regular expression, pathname). + std::smatch pathname_exec_result_value; + auto pathname_exec_result = + std::regex_search(pathname, pathname_exec_result_value, + pathname_component.regexp, regex_flags); + + // Let searchExecResult be RegExpBuiltinExec(urlPattern’s search component's + // regular expression, search). + std::smatch search_exec_result_value; + auto search_exec_result = std::regex_search( + search, search_exec_result_value, search_component.regexp, regex_flags); + + // Let hashExecResult be RegExpBuiltinExec(urlPattern’s hash component's + // regular expression, hash). + std::smatch hash_exec_result_value; + auto hash_exec_result = std::regex_search(hash, hash_exec_result_value, + hash_component.regexp, regex_flags); + + // If protocolExecResult, usernameExecResult, passwordExecResult, + // hostnameExecResult, portExecResult, pathnameExecResult, searchExecResult, + // or hashExecResult are null then return null. + if (!protocol_exec_result || !username_exec_result || !password_exec_result || + !hostname_exec_result || !port_exec_result || !pathname_exec_result || + !search_exec_result || !hash_exec_result) { + return std::nullopt; + } + + // Let result be a new URLPatternResult. + auto result = url_pattern_result{}; + // Set result["inputs"] to inputs. + result.inputs = std::move(inputs); + // Set result["protocol"] to the result of creating a component match result + // given urlPattern’s protocol component, protocol, and protocolExecResult. + result.protocol = protocol_component.create_component_match_result( + protocol, protocol_exec_result_value); + + // Set result["username"] to the result of creating a component match result + // given urlPattern’s username component, username, and usernameExecResult. + result.username = username_component.create_component_match_result( + username, username_exec_result_value); + + // Set result["password"] to the result of creating a component match result + // given urlPattern’s password component, password, and passwordExecResult. + result.password = password_component.create_component_match_result( + password, password_exec_result_value); + + // Set result["hostname"] to the result of creating a component match result + // given urlPattern’s hostname component, hostname, and hostnameExecResult. + result.hostname = hostname_component.create_component_match_result( + hostname, hostname_exec_result_value); + + // Set result["port"] to the result of creating a component match result given + // urlPattern’s port component, port, and portExecResult. + result.port = port_component.create_component_match_result( + port, port_exec_result_value); + + // Set result["pathname"] to the result of creating a component match result + // given urlPattern’s pathname component, pathname, and pathnameExecResult. + result.pathname = pathname_component.create_component_match_result( + pathname, pathname_exec_result_value); + + // Set result["search"] to the result of creating a component match result + // given urlPattern’s search component, search, and searchExecResult. + result.search = search_component.create_component_match_result( + search, search_exec_result_value); + + // Set result["hash"] to the result of creating a component match result given + // urlPattern’s hash component, hash, and hashExecResult. + result.hash = hash_component.create_component_match_result( + hash, hash_exec_result_value); + + return result; +} + +} // namespace ada diff --git a/src/url_pattern_helpers.cpp b/src/url_pattern_helpers.cpp new file mode 100644 index 000000000..56927635b --- /dev/null +++ b/src/url_pattern_helpers.cpp @@ -0,0 +1,1302 @@ +#include "ada/url_pattern_helpers.h" + +#include +#include +#include + +namespace ada::url_pattern_helpers { + +std::tuple> +generate_regular_expression_and_name_list( + std::vector& part_list, + url_pattern_compile_component_options options) { + // Let result be "^" + std::string result = "^"; + + // Let name list be a new list + std::vector name_list{}; + const std::string full_wildcard_regexp_value = ".*"; + + // For each part of part list: + for (const url_pattern_part& part : part_list) { + // If part's type is "fixed-text": + if (part.type == url_pattern_part_type::FIXED_TEXT) { + // If part's modifier is "none" + if (part.modifier == url_pattern_part_modifier::NONE) { + // Append the result of running escape a regexp string given part's + // value + result += escape_regexp_string(part.value); + } else { + // A "fixed-text" part with a modifier uses a non capturing group + // (?:) + // Append "(?:" to the end of result. + result.append("(?:"); + // Append the result of running escape a regexp string given part’s + // value to the end of result. + result.append(escape_regexp_string(part.value)); + // Append ")" to the end of result. + result.append(")"); + // Append the result of running convert a modifier to a string given + // part’s modifier to the end of result. + result.append(convert_modifier_to_string(part.modifier)); + } + continue; + } + + // Assert: part's name is not the empty string + ADA_ASSERT_TRUE(!part.name.empty()); + + // Append part's name to name list + name_list.push_back(part.name); + + // Let regexp value be part's value + std::string regexp_value = part.value; + + // If part's type is "segment-wildcard" + if (part.type == url_pattern_part_type::SEGMENT_WILDCARD) { + // then set regexp value to the result of running generate a segment + // wildcard regexp given options. + regexp_value = generate_segment_wildcard_regexp(options); + } + // Otherwise if part's type is "full-wildcard" + else if (part.type == url_pattern_part_type::FULL_WILDCARD) { + // then set regexp value to full wildcard regexp value. + regexp_value = full_wildcard_regexp_value; + } + + // If part's prefix is the empty string and part's suffix is the empty + // string + if (part.prefix.empty() && part.suffix.empty()) { + // If part's modifier is "none" or "optional" + if (part.modifier == url_pattern_part_modifier::NONE || + part.modifier == url_pattern_part_modifier::OPTIONAL) { + // () + result += "(" + regexp_value + ")" + + convert_modifier_to_string(part.modifier); + } else { + // ((?:)) + result += "((?:" + regexp_value + ")" + + convert_modifier_to_string(part.modifier) + ")"; + } + continue; + } + + // If part's modifier is "none" or "optional" + if (part.modifier == url_pattern_part_modifier::NONE || + part.modifier == url_pattern_part_modifier::OPTIONAL) { + // (?:()) + result += "(?:" + escape_regexp_string(part.prefix) + "(" + regexp_value + + ")" + escape_regexp_string(part.suffix) + ")" + + convert_modifier_to_string(part.modifier); + continue; + } + + // Assert: part's modifier is "zero-or-more" or "one-or-more" + ADA_ASSERT_TRUE(part.modifier == url_pattern_part_modifier::ZERO_OR_MORE || + part.modifier == url_pattern_part_modifier::ONE_OR_MORE); + + // Assert: part's prefix is not the empty string or part's suffix is not the + // empty string + ADA_ASSERT_TRUE(!part.prefix.empty() || !part.suffix.empty()); + + // (?:((?:)(?:(?:))*))? + // Append "(?:" to the end of result. + result.append("(?:"); + // Append the result of running escape a regexp string given part’s prefix + // to the end of result. + result.append(escape_regexp_string(part.prefix)); + // Append "((?:" to the end of result. + result.append("((?:"); + // Append regexp value to the end of result. + result.append(regexp_value); + // Append ")(?:" to the end of result. + result.append(")(?:"); + // Append the result of running escape a regexp string given part’s suffix + // to the end of result. + result.append(escape_regexp_string(part.suffix)); + // Append the result of running escape a regexp string given part’s prefix + // to the end of result. + result.append(escape_regexp_string(part.prefix)); + // Append "(?:" to the end of result. + result.append("(?:"); + // Append regexp value to the end of result. + result.append(regexp_value); + // Append "))*)" to the end of result. + result.append("))*)"); + // Append the result of running escape a regexp string given part’s suffix + // to the end of result. + result.append(escape_regexp_string(part.suffix)); + // Append ")" to the end of result. + result.append(")"); + + // If part's modifier is "zero-or-more" then append "?" to the end of result + if (part.modifier == url_pattern_part_modifier::ZERO_OR_MORE) { + result += "?"; + } + } + + // Append "$" to the end of result + result += "$"; + + // Return (result, name list) + return {result, name_list}; +} + +bool is_ipv6_address(std::string_view input) noexcept { + // If input’s code point length is less than 2, then return false. + if (input.size() < 2) return false; + + // Let input code points be input interpreted as a list of code points. + // If input code points[0] is U+005B ([), then return true. + if (input.front() == '[') return true; + // If input code points[0] is U+007B ({) and input code points[1] is U+005B + // ([), then return true. + if (input.starts_with("{[")) return true; + // If input code points[0] is U+005C (\) and input code points[1] is U+005B + // ([), then return true. + return input.starts_with("\\["); +} + +std::string convert_modifier_to_string(url_pattern_part_modifier modifier) { + // TODO: Optimize this. + switch (modifier) { + // If modifier is "zero-or-more", then return "*". + case url_pattern_part_modifier::ZERO_OR_MORE: + return "*"; + // If modifier is "optional", then return "?". + case url_pattern_part_modifier::OPTIONAL: + return "?"; + // If modifier is "one-or-more", then return "+". + case url_pattern_part_modifier::ONE_OR_MORE: + return "+"; + // Return the empty string. + default: + return ""; + } +} + +std::string generate_segment_wildcard_regexp( + url_pattern_compile_component_options options) { + // Let result be "[^". + std::string result = "[^"; + // Append the result of running escape a regexp string given options’s + // delimiter code point to the end of result. + result.append(escape_regexp_string(options.get_delimiter())); + // Append "]+?" to the end of result. + result.append("]+?"); + // Return result. + ada_log("generate_segment_wildcard_regexp result: ", result); + return result; +} + +bool protocol_component_matches_special_scheme( + url_pattern_component& component) { + auto regex = component.regexp; + return std::regex_match("http", regex) || std::regex_match("https", regex) || + std::regex_match("ws", regex) || std::regex_match("wss", regex) || + std::regex_match("ftp", regex); +} + +inline std::optional +constructor_string_parser::compute_protocol_matches_special_scheme_flag() { + ada_log( + "constructor_string_parser::compute_protocol_matches_special_scheme_" + "flag"); + // Let protocol string be the result of running make a component string given + // parser. + auto protocol_string = make_component_string(); + // Let protocol component be the result of compiling a component given + // protocol string, canonicalize a protocol, and default options. + auto protocol_component = url_pattern_component::compile( + protocol_string, canonicalize_protocol, + url_pattern_compile_component_options::DEFAULT); + if (!protocol_component) { + ada_log("url_pattern_component::compile failed for protocol_string ", + protocol_string); + return protocol_component.error(); + } + // If the result of running protocol component matches a special scheme given + // protocol component is true, then set parser’s protocol matches a special + // scheme flag to true. + if (protocol_component_matches_special_scheme(*protocol_component)) { + protocol_matches_a_special_scheme_flag = true; + } + return std::nullopt; +} + +tl::expected canonicalize_protocol( + std::string_view input) { + ada_log("canonicalize_protocol called with input=", input); + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + + // IMPORTANT: Deviation from the spec. We remove the trailing ':' here. + if (input.ends_with(":")) { + input.remove_suffix(1); + } + + // Let dummyURL be a new URL record. + // Let parseResult be the result of running the basic URL parser given value + // followed by "://dummy.test", with dummyURL as url. + if (auto dummy_url = ada::parse( + std::string(input) + "://dummy.test", nullptr)) { + // IMPORTANT: Deviation from the spec. We remove the trailing ':' here. + // Since URL parser always return protocols ending with `:` + auto protocol = dummy_url->get_protocol(); + protocol.remove_suffix(1); + return std::string(protocol); + } + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_username( + std::string_view input) { + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + auto url = ada::parse("fake://dummy.test", nullptr); + ADA_ASSERT_TRUE(url.has_value()); + // Set the username given dummyURL and value. + if (!url->set_username(input)) { + return tl::unexpected(errors::type_error); + } + // Return dummyURL’s username. + return std::string(url->get_username()); +} + +tl::expected canonicalize_password( + std::string_view input) { + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // Set the password given dummyURL and value. + auto url = ada::parse("fake://dummy.test", nullptr); + + ADA_ASSERT_TRUE(url.has_value()); + if (!url->set_password(input)) { + return tl::unexpected(errors::type_error); + } + // Return dummyURL’s password. + return std::string(url->get_password()); +} + +tl::expected canonicalize_hostname( + std::string_view input) { + ada_log("canonicalize_hostname input=", input); + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // Let parseResult be the result of running the basic URL parser given value + // with dummyURL as url and hostname state as state override. + + // IMPORTANT: The protocol needs to be a special protocol, otherwise the + // hostname will not be converted using IDNA. + auto url = ada::parse("https://dummy.test", nullptr); + ADA_ASSERT_TRUE(url); + // if (!isValidHostnameInput(hostname)) return kj::none; + if (!url->set_hostname(input)) { + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); + } + // Return dummyURL’s host, serialized, or empty string if it is null. + return std::string(url->get_hostname()); +} + +tl::expected canonicalize_ipv6_hostname( + std::string_view input) { + ada_log("canonicalize_ipv6_hostname input=", input); + // TODO: Optimization opportunity: Use lookup table to speed up checking + if (std::ranges::any_of(input, [](char c) { + return c != '[' && c != ']' && c != ':' && + !unicode::is_ascii_hex_digit(c); + })) { + return tl::unexpected(errors::type_error); + } + // Append the result of running ASCII lowercase given code point to the end of + // result. + auto hostname = std::string(input); + unicode::to_lower_ascii(hostname.data(), hostname.size()); + return hostname; +} + +tl::expected canonicalize_port( + std::string_view port_value) { + // If portValue is the empty string, return portValue. + if (port_value.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // If protocolValue was given, then set dummyURL’s scheme to protocolValue. + // Let parseResult be the result of running basic URL parser given portValue + // with dummyURL as url and port state as state override. + auto url = ada::parse("fake://dummy.test", nullptr); + ADA_ASSERT_TRUE(url); + if (url->set_port(port_value)) { + // Return dummyURL’s port, serialized, or empty string if it is null. + return std::string(url->get_port()); + } + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_port_with_protocol( + std::string_view port_value, std::string_view protocol) { + // If portValue is the empty string, return portValue. + if (port_value.empty()) [[unlikely]] { + return ""; + } + + // TODO: Remove this + // We have an empty protocol because get_protocol() returns an empty string + // We should handle this in the caller rather than here. + if (protocol.empty()) { + protocol = "fake"; + } else if (protocol.ends_with(":")) { + protocol.remove_suffix(1); + } + // Let dummyURL be a new URL record. + // If protocolValue was given, then set dummyURL’s scheme to protocolValue. + // Let parseResult be the result of running basic URL parser given portValue + // with dummyURL as url and port state as state override. + auto url = ada::parse(std::string(protocol) + "://dummy.test", + nullptr); + // TODO: Remove has_port() check. + // This is actually a bug with url parser where set_port() returns true for + // "invalid80" port value. + if (url && url->set_port(port_value) && url->has_port()) { + // Return dummyURL’s port, serialized, or empty string if it is null. + return std::string(url->get_port()); + } + // TODO: Remove this once the previous has_port() check is removed. + if (url) { + if (scheme::is_special(protocol) && url->get_port().empty()) { + return ""; + } + } + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_pathname( + std::string_view input) { + // If value is the empty string, then return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let leading slash be true if the first code point in value is U+002F (/) + // and otherwise false. + const bool leading_slash = input.starts_with("/"); + // Let modified value be "/-" if leading slash is false and otherwise the + // empty string. + const auto modified_value = leading_slash ? "" : "/-"; + const auto full_url = + std::string("fake://fake-url") + modified_value + std::string(input); + if (auto url = ada::parse(full_url, nullptr)) { + const auto pathname = url->get_pathname(); + // If leading slash is false, then set result to the code point substring + // from 2 to the end of the string within result. + return leading_slash ? std::string(pathname) + : std::string(pathname.substr(2)); + } + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_opaque_pathname( + std::string_view input) { + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // Set dummyURL’s path to the empty string. + // Let parseResult be the result of running URL parsing given value with + // dummyURL as url and opaque path state as state override. + if (auto url = + ada::parse("fake:" + std::string(input), nullptr)) { + // Return the result of URL path serializing dummyURL. + return std::string(url->get_pathname()); + } + // If parseResult is failure, then throw a TypeError. + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_search(std::string_view input) { + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // Set dummyURL’s query to the empty string. + // Let parseResult be the result of running basic URL parser given value with + // dummyURL as url and query state as state override. + auto url = ada::parse("fake://dummy.test", nullptr); + ADA_ASSERT_TRUE(url.has_value()); + url->set_search(input); + if (url->has_search()) { + const auto search = url->get_search(); + return std::string(search.substr(1)); + } + return tl::unexpected(errors::type_error); +} + +tl::expected canonicalize_hash(std::string_view input) { + // If value is the empty string, return value. + if (input.empty()) [[unlikely]] { + return ""; + } + // Let dummyURL be a new URL record. + // Set dummyURL’s fragment to the empty string. + // Let parseResult be the result of running basic URL parser given value with + // dummyURL as url and fragment state as state override. + auto url = ada::parse("fake://dummy.test", nullptr); + ADA_ASSERT_TRUE(url.has_value()); + url->set_hash(input); + // Return dummyURL’s fragment. + if (url->has_hash()) { + const auto hash = url->get_hash(); + return std::string(hash.substr(1)); + } + return tl::unexpected(errors::type_error); +} + +tl::expected constructor_string_parser::parse( + std::string_view input) { + ada_log("constructor_string_parser::parse input=", input); + // Let parser be a new constructor string parser whose input is input and + // token list is the result of running tokenize given input and "lenient". + auto token_list = tokenize(input, token_policy::LENIENT); + if (!token_list) { + return tl::unexpected(token_list.error()); + } + auto parser = constructor_string_parser(input, std::move(*token_list)); + + // While parser’s token index is less than parser’s token list size: + while (parser.token_index < parser.token_list.size()) { + // Set parser’s token increment to 1. + parser.token_increment = 1; + + // If parser’s token list[parser’s token index]'s type is "end" then: + if (parser.token_list[parser.token_index].type == token_type::END) { + // If parser’s state is "init": + if (parser.state == State::INIT) { + // Run rewind given parser. + parser.rewind(); + // If the result of running is a hash prefix given parser is true, then + // run change state given parser, "hash" and 1. + if (parser.is_hash_prefix()) { + parser.change_state(State::HASH, 1); + } else if (parser.is_search_prefix()) { + // Otherwise if the result of running is a search prefix given parser + // is true: Run change state given parser, "search" and 1. + parser.change_state(State::SEARCH, 1); + } else { + // Run change state given parser, "pathname" and 0. + parser.change_state(State::PATHNAME, 0); + } + // Increment parser’s token index by parser’s token increment. + parser.token_index += parser.token_increment; + // Continue. + continue; + } + + if (parser.state == State::AUTHORITY) { + // If parser’s state is "authority": + // Run rewind and set state given parser, and "hostname". + parser.rewind(); + parser.change_state(State::HOSTNAME, 0); + // Increment parser’s token index by parser’s token increment. + parser.token_index += parser.token_increment; + // Continue. + continue; + } + + // Run change state given parser, "done" and 0. + parser.change_state(State::DONE, 0); + // Break. + break; + } + + // If the result of running is a group open given parser is true: + if (parser.is_group_open()) { + // Increment parser’s group depth by 1. + parser.group_depth += 1; + // Increment parser’s token index by parser’s token increment. + parser.token_index += parser.token_increment; + } + + // If parser’s group depth is greater than 0: + if (parser.group_depth > 0) { + // If the result of running is a group close given parser is true, then + // decrement parser’s group depth by 1. + if (parser.is_group_close()) { + parser.group_depth -= 1; + } else { + // Increment parser’s token index by parser’s token increment. + parser.token_index += parser.token_increment; + continue; + } + } + + // Switch on parser’s state and run the associated steps: + switch (parser.state) { + case State::INIT: { + // If the result of running is a protocol suffix given parser is true: + if (parser.is_protocol_suffix()) { + // Run rewind and set state given parser and "protocol". + parser.rewind(); + parser.change_state(State::PROTOCOL, 0); + } + break; + } + case State::PROTOCOL: { + // If the result of running is a protocol suffix given parser is true: + if (parser.is_protocol_suffix()) { + // Run compute protocol matches a special scheme flag given parser. + if (const auto error = + parser.compute_protocol_matches_special_scheme_flag()) { + ada_log("compute_protocol_matches_special_scheme_flag failed"); + return tl::unexpected(*error); + } + // Let next state be "pathname". + auto next_state = State::PATHNAME; + // Let skip be 1. + auto skip = 1; + // If the result of running next is authority slashes given parser is + // true: + if (parser.next_is_authority_slashes()) { + // Set next state to "authority". + next_state = State::AUTHORITY; + // Set skip to 3. + skip = 3; + } else if (parser.protocol_matches_a_special_scheme_flag) { + // Otherwise if parser’s protocol matches a special scheme flag is + // true, then set next state to "authority". + next_state = State::AUTHORITY; + } + + // Run change state given parser, next state, and skip. + parser.change_state(next_state, skip); + } + break; + } + case State::AUTHORITY: { + // If the result of running is an identity terminator given parser is + // true, then run rewind and set state given parser and "username". + if (parser.is_an_identity_terminator()) { + parser.rewind(); + parser.change_state(State::USERNAME, 0); + } else if (parser.is_pathname_start() || parser.is_search_prefix() || + parser.is_hash_prefix()) { + // Otherwise if any of the following are true: + // - the result of running is a pathname start given parser; + // - the result of running is a search prefix given parser; or + // - the result of running is a hash prefix given parser, + // then run rewind and set state given parser and "hostname". + parser.rewind(); + parser.change_state(State::HOSTNAME, 0); + } + break; + } + case State::USERNAME: { + // If the result of running is a password prefix given parser is true, + // then run change state given parser, "password", and 1. + if (parser.is_password_prefix()) { + parser.change_state(State::PASSWORD, 1); + } else if (parser.is_an_identity_terminator()) { + // Otherwise if the result of running is an identity terminator given + // parser is true, then run change state given parser, "hostname", + // and 1. + parser.change_state(State::HOSTNAME, 1); + } + break; + } + case State::PASSWORD: { + // If the result of running is an identity terminator given parser is + // true, then run change state given parser, "hostname", and 1. + if (parser.is_an_identity_terminator()) { + parser.change_state(State::HOSTNAME, 1); + } + break; + } + case State::HOSTNAME: { + // If the result of running is an IPv6 open given parser is true, then + // increment parser’s hostname IPv6 bracket depth by 1. + if (parser.is_an_ipv6_open()) { + parser.hostname_ipv6_bracket_depth += 1; + } else if (parser.is_an_ipv6_close()) { + // Otherwise if the result of running is an IPv6 close given parser is + // true, then decrement parser’s hostname IPv6 bracket depth by 1. + parser.hostname_ipv6_bracket_depth -= 1; + } else if (parser.is_port_prefix() && + parser.hostname_ipv6_bracket_depth == 0) { + // Otherwise if the result of running is a port prefix given parser is + // true and parser’s hostname IPv6 bracket depth is zero, then run + // change state given parser, "port", and 1. + parser.change_state(State::PORT, 1); + } else if (parser.is_pathname_start()) { + // Otherwise if the result of running is a pathname start given parser + // is true, then run change state given parser, "pathname", and 0. + parser.change_state(State::PATHNAME, 0); + } else if (parser.is_search_prefix()) { + // Otherwise if the result of running is a search prefix given parser + // is true, then run change state given parser, "search", and 1. + parser.change_state(State::SEARCH, 1); + } else if (parser.is_hash_prefix()) { + // Otherwise if the result of running is a hash prefix given parser is + // true, then run change state given parser, "hash", and 1. + parser.change_state(State::HASH, 1); + } + + break; + } + case State::PORT: { + // If the result of running is a pathname start given parser is true, + // then run change state given parser, "pathname", and 0. + if (parser.is_pathname_start()) { + parser.change_state(State::PATHNAME, 0); + } else if (parser.is_search_prefix()) { + // Otherwise if the result of running is a search prefix given parser + // is true, then run change state given parser, "search", and 1. + parser.change_state(State::SEARCH, 1); + } else if (parser.is_hash_prefix()) { + // Otherwise if the result of running is a hash prefix given parser is + // true, then run change state given parser, "hash", and 1. + parser.change_state(State::HASH, 1); + } + break; + } + case State::PATHNAME: { + // If the result of running is a search prefix given parser is true, + // then run change state given parser, "search", and 1. + if (parser.is_search_prefix()) { + parser.change_state(State::SEARCH, 1); + } else if (parser.is_hash_prefix()) { + // Otherwise if the result of running is a hash prefix given parser is + // true, then run change state given parser, "hash", and 1. + parser.change_state(State::HASH, 1); + } + break; + } + case State::SEARCH: { + // If the result of running is a hash prefix given parser is true, then + // run change state given parser, "hash", and 1. + if (parser.is_hash_prefix()) { + parser.change_state(State::HASH, 1); + } + } + case State::HASH: { + // Do nothing + break; + } + default: { + // Assert: This step is never reached. + unreachable(); + } + } + + // Increment parser’s token index by parser’s token increment. + parser.token_index += parser.token_increment; + } + + // If parser’s result contains "hostname" and not "port", then set parser’s + // result["port"] to the empty string. + if (parser.result.hostname && !parser.result.port) { + parser.result.port = ""; + } + + // Return parser’s result. + return parser.result; +} + +tl::expected, errors> tokenize(std::string_view input, + token_policy policy) { + ada_log("tokenize input: ", input); + // Let tokenizer be a new tokenizer. + // Set tokenizer’s input to input. + // Set tokenizer’s policy to policy. + auto tokenizer = Tokenizer(input, policy); + // While tokenizer’s index is less than tokenizer’s input's code point length: + while (tokenizer.index < tokenizer.input.size()) { + // Run seek and get the next code point given tokenizer and tokenizer’s + // index. + tokenizer.seek_and_get_next_code_point(tokenizer.index); + + // If tokenizer’s code point is U+002A (*): + if (tokenizer.code_point == '*') { + // Run add a token with default position and length given tokenizer and + // "asterisk". + tokenizer.add_token_with_defaults(token_type::ASTERISK); + ada_log("add ASTERISK token"); + // Continue. + continue; + } + + // If tokenizer’s code point is U+002B (+) or U+003F (?): + if (tokenizer.code_point == '+' || tokenizer.code_point == '?') { + // Run add a token with default position and length given tokenizer and + // "other-modifier". + tokenizer.add_token_with_defaults(token_type::OTHER_MODIFIER); + // Continue. + continue; + } + + // If tokenizer’s code point is U+005C (\): + if (tokenizer.code_point == '\\') { + // If tokenizer’s index is equal to tokenizer’s input's code point length + // − 1: + if (tokenizer.index == tokenizer.input.size() - 1) { + // Run process a tokenizing error given tokenizer, tokenizer’s next + // index, and tokenizer’s index. + if (auto error = tokenizer.process_tokenizing_error( + tokenizer.next_index, tokenizer.index)) { + ada_log("process_tokenizing_error failed"); + return tl::unexpected(*error); + } + continue; + } + + // Let escaped index be tokenizer’s next index. + auto escaped_index = tokenizer.next_index; + // Run get the next code point given tokenizer. + tokenizer.get_next_code_point(); + // Run add a token with default length given tokenizer, "escaped-char", + // tokenizer’s next index, and escaped index. + tokenizer.add_token_with_default_length( + token_type::ESCAPED_CHAR, tokenizer.next_index, escaped_index); + ada_log("add ESCAPED_CHAR token on next_index ", tokenizer.next_index, + " with escaped index ", escaped_index); + // Continue. + continue; + } + + // If tokenizer’s code point is U+007B ({): + if (tokenizer.code_point == '{') { + // Run add a token with default position and length given tokenizer and + // "open". + tokenizer.add_token_with_defaults(token_type::OPEN); + ada_log("add OPEN token"); + continue; + } + + // If tokenizer’s code point is U+007D (}): + if (tokenizer.code_point == '}') { + // Run add a token with default position and length given tokenizer and + // "close". + tokenizer.add_token_with_defaults(token_type::CLOSE); + ada_log("add CLOSE token"); + continue; + } + + // If tokenizer’s code point is U+003A (:): + if (tokenizer.code_point == ':') { + // Let name position be tokenizer’s next index. + auto name_position = tokenizer.next_index; + // Let name start be name position. + auto name_start = name_position; + // While name position is less than tokenizer’s input's code point length: + while (name_position < tokenizer.input.size()) { + // Run seek and get the next code point given tokenizer and name + // position. + tokenizer.seek_and_get_next_code_point(name_position); + // Let first code point be true if name position equals name start and + // false otherwise. + bool first_code_point = name_position == name_start; + // Let valid code point be the result of running is a valid name code + // point given tokenizer’s code point and first code point. + auto valid_code_point = + idna::valid_name_code_point(tokenizer.code_point, first_code_point); + ada_log("tokenizer.code_point=", uint32_t(tokenizer.code_point), + " first_code_point=", first_code_point, + " valid_code_point=", valid_code_point); + // If valid code point is false break. + if (!valid_code_point) break; + // Set name position to tokenizer’s next index. + name_position = tokenizer.next_index; + } + + // If name position is less than or equal to name start: + if (name_position <= name_start) { + // Run process a tokenizing error given tokenizer, name start, and + // tokenizer’s index. + if (auto error = tokenizer.process_tokenizing_error(name_start, + tokenizer.index)) { + ada_log("process_tokenizing_error failed"); + return tl::unexpected(*error); + } + // Continue + continue; + } + + // Run add a token with default length given tokenizer, "name", name + // position, and name start. + tokenizer.add_token_with_default_length(token_type::NAME, name_position, + name_start); + continue; + } + + // If tokenizer’s code point is U+0028 ((): + if (tokenizer.code_point == '(') { + // Let depth be 1. + size_t depth = 1; + // Let regexp position be tokenizer’s next index. + auto regexp_position = tokenizer.next_index; + // Let regexp start be regexp position. + auto regexp_start = regexp_position; + // Let error be false. + bool error = false; + + // While regexp position is less than tokenizer’s input's code point + // length: + while (regexp_position < tokenizer.input.size()) { + // Run seek and get the next code point given tokenizer and regexp + // position. + tokenizer.seek_and_get_next_code_point(regexp_position); + + // TODO: Optimization opportunity: The next 2 if statements can be + // merged. If the result of running is ASCII given tokenizer’s code + // point is false: + if (!unicode::is_ascii(tokenizer.code_point)) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + // Set error to true. + error = true; + break; + } + + // If regexp position equals regexp start and tokenizer’s code point is + // U+003F (?): + if (regexp_position == regexp_start && tokenizer.code_point == '?') { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + // Set error to true; + error = true; + break; + } + + // If tokenizer’s code point is U+005C (\): + if (tokenizer.code_point == '\\') { + // If regexp position equals tokenizer’s input's code point length − 1 + if (regexp_position == tokenizer.input.size() - 1) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + // Set error to true. + error = true; + break; + } + // Run get the next code point given tokenizer. + tokenizer.get_next_code_point(); + // If the result of running is ASCII given tokenizer’s code point is + // false: + if (!unicode::is_ascii(tokenizer.code_point)) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index); + process_error.has_value()) { + return tl::unexpected(*process_error); + } + // Set error to true. + error = true; + break; + } + // Set regexp position to tokenizer’s next index. + regexp_position = tokenizer.next_index; + continue; + } + + // If tokenizer’s code point is U+0029 ()): + if (tokenizer.code_point == ')') { + // Decrement depth by 1. + depth--; + // If depth is 0: + if (depth == 0) { + // Set regexp position to tokenizer’s next index. + regexp_position = tokenizer.next_index; + // Break. + break; + } + } else if (tokenizer.code_point == '(') { + // Otherwise if tokenizer’s code point is U+0028 ((): + // Increment depth by 1. + depth++; + // If regexp position equals tokenizer’s input's code point length − + // 1: + if (regexp_position == tokenizer.input.size() - 1) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + // Set error to true. + error = true; + break; + } + // Let temporary position be tokenizer’s next index. + auto temporary_position = tokenizer.next_index; + // Run get the next code point given tokenizer. + tokenizer.get_next_code_point(); + // If tokenizer’s code point is not U+003F (?): + if (tokenizer.code_point != '?') { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + // Set error to true. + error = true; + break; + } + // Set tokenizer’s next index to temporary position. + tokenizer.next_index = temporary_position; + } + // Set regexp position to tokenizer’s next index. + regexp_position = tokenizer.next_index; + } + + // If error is true continue. + if (error) continue; + // If depth is not zero: + if (depth != 0) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + return tl::unexpected(*process_error); + } + continue; + } + // Let regexp length be regexp position − regexp start − 1. + auto regexp_length = regexp_position - regexp_start - 1; + // If regexp length is zero: + if (regexp_length == 0) { + // Run process a tokenizing error given tokenizer, regexp start, and + // tokenizer’s index. + if (auto process_error = tokenizer.process_tokenizing_error( + regexp_start, tokenizer.index)) { + ada_log("process_tokenizing_error failed"); + return tl::unexpected(*process_error); + } + continue; + } + // Run add a token given tokenizer, "regexp", regexp position, regexp + // start, and regexp length. + tokenizer.add_token(token_type::REGEXP, regexp_position, regexp_start, + regexp_length); + continue; + } + // Run add a token with default position and length given tokenizer and + // "char". + tokenizer.add_token_with_defaults(token_type::CHAR); + } + // Run add a token with default length given tokenizer, "end", tokenizer’s + // index, and tokenizer’s index. + tokenizer.add_token_with_default_length(token_type::END, tokenizer.index, + tokenizer.index); + + ada_log("tokenizer.token_list size is: ", tokenizer.token_list.size()); + // Return tokenizer’s token list. + return tokenizer.token_list; +} + +std::string escape_pattern_string(std::string_view input) { + ada_log("escape_pattern_string called with input=", input); + if (input.empty()) [[unlikely]] { + return ""; + } + // Assert: input is an ASCII string. + ADA_ASSERT_TRUE(ada::idna::is_ascii(input)); + // Let result be the empty string. + std::string result{}; + result.reserve(input.size()); + + // TODO: Optimization opportunity: Use a lookup table + constexpr auto should_escape = [](const char c) { + return c == '+' || c == '*' || c == '?' || c == ':' || c == '{' || + c == '}' || c == '(' || c == ')' || c == '\\'; + }; + + // While index is less than input’s length: + for (const auto& c : input) { + if (should_escape(c)) { + // then append U+005C (\) to the end of result. + result.append("\\"); + } + + // Append c to the end of result. + result += c; + } + // Return result. + return result; +} + +namespace { +constexpr std::array escape_regexp_table = []() consteval { + std::array out{}; + for (auto& c : {'.', '+', '*', '?', '^', '$', '{', '}', '(', ')', '[', ']', + '|', '/', '\\'}) { + out[c] = 1; + } + return out; +}(); + +constexpr bool should_escape_regexp_char(char c) { + return escape_regexp_table[(uint8_t)c]; +} +} // namespace + +std::string escape_regexp_string(std::string_view input) { + // Assert: input is an ASCII string. + ADA_ASSERT_TRUE(idna::is_ascii(input)); + // Let result be the empty string. + std::string result{}; + result.reserve(input.size()); + for (const auto& c : input) { + // TODO: Optimize this even further + if (should_escape_regexp_char(c)) { + result.append(std::string("\\") + c); + } else { + result.push_back(c); + } + } + return result; +} + +std::string process_base_url_string(std::string_view input, + std::string_view type) { + // If type is not "pattern" return input. + if (type != "pattern") { + return std::string(input); + } + // Return the result of escaping a pattern string given input. + return escape_pattern_string(input); +} + +constexpr bool is_absolute_pathname(std::string_view input, + std::string_view type) noexcept { + // If input is the empty string, then return false. + if (input.empty()) [[unlikely]] { + return false; + } + // If input[0] is U+002F (/), then return true. + if (input.starts_with("/")) return true; + // If type is "url", then return false. + if (type == "url") return false; + // If input’s code point length is less than 2, then return false. + if (input.size() < 2) return false; + // If input[0] is U+005C (\) and input[1] is U+002F (/), then return true. + if (input.starts_with("\\/")) return true; + // If input[0] is U+007B ({) and input[1] is U+002F (/), then return true. + if (input.starts_with("{/")) return true; + // Return false. + return false; +} + +std::string generate_pattern_string( + std::vector& part_list, + url_pattern_compile_component_options& options) { + // Let result be the empty string. + std::string result{}; + // Let index list be the result of getting the indices for part list. + // For each index of index list: + for (size_t index = 0; index < part_list.size(); index++) { + // Let part be part list[index]. + auto part = part_list[index]; + // Let previous part be part list[index - 1] if index is greater than 0, + // otherwise let it be null. + // TODO: Optimization opportunity. Find a way to avoid making a copy here. + std::optional previous_part = + index == 0 ? std::nullopt : std::optional(part_list[index - 1]); + // Let next part be part list[index + 1] if index is less than index list’s + // size - 1, otherwise let it be null. + std::optional next_part = + index < part_list.size() - 1 ? std::optional(part_list[index + 1]) + : std::nullopt; + // If part’s type is "fixed-text" then: + if (part.type == url_pattern_part_type::FIXED_TEXT) { + // If part’s modifier is "none" then: + if (part.modifier == url_pattern_part_modifier::NONE) { + // Append the result of running escape a pattern string given part’s + // value to the end of result. + result.append(escape_pattern_string(part.value)); + continue; + } + // Append "{" to the end of result. + result += "{"; + // Append the result of running escape a pattern string given part’s value + // to the end of result. + result.append(escape_pattern_string(part.value)); + // Append "}" to the end of result. + result += "}"; + // Append the result of running convert a modifier to a string given + // part’s modifier to the end of result. + result.append(convert_modifier_to_string(part.modifier)); + continue; + } + // Let custom name be true if part’s name[0] is not an ASCII digit; + // otherwise false. + bool custom_name = !unicode::is_ascii_digit(part.name[0]); + // Let needs grouping be true if at least one of the following are true, + // otherwise let it be false: + // - part’s suffix is not the empty string. + // - part’s prefix is not the empty string and is not options’s prefix code + // point. + bool needs_grouping = + !part.suffix.empty() || + (!part.prefix.empty() && part.prefix[0] != options.get_prefix()[0]); + + // If all of the following are true: + // - needs grouping is false; and + // - custom name is true; and + // - part’s type is "segment-wildcard"; and + // - part’s modifier is "none"; and + // - next part is not null; and + // - next part’s prefix is the empty string; and + // - next part’s suffix is the empty string + if (!needs_grouping && custom_name && + part.type == url_pattern_part_type::SEGMENT_WILDCARD && + part.modifier == url_pattern_part_modifier::NONE && + next_part.has_value() && next_part->prefix.empty() && + next_part->suffix.empty()) { + // If next part’s type is "fixed-text": + if (next_part->type == url_pattern_part_type::FIXED_TEXT) { + // Set needs grouping to true if the result of running is a valid name + // code point given next part’s value's first code point and the boolean + // false is true. + if (idna::valid_name_code_point(next_part->value[0], false)) { + needs_grouping = true; + } + } else { + // Set needs grouping to true if next part’s name[0] is an ASCII digit. + needs_grouping = !next_part->name.empty() && + unicode::is_ascii_digit(next_part->name[0]); + } + } + + // If all of the following are true: + // - needs grouping is false; and + // - part’s prefix is the empty string; and + // - previous part is not null; and + // - previous part’s type is "fixed-text"; and + // - previous part’s value's last code point is options’s prefix code point. + // then set needs grouping to true. + if (!needs_grouping && part.prefix.empty() && previous_part.has_value() && + previous_part->type == url_pattern_part_type::FIXED_TEXT && + !options.get_prefix().empty() && + previous_part->value.at(previous_part->value.size() - 1) == + options.get_prefix()[0]) { + needs_grouping = true; + } + + // Assert: part’s name is not the empty string or null. + ADA_ASSERT_TRUE(!part.name.empty()); + + // If needs grouping is true, then append "{" to the end of result. + if (needs_grouping) { + result.append("{"); + } + + // Append the result of running escape a pattern string given part’s prefix + // to the end of result. + result.append(escape_pattern_string(part.prefix)); + + // If custom name is true: + if (custom_name) { + // Append ":" to the end of result. + result.append(":"); + // Append part’s name to the end of result. + result.append(part.name); + } + + // If part’s type is "regexp" then: + if (part.type == url_pattern_part_type::REGEXP) { + // Append "(" to the end of result. + result.append("("); + // Append part’s value to the end of result. + result.append(part.value); + // Append ")" to the end of result. + result.append(")"); + } else if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && + !custom_name) { + // Otherwise if part’s type is "segment-wildcard" and custom name is + // false: Append "(" to the end of result. + result.append("("); + // Append the result of running generate a segment wildcard regexp given + // options to the end of result. + result.append(generate_segment_wildcard_regexp(options)); + // Append ")" to the end of result. + result.append(")"); + } else if (part.type == url_pattern_part_type::FULL_WILDCARD) { + // Otherwise if part’s type is "full-wildcard": + // If custom name is false and one of the following is true: + // - previous part is null; or + // - previous part’s type is "fixed-text"; or + // - previous part’s modifier is not "none"; or + // - needs grouping is true; or + // - part’s prefix is not the empty string + // - then append "*" to the end of result. + if (!custom_name && + (!previous_part.has_value() || + previous_part->type == url_pattern_part_type::FIXED_TEXT || + previous_part->modifier != url_pattern_part_modifier::NONE || + needs_grouping || !part.prefix.empty())) { + result.append("*"); + } else { + // Append "(" to the end of result. + // Append full wildcard regexp value to the end of result. + // Append ")" to the end of result. + result.append("(.*)"); + } + } + + // If all of the following are true: + // - part’s type is "segment-wildcard"; and + // - custom name is true; and + // - part’s suffix is not the empty string; and + // - The result of running is a valid name code point given part’s suffix's + // first code point and the boolean false is true then append U+005C (\) to + // the end of result. + if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && custom_name && + !part.suffix.empty() && + idna::valid_name_code_point(part.suffix[0], false)) { + result.append("\\"); + } + + // Append the result of running escape a pattern string given part’s suffix + // to the end of result. + result.append(escape_pattern_string(part.suffix)); + // If needs grouping is true, then append "}" to the end of result. + if (needs_grouping) result.append("}"); + // Append the result of running convert a modifier to a string given part’s + // modifier to the end of result. + result.append(convert_modifier_to_string(part.modifier)); + } + // Return result. + return result; +} + +} // namespace ada::url_pattern_helpers diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f9a242b03..5efbe8a6a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,6 +7,14 @@ if(MSVC) add_compile_options("$<$:/utf-8>") endif() +if(ADA_LOGGING) + add_compile_definitions(ADA_LOGGING=1) +endif() + +if(ADA_TESTING) + add_compile_definitions(ADA_TESTING=1) +endif() + include(${PROJECT_SOURCE_DIR}/cmake/add-cpp-test.cmake) link_libraries(ada) @@ -25,21 +33,24 @@ if(MSVC AND BUILD_SHARED_LIBS) message(STATUS "Thus the tests are disabled. Sorry.") else() include(GoogleTest) - add_executable(wpt_tests wpt_tests.cpp) + add_executable(wpt_url_tests wpt_url_tests.cpp) + add_executable(wpt_urlpattern_tests wpt_urlpattern_tests.cpp) add_executable(url_components url_components.cpp) add_executable(basic_tests basic_tests.cpp) add_executable(from_file_tests from_file_tests.cpp) add_executable(ada_c ada_c.cpp) add_executable(url_search_params url_search_params.cpp) - target_link_libraries(wpt_tests PRIVATE simdjson GTest::gtest_main) + target_link_libraries(wpt_url_tests PRIVATE simdjson GTest::gtest_main) + target_link_libraries(wpt_urlpattern_tests PRIVATE simdjson GTest::gtest_main) target_link_libraries(url_components PRIVATE simdjson GTest::gtest_main) target_link_libraries(basic_tests PRIVATE simdjson GTest::gtest_main) target_link_libraries(from_file_tests PRIVATE simdjson GTest::gtest_main) target_link_libraries(ada_c PRIVATE simdjson GTest::gtest_main) target_link_libraries(url_search_params PRIVATE simdjson GTest::gtest_main) - gtest_discover_tests(wpt_tests PROPERTIES TEST_DISCOVERY_TIMEOUT 600) + gtest_discover_tests(wpt_url_tests PROPERTIES TEST_DISCOVERY_TIMEOUT 600) + gtest_discover_tests(wpt_urlpattern_tests PROPERTIES TEST_DISCOVERY_TIMEOUT 600) gtest_discover_tests(url_components PROPERTIES TEST_DISCOVERY_TIMEOUT 600) gtest_discover_tests(basic_tests PROPERTIES TEST_DISCOVERY_TIMEOUT 600) gtest_discover_tests(from_file_tests PROPERTIES TEST_DISCOVERY_TIMEOUT 600) @@ -48,14 +59,16 @@ else() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9) - target_link_libraries(wpt_tests PUBLIC stdc++fs) + target_link_libraries(wpt_url_tests PUBLIC stdc++fs) + target_link_libraries(wpt_urlpattern_tests PUBLIC stdc++fs) target_link_libraries(url_components PUBLIC stdc++fs) target_link_libraries(url_search_params PUBLIC stdc++fs) endif() endif() if(MSVC OR MINGW) - target_compile_definitions(wpt_tests PRIVATE _CRT_SECURE_NO_WARNINGS) + target_compile_definitions(wpt_url_tests PRIVATE _CRT_SECURE_NO_WARNINGS) + target_compile_definitions(wpt_urlpattern_tests PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(url_components PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(basic_fuzzer PRIVATE _CRT_SECURE_NO_WARNINGS) target_compile_definitions(from_file_tests PRIVATE _CRT_SECURE_NO_WARNINGS) diff --git a/tests/wpt/urlpatterntestdata.json b/tests/wpt/urlpatterntestdata.json index 058079bb6..18a696d7f 100644 --- a/tests/wpt/urlpatterntestdata.json +++ b/tests/wpt/urlpatterntestdata.json @@ -379,9 +379,8 @@ { "pattern": [{ "pathname": "/foo/:bar?" }], "inputs": [{ "pathname": "/foo" }], - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "bar": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { @@ -445,9 +444,8 @@ { "pattern": [{ "pathname": "/foo/:bar*" }], "inputs": [{ "pathname": "/foo" }], - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "bar": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { @@ -500,17 +498,15 @@ "expected_obj": { "pathname": "/foo/*?" }, - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "0": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { "pattern": [{ "pathname": "/foo/*?" }], "inputs": [{ "pathname": "/foo" }], - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "0": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { @@ -686,17 +682,15 @@ "expected_obj": { "pathname": "/foo/**" }, - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "0": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { "pattern": [{ "pathname": "/foo/**" }], "inputs": [{ "pathname": "/foo" }], - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { - "pathname": { "input": "/foo", "groups": { "0": null } } + "pathname": { "input": "/foo", "groups": {} } } }, { @@ -1145,6 +1139,13 @@ { "pattern": [{ "protocol": "http", "port": "80 " }], "inputs": [{ "protocol": "http", "port": "80" }], + "exactly_empty_components": ["port"], + "expected_match": { + "protocol": { "input": "http", "groups": {} } + } + }, + { + "pattern": [{ "protocol": "http", "port": "100000" }], "expected_obj": "error" }, { @@ -1380,10 +1381,6 @@ "pathname": { "input": "8675309", "groups": { "number": "8675309" }} } }, - { - "pattern": [{ "pathname": "/(\\m)" }], - "expected_obj": "error" - }, { "pattern": [{ "pathname": "/foo!" }], "inputs": [{ "pathname": "/foo!" }], @@ -1806,11 +1803,6 @@ "pathname": { "input": "/foo", "groups": {} } } }, - { - "pattern": [ "https://{sub.}?example{.com/}foo" ], - "inputs": [ "https://example.com/foo" ], - "expected_obj": "error" - }, { "pattern": [ "{https://}example.com/foo" ], "inputs": [ "https://example.com/foo" ], @@ -1825,10 +1817,9 @@ "hostname": "(sub.)?example.com", "pathname": "/foo" }, - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { "protocol": { "input": "https", "groups": {} }, - "hostname": { "input": "example.com", "groups": { "0": null } }, + "hostname": { "input": "example.com", "groups": {} }, "pathname": { "input": "/foo", "groups": {} } } }, @@ -1862,10 +1853,9 @@ "hostname": "(sub(?:.))?example.com", "pathname": "/foo" }, - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { "protocol": { "input": "https", "groups": {} }, - "hostname": { "input": "example.com", "groups": { "0": null } }, + "hostname": { "input": "example.com", "groups": {} }, "pathname": { "input": "/foo", "groups": {} } } }, @@ -2297,10 +2287,9 @@ "protocol": "data", "pathname": "text/javascript,let x = 100/:tens?5;" }, - "//": "The `null` below is translated to undefined in the test harness.", "expected_match": { "protocol": { "input": "data", "groups": {} }, - "pathname": { "input": "text/javascript,let x = 100/5;", "groups": { "tens": null } } + "pathname": { "input": "text/javascript,let x = 100/5;", "groups": {} } } }, { @@ -2367,7 +2356,10 @@ }, { "pattern": [{ "hostname": "bad#hostname" }], - "expected_obj": "error" + "inputs": [{ "hostname": "bad" }], + "expected_match": { + "hostname": { "input": "bad", "groups": {} } + } }, { "pattern": [{ "hostname": "bad%hostname" }], @@ -2375,7 +2367,10 @@ }, { "pattern": [{ "hostname": "bad/hostname" }], - "expected_obj": "error" + "inputs": [{ "hostname": "bad" }], + "expected_match": { + "hostname": { "input": "bad", "groups": {} } + } }, { "pattern": [{ "hostname": "bad\\:hostname" }], @@ -2405,10 +2400,6 @@ "pattern": [{ "hostname": "bad]hostname" }], "expected_obj": "error" }, - { - "pattern": [{ "hostname": "bad\\\\hostname" }], - "expected_obj": "error" - }, { "pattern": [{ "hostname": "bad^hostname" }], "expected_obj": "error" @@ -2417,18 +2408,6 @@ "pattern": [{ "hostname": "bad|hostname" }], "expected_obj": "error" }, - { - "pattern": [{ "hostname": "bad\nhostname" }], - "expected_obj": "error" - }, - { - "pattern": [{ "hostname": "bad\rhostname" }], - "expected_obj": "error" - }, - { - "pattern": [{ "hostname": "bad\thostname" }], - "expected_obj": "error" - }, { "pattern": [{}], "inputs": ["https://example.com/"], @@ -2614,17 +2593,6 @@ "pathname": { "input": "foobar", "groups": { "foo": "foo" }} } }, - { - "pattern": [{ "pathname": "*{}**?" }], - "inputs": [{ "pathname": "foobar" }], - "expected_obj": { - "pathname": "*(.*)?" - }, - "//": "The `null` below is translated to undefined in the test harness.", - "expected_match": { - "pathname": { "input": "foobar", "groups": { "0": "foobar", "1": null }} - } - }, { "pattern": [{ "pathname": ":foo(baz)(.*)" }], "inputs": [{ "pathname": "bazbar" }], @@ -2829,29 +2797,5 @@ "search": { "input": "q=*&v=?&hmm={}&umm=()", "groups": {} }, "hash": { "input": "foo", "groups": {} } } - }, - { - "pattern": [{ "pathname": "/([[a-z]--a])" }], - "inputs": [{ "pathname": "/a" }], - "expected_match": null - }, - { - "pattern": [{ "pathname": "/([[a-z]--a])" }], - "inputs": [{ "pathname": "/z" }], - "expected_match": { - "pathname": { "input": "/z", "groups": { "0": "z" } } - } - }, - { - "pattern": [{ "pathname": "/([\\d&&[0-1]])" }], - "inputs": [{ "pathname": "/0" }], - "expected_match": { - "pathname": { "input": "/0", "groups": { "0": "0" } } - } - }, - { - "pattern": [{ "pathname": "/([\\d&&[0-1]])" }], - "inputs": [{ "pathname": "/3" }], - "expected_match": null } ] diff --git a/tests/wpt_tests.cpp b/tests/wpt_url_tests.cpp similarity index 98% rename from tests/wpt_tests.cpp rename to tests/wpt_url_tests.cpp index 1e5f7c567..7d916780d 100644 --- a/tests/wpt_tests.cpp +++ b/tests/wpt_url_tests.cpp @@ -52,8 +52,8 @@ const char *VERIFYDNSLENGTH_TESTS_JSON = using Types = testing::Types; template -struct wpt_tests_typed : testing::Test {}; -TYPED_TEST_SUITE(wpt_tests_typed, Types); +struct wpt_url_tests_typed : testing::Test {}; +TYPED_TEST_SUITE(wpt_url_tests_typed, Types); std::stringstream error_buffer; @@ -70,7 +70,7 @@ bool file_exists(const char *filename) { } } -TEST(wpt_tests, idna_test_v2_to_ascii) { +TEST(wpt_url_tests, idna_test_v2_to_ascii) { ondemand::parser parser; ASSERT_TRUE(file_exists(IDNA_TEST_V2)); padded_string json = padded_string::load(IDNA_TEST_V2); @@ -106,7 +106,7 @@ TEST(wpt_tests, idna_test_v2_to_ascii) { SUCCEED(); } -TEST(wpt_tests, percent_encoding) { +TEST(wpt_url_tests, percent_encoding) { ondemand::parser parser; size_t counter{0}; @@ -149,7 +149,7 @@ TEST(wpt_tests, percent_encoding) { SUCCEED(); } -TYPED_TEST(wpt_tests_typed, setters_tests_encoding) { +TYPED_TEST(wpt_url_tests_typed, setters_tests_encoding) { for (auto source : {SETTERS_TESTS_JSON, ADA_SETTERS_TESTS_JSON}) { ondemand::parser parser; ASSERT_TRUE(file_exists(source)); @@ -255,7 +255,7 @@ TYPED_TEST(wpt_tests_typed, setters_tests_encoding) { SUCCEED(); } -TYPED_TEST(wpt_tests_typed, toascii_encoding) { +TYPED_TEST(wpt_url_tests_typed, toascii_encoding) { ondemand::parser parser; ASSERT_TRUE(file_exists(TOASCII_JSON)); padded_string json = padded_string::load(TOASCII_JSON); @@ -335,7 +335,7 @@ TYPED_TEST(wpt_tests_typed, toascii_encoding) { SUCCEED(); } -TYPED_TEST(wpt_tests_typed, urltestdata_encoding) { +TYPED_TEST(wpt_url_tests_typed, urltestdata_encoding) { for (auto source : {URLTESTDATA_JSON, ADA_URLTESTDATA_JSON}) { ondemand::parser parser; size_t counter{}; @@ -460,7 +460,7 @@ TYPED_TEST(wpt_tests_typed, urltestdata_encoding) { SUCCEED(); } -TEST(wpt_tests, verify_dns_length) { +TEST(wpt_url_tests, verify_dns_length) { const char *source = VERIFYDNSLENGTH_TESTS_JSON; size_t counter{}; ondemand::parser parser; diff --git a/tests/wpt_urlpattern_tests.cpp b/tests/wpt_urlpattern_tests.cpp new file mode 100644 index 000000000..89bea3a13 --- /dev/null +++ b/tests/wpt_urlpattern_tests.cpp @@ -0,0 +1,562 @@ +#include +#include + +#include "ada/log.h" +#include "gtest/gtest.h" +#include "simdjson.h" + +#include "ada.h" +#include "ada/url_pattern.h" +#include "ada/parser.h" + +using namespace simdjson; + +constexpr std::string_view URL_PATTERN_TEST_DATA = + "wpt/urlpatterntestdata.json"; + +TEST(wpt_urlpattern_tests, parser_tokenize_basic_tests) { + auto tokenize_result = + tokenize("*", ada::url_pattern_helpers::token_policy::STRICT); + ASSERT_TRUE(tokenize_result); +} + +TEST(wpt_urlpattern_tests, parse_pattern_string_basic_tests) { + auto part_list = ada::url_pattern_helpers::parse_pattern_string( + "*", ada::url_pattern_compile_component_options::DEFAULT, + ada::url_pattern_helpers::canonicalize_protocol); + + ASSERT_TRUE(part_list); +} + +TEST(wpt_urlpattern_tests, compile_basic_tests) { + auto protocol_component = ada::url_pattern_component::compile( + "*", ada::url_pattern_helpers::canonicalize_protocol, + ada::url_pattern_compile_component_options::DEFAULT); + ASSERT_TRUE(protocol_component); +} + +TEST(wpt_urlpattern_tests, basic_tests) { + auto init = ada::url_pattern_init{}; + init.pathname = "/books"; + auto url = ada::parse_url_pattern(init); + ASSERT_TRUE(url); + ASSERT_EQ(url->get_protocol(), "*"); + ASSERT_EQ(url->get_hostname(), "*"); + ASSERT_EQ(url->get_username(), "*"); + ASSERT_EQ(url->get_password(), "*"); + ASSERT_EQ(url->get_port(), "*"); + ASSERT_EQ(url->get_pathname(), "/books"); + ASSERT_EQ(url->get_search(), "*"); + ASSERT_EQ(url->get_hash(), "*"); + ASSERT_FALSE(url->has_regexp_groups()); + SUCCEED(); +} + +// Tests are taken from WPT +// https://github.com/web-platform-tests/wpt/blob/0c1d19546fd4873bb9f4147f0bbf868e7b4f91b7/urlpattern/resources/urlpattern-hasregexpgroups-tests.js +TEST(wpt_urlpattern_tests, has_regexp_groups) { + auto create_init = [](std::string_view component, + std::string value) -> ada::url_pattern_init { + if (component == "protocol") return {.protocol = value}; + if (component == "username") return {.username = value}; + if (component == "password") return {.password = value}; + if (component == "hostname") return {.hostname = value}; + if (component == "port") return {.port = value}; + if (component == "pathname") return {.pathname = value}; + if (component == "search") return {.search = value}; + if (component == "hash") return {.hash = value}; + ada::unreachable(); + }; + constexpr std::string_view fields[] = {"protocol", "username", "password", + "hostname", "port", "pathname", + "search", "hash"}; + + for (const auto& field : fields) { + std::cout << "field " << field << std::endl; + + ASSERT_FALSE( + ada::parse_url_pattern(create_init(field, "*"))->has_regexp_groups()); + ASSERT_FALSE(ada::parse_url_pattern(create_init(field, ":foo")) + ->has_regexp_groups()); + ASSERT_FALSE(ada::parse_url_pattern(create_init(field, ":foo?")) + ->has_regexp_groups()); + ASSERT_TRUE(ada::parse_url_pattern(create_init(field, ":foo(hi)")) + ->has_regexp_groups()); + ASSERT_TRUE(ada::parse_url_pattern(create_init(field, "(hi)")) + ->has_regexp_groups()); + + if (field != "protocol" && field != "port") { + ASSERT_FALSE( + ada::parse_url_pattern(create_init(field, "a-{:hello}-z-*-a")) + ->has_regexp_groups()); + ASSERT_TRUE(ada::parse_url_pattern(create_init(field, "a-(hi)-z-(lo)-a")) + ->has_regexp_groups()); + } + } + + ASSERT_FALSE(ada::parse_url_pattern( + ada::url_pattern_init{.pathname = "/a/:foo/:baz?/b/*"}) + ->has_regexp_groups()); + ASSERT_TRUE( + ada::parse_url_pattern( + ada::url_pattern_init{.pathname = "/a/:foo/:baz([a-z]+)?/b/*"}) + ->has_regexp_groups()); + + SUCCEED(); +} + +std::variant parse_init( + ondemand::object& object) { + ada::url_pattern_init init{}; + for (auto field : object) { + auto key = field.key().value(); + std::string_view value; + if (field.value().get_string(value)) { + bool value_true; + EXPECT_FALSE(field.value().get_bool().get(value_true)); + return ada::url_pattern_options{.ignore_case = value_true}; + } + if (key == "protocol") { + init.protocol = std::string(value); + } else if (key == "username") { + init.username = std::string(value); + } else if (key == "password") { + init.password = std::string(value); + } else if (key == "hostname") { + init.hostname = std::string(value); + } else if (key == "port") { + init.port = std::string(value); + } else if (key == "pathname") { + init.pathname = std::string(value); + } else if (key == "search") { + init.search = std::string(value); + } else if (key == "hash") { + init.hash = std::string(value); + } else if (key == "baseURL") { + init.base_url = std::string(value); + } + } + return init; +} + +ada::url_pattern_options parse_options(ondemand::object& object) { + ada::url_pattern_options options{}; + if (object["ignoreCase"]) { + options.ignore_case = object["ignoreCase"].get_bool().value(); + } + return options; +} + +// URLPattern can accept the following use cases: +// new URLPattern(input) +// new URLPattern(input, baseURL) +// new URLPattern(input, options) +// new URLPattern(input, baseURL, options) +std::tuple, + std::optional, std::optional> +parse_pattern_field(ondemand::array& patterns) { + // If no arguments have been passed let's assume it's an empty init. + if (patterns.count_elements().value() == 0) { + return {ada::url_pattern_init{}, {}, {}}; + } + + std::optional init_obj{}; + std::optional init_str{}; + std::optional base_url{}; + std::optional options{}; + + // In simdjson's On-Demand, we disallow the pattern array size, access element + // 0, access element 1... as it leads to inefficient code. Instead, we iterate + // over the array. + // The following can be used for debugging: + // std::cout << "parse_pattern_field" << patterns.raw_json().value()<< + // std::endl; patterns.reset(); // <==== Do not forget because raw_json() + // consumes the object!!! + size_t pattern_size = 0; // how many elements we have consumed. + patterns.reset(); + for (auto pattern : patterns) { + if (pattern_size == 0) { + // Init can be a string or an object. + if (pattern.type() == ondemand::json_type::string) { + EXPECT_FALSE(pattern.get_string(init_str)); + } else { + EXPECT_TRUE(pattern.type() == ondemand::json_type::object); + ondemand::object object = pattern.get_object(); + auto init_result = parse_init(object); + if (std::holds_alternative(init_result)) { + init_obj = std::get(init_result); + } else { + init_obj = ada::url_pattern_init{}; + options = std::get(init_result); + return std::tuple(*init_obj, base_url, options); + } + } + } else if (pattern_size == 1) { + // The second value can be a base url or an option. + if (pattern.type() == ondemand::json_type::string) { + EXPECT_FALSE(pattern.get_string(base_url)); + } else { + EXPECT_TRUE(pattern.type() == ondemand::json_type::object); + ondemand::object object = pattern.get_object(); + options = parse_options(object); + } + } else if (pattern_size == 2) { + // This can only be options now. + if (pattern.type() == ondemand::json_type::object) { + EXPECT_FALSE(options.has_value()); + ondemand::object object = pattern.get_object(); + options = parse_options(object); + } else if (pattern.type() == ondemand::json_type::string) { + // E.g., [ "/foo?bar#baz", { "ignoreCase": true }, + // "https://example.com:8080" ] + // This is an invalid pattern. We should not test it. + // We return false to indicate that should skip the test. + return std::tuple(false, std::nullopt, std::nullopt); + } + } + pattern_size++; + } + EXPECT_TRUE(pattern_size > 0); + if (init_obj) { + return std::tuple(*init_obj, base_url, options); + } + EXPECT_TRUE(init_str.has_value()); + return std::tuple(*init_str, base_url, options); +} + +tl::expected parse_pattern( + std::variant& init_variant, + std::optional& base_url, + std::optional& options) { + std::string_view base_url_view{}; + + if (base_url) { + base_url_view = {base_url->data(), base_url->size()}; + } + + if (std::holds_alternative(init_variant)) { + auto str_init = std::get(init_variant); + std::cout << "init: " << str_init << std::endl; + return ada::parse_url_pattern( + std::string_view(str_init), + base_url.has_value() ? &base_url_view : nullptr, + options.has_value() ? &options.value() : nullptr); + } + + auto obj_init = std::get(init_variant); + std::cout << "init: " << obj_init.to_string() << std::endl; + return ada::parse_url_pattern( + obj_init, base_url.has_value() ? &base_url_view : nullptr, + options.has_value() ? &options.value() : nullptr); +} + +std::tuple, + std::optional> +parse_inputs_array(ondemand::array& inputs) { + std::cout << "inputs: " << inputs.raw_json().value() << std::endl; + inputs.reset(); + + std::variant first_param = + ada::url_pattern_init{}; + std::optional base_url{}; + + size_t index = 0; + for (auto input : inputs) { + if (index == 0) { + if (input.type() == ondemand::json_type::string) { + std::string_view value; + EXPECT_FALSE(input.get_string().get(value)); + first_param = std::string(value); + index++; + continue; + } + + ondemand::object attribute; + EXPECT_FALSE(input.get_object().get(attribute)); + // We always know that this function is called with url pattern init. + first_param = std::get(parse_init(attribute)); + index++; + continue; + } + + std::string_view value; + EXPECT_FALSE(input.get_string().get(value)); + base_url = std::string(value); + index++; + } + + return {first_param, base_url}; +} + +ada::url_pattern_component_result parse_component_result( + ondemand::object& component) { + auto result = ada::url_pattern_component_result{}; + + for (auto element : component) { + auto key = element.key().value(); + + if (key == "input") { + // The value will always be string + std::string_view value; + EXPECT_FALSE(element.value().get_string().get(value)); + result.input = std::string(value); + } else if (key == "groups") { + ondemand::object groups; + EXPECT_FALSE(element.value().get_object().get(groups)); + for (auto group : groups) { + auto group_key = group.unescaped_key().value(); + std::string_view group_value; + EXPECT_FALSE(group.value().get_string(group_value)); + result.groups.insert_or_assign(std::string(group_key), + std::string(group_value)); + } + } + } + + return result; +} + +std::tuple parse_exec_result( + ondemand::object& exec_result) { + auto result = ada::url_pattern_result{}; + bool has_inputs = false; + + for (auto field : exec_result) { + auto key = field.key().value(); + + if (key == "inputs") { + has_inputs = true; + // All values will be string or init object. + ondemand::array inputs; + EXPECT_FALSE(field.value().get_array().get(inputs)); + for (auto input_field : inputs) { + if (input_field.type() == ondemand::json_type::string) { + std::string_view input_field_str; + EXPECT_FALSE(input_field.get_string().get(input_field_str)); + result.inputs.emplace_back(std::string(input_field_str)); + } else if (input_field.type() == ondemand::json_type::object) { + ondemand::object input_field_object; + EXPECT_FALSE(input_field.get_object().get(input_field_object)); + auto parse_value = parse_init(input_field_object); + EXPECT_TRUE( + std::holds_alternative(parse_value)); + result.inputs.emplace_back( + std::get(parse_value)); + } else { + ADD_FAILURE() << "Unexpected input field type"; + } + } + } else { + ondemand::object component; + EXPECT_FALSE(field.value().get_object().get(component)); + auto component_result = parse_component_result(component); + + if (key == "protocol") { + result.protocol = component_result; + } else if (key == "username") { + result.username = component_result; + } else if (key == "password") { + result.password = component_result; + } else if (key == "hostname") { + result.hostname = component_result; + } else if (key == "port") { + result.port = component_result; + } else if (key == "pathname") { + result.pathname = component_result; + } else if (key == "search") { + result.search = component_result; + } else if (key == "hash") { + result.hash = component_result; + } else { + ADD_FAILURE() << "Unexpected key in url_pattern_component_result"; + } + } + } + + return {result, has_inputs}; +} + +TEST(wpt_urlpattern_tests, urlpattern_test_data) { + ondemand::parser parser; + ASSERT_TRUE(std::filesystem::exists(URL_PATTERN_TEST_DATA)); + padded_string json = padded_string::load(URL_PATTERN_TEST_DATA); + ondemand::document doc = parser.iterate(json); + try { + for (auto element : doc.get_array()) { + if (element.type() == ondemand::json_type::string) { + std::cout << " comment: " << element.get_string() << std::endl; + continue; + } + + std::cout << "--------------------" << std::endl; + + ondemand::object main_object = element.get_object(); + // If we have a key with 'expected_obj' and the value is 'error', then + // we expect the pattern to be invalid. There should be a key with + // 'pattern' and the value should be an array. + std::string_view expected_obj_str; + ondemand::array patterns; + ASSERT_FALSE(main_object["pattern"].get_array().get(patterns)); + auto [init_variant, base_url, options] = parse_pattern_field(patterns); + // This is an invalid test case. We should not test it. + if (std::holds_alternative(init_variant)) { + // Skip invalid test cases. + continue; + } + auto parse_result = parse_pattern(init_variant, base_url, options); + + if (!main_object["expected_obj"].get_string().get(expected_obj_str) && + expected_obj_str == "error") { + if (parse_result) { + main_object.reset(); + FAIL() << "Test should have failed but it didn't" << std::endl + << main_object.raw_json().value() << std::endl; + } + continue; + } + + // Test for valid cases. + if (!parse_result) { + main_object.reset(); + if (base_url) { + std::cerr << "base_url: " << base_url.value_or("") << std::endl; + } + if (options) { + std::cerr << "options: " << options->to_string() << std::endl; + } + FAIL() << "Test should have succeeded but failed" << std::endl + << main_object.raw_json().value() << std::endl; + } + ada_log("parse_result: ", parse_result->to_string()); + ondemand::array exactly_empty_components; + if (!main_object["exactly_empty_components"].get_array().get( + exactly_empty_components)) { + for (auto component : exactly_empty_components) { + std::string_view key; + ASSERT_FALSE(component.get_string().get(key)); + if (key == "hash") { + ASSERT_TRUE(parse_result->get_hash().empty()); + } else if (key == "hostname") { + ASSERT_TRUE(parse_result->get_hostname().empty()); + } else if (key == "pathname") { + ASSERT_TRUE(parse_result->get_pathname().empty()); + } else if (key == "search") { + ASSERT_TRUE(parse_result->get_search().empty()); + } else if (key == "port") { + ASSERT_TRUE(parse_result->get_port().empty()); + } else if (key == "protocol") { + ASSERT_TRUE(parse_result->get_protocol().empty()); + } else if (key == "username") { + ASSERT_TRUE(parse_result->get_username().empty()); + } else if (key == "password") { + ASSERT_TRUE(parse_result->get_password().empty()); + } else { + FAIL() << "Unknown key in exactly_empty_components: " << key + << std::endl; + } + } + } + + ondemand::object expected_obj; + if (!main_object["expected_obj"].get_object().get(expected_obj)) { + for (auto obj_element : expected_obj) { + auto key = obj_element.key().value(); + std::string_view value; + ASSERT_FALSE(obj_element.value().get_string().get(value)); + if (key == "hash") { + ASSERT_EQ(parse_result->get_hash(), value); + } else if (key == "hostname") { + ASSERT_EQ(parse_result->get_hostname(), value); + } else if (key == "password") { + ASSERT_EQ(parse_result->get_password(), value); + } else if (key == "pathname") { + ASSERT_EQ(parse_result->get_pathname(), value); + } else if (key == "port") { + ASSERT_EQ(parse_result->get_port(), value); + } else if (key == "protocol") { + ASSERT_EQ(parse_result->get_protocol(), value); + } else if (key == "search") { + ASSERT_EQ(parse_result->get_search(), value); + } else if (key == "username") { + ASSERT_EQ(parse_result->get_username(), value); + } else { + FAIL() << "Unknown key in expected object: " << key << std::endl; + } + } + } + + ondemand::array inputs; + if (!main_object["inputs"].get_array().get(inputs)) { + // Expected match can be: + // - "error" + // - null + // - {} // response here. + auto [input_value, base_url] = parse_inputs_array(inputs); + tl::expected, ada::errors> + exec_result; + std::string_view base_url_view; + std::string_view* opt_base_url = nullptr; + if (base_url) { + base_url_view = std::string_view(base_url.value()); + opt_base_url = &base_url_view; + } + if (std::holds_alternative(input_value)) { + auto str = std::get(input_value); + ada_log("input_value is str=", str); + exec_result = parse_result->exec(str, opt_base_url); + } else { + ada_log("input_value is url_pattern_init"); + auto obj = std::get(input_value); + exec_result = parse_result->exec(obj, opt_base_url); + } + + ondemand::value expected_match = main_object["expected_match"].value(); + if (expected_match.type() == ondemand::json_type::string) { + // If it is a string, it will always be "error" + ASSERT_EQ(expected_match.get_string().value(), "error"); + ASSERT_EQ(exec_result.has_value(), false) + << "Expected error but exec() has_value= " + << exec_result->has_value(); + } else if (expected_match.type() == ondemand::json_type::null) { + ASSERT_EQ(exec_result.has_value(), true) + << "Expected non failure but it throws an error"; + ASSERT_EQ(exec_result->has_value(), false) + << "Expected null value but exec() returned a value "; + } else { + ASSERT_EQ(exec_result.has_value(), true) + << "Expect match to succeed but it throw an error"; + ASSERT_EQ(exec_result->has_value(), true) + << "Expect match to succeed but it returned a null value"; + auto exec_result_obj = expected_match.get_object().value(); + auto [expected_exec_result, has_inputs] = + parse_exec_result(exec_result_obj); + + // Some match_result data in JSON does not have any inputs output + if (has_inputs) { + ASSERT_EQ(exec_result->value().inputs, expected_exec_result.inputs); + } + + ASSERT_EQ(exec_result->value().protocol, + expected_exec_result.protocol); + ASSERT_EQ(exec_result->value().username, + expected_exec_result.username); + ASSERT_EQ(exec_result->value().password, + expected_exec_result.password); + ASSERT_EQ(exec_result->value().hostname, + expected_exec_result.hostname); + ASSERT_EQ(exec_result->value().port, expected_exec_result.port); + ASSERT_EQ(exec_result->value().pathname, + expected_exec_result.pathname); + ASSERT_EQ(exec_result->value().search, expected_exec_result.search); + ASSERT_EQ(exec_result->value().hash, expected_exec_result.hash); + } + } + } + } catch (simdjson_error& error) { + std::cerr << "JSON error: " << error.what() << " near " + << doc.current_location() << " in " << URL_PATTERN_TEST_DATA + << std::endl; + FAIL(); + } + SUCCEED(); +}