From db3c933423ad4817cd9b7074b74c4dd9eaebd587 Mon Sep 17 00:00:00 2001 From: Esteban Blanc Date: Tue, 5 Jan 2021 09:57:12 +0100 Subject: [PATCH] Support other charset (#105) * scraper: Initial work for other charset support Not working * scraper: Get charset from html instead of http headers Not working * scraper: Charset working * scraper: Fix regex ('"' after equals) * scraper: Use lazy_static for charset regex * misc: Refactor pr * tests: Refactor fixtures * tests: Add html charset support tests * tests: Charset, split tests * downloader: Get charset from http headers * misc: Clarify iterator first * Apply suggestions from CohenArthur Co-authored-by: CohenArthur --- Cargo.lock | 500 +++++++++++------- Cargo.toml | 3 + src/downloader.rs | 47 +- src/response.rs | 11 +- src/scraper.rs | 101 +++- tests/auth.rs | 17 +- tests/charset_html_found.rs | 41 ++ tests/charset_html_not_found.rs | 41 ++ tests/charset_http_found.rs | 48 ++ tests/charset_http_not_found.rs | 43 ++ tests/filters.rs | 26 +- tests/fixtures/charset_test_html.html | 13 + tests/fixtures/charset_test_html_no_meta.html | 12 + tests/fixtures/mod.rs | 40 +- 14 files changed, 708 insertions(+), 235 deletions(-) create mode 100644 tests/charset_html_found.rs create mode 100644 tests/charset_html_not_found.rs create mode 100644 tests/charset_http_found.rs create mode 100644 tests/charset_http_not_found.rs create mode 100644 tests/fixtures/charset_test_html.html create mode 100644 tests/fixtures/charset_test_html_no_meta.html diff --git a/Cargo.lock b/Cargo.lock index 61df680..ffca268 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,9 +2,9 @@ # It is not intended for manual editing. [[package]] name = "aho-corasick" -version = "0.7.13" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" dependencies = [ "memchr", ] @@ -43,15 +43,15 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "base-x" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b20b618342cf9891c292c4f5ac2cde7287cc5c87e87e9c769d617793607dec1" +checksum = "a4521f3e3d031370679b3b140beb36dfe4801b09ac77e30c61941f97df3ef28b" [[package]] name = "base64" -version = "0.12.3" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "bitflags" @@ -85,9 +85,9 @@ checksum = "e0dcbc35f504eb6fc275a6d20e4ebcda18cf50d40ba6fabff8c711fa16cb3b16" [[package]] name = "cc" -version = "1.0.60" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef611cc68ff783f18535d77ddd080185275713d852c4f5cbb6122c462a7a825c" +checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" [[package]] name = "cfg-if" @@ -103,20 +103,22 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.15" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942f72db697d8767c22d46a598e01f2d3b475501ea43d0db4f16d90259182d0b" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" dependencies = [ + "libc", "num-integer", "num-traits", "time 0.1.44", + "winapi 0.3.9", ] [[package]] name = "chunked_transfer" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d29eb15132782371f71da8f947dba48b3717bdb6fa771b9b434d645e40a7193" +checksum = "7477065d45a8fe57167bf3cf8bcd3729b54cfcb81cca49bda2d038ea89ae82ca" [[package]] name = "clap" @@ -146,18 +148,18 @@ dependencies = [ [[package]] name = "const_fn" -version = "0.4.2" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2" +checksum = "cd51eab21ab4fd6a3bf889e2d0958c0a6e3a61ad04260325e919e652a2a62826" [[package]] name = "cookie" -version = "0.14.2" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1373a16a4937bc34efec7b391f9c1500c30b8478a701a4f44c9165cc0475a6e0" +checksum = "784ad0fbab4f3e9cef09f20e0aea6000ae08d2cb98ac4c0abc53df18803d702f" dependencies = [ "percent-encoding", - "time 0.2.21", + "time 0.2.23", "version_check", ] @@ -173,15 +175,15 @@ dependencies = [ "publicsuffix", "serde", "serde_json", - "time 0.2.21", + "time 0.2.23", "url", ] [[package]] name = "core-foundation" -version = "0.7.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d24c7a13c43e870e37c1556b74555437870a04514f7685f5b354e090567171" +checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", "libc", @@ -189,9 +191,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.7.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3a71ab494c0b5b860bdc8407ae08978052417070c2ced38573a9157ad75b8ac" +checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" [[package]] name = "crossbeam" @@ -292,9 +294,9 @@ dependencies = [ [[package]] name = "derive_more" -version = "0.99.10" +version = "0.99.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dcfabdab475c16a93d669dddfc393027803e347d09663f524447f642fbb84ba" +checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c" dependencies = [ "proc-macro2", "quote", @@ -309,26 +311,26 @@ checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0" [[package]] name = "dtoa" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b" +checksum = "88d7ed2934d741c6b37e33e3832298e8850b53fd2d2bea03873375596c7cea4e" [[package]] name = "dtoa-short" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" +checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6" dependencies = [ "dtoa", ] [[package]] name = "encoding_rs" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a51b8cf747471cb9499b6d59e59b0444f4c90eba8968c4e44874e92b5b64ace2" +checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", ] [[package]] @@ -399,51 +401,51 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f366ad74c28cca6ba456d95e6422883cfb4b252a83bed929c83abfdbbf2967d5" +checksum = "4b7109687aa4e177ef6fe84553af6280ef2778bdb7783ba44c9dc3399110fe64" dependencies = [ "futures-core", ] [[package]] name = "futures-core" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59f5fff90fd5d971f936ad674802482ba441b6f09ba5e15fd8b39145582ca399" +checksum = "847ce131b72ffb13b6109a221da9ad97a64cbe48feb1028356b836b47b8f1748" [[package]] name = "futures-io" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de27142b013a8e869c14957e6d2edeef89e97c289e69d042ee3a49acd8b51789" +checksum = "611834ce18aaa1bd13c4b374f5d653e1027cf99b6b502584ff8c9a64413b30bb" [[package]] name = "futures-sink" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f2032893cb734c7a05d85ce0cc8b8c4075278e93b24b66f9de99d6eb0fa8acc" +checksum = "f878195a49cee50e006b02b93cf7e0a95a38ac7b776b4c4d9cc1207cd20fcb3d" [[package]] name = "futures-task" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb66b5f09e22019b1ab0830f7785bcea8e7a42148683f99214f73f8ec21a626" +checksum = "7c554eb5bf48b2426c4771ab68c6b14468b6e76cc90996f528c3338d761a4d0d" dependencies = [ "once_cell", ] [[package]] name = "futures-util" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8764574ff08b701a084482c3c7031349104b07ac897393010494beaa18ce32c6" +checksum = "d304cff4a7b99cfb7986f7d43fbe93d175e72e704a8860787cc95e9ffd85cbd2" dependencies = [ "futures-core", "futures-io", "futures-task", "memchr", - "pin-project", + "pin-project 1.0.2", "pin-utils", "slab", ] @@ -459,31 +461,31 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "libc", "wasi 0.9.0+wasi-snapshot-preview1", ] [[package]] name = "getrandom" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee8025cf36f917e6a52cce185b7c7177689b838b7ec138364e50cc2277a56cf4" +checksum = "4060f4657be78b8e766215b02b18a2e862d83745545de804638e2b545e81aee6" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "libc", - "wasi 0.9.0+wasi-snapshot-preview1", + "wasi 0.10.0+wasi-snapshot-preview1", ] [[package]] name = "h2" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "993f9e0baeed60001cf565546b0d3dbe6a6ad23f2bd31644a133c641eccf6d53" +checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" dependencies = [ "bytes 0.5.6", "fnv", @@ -496,28 +498,29 @@ dependencies = [ "tokio", "tokio-util", "tracing", + "tracing-futures", ] [[package]] name = "hashbrown" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00d63df3d41950fb462ed38308eea019113ad1508da725bbedcd0fa5a85ef5f7" +checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "heck" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" dependencies = [ "unicode-segmentation", ] [[package]] name = "hermit-abi" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c30f6d0bc6b00693347368a67d41b58f2fb851215ff1da49e90fe2c5c667151" +checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8" dependencies = [ "libc", ] @@ -538,9 +541,9 @@ dependencies = [ [[package]] name = "http" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d569972648b2c512421b5f2a405ad6ac9666547189d0c5477a3f200f3e02f9" +checksum = "84129d298a6d57d246960ff8eb831ca4af3f96d29e2e28848dae275408658e26" dependencies = [ "bytes 0.5.6", "fnv", @@ -571,9 +574,9 @@ checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" [[package]] name = "hyper" -version = "0.13.8" +version = "0.13.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f3afcfae8af5ad0576a31e768415edb627824129e8e5a29b8bfccb2f234e835" +checksum = "f6ad767baac13b44d4529fcf58ba2cd0995e36e7b435bc5b039de6f47e880dbf" dependencies = [ "bytes 0.5.6", "futures-channel", @@ -585,7 +588,7 @@ dependencies = [ "httparse", "httpdate", "itoa", - "pin-project", + "pin-project 1.0.2", "socket2", "tokio", "tower-service", @@ -619,14 +622,23 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" +checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" dependencies = [ "autocfg", "hashbrown", ] +[[package]] +name = "instant" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61124eeebbd69b8190558df225adf7e4caafce0d743919e5d6b19652314ec5ec" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "iovec" version = "0.1.4" @@ -644,15 +656,15 @@ checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" [[package]] name = "itoa" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" +checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.45" +version = "0.3.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca059e81d9486668f12d455a4ea6daa600bd408134cd17e3d3fb5a32d1f016f8" +checksum = "cf3d7383929f7c9c7c2d0fa596f325832df98c3704f2c60553080f7127a58175" dependencies = [ "wasm-bindgen", ] @@ -687,9 +699,18 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.77" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f96b10ec2560088a8e76961b00d47107b3a625fecb76dedb29ee7ccbf98235" +checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb" + +[[package]] +name = "lock_api" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd96ffd135b2fd7b973ac026d28085defbe8983df057ced3eb4f2130b0831312" +dependencies = [ + "scopeguard", +] [[package]] name = "log" @@ -731,9 +752,9 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "memchr" -version = "2.3.3" +version = "2.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" [[package]] name = "memoffset" @@ -762,9 +783,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.6.22" +version = "0.6.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430" +checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" dependencies = [ "cfg-if 0.1.10", "fuchsia-zircon", @@ -781,9 +802,9 @@ dependencies = [ [[package]] name = "miow" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1f2f3b1cf331de6896aabf6e9d55dca90356cc9960cca7eaaf408a355ae919" +checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" dependencies = [ "kernel32-sys", "net2", @@ -793,9 +814,9 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.4" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d88c06fe90d5ee94048ba40409ef1d9315d86f6f38c2efdaad4fb50c58b2d" +checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", "libc", @@ -811,9 +832,9 @@ dependencies = [ [[package]] name = "net2" -version = "0.2.35" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ebc3ec692ed7c9a255596c67808dee269f64655d8baf7b4f0638e51ba1d6853" +checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", "libc", @@ -834,9 +855,9 @@ checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "num-integer" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d59457e662d541ba17869cf51cf177c0b5f0cbf476c66bdc90bf1edac4f875b" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" dependencies = [ "autocfg", "num-traits", @@ -844,9 +865,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", ] @@ -863,18 +884,18 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.4.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "260e51e7efe62b592207e9e13a68e43692a7a279171d6ba57abd208bf23645ad" +checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" [[package]] name = "openssl" -version = "0.10.30" +version = "0.10.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d575eff3665419f9b83678ff2815858ad9d11567e082f5ac1814baba4e2bcb4" +checksum = "038d43985d1ddca7a9900630d8cd031b56e4794eecc2e9ea39dd17aa04399a70" dependencies = [ "bitflags", - "cfg-if 0.1.10", + "cfg-if 1.0.0", "foreign-types", "lazy_static", "libc", @@ -889,9 +910,9 @@ checksum = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de" [[package]] name = "openssl-sys" -version = "0.9.58" +version = "0.9.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a842db4709b604f0fe5d1170ae3565899be2ad3d9cbc72dedc789ac0511f78de" +checksum = "921fc71883267538946025deffb622905ecad223c28efbfdef9bb59a0175f3e6" dependencies = [ "autocfg", "cc", @@ -900,6 +921,31 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ccb628cad4f84851442432c60ad8e1f607e29752d0bf072cbd0baf28aa34272" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi 0.3.9", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -962,18 +1008,38 @@ dependencies = [ [[package]] name = "pin-project" -version = "0.4.23" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" +dependencies = [ + "pin-project-internal 0.4.27", +] + +[[package]] +name = "pin-project" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7" +dependencies = [ + "pin-project-internal 1.0.2", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca4433fff2ae79342e497d9f8ee990d174071408f28f726d6d83af93e58e48aa" +checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" dependencies = [ - "pin-project-internal", + "proc-macro2", + "quote", + "syn", ] [[package]] name = "pin-project-internal" -version = "0.4.23" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0e815c3ee9a031fdf5af21c10aa17c573c9c6a566328d99e3936c34e36461f" +checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f" dependencies = [ "proc-macro2", "quote", @@ -982,9 +1048,15 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.1.7" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" + +[[package]] +name = "pin-project-lite" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282adbf10f2698a7a77f8e983a74b2d18176c19a7fd32a45446139ae7b02b715" +checksum = "6b063f57ec186e6140e2b8b6921e5f1bd89c7356dda5b33acc5401203ca6131c" [[package]] name = "pin-utils" @@ -994,15 +1066,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d36492546b6af1463394d46f0c834346f31548646f6ba10849802c9c9a27ac33" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" [[package]] name = "ppv-lite86" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36fa947111f5c62a733b652544dd0016a43ce89619538a8ef92724a6f501a20" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" [[package]] name = "precomputed-hash" @@ -1036,15 +1108,15 @@ dependencies = [ [[package]] name = "proc-macro-hack" -version = "0.5.18" +version = "0.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c605b9a0adc77b7211c6b1f722dcb613d68d66859a44f3d485a6da332b0598" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.21" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36e28516df94f3dd551a587da5357459d9b36d945a7c37c3557928c1c2ff2a2c" +checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" dependencies = [ "unicode-xid", ] @@ -1064,9 +1136,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" dependencies = [ "proc-macro2", ] @@ -1077,7 +1149,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom 0.1.15", + "getrandom 0.1.16", "libc", "rand_chacha 0.2.2", "rand_core 0.5.1", @@ -1087,13 +1159,13 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a76330fb486679b4ace3670f117bbc9e16204005c4bde9c4bd372f45bed34f12" +checksum = "c24fcd450d3fa2b592732565aa4f17a27a61c65ece4726353e000939b0edee34" dependencies = [ "libc", "rand_chacha 0.3.0", - "rand_core 0.6.0", + "rand_core 0.6.1", "rand_hc 0.3.0", ] @@ -1114,7 +1186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" dependencies = [ "ppv-lite86", - "rand_core 0.6.0", + "rand_core 0.6.1", ] [[package]] @@ -1123,16 +1195,16 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom 0.1.15", + "getrandom 0.1.16", ] [[package]] name = "rand_core" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8b34ba8cfb21243bd8df91854c830ff0d785fff2e82ebd4434c2644cb9ada18" +checksum = "c026d7df8b298d90ccbbc5190bd04d85e159eaf5576caeacf8741da93ccbd2e5" dependencies = [ - "getrandom 0.2.0", + "getrandom 0.2.1", ] [[package]] @@ -1150,7 +1222,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" dependencies = [ - "rand_core 0.6.0", + "rand_core 0.6.1", ] [[package]] @@ -1197,9 +1269,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.10.8" +version = "0.10.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9eaa17ac5d7b838b7503d118fa16ad88f440498bf9ffe5424e621f93190d61e" +checksum = "0718f81a8e14c4dbb3b34cf23dc6aaf9ab8a0dfec160c534b3dbca1aaa21f47c" dependencies = [ "base64", "bytes 0.5.6", @@ -1220,10 +1292,10 @@ dependencies = [ "mime_guess", "native-tls", "percent-encoding", - "pin-project-lite", + "pin-project-lite 0.2.0", "serde", "serde_urlencoded", - "time 0.2.21", + "time 0.2.23", "tokio", "tokio-tls", "url", @@ -1266,9 +1338,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "security-framework" -version = "0.4.4" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64808902d7d99f78eaddd2b4e2509713babc3dc3c85ad6f4c447680f3c01e535" +checksum = "c1759c2e3c8580017a484a7ac56d3abc5a6c1feadf88db2f3633f12ae4268c69" dependencies = [ "bitflags", "core-foundation", @@ -1279,9 +1351,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "0.4.3" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17bf11d99252f512695eb468de5516e5cf75455521e69dfe343f3b74e4748405" +checksum = "f99b9d5e26d2a71633cc4f2ebae7cc9f874044e0c351a27e17892d76dce5678b" dependencies = [ "core-foundation-sys", "libc", @@ -1324,18 +1396,18 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.116" +version = "1.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96fe57af81d28386a513cbc6858332abc6117cfdb5999647c6444b8f43a370a5" +checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.116" +version = "1.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f630a6370fd8e457873b4bd2ffdae75408bc291ba72be773772a4c2a065d9ae8" +checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" dependencies = [ "proc-macro2", "quote", @@ -1344,9 +1416,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.57" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "164eacbdb13512ec2745fb09d51fd5b22b0d65ed294a1dcf7285a360c80a675c" +checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" dependencies = [ "itoa", "ryu", @@ -1355,14 +1427,36 @@ dependencies = [ [[package]] name = "serde_urlencoded" -version = "0.6.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" dependencies = [ - "dtoa", + "form_urlencoded", "itoa", + "ryu", "serde", - "url", +] + +[[package]] +name = "serial_test" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0bccbcf40c8938196944a3da0e133e031a33f4d6b72db3bda3cc556e361905d" +dependencies = [ + "lazy_static", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2acd6defeddb41eb60bb468f8825d0cfd0c2a76bc03bfd235b6a1dc4f6a1ad5" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -1395,19 +1489,18 @@ checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" [[package]] name = "smallvec" -version = "1.4.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbee7696b84bbf3d89a1c2eccff0850e3047ed46bfcd2e92c29a2d074d57e252" +checksum = "1a55ca5f3b68e41c979bf8c46a6f1da892ca4db8f94023ce0bd32407573b1ac0" [[package]] name = "socket2" -version = "0.3.15" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1fa70dc5c8104ec096f4fe7ede7a221d35ae13dcd19ba1ad9a81d2cab9a1c44" +checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "libc", - "redox_syscall", "winapi 0.3.9", ] @@ -1419,9 +1512,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "standback" -version = "0.2.10" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33a71ea1ea5f8747d1af1979bfb7e65c3a025a70609f04ceb78425bc5adad8e6" +checksum = "c66a8cff4fa24853fdf6b51f75c6d7f8206d7c75cab4e467bcd7f25c2b1febe0" dependencies = [ "version_check", ] @@ -1477,9 +1570,9 @@ checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" [[package]] name = "string_cache" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a" +checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" dependencies = [ "lazy_static", "new_debug_unreachable", @@ -1508,9 +1601,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.18" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33f6461027d7f08a13715659b2948e1602c31a3756aeae9378bfe7518c72e82" +checksum = "5277acd7ee46e63e5168a80734c9f6ee81b1367a7d8772a2d765df2a3705d28c" dependencies = [ "clap", "lazy_static", @@ -1519,9 +1612,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.11" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c92e775028122a4b3dd55d58f14fc5120289c69bee99df1d117ae30f84b225c9" +checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" dependencies = [ "heck", "proc-macro-error", @@ -1538,11 +1631,14 @@ dependencies = [ "chrono", "colored", "crossbeam", + "encoding_rs", "kuchiki", + "lazy_static", "percent-encoding", - "rand 0.8.0", + "rand 0.8.1", "regex", "reqwest", + "serial_test", "structopt", "tiny_http", "url", @@ -1550,9 +1646,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.41" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6690e3e9f692504b941dc6c3b188fd28df054f7fb8469ab40680df52fdcc842b" +checksum = "4211ce9909eb971f111059df92c45640aad50a619cf55cd76476be803c4c68e6" dependencies = [ "proc-macro2", "quote", @@ -1621,9 +1717,9 @@ dependencies = [ [[package]] name = "time" -version = "0.2.21" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c2e31fb28e2a9f01f5ed6901b066c1ba2333c04b64dc61254142bafcb3feb2c" +checksum = "bcdaeea317915d59b2b4cd3b5efcd156c309108664277793f5351700c02ce98b" dependencies = [ "const_fn", "libc", @@ -1636,9 +1732,9 @@ dependencies = [ [[package]] name = "time-macros" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ae9b6e9f095bc105e183e3cd493d72579be3181ad4004fceb01adbe9eecab2d" +checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1" dependencies = [ "proc-macro-hack", "time-macros-impl", @@ -1672,15 +1768,24 @@ dependencies = [ [[package]] name = "tinyvec" -version = "0.3.4" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238ce071d267c5710f9d31451efec16c5ee22de34df17cc05e56cbc92e967117" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "0.2.22" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d34ca54d84bf2b5b4d7d31e901a8464f7b60ac145a284fba25ceb801f2ddccd" +checksum = "099837d3464c16a808060bb3f02263b412f6fafcb5d01c533d309985fbeebe48" dependencies = [ "bytes 0.5.6", "fnv", @@ -1690,7 +1795,7 @@ dependencies = [ "memchr", "mio", "num_cpus", - "pin-project-lite", + "pin-project-lite 0.1.11", "slab", ] @@ -1714,7 +1819,7 @@ dependencies = [ "futures-core", "futures-sink", "log", - "pin-project-lite", + "pin-project-lite 0.1.11", "tokio", ] @@ -1726,24 +1831,35 @@ checksum = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860" [[package]] name = "tracing" -version = "0.1.19" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d79ca061b032d6ce30c660fded31189ca0b9922bf483cd70759f13a2d86786c" +checksum = "9f47026cdc4080c07e49b37087de021820269d996f581aac150ef9e5583eefe3" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "log", + "pin-project-lite 0.2.0", "tracing-core", ] [[package]] name = "tracing-core" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bcf46c1f1f06aeea2d6b81f3c863d0930a596c86ad1920d4e5bad6dd1d7119a" +checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" dependencies = [ "lazy_static", ] +[[package]] +name = "tracing-futures" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" +dependencies = [ + "pin-project 0.4.27", + "tracing", +] + [[package]] name = "try-lock" version = "0.2.3" @@ -1770,18 +1886,18 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.13" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb19cf769fa8c6a80a162df694621ebeb4dafb606470b2b2fce0be40a98a977" +checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" dependencies = [ "tinyvec", ] [[package]] name = "unicode-segmentation" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" +checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" [[package]] name = "unicode-width" @@ -1815,9 +1931,9 @@ checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" [[package]] name = "vcpkg" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6454029bf181f092ad1b853286f23e2c507d8e8194d01d92da4a55c274a5508c" +checksum = "b00bca6106a5e23f3eee943593759b7fcddb00554332e856d990c893966879fb" [[package]] name = "vec_map" @@ -1855,11 +1971,11 @@ checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" [[package]] name = "wasm-bindgen" -version = "0.2.68" +version = "0.2.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac64ead5ea5f05873d7c12b545865ca2b8d28adfc50a49b84770a3a97265d42" +checksum = "3cd364751395ca0f68cafb17666eee36b63077fb5ecd972bbcd74c90c4bf736e" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "serde", "serde_json", "wasm-bindgen-macro", @@ -1867,9 +1983,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.68" +version = "0.2.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22b422e2a757c35a73774860af8e112bff612ce6cb604224e8e47641a9e4f68" +checksum = "1114f89ab1f4106e5b55e688b828c0ab0ea593a1ea7c094b141b14cbaaec2d62" dependencies = [ "bumpalo", "lazy_static", @@ -1882,11 +1998,11 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.18" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7866cab0aa01de1edf8b5d7936938a7e397ee50ce24119aef3e1eaa3b6171da" +checksum = "1fe9756085a84584ee9457a002b7cdfe0bfff169f45d2591d8be1345a6780e35" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "js-sys", "wasm-bindgen", "web-sys", @@ -1894,9 +2010,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.68" +version = "0.2.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13312a745c08c469f0b292dd2fcd6411dba5f7160f593da6ef69b64e407038" +checksum = "7a6ac8995ead1f084a8dea1e65f194d0973800c7f571f6edd70adf06ecf77084" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1904,9 +2020,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.68" +version = "0.2.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f249f06ef7ee334cc3b8ff031bfc11ec99d00f34d86da7498396dc1e3b1498fe" +checksum = "b5a48c72f299d80557c7c62e37e7225369ecc0c963964059509fbafe917c7549" dependencies = [ "proc-macro2", "quote", @@ -1917,15 +2033,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.68" +version = "0.2.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d649a3145108d7d3fbcde896a468d1bd636791823c9921135218ad89be08307" +checksum = "7e7811dd7f9398f14cc76efd356f98f03aa30419dea46aa810d71e819fc97158" [[package]] name = "web-sys" -version = "0.3.45" +version = "0.3.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bf6ef87ad7ae8008e15a355ce696bed26012b7caa21605188cfd8214ab51e2d" +checksum = "222b1ef9334f92a21d3fb53dc3fd80f30836959a90f9274a626d7e06315ba3c3" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 2cf3131..6f16b32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,9 @@ percent-encoding = "^2.1" url = "^2.2" rand = "^0.8" regex = "^1.4" +encoding_rs = "^0.8" +lazy_static = "1.4.0" [dev-dependencies] tiny_http = "^0.7" +serial_test = "^0.5" diff --git a/src/downloader.rs b/src/downloader.rs index 16fb0ac..09c9f66 100644 --- a/src/downloader.rs +++ b/src/downloader.rs @@ -1,5 +1,8 @@ use super::response::{Response, ResponseData}; use std::collections::HashMap; + +use lazy_static::lazy_static; +use regex::Regex; use url::Url; use crate::warn; @@ -104,26 +107,48 @@ impl Downloader { }; match req.send() { Ok(mut data) => { - let data_type = match data.headers().get("content-type") { - Some(data_type) => data_type.to_str().unwrap(), - None => "text/html", - }; - - let filename = if !Downloader::is_html(data_type) { + lazy_static! { + static ref DATA_TYPE_REGEX: Regex = + Regex::new("^.*(\\b[a-z]+/[a-z-+\\.]+).*$").unwrap(); + static ref CHARSET_REGEX: Regex = + Regex::new("^.*charset\\s*=\\s*\"?([^\"\\s;]+).*$").unwrap(); + } + + let (data_type, charset): (String, Option) = + match data.headers().get("content-type") { + Some(content_type_header) => { + let content_type = content_type_header.to_str().unwrap(); + let data_type_captures = + DATA_TYPE_REGEX.captures_iter(&content_type).nth(0); + let data_type = data_type_captures + .map_or(String::from("text/html"), |first| { + String::from(first.get(1).unwrap().as_str().to_lowercase()) + }); + let charset_captures = + CHARSET_REGEX.captures_iter(&content_type).nth(0); + let charset = charset_captures.map(|first| { + String::from(first.get(1).unwrap().as_str().to_lowercase()) + }); + (data_type, charset) + } + None => (String::from("text/html"), None), + }; + + let filename = if !Downloader::is_html(&data_type) { Downloader::get_filename(data.headers()) } else { None }; - let data = if Downloader::is_html(data_type) { - ResponseData::Html(data.text().unwrap()) + let mut raw_data: Vec = Vec::new(); + data.copy_to(&mut raw_data).unwrap(); + let response_data = if Downloader::is_html(&data_type) { + ResponseData::Html(raw_data) } else { - let mut raw_data: Vec = Vec::new(); - data.copy_to(&mut raw_data).unwrap(); ResponseData::Other(raw_data) }; - Ok(Response::new(data, filename)) + Ok(Response::new(response_data, filename, charset)) } Err(e) => { diff --git a/src/response.rs b/src/response.rs index 23a20fa..3f622b8 100644 --- a/src/response.rs +++ b/src/response.rs @@ -1,6 +1,6 @@ /// Separates HTML responses and other content (PDFs, images...) pub enum ResponseData { - Html(String), + Html(Vec), Other(Vec), } @@ -8,11 +8,16 @@ pub enum ResponseData { pub struct Response { pub data: ResponseData, pub filename: Option, + pub charset: Option, } impl Response { ///Create a new Response - pub fn new(data: ResponseData, filename: Option) -> Response { - Response { data, filename } + pub fn new(data: ResponseData, filename: Option, charset: Option) -> Response { + Response { + data, + filename, + charset, + } } } diff --git a/src/scraper.rs b/src/scraper.rs index 0cf090f..541f9f6 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -1,24 +1,26 @@ use crossbeam::channel::{Receiver, Sender, TryRecvError}; use crossbeam::thread; +use encoding_rs::Encoding; +use lazy_static::lazy_static; +use rand::Rng; +use regex::Regex; use url::Url; +use std::borrow::Borrow; use std::collections::HashMap; use std::collections::HashSet; use std::process; use std::sync::Mutex; use std::time; -use rand::Rng; - -use super::downloader; - use super::args; use super::disk; use super::dom; +use super::downloader; use super::response; use super::url_helper; -use crate::{error, info}; +use crate::{error, info, warn}; /// Maximum number of empty recv() from the channel static MAX_EMPTY_RECEIVES: usize = 10; @@ -91,15 +93,79 @@ impl Scraper { old_url_str.push_str(&new_url_str); } - ///Proces an html file: add new url to the chanel and prepare for offline navigation + /// Find the charset of the webpage. ``data`` is not a String as this might not be utf8. + /// Returned String is lower cased + /// This is a hack and should be check in case of a bug + fn find_charset(data: &[u8], http_charset: Option) -> Option { + lazy_static! { + static ref CHARSET_REGEX: Regex = + Regex::new("").unwrap(); + } + + // We don't know the real charset yet. We hope that the charset is ASCII + // compatible, because Rust String are in UTF-8 (also ASCII compatible). + let data_utf8 = unsafe { String::from_utf8_unchecked(Vec::from(data)) }; + let captures = CHARSET_REGEX.captures_iter(&data_utf8).next(); + + // We use the first one, hopping we are in the of the page... or if nothing is found + // we used the http charset (if any). + captures + .map(|first| String::from(first.get(1).unwrap().as_str().to_lowercase())) + .or(http_charset) + } + + /// Proceed to convert the data in utf8. + fn charset_convert( + data: &[u8], + charset_source: &'static Encoding, + charset_dest: &'static Encoding, + ) -> Vec { + let decode_result = charset_source.decode(data); + let decode_bytes = decode_result.0.borrow(); + + let encode_result = charset_dest.encode(decode_bytes); + let encode_bytes = encode_result.0.into_owned(); + + encode_bytes + } + + /// Check if the charset require conversion + fn needs_charset_conversion(charset: &str) -> bool { + match charset { + "utf-8" => false, + _ => true, + } + } + + /// Proces an html file: add new url to the chanel and prepare for offline navigation fn handle_html( scraper: &Scraper, transmitter: &Sender<(Url, i32)>, url: &Url, depth: i32, - data: &str, + data: &[u8], + http_charset: Option, ) -> Vec { - let dom = dom::Dom::new(data); + let charset_source_str = match Self::find_charset(data, http_charset) { + Some(s) => s, + None => { + warn!("Charset not found for {}, defaulting to UTF-8", url); + String::from("utf-8") + } + }; + + let need_charset_conversion = Self::needs_charset_conversion(&charset_source_str); + + let charset_source = + encoding_rs::Encoding::for_label(&charset_source_str.as_bytes()).unwrap(); + let charset_utf8 = encoding_rs::UTF_8; + let utf8_data = if need_charset_conversion { + Self::charset_convert(data, charset_source, charset_utf8) + } else { + Vec::from(data) + }; + + let dom = dom::Dom::new(&String::from_utf8_lossy(&utf8_data).into_owned()); dom.find_urls_as_strings() .into_iter() @@ -119,7 +185,13 @@ impl Scraper { scraper.fix_domtree(next_url, &next_full_url); }); - dom.serialize().into_bytes() + let utf8_data = dom.serialize().into_bytes(); + + if need_charset_conversion { + Self::charset_convert(&utf8_data, charset_utf8, charset_source) + } else { + utf8_data + } } /// Process a single URL @@ -127,9 +199,14 @@ impl Scraper { match scraper.downloader.get(&url) { Ok(response) => { let data = match response.data { - response::ResponseData::Html(data) => { - Scraper::handle_html(scraper, transmitter, &url, depth, &data) - } + response::ResponseData::Html(data) => Scraper::handle_html( + scraper, + transmitter, + &url, + depth, + &data, + response.charset, + ), response::ResponseData::Other(data) => data, }; diff --git a/tests/auth.rs b/tests/auth.rs index f46c7c7..9296bd3 100644 --- a/tests/auth.rs +++ b/tests/auth.rs @@ -2,20 +2,19 @@ mod fixtures; -use fixtures::get_file_count_with_pattern; use std::fs::read_dir; use std::process::Command; use std::process::Stdio; use std::sync::Once; -const ADDR: &'static str = "http://0.0.0.0:8000"; +const PAGE: &'static str = "tests/fixtures/index.html"; static START: Once = Once::new(); #[test] fn test_auth() { // Spawn a single instance of a local http server usable by all tests in this module. START.call_once(|| { - fixtures::spawn_local_http_server(true); + fixtures::spawn_local_http_server(PAGE, true, None); }); // Tests below are grouped together as they depend on the local_http_server above. @@ -28,7 +27,7 @@ fn auth_different_host() { let output_dir = "w4"; let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) .args(&[ - ADDR, + fixtures::HTTP_ADDR, "-o", "w4", "-a", @@ -54,7 +53,15 @@ fn auth_different_host() { fn auth_valid() { let output_dir = "w5"; let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) - .args(&[ADDR, "-o", "w5", "-a", "username password", "-j", "16"]) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + "w5", + "-a", + "username password", + "-j", + "16", + ]) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .spawn() diff --git a/tests/charset_html_found.rs b/tests/charset_html_found.rs new file mode 100644 index 0000000..084a4f8 --- /dev/null +++ b/tests/charset_html_found.rs @@ -0,0 +1,41 @@ +//! Test for charset detection/conversion + +mod fixtures; + +use std::fs; +use std::process::{Command, Stdio}; +use std::sync::Once; + +const PAGE_META: &'static str = "tests/fixtures/charset_test_html.html"; +static START: Once = Once::new(); + +#[test] +fn test_html_charset_found() { + // Spawn a single instance of a local http server usable by all tests in this module. + START.call_once(|| { + fixtures::spawn_local_http_server(PAGE_META, false, None); + }); + + let output_dir = "charset_html_found"; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[fixtures::HTTP_ADDR, "-o", output_dir]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + let status = cmd.wait().unwrap(); + assert!(status.success()); + let file_path = fs::read_dir(output_dir) + .unwrap() + .next() + .unwrap() + .unwrap() + .path(); // There is only one file in the directory + + let data_source = fs::read(PAGE_META).unwrap(); + let data_downloaded = fs::read(file_path).unwrap(); + + assert!(fixtures::do_vecs_match(&data_source, &data_downloaded)); + + fs::remove_dir_all(output_dir).unwrap(); +} diff --git a/tests/charset_html_not_found.rs b/tests/charset_html_not_found.rs new file mode 100644 index 0000000..4ed7b86 --- /dev/null +++ b/tests/charset_html_not_found.rs @@ -0,0 +1,41 @@ +//! Test for charset detection/conversion + +mod fixtures; + +use std::fs; +use std::process::{Command, Stdio}; +use std::sync::Once; + +const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html"; +static START: Once = Once::new(); + +#[test] +fn test_html_charset_not_found() { + // Spawn a single instance of a local http server usable by all tests in this module. + START.call_once(|| { + fixtures::spawn_local_http_server(PAGE_NO_META, false, None); + }); + + let output_dir = "charset_html_not_found"; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[fixtures::HTTP_ADDR, "-o", output_dir]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + let status = cmd.wait().unwrap(); + assert!(status.success()); + let file_path = fs::read_dir(output_dir) + .unwrap() + .next() + .unwrap() + .unwrap() + .path(); // There is only one file in the directory + + let data_source = fs::read(PAGE_NO_META).unwrap(); + let data_downloaded = fs::read(file_path).unwrap(); + + assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded)); + + fs::remove_dir_all(output_dir).unwrap(); +} diff --git a/tests/charset_http_found.rs b/tests/charset_http_found.rs new file mode 100644 index 0000000..4ee0786 --- /dev/null +++ b/tests/charset_http_found.rs @@ -0,0 +1,48 @@ +//! Test for charset detection/conversion + +mod fixtures; + +use std::fs; +use std::process::{Command, Stdio}; +use std::sync::Once; + +use lazy_static::lazy_static; + +const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html"; +static START: Once = Once::new(); + +lazy_static! { + static ref CHARSET_HEADER: Vec<(&'static str, &'static str)> = + vec![("Content-Type", "charset=windows-1252")]; +} + +#[test] +fn test_http_charset_found() { + // Spawn a single instance of a local http server usable by all tests in this module. + START.call_once(|| { + fixtures::spawn_local_http_server(PAGE_NO_META, false, Some(&CHARSET_HEADER)); + }); + + let output_dir = "charset_html_found"; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[fixtures::HTTP_ADDR, "-o", output_dir]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + let status = cmd.wait().unwrap(); + assert!(status.success()); + let file_path = fs::read_dir(output_dir) + .unwrap() + .next() + .unwrap() + .unwrap() + .path(); // There is only one file in the directory + + let data_source = fs::read(PAGE_NO_META).unwrap(); + let data_downloaded = fs::read(file_path).unwrap(); + + assert!(fixtures::do_vecs_match(&data_source, &data_downloaded)); + + fs::remove_dir_all(output_dir).unwrap(); +} diff --git a/tests/charset_http_not_found.rs b/tests/charset_http_not_found.rs new file mode 100644 index 0000000..a60aa28 --- /dev/null +++ b/tests/charset_http_not_found.rs @@ -0,0 +1,43 @@ +//! Test for charset detection/conversion + +mod fixtures; + +use std::fs; +use std::process::{Command, Stdio}; +use std::sync::Once; + +use lazy_static::lazy_static; + +const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html"; +static START: Once = Once::new(); + +#[test] +fn test_http_charset_found() { + // Spawn a single instance of a local http server usable by all tests in this module. + START.call_once(|| { + fixtures::spawn_local_http_server(PAGE_NO_META, false, None); + }); + + let output_dir = "charset_html_found"; + let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) + .args(&[fixtures::HTTP_ADDR, "-o", output_dir]) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .unwrap(); + let status = cmd.wait().unwrap(); + assert!(status.success()); + let file_path = fs::read_dir(output_dir) + .unwrap() + .next() + .unwrap() + .unwrap() + .path(); // There is only one file in the directory + + let data_source = fs::read(PAGE_NO_META).unwrap(); + let data_downloaded = fs::read(file_path).unwrap(); + + assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded)); + + fs::remove_dir_all(output_dir).unwrap(); +} diff --git a/tests/filters.rs b/tests/filters.rs index 9bfb608..c402564 100644 --- a/tests/filters.rs +++ b/tests/filters.rs @@ -8,14 +8,14 @@ use std::process::Command; use std::process::Stdio; use std::sync::Once; -const ADDR: &'static str = "http://0.0.0.0:8000"; +const PAGE: &'static str = "tests/fixtures/index.html"; static START: Once = Once::new(); #[test] fn test_include_exclude() { // Spawn a single instance of a local http server usable by all tests in this module. START.call_once(|| { - fixtures::spawn_local_http_server(false); + fixtures::spawn_local_http_server(PAGE, false, None); }); // Tests below are grouped together as they depend on the local_http_server above. @@ -28,7 +28,7 @@ fn test_include_exclude() { fn include_filter() { let output_dir = "w2"; let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) - .args(&[ADDR, "-o", "w2", "-i", "mp[3-4]", "-j", "16"]) + .args(&[fixtures::HTTP_ADDR, "-o", "w2", "-i", "mp[3-4]", "-j", "16"]) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .spawn() @@ -49,7 +49,15 @@ fn include_filter() { fn include_multiple_filters() { let output_dir = "w1"; let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) - .args(&[ADDR, "-o", output_dir, "-i", "(mp[3-4])|(txt)", "-j", "16"]) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + output_dir, + "-i", + "(mp[3-4])|(txt)", + "-j", + "16", + ]) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .spawn() @@ -68,7 +76,15 @@ fn include_multiple_filters() { fn exclude_filter() { let output_dir = "w3"; let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit")) - .args(&[ADDR, "-o", output_dir, "-e", "jpe?g", "-j", "16"]) + .args(&[ + fixtures::HTTP_ADDR, + "-o", + output_dir, + "-e", + "jpe?g", + "-j", + "16", + ]) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .spawn() diff --git a/tests/fixtures/charset_test_html.html b/tests/fixtures/charset_test_html.html new file mode 100644 index 0000000..ccf29b6 --- /dev/null +++ b/tests/fixtures/charset_test_html.html @@ -0,0 +1,13 @@ + + +Gamle Gjerpen - Ei Bygdebok for nett. + + + +

Gamle Valebø Redigering av sidene pågår.

+ \ No newline at end of file diff --git a/tests/fixtures/charset_test_html_no_meta.html b/tests/fixtures/charset_test_html_no_meta.html new file mode 100644 index 0000000..c1aa85d --- /dev/null +++ b/tests/fixtures/charset_test_html_no_meta.html @@ -0,0 +1,12 @@ + +Gamle Gjerpen - Ei Bygdebok for nett. + + + +

Gamle Valebø Redigering av sidene pågår.

+ \ No newline at end of file diff --git a/tests/fixtures/mod.rs b/tests/fixtures/mod.rs index c445853..f26d834 100644 --- a/tests/fixtures/mod.rs +++ b/tests/fixtures/mod.rs @@ -4,12 +4,17 @@ use std::process::Stdio; use std::thread; use tiny_http::{Header, Response, Server}; -const PAGE: &'static str = "tests/fixtures/index.html"; +pub const HTTP_ADDR: &'static str = "http://0.0.0.0:8000"; +const ADDR: &'static str = "0.0.0.0:8000"; const AUTH_HEADER: &str = "Authorization"; const AUTH_CREDENTIALS: &str = "Basic dXNlcm5hbWU6cGFzc3dvcmQ="; // base64-encoded "username:password" -pub fn spawn_local_http_server(requires_auth: bool) { - let server = Server::http("0.0.0.0:8000").unwrap(); +pub fn spawn_local_http_server( + page: &'static str, + requires_auth: bool, + headers: Option<&'static Vec<(&'static str, &'static str)>>, +) { + let server = Server::http(ADDR).unwrap(); println!("Spawning http server"); thread::spawn(move || { for request in server.incoming_requests() { @@ -20,15 +25,31 @@ pub fn spawn_local_http_server(requires_auth: bool) { .find(|h| h.field.equiv(AUTH_HEADER)); let valid_auth = check_auth_credentials(auth_header); - if requires_auth && !valid_auth { + let mut response = if requires_auth && !valid_auth { let mut response = Response::from_string("Invalid auth").with_status_code(401); let h = Header::from_bytes("WWW-Authenticate", r#"Basic realm="Test""#).unwrap(); response.add_header(h); - request.respond(response).unwrap(); + response.boxed() } else { - let response = Response::from_file(File::open(PAGE).unwrap()); - request.respond(response).unwrap(); + Response::from_file(File::open(page).unwrap()).boxed() }; + + match headers { + Some(vec) => { + let mut key_vec: Vec = vec![]; + let mut value_vec: Vec = vec![]; + for (key, value) in vec { + key_vec.extend_from_slice(key.as_bytes()); + value_vec.extend_from_slice(value.as_bytes()); + } + + let h = Header::from_bytes(key_vec, value_vec).unwrap(); + response.add_header(h); + } + _ => (), + } + + request.respond(response).unwrap(); } }); } @@ -77,3 +98,8 @@ pub fn get_file_count_with_pattern(pattern: &str, dir: &str) -> Result(a: &Vec, b: &Vec) -> bool { + let matching = a.iter().zip(b.iter()).filter(|&(a, b)| a == b).count(); + matching == a.len() && matching == b.len() +}