From de0b7574975b5cc113142d1522951bfcd94ad2e2 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 17:17:19 +0200 Subject: [PATCH 01/10] feat: Initialize web scraper project Signed-off-by: margarita.surina --- topics/web-scraper/Cargo.lock | 7 +++++++ topics/web-scraper/Cargo.toml | 6 ++++++ topics/web-scraper/src/main.rs | 3 +++ 3 files changed, 16 insertions(+) create mode 100644 topics/web-scraper/Cargo.lock create mode 100644 topics/web-scraper/Cargo.toml create mode 100644 topics/web-scraper/src/main.rs diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock new file mode 100644 index 0000000..4a0282a --- /dev/null +++ b/topics/web-scraper/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "webcrawl" +version = "0.1.0" diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml new file mode 100644 index 0000000..8687e1c --- /dev/null +++ b/topics/web-scraper/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "webcrawl" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs new file mode 100644 index 0000000..e7a11a9 --- /dev/null +++ b/topics/web-scraper/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} From 0391107e1707a8940b4298c8f200e051eef28a79 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 17:44:52 +0200 Subject: [PATCH 02/10] feat: add basic CLI Signed-off-by: margarita.surina --- topics/web-scraper/Cargo.lock | 558 +++++++++++++++++++++++++++++++++ topics/web-scraper/Cargo.toml | 2 + topics/web-scraper/src/cli.rs | 16 + topics/web-scraper/src/main.rs | 11 +- 4 files changed, 586 insertions(+), 1 deletion(-) create mode 100644 topics/web-scraper/src/cli.rs diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock index 4a0282a..bf56d92 100644 --- a/topics/web-scraper/Cargo.lock +++ b/topics/web-scraper/Cargo.lock @@ -2,6 +2,564 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "clap" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "potential_utf" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +dependencies = [ + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "url" +version = "2.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "webcrawl" version = "0.1.0" +dependencies = [ + "clap", + "url", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml index 8687e1c..c94b4da 100644 --- a/topics/web-scraper/Cargo.toml +++ b/topics/web-scraper/Cargo.toml @@ -4,3 +4,5 @@ version = "0.1.0" edition = "2024" [dependencies] +clap = { version = "4.0", features = ["derive"] } +url = "2.4" diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs new file mode 100644 index 0000000..f84d687 --- /dev/null +++ b/topics/web-scraper/src/cli.rs @@ -0,0 +1,16 @@ +use clap::Parser; +use url::Url; + +/// A web crawler that downloads pages and follows links +#[derive(Parser, Debug)] +#[command(name = "webcrawl")] +#[command(about = "A concurrent web scraper that downloads pages and follows links")] +pub struct Args { + /// The starting URL to crawl + #[arg(value_parser = parse_url)] + pub url: Url, +} + +fn parse_url(s: &str) -> Result { + Url::parse(s) +} \ No newline at end of file diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index e7a11a9..63b2975 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -1,3 +1,12 @@ +use clap::Parser; + +mod cli; +use cli::Args; + fn main() { - println!("Hello, world!"); + let args = Args::parse(); + + println!("Starting web crawler..."); + println!("Target URL: {}", args.url); + println!("Web crawler initialized successfully!"); } From 5e313375123f9b7a60b2411be524d241fbe977f7 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 17:56:22 +0200 Subject: [PATCH 03/10] feat: expand CLI with output, depth, and workers options Signed-off-by: margarita.surina --- topics/web-scraper/src/cli.rs | 13 +++++++++++++ topics/web-scraper/src/main.rs | 5 ++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs index f84d687..206b0d7 100644 --- a/topics/web-scraper/src/cli.rs +++ b/topics/web-scraper/src/cli.rs @@ -1,4 +1,5 @@ use clap::Parser; +use std::path::PathBuf; use url::Url; /// A web crawler that downloads pages and follows links @@ -9,6 +10,18 @@ pub struct Args { /// The starting URL to crawl #[arg(value_parser = parse_url)] pub url: Url, + + /// Output directory for downloaded pages + #[arg(short, long, default_value = "./crawled")] + pub output: PathBuf, + + /// Maximum crawling depth + #[arg(short, long, default_value = "2")] + pub depth: usize, + + /// Number of concurrent workers + #[arg(short, long, default_value = "4")] + pub workers: usize, } fn parse_url(s: &str) -> Result { diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index 63b2975..990e224 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -8,5 +8,8 @@ fn main() { println!("Starting web crawler..."); println!("Target URL: {}", args.url); - println!("Web crawler initialized successfully!"); + println!("Output directory: {}", args.output.display()); + println!("Max depth: {}", args.depth); + println!("Workers: {}", args.workers); + println!("Web crawler configured successfully!"); } From 2f34158591448b97589f88c8db42b9fd62fc71e6 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 18:09:24 +0200 Subject: [PATCH 04/10] feat: add HTTP downloader with async support Signed-off-by: margarita.surina --- topics/web-scraper/Cargo.lock | 1319 ++++++++++++++++++++++++-- topics/web-scraper/Cargo.toml | 3 + topics/web-scraper/src/downloader.rs | 47 + topics/web-scraper/src/main.rs | 36 +- 4 files changed, 1318 insertions(+), 87 deletions(-) create mode 100644 topics/web-scraper/src/downloader.rs diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock index bf56d92..7c1fe3a 100644 --- a/topics/web-scraper/Cargo.lock +++ b/topics/web-scraper/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "anstream" version = "0.6.21" @@ -38,7 +53,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -49,9 +64,76 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cc" +version = "1.2.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" +dependencies = [ + "find-msvc-tools", + "shlex", ] +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + [[package]] name = "clap" version = "4.5.48" @@ -98,6 +180,22 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "displaydoc" version = "0.2.5" @@ -109,6 +207,64 @@ dependencies = [ "syn", ] +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -118,12 +274,165 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.7+wasi-0.2.4", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + [[package]] name = "icu_collections" version = "2.0.0" @@ -232,120 +541,177 @@ dependencies = [ ] [[package]] -name = "is_terminal_polyfill" -version = "1.70.1" +name = "indexmap" +version = "2.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +dependencies = [ + "equivalent", + "hashbrown", +] [[package]] -name = "litemap" -version = "0.8.0" +name = "io-uring" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +dependencies = [ + "bitflags 2.9.4", + "cfg-if", + "libc", +] [[package]] -name = "once_cell_polyfill" +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] -name = "percent-encoding" -version = "2.3.2" +name = "itoa" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] -name = "potential_utf" -version = "0.1.3" +name = "js-sys" +version = "0.3.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" dependencies = [ - "zerovec", + "once_cell", + "wasm-bindgen", ] [[package]] -name = "proc-macro2" -version = "1.0.101" +name = "libc" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" -dependencies = [ - "unicode-ident", -] +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] -name = "quote" -version = "1.0.41" +name = "linux-raw-sys" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "proc-macro2", + "scopeguard", ] [[package]] -name = "serde" -version = "1.0.228" +name = "log" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ - "serde_core", - "serde_derive", + "adler2", ] [[package]] -name = "serde_core" -version = "1.0.228" +name = "mio" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" dependencies = [ - "serde_derive", + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", + "windows-sys 0.59.0", ] [[package]] -name = "serde_derive" -version = "1.0.228" +name = "native-tls" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" dependencies = [ - "proc-macro2", - "quote", - "syn", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", ] [[package]] -name = "smallvec" -version = "1.15.1" +name = "object" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] [[package]] -name = "stable_deref_trait" -version = "1.2.1" +name = "once_cell" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] -name = "strsim" -version = "0.11.1" +name = "once_cell_polyfill" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] -name = "syn" -version = "2.0.106" +name = "openssl" +version = "0.10.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", + "bitflags 2.9.4", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", ] [[package]] -name = "synstructure" -version = "0.13.2" +name = "openssl-macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", @@ -353,35 +719,521 @@ dependencies = [ ] [[package]] -name = "tinystr" -version = "0.8.1" +name = "openssl-probe" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" dependencies = [ - "displaydoc", - "zerovec", + "cc", + "libc", + "pkg-config", + "vcpkg", ] [[package]] -name = "unicode-ident" -version = "1.0.19" +name = "parking_lot" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] [[package]] -name = "url" -version = "2.5.7" +name = "parking_lot_core" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", ] [[package]] -name = "utf8_iter" +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "potential_utf" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +dependencies = [ + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.9.4", +] + +[[package]] +name = "reqwest" +version = "0.11.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags 2.9.4", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.60.2", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.9.4", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys 0.60.2", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.47.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +dependencies = [ + "backtrace", + "bytes", + "io-uring", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "slab", + "socket2 0.6.0", + "tokio-macros", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "url" +version = "2.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" @@ -392,11 +1244,135 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasi" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webcrawl" version = "0.1.0" dependencies = [ + "anyhow", "clap", + "reqwest", + "tokio", "url", ] @@ -406,13 +1382,80 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -422,64 +1465,170 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + [[package]] name = "writeable" version = "0.6.1" diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml index c94b4da..d2e6df6 100644 --- a/topics/web-scraper/Cargo.toml +++ b/topics/web-scraper/Cargo.toml @@ -6,3 +6,6 @@ edition = "2024" [dependencies] clap = { version = "4.0", features = ["derive"] } url = "2.4" +tokio = { version = "1.0", features = ["full"] } +reqwest = { version = "0.11", features = ["json"] } +anyhow = "1.0" diff --git a/topics/web-scraper/src/downloader.rs b/topics/web-scraper/src/downloader.rs new file mode 100644 index 0000000..47595f1 --- /dev/null +++ b/topics/web-scraper/src/downloader.rs @@ -0,0 +1,47 @@ +use anyhow::Result; +use reqwest::Client; +use std::time::Duration; +use url::Url; + +/// HTTP client for downloading web pages +pub struct Downloader { + client: Client, +} + +/// Downloaded page data +pub struct Page { + pub url: Url, + pub content: String, +} + +impl Downloader { + /// Create a new downloader with reasonable defaults + pub fn new() -> Result { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .user_agent("webcrawl/0.1.0") + .build()?; + + Ok(Downloader { client }) + } + + /// Download a page from the given URL + pub async fn download(&self, url: Url) -> Result { + println!("Downloading: {}", url); + + let response = self.client.get(url.clone()).send().await?; + + if !response.status().is_success() { + return Err(anyhow::anyhow!("HTTP error {}: {}", response.status(), url)); + } + + let content = response.text().await?; + + println!("Downloaded {} bytes from {}", content.len(), url); + + Ok(Page { + url, + content, + }) + } +} \ No newline at end of file diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index 990e224..42107ed 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -1,9 +1,14 @@ use clap::Parser; +use anyhow::Result; mod cli; +mod downloader; + use cli::Args; +use downloader::Downloader; -fn main() { +#[tokio::main] +async fn main() -> Result<()> { let args = Args::parse(); println!("Starting web crawler..."); @@ -11,5 +16,32 @@ fn main() { println!("Output directory: {}", args.output.display()); println!("Max depth: {}", args.depth); println!("Workers: {}", args.workers); - println!("Web crawler configured successfully!"); + println!(); + + // Test the downloader + println!("Testing downloader..."); + let downloader = Downloader::new()?; + + match downloader.download(args.url).await { + Ok(page) => { + println!("Successfully downloaded page!"); + println!("Final URL: {}", page.url); + println!("Content preview (first 200 chars):"); + let preview = if page.content.len() > 200 { + &page.content[..200] + } else { + &page.content + }; + println!("{}", preview); + if page.content.len() > 200 { + println!("... (truncated)"); + } + } + Err(e) => { + println!("Failed to download: {}", e); + } + } + + println!("\nDownloader test completed!"); + Ok(()) } From 1aebaa635da611ee57202d69457f6d2958e317a4 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 18:24:10 +0200 Subject: [PATCH 05/10] feat: add HTML parser for link extraction Signed-off-by: margarita.surina --- topics/web-scraper/Cargo.lock | 451 ++++++++++++++++++++++++++++++- topics/web-scraper/Cargo.toml | 1 + topics/web-scraper/src/main.rs | 28 +- topics/web-scraper/src/parser.rs | 59 ++++ 4 files changed, 520 insertions(+), 19 deletions(-) create mode 100644 topics/web-scraper/src/parser.rs diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock index 7c1fe3a..4644eac 100644 --- a/topics/web-scraper/Cargo.lock +++ b/topics/web-scraper/Cargo.lock @@ -17,6 +17,19 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.3", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "anstream" version = "0.6.21" @@ -112,6 +125,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.10.1" @@ -165,7 +184,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] [[package]] @@ -196,6 +215,40 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.3", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.106", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -204,9 +257,30 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] +[[package]] +name = "dtoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -229,7 +303,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -274,6 +348,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -313,6 +397,35 @@ dependencies = [ "pin-utils", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.3.3" @@ -362,6 +475,20 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "http" version = "0.2.12" @@ -622,6 +749,26 @@ version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "memchr" version = "2.7.6" @@ -671,6 +818,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "object" version = "0.37.3" @@ -715,7 +868,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] [[package]] @@ -765,6 +918,86 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher 1.0.1", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -792,6 +1025,21 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.101" @@ -816,6 +1064,36 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -881,7 +1159,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -920,6 +1198,23 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "smallvec", + "tendril", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -943,6 +1238,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags 2.9.4", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "serde" version = "1.0.228" @@ -970,7 +1284,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] [[package]] @@ -998,6 +1312,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1013,6 +1336,18 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.11" @@ -1051,12 +1386,48 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.106" @@ -1082,7 +1453,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] [[package]] @@ -1113,10 +1484,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom", + "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys 0.60.2", + "windows-sys 0.61.2", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", ] [[package]] @@ -1157,7 +1539,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] [[package]] @@ -1220,6 +1602,12 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "url" version = "2.5.7" @@ -1232,6 +1620,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -1250,6 +1644,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "want" version = "0.3.1" @@ -1306,7 +1706,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn", + "syn 2.0.106", "wasm-bindgen-shared", ] @@ -1341,7 +1741,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1372,6 +1772,7 @@ dependencies = [ "anyhow", "clap", "reqwest", + "scraper", "tokio", "url", ] @@ -1655,10 +2056,30 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "zerofrom" version = "0.1.6" @@ -1676,7 +2097,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", "synstructure", ] @@ -1710,5 +2131,5 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.106", ] diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml index d2e6df6..e55894d 100644 --- a/topics/web-scraper/Cargo.toml +++ b/topics/web-scraper/Cargo.toml @@ -9,3 +9,4 @@ url = "2.4" tokio = { version = "1.0", features = ["full"] } reqwest = { version = "0.11", features = ["json"] } anyhow = "1.0" +scraper = "0.17" diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index 42107ed..e7cd5d5 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -3,9 +3,11 @@ use anyhow::Result; mod cli; mod downloader; +mod parser; use cli::Args; use downloader::Downloader; +use parser::Parser as HtmlParser; #[tokio::main] async fn main() -> Result<()> { @@ -18,11 +20,12 @@ async fn main() -> Result<()> { println!("Workers: {}", args.workers); println!(); - // Test the downloader - println!("Testing downloader..."); + // Test the downloader and parser + println!("Testing downloader and parser..."); let downloader = Downloader::new()?; + let parser = HtmlParser::new()?; - match downloader.download(args.url).await { + match downloader.download(args.url.clone()).await { Ok(page) => { println!("Successfully downloaded page!"); println!("Final URL: {}", page.url); @@ -36,12 +39,29 @@ async fn main() -> Result<()> { if page.content.len() > 200 { println!("... (truncated)"); } + + // Test the parser + println!("\nExtracting links..."); + match parser.extract_links(&page.content, &page.url) { + Ok(links) => { + println!("Found {} links:", links.len()); + for (i, link) in links.iter().take(10).enumerate() { + println!(" {}. {}", i + 1, link); + } + if links.len() > 10 { + println!(" ... and {} more links", links.len() - 10); + } + } + Err(e) => { + println!("Failed to extract links: {}", e); + } + } } Err(e) => { println!("Failed to download: {}", e); } } - println!("\nDownloader test completed!"); + println!("\nDownloader and parser test completed!"); Ok(()) } diff --git a/topics/web-scraper/src/parser.rs b/topics/web-scraper/src/parser.rs new file mode 100644 index 0000000..c732851 --- /dev/null +++ b/topics/web-scraper/src/parser.rs @@ -0,0 +1,59 @@ +use anyhow::Result; +use scraper::{Html, Selector}; +use url::Url; + +/// HTML parser for extracting links from pages +pub struct Parser { + link_selector: Selector, +} + +impl Parser { + /// Create a new parser + pub fn new() -> Result { + let link_selector = Selector::parse("a[href]") + .map_err(|e| anyhow::anyhow!("Failed to create CSS selector: {:?}", e))?; + + Ok(Parser { link_selector }) + } + + /// Extract all links from an HTML page + pub fn extract_links(&self, html: &str, base_url: &Url) -> Result> { + let document = Html::parse_document(html); + let mut links = Vec::new(); + + for element in document.select(&self.link_selector) { + if let Some(href) = element.value().attr("href") { + // Skip empty hrefs and fragments + if href.is_empty() || href.starts_with('#') { + continue; + } + + // Try to resolve the URL relative to the base + match base_url.join(href) { + Ok(url) => { + // Only include HTTP/HTTPS URLs from the same domain + if url.scheme() == "http" || url.scheme() == "https" { + if let Some(base_host) = base_url.host_str() { + if let Some(url_host) = url.host_str() { + if base_host == url_host { + links.push(url); + } + } + } + } + } + Err(_) => { + // Skip invalid URLs + continue; + } + } + } + } + + // Remove duplicates + links.sort(); + links.dedup(); + + Ok(links) + } +} \ No newline at end of file From d5a196bda3b64cf055d6a878e35ed1caab2081c3 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 20:17:24 +0200 Subject: [PATCH 06/10] feat: add hierarchical file storage system Signed-off-by: margarita.surina --- topics/web-scraper/src/main.rs | 46 +++++++++++++++-- topics/web-scraper/src/storage.rs | 83 +++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 topics/web-scraper/src/storage.rs diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index e7cd5d5..ad46f06 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -4,10 +4,12 @@ use anyhow::Result; mod cli; mod downloader; mod parser; +mod storage; use cli::Args; use downloader::Downloader; use parser::Parser as HtmlParser; +use storage::Storage; #[tokio::main] async fn main() -> Result<()> { @@ -20,10 +22,14 @@ async fn main() -> Result<()> { println!("Workers: {}", args.workers); println!(); - // Test the downloader and parser - println!("Testing downloader and parser..."); + // Test the downloader, parser, and storage + println!("Testing downloader, parser, and storage..."); let downloader = Downloader::new()?; let parser = HtmlParser::new()?; + let storage = Storage::new(args.output.clone()); + + // Initialize storage + storage.init().await?; match downloader.download(args.url.clone()).await { Ok(page) => { @@ -40,6 +46,18 @@ async fn main() -> Result<()> { println!("... (truncated)"); } + // Test the storage + println!("\nSaving page to storage..."); + match storage.save_page(&page.url, &page.content, 0).await { + Ok(file_path) => { + println!("Page saved successfully!"); + println!("File path: {}", file_path.display()); + } + Err(e) => { + println!("Failed to save page: {}", e); + } + } + // Test the parser println!("\nExtracting links..."); match parser.extract_links(&page.content, &page.url) { @@ -51,6 +69,28 @@ async fn main() -> Result<()> { if links.len() > 10 { println!(" ... and {} more links", links.len() - 10); } + + // Test saving a few linked pages (simulate depth 1) + if !links.is_empty() { + println!("\nTesting hierarchical storage with linked pages..."); + for (i, link) in links.iter().take(2).enumerate() { + match downloader.download(link.clone()).await { + Ok(linked_page) => { + match storage.save_page(&linked_page.url, &linked_page.content, 1).await { + Ok(file_path) => { + println!("Saved linked page {}: {}", i + 1, file_path.display()); + } + Err(e) => { + println!("Failed to save linked page {}: {}", i + 1, e); + } + } + } + Err(e) => { + println!("Failed to download linked page {}: {}", i + 1, e); + } + } + } + } } Err(e) => { println!("Failed to extract links: {}", e); @@ -62,6 +102,6 @@ async fn main() -> Result<()> { } } - println!("\nDownloader and parser test completed!"); + println!("\nDownloader, parser, and storage test completed!"); Ok(()) } diff --git a/topics/web-scraper/src/storage.rs b/topics/web-scraper/src/storage.rs new file mode 100644 index 0000000..639321c --- /dev/null +++ b/topics/web-scraper/src/storage.rs @@ -0,0 +1,83 @@ +use anyhow::Result; +use std::path::PathBuf; +use tokio::fs; +use url::Url; + +/// File system storage manager +pub struct Storage { + pub base_path: PathBuf, +} + +impl Storage { + /// Create a new storage manager + pub fn new(base_path: PathBuf) -> Self { + Storage { base_path } + } + + /// Initialize the storage directory + pub async fn init(&self) -> Result<()> { + fs::create_dir_all(&self.base_path).await?; + println!("Initialized storage directory: {}", self.base_path.display()); + Ok(()) + } + + /// Save a page to the appropriate location + pub async fn save_page(&self, url: &Url, content: &str, depth: usize) -> Result { + let file_path = self.url_to_path(url, depth)?; + + // Create parent directories + if let Some(parent) = file_path.parent() { + fs::create_dir_all(parent).await?; + } + + // Write the content + fs::write(&file_path, content).await?; + + println!("Saved: {} -> {}", url, file_path.display()); + Ok(file_path) + } + + /// Convert a URL to a file path following the hierarchical structure + fn url_to_path(&self, url: &Url, depth: usize) -> Result { + let mut path = self.base_path.clone(); + + // Add depth folder for organization + path.push(format!("depth_{}", depth)); + + // Add host + if let Some(host) = url.host_str() { + path.push(sanitize_filename(host)); + } + + // Add path components + let url_path = url.path(); + if url_path != "/" && !url_path.is_empty() { + for segment in url_path.split('/').filter(|s| !s.is_empty()) { + path.push(sanitize_filename(segment)); + } + } + + // Ensure we have a filename with .html extension + if path.is_dir() || path.to_string_lossy().ends_with('/') || url_path == "/" { + path.push("index.html"); + } else if !path.to_string_lossy().ends_with(".html") { + path.set_extension("html"); + } + + Ok(path) + } +} + +/// Sanitize a string to be safe for use as a filename +fn sanitize_filename(input: &str) -> String { + input + .chars() + .map(|c| match c { + '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|' => '_', + c if c.is_control() => '_', + c => c, + }) + .collect::() + .trim_matches('.') + .to_string() +} \ No newline at end of file From b554f3e13f4d56d069bfd4e46ca998a8c06ec145 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 20:37:20 +0200 Subject: [PATCH 07/10] feat: implement concurrent workers system Signed-off-by: margarita.surina --- topics/web-scraper/src/crawler.rs | 136 ++++++++++++++++++++++++++++++ topics/web-scraper/src/main.rs | 94 +++------------------ topics/web-scraper/src/worker.rs | 109 ++++++++++++++++++++++++ 3 files changed, 258 insertions(+), 81 deletions(-) create mode 100644 topics/web-scraper/src/crawler.rs create mode 100644 topics/web-scraper/src/worker.rs diff --git a/topics/web-scraper/src/crawler.rs b/topics/web-scraper/src/crawler.rs new file mode 100644 index 0000000..9296a30 --- /dev/null +++ b/topics/web-scraper/src/crawler.rs @@ -0,0 +1,136 @@ +use crate::storage::Storage; +use crate::worker::{Worker, WorkItem, WorkResult}; +use anyhow::Result; +use std::collections::{HashSet, VecDeque}; +use tokio::sync::mpsc; +use url::Url; + +/// Simple crawler coordinator that manages workers +pub struct SimpleCrawler { + storage: Storage, + max_depth: usize, + num_workers: usize, + visited: HashSet, + queue: VecDeque, +} + +impl SimpleCrawler { + /// Create a new simple crawler + pub fn new( + storage: Storage, + max_depth: usize, + num_workers: usize, + start_url: Url, + ) -> Self { + let mut queue = VecDeque::new(); + queue.push_back(WorkItem { + url: start_url, + depth: 0, + }); + + SimpleCrawler { + storage, + max_depth, + num_workers, + visited: HashSet::new(), + queue, + } + } + + /// Start the crawling process + pub async fn crawl(&mut self) -> Result<()> { + println!("Starting crawler with {} workers, max depth {}", + self.num_workers, self.max_depth); + + // Initialize storage + self.storage.init().await?; + + // Create channels for worker communication + let (result_sender, mut result_receiver) = mpsc::channel::(100); + + // Create individual work channels and start workers + let mut work_senders = Vec::new(); + let mut worker_handles = Vec::new(); + + for i in 0..self.num_workers { + let (work_sender, work_receiver) = mpsc::channel::(10); + work_senders.push(work_sender); + + let worker_storage = Storage::new(self.storage.base_path.clone()); + let worker = Worker::new(i, worker_storage)?; + let result_tx = result_sender.clone(); + + let handle = tokio::spawn(async move { + worker.run(work_receiver, result_tx).await; + }); + worker_handles.push(handle); + } + + // Drop the original result sender so we can detect when all workers finish + drop(result_sender); + + let mut active_work = 0; + let mut total_processed = 0; + let mut current_worker = 0; + + // Main crawling loop + loop { + // Send work items to workers (round-robin) + while let Some(work_item) = self.queue.pop_front() { + if self.visited.contains(&work_item.url) { + continue; + } + + self.visited.insert(work_item.url.clone()); + + // Send to next worker (round-robin) + if work_senders[current_worker].send(work_item).await.is_err() { + // Worker channel is closed, break the loop + break; + } + + current_worker = (current_worker + 1) % self.num_workers; + active_work += 1; + } + + // If no active work and queue is empty, we're done + if active_work == 0 { + break; + } + + // Process results from workers + if let Some(result) = result_receiver.recv().await { + active_work -= 1; + total_processed += 1; + + if result.success { + // Add new links to queue if we haven't reached max depth + if result.depth < self.max_depth { + for link in result.links { + if !self.visited.contains(&link) { + self.queue.push_back(WorkItem { + url: link, + depth: result.depth + 1, + }); + } + } + } + } + } else { + // All workers have finished + break; + } + } + + // Close all work channels to signal workers to stop + drop(work_senders); + + // Wait for all workers to finish + for handle in worker_handles { + let _ = handle.await; + } + + println!("Crawling complete! Processed {} pages.", total_processed); + Ok(()) + } +} \ No newline at end of file diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index ad46f06..377cb8b 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -5,11 +5,12 @@ mod cli; mod downloader; mod parser; mod storage; +mod worker; +mod crawler; use cli::Args; -use downloader::Downloader; -use parser::Parser as HtmlParser; use storage::Storage; +use crawler::SimpleCrawler; #[tokio::main] async fn main() -> Result<()> { @@ -22,86 +23,17 @@ async fn main() -> Result<()> { println!("Workers: {}", args.workers); println!(); - // Test the downloader, parser, and storage - println!("Testing downloader, parser, and storage..."); - let downloader = Downloader::new()?; - let parser = HtmlParser::new()?; - let storage = Storage::new(args.output.clone()); + // Create and start the concurrent crawler + let storage = Storage::new(args.output); + let mut crawler = SimpleCrawler::new( + storage, + args.depth, + args.workers, + args.url, + ); - // Initialize storage - storage.init().await?; + crawler.crawl().await?; - match downloader.download(args.url.clone()).await { - Ok(page) => { - println!("Successfully downloaded page!"); - println!("Final URL: {}", page.url); - println!("Content preview (first 200 chars):"); - let preview = if page.content.len() > 200 { - &page.content[..200] - } else { - &page.content - }; - println!("{}", preview); - if page.content.len() > 200 { - println!("... (truncated)"); - } - - // Test the storage - println!("\nSaving page to storage..."); - match storage.save_page(&page.url, &page.content, 0).await { - Ok(file_path) => { - println!("Page saved successfully!"); - println!("File path: {}", file_path.display()); - } - Err(e) => { - println!("Failed to save page: {}", e); - } - } - - // Test the parser - println!("\nExtracting links..."); - match parser.extract_links(&page.content, &page.url) { - Ok(links) => { - println!("Found {} links:", links.len()); - for (i, link) in links.iter().take(10).enumerate() { - println!(" {}. {}", i + 1, link); - } - if links.len() > 10 { - println!(" ... and {} more links", links.len() - 10); - } - - // Test saving a few linked pages (simulate depth 1) - if !links.is_empty() { - println!("\nTesting hierarchical storage with linked pages..."); - for (i, link) in links.iter().take(2).enumerate() { - match downloader.download(link.clone()).await { - Ok(linked_page) => { - match storage.save_page(&linked_page.url, &linked_page.content, 1).await { - Ok(file_path) => { - println!("Saved linked page {}: {}", i + 1, file_path.display()); - } - Err(e) => { - println!("Failed to save linked page {}: {}", i + 1, e); - } - } - } - Err(e) => { - println!("Failed to download linked page {}: {}", i + 1, e); - } - } - } - } - } - Err(e) => { - println!("Failed to extract links: {}", e); - } - } - } - Err(e) => { - println!("Failed to download: {}", e); - } - } - - println!("\nDownloader, parser, and storage test completed!"); + println!("Web crawler completed successfully!"); Ok(()) } diff --git a/topics/web-scraper/src/worker.rs b/topics/web-scraper/src/worker.rs new file mode 100644 index 0000000..6860e61 --- /dev/null +++ b/topics/web-scraper/src/worker.rs @@ -0,0 +1,109 @@ +use crate::downloader::Downloader; +use crate::parser::Parser as HtmlParser; +use crate::storage::Storage; +use anyhow::Result; +use tokio::sync::mpsc; +use url::Url; + +/// Message types for worker communication +#[derive(Debug, Clone)] +pub struct WorkItem { + pub url: Url, + pub depth: usize, +} + +#[derive(Debug)] +pub struct WorkResult { + pub url: Url, + pub depth: usize, + pub links: Vec, + pub success: bool, + pub error: Option, +} + +/// Worker that downloads and processes web pages +pub struct Worker { + id: usize, + downloader: Downloader, + parser: HtmlParser, + storage: Storage, +} + +impl Worker { + /// Create a new worker + pub fn new(id: usize, storage: Storage) -> Result { + let downloader = Downloader::new()?; + let parser = HtmlParser::new()?; + + Ok(Worker { + id, + downloader, + parser, + storage, + }) + } + + /// Start the worker loop + pub async fn run( + mut self, + mut receiver: mpsc::Receiver, + sender: mpsc::Sender, + ) { + println!("Worker {} started", self.id); + + while let Some(work_item) = receiver.recv().await { + let result = self.process_work_item(work_item).await; + + if let Err(e) = sender.send(result).await { + eprintln!("Worker {} failed to send result: {}", self.id, e); + break; + } + } + + println!("Worker {} shutting down", self.id); + } + + /// Process a single work item + async fn process_work_item(&mut self, work_item: WorkItem) -> WorkResult { + let url = work_item.url.clone(); + let depth = work_item.depth; + + match self.download_and_process(&work_item).await { + Ok(links) => WorkResult { + url, + depth, + links, + success: true, + error: None, + }, + Err(e) => { + let error_msg = format!("{}", e); + eprintln!("Worker {} failed to process {}: {}", self.id, url, error_msg); + WorkResult { + url, + depth, + links: Vec::new(), + success: false, + error: Some(error_msg), + } + } + } + } + + /// Download and process a single page + async fn download_and_process(&mut self, work_item: &WorkItem) -> Result> { + // Download the page + let page = self.downloader.download(work_item.url.clone()).await?; + + // Save the page + self.storage.save_page(&page.url, &page.content, work_item.depth).await?; + + // Extract links + let links = self.parser.extract_links(&page.content, &page.url)?; + + println!("Worker {} processed {} (depth {}) - found {} links", + self.id, page.url, work_item.depth, links.len()); + + Ok(links) + } +} \ No newline at end of file From 1c40a01b18ec8bfc559ede087368f829c37a9c57 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 20:49:27 +0200 Subject: [PATCH 08/10] chore: code formatting Signed-off-by: margarita.surina --- topics/web-scraper/src/cli.rs | 8 ++-- topics/web-scraper/src/crawler.rs | 71 ++++++++++++++++------------ topics/web-scraper/src/downloader.rs | 21 ++++---- topics/web-scraper/src/main.rs | 21 ++++---- topics/web-scraper/src/parser.rs | 14 +++--- topics/web-scraper/src/storage.rs | 29 +++++++----- topics/web-scraper/src/worker.rs | 44 ++++++++++------- 7 files changed, 112 insertions(+), 96 deletions(-) diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs index 206b0d7..d291ceb 100644 --- a/topics/web-scraper/src/cli.rs +++ b/topics/web-scraper/src/cli.rs @@ -10,15 +10,15 @@ pub struct Args { /// The starting URL to crawl #[arg(value_parser = parse_url)] pub url: Url, - + /// Output directory for downloaded pages #[arg(short, long, default_value = "./crawled")] pub output: PathBuf, - + /// Maximum crawling depth #[arg(short, long, default_value = "2")] pub depth: usize, - + /// Number of concurrent workers #[arg(short, long, default_value = "4")] pub workers: usize, @@ -26,4 +26,4 @@ pub struct Args { fn parse_url(s: &str) -> Result { Url::parse(s) -} \ No newline at end of file +} diff --git a/topics/web-scraper/src/crawler.rs b/topics/web-scraper/src/crawler.rs index 9296a30..7cdf898 100644 --- a/topics/web-scraper/src/crawler.rs +++ b/topics/web-scraper/src/crawler.rs @@ -1,5 +1,5 @@ use crate::storage::Storage; -use crate::worker::{Worker, WorkItem, WorkResult}; +use crate::worker::{WorkItem, WorkResult, Worker}; use anyhow::Result; use std::collections::{HashSet, VecDeque}; use tokio::sync::mpsc; @@ -16,18 +16,13 @@ pub struct SimpleCrawler { impl SimpleCrawler { /// Create a new simple crawler - pub fn new( - storage: Storage, - max_depth: usize, - num_workers: usize, - start_url: Url, - ) -> Self { + pub fn new(storage: Storage, max_depth: usize, num_workers: usize, start_url: Url) -> Self { let mut queue = VecDeque::new(); queue.push_back(WorkItem { url: start_url, depth: 0, }); - + SimpleCrawler { storage, max_depth, @@ -36,43 +31,45 @@ impl SimpleCrawler { queue, } } - + /// Start the crawling process pub async fn crawl(&mut self) -> Result<()> { - println!("Starting crawler with {} workers, max depth {}", - self.num_workers, self.max_depth); - + println!( + "Starting crawler with {} workers, max depth {}", + self.num_workers, self.max_depth + ); + // Initialize storage self.storage.init().await?; - + // Create channels for worker communication let (result_sender, mut result_receiver) = mpsc::channel::(100); - + // Create individual work channels and start workers let mut work_senders = Vec::new(); let mut worker_handles = Vec::new(); - + for i in 0..self.num_workers { let (work_sender, work_receiver) = mpsc::channel::(10); work_senders.push(work_sender); - + let worker_storage = Storage::new(self.storage.base_path.clone()); let worker = Worker::new(i, worker_storage)?; let result_tx = result_sender.clone(); - + let handle = tokio::spawn(async move { worker.run(work_receiver, result_tx).await; }); worker_handles.push(handle); } - + // Drop the original result sender so we can detect when all workers finish drop(result_sender); - + let mut active_work = 0; let mut total_processed = 0; let mut current_worker = 0; - + // Main crawling loop loop { // Send work items to workers (round-robin) @@ -80,30 +77,37 @@ impl SimpleCrawler { if self.visited.contains(&work_item.url) { continue; } - + self.visited.insert(work_item.url.clone()); - + // Send to next worker (round-robin) if work_senders[current_worker].send(work_item).await.is_err() { // Worker channel is closed, break the loop break; } - + current_worker = (current_worker + 1) % self.num_workers; active_work += 1; } - + // If no active work and queue is empty, we're done if active_work == 0 { break; } - + // Process results from workers if let Some(result) = result_receiver.recv().await { active_work -= 1; total_processed += 1; - + if result.success { + println!( + "Completed: {} (depth {}) - {} links found", + result.url, + result.depth, + result.links.len() + ); + // Add new links to queue if we haven't reached max depth if result.depth < self.max_depth { for link in result.links { @@ -115,22 +119,29 @@ impl SimpleCrawler { } } } + } else { + println!( + "Failed: {} (depth {}) - {}", + result.url, + result.depth, + result.error.unwrap_or_else(|| "Unknown error".to_string()) + ); } } else { // All workers have finished break; } } - + // Close all work channels to signal workers to stop drop(work_senders); - + // Wait for all workers to finish for handle in worker_handles { let _ = handle.await; } - + println!("Crawling complete! Processed {} pages.", total_processed); Ok(()) } -} \ No newline at end of file +} diff --git a/topics/web-scraper/src/downloader.rs b/topics/web-scraper/src/downloader.rs index 47595f1..f44f76f 100644 --- a/topics/web-scraper/src/downloader.rs +++ b/topics/web-scraper/src/downloader.rs @@ -21,27 +21,24 @@ impl Downloader { .timeout(Duration::from_secs(30)) .user_agent("webcrawl/0.1.0") .build()?; - + Ok(Downloader { client }) } - + /// Download a page from the given URL pub async fn download(&self, url: Url) -> Result { println!("Downloading: {}", url); - + let response = self.client.get(url.clone()).send().await?; - + if !response.status().is_success() { return Err(anyhow::anyhow!("HTTP error {}: {}", response.status(), url)); } - + let content = response.text().await?; - + println!("Downloaded {} bytes from {}", content.len(), url); - - Ok(Page { - url, - content, - }) + + Ok(Page { url, content }) } -} \ No newline at end of file +} diff --git a/topics/web-scraper/src/main.rs b/topics/web-scraper/src/main.rs index 377cb8b..983f492 100644 --- a/topics/web-scraper/src/main.rs +++ b/topics/web-scraper/src/main.rs @@ -1,39 +1,34 @@ -use clap::Parser; use anyhow::Result; +use clap::Parser; mod cli; +mod crawler; mod downloader; mod parser; mod storage; mod worker; -mod crawler; use cli::Args; -use storage::Storage; use crawler::SimpleCrawler; +use storage::Storage; #[tokio::main] async fn main() -> Result<()> { let args = Args::parse(); - + println!("Starting web crawler..."); println!("Target URL: {}", args.url); println!("Output directory: {}", args.output.display()); println!("Max depth: {}", args.depth); println!("Workers: {}", args.workers); println!(); - + // Create and start the concurrent crawler let storage = Storage::new(args.output); - let mut crawler = SimpleCrawler::new( - storage, - args.depth, - args.workers, - args.url, - ); - + let mut crawler = SimpleCrawler::new(storage, args.depth, args.workers, args.url); + crawler.crawl().await?; - + println!("Web crawler completed successfully!"); Ok(()) } diff --git a/topics/web-scraper/src/parser.rs b/topics/web-scraper/src/parser.rs index c732851..6c18485 100644 --- a/topics/web-scraper/src/parser.rs +++ b/topics/web-scraper/src/parser.rs @@ -12,22 +12,22 @@ impl Parser { pub fn new() -> Result { let link_selector = Selector::parse("a[href]") .map_err(|e| anyhow::anyhow!("Failed to create CSS selector: {:?}", e))?; - + Ok(Parser { link_selector }) } - + /// Extract all links from an HTML page pub fn extract_links(&self, html: &str, base_url: &Url) -> Result> { let document = Html::parse_document(html); let mut links = Vec::new(); - + for element in document.select(&self.link_selector) { if let Some(href) = element.value().attr("href") { // Skip empty hrefs and fragments if href.is_empty() || href.starts_with('#') { continue; } - + // Try to resolve the URL relative to the base match base_url.join(href) { Ok(url) => { @@ -49,11 +49,11 @@ impl Parser { } } } - + // Remove duplicates links.sort(); links.dedup(); - + Ok(links) } -} \ No newline at end of file +} diff --git a/topics/web-scraper/src/storage.rs b/topics/web-scraper/src/storage.rs index 639321c..3a5f94a 100644 --- a/topics/web-scraper/src/storage.rs +++ b/topics/web-scraper/src/storage.rs @@ -13,42 +13,45 @@ impl Storage { pub fn new(base_path: PathBuf) -> Self { Storage { base_path } } - + /// Initialize the storage directory pub async fn init(&self) -> Result<()> { fs::create_dir_all(&self.base_path).await?; - println!("Initialized storage directory: {}", self.base_path.display()); + println!( + "Initialized storage directory: {}", + self.base_path.display() + ); Ok(()) } - + /// Save a page to the appropriate location pub async fn save_page(&self, url: &Url, content: &str, depth: usize) -> Result { let file_path = self.url_to_path(url, depth)?; - + // Create parent directories if let Some(parent) = file_path.parent() { fs::create_dir_all(parent).await?; } - + // Write the content fs::write(&file_path, content).await?; - + println!("Saved: {} -> {}", url, file_path.display()); Ok(file_path) } - + /// Convert a URL to a file path following the hierarchical structure fn url_to_path(&self, url: &Url, depth: usize) -> Result { let mut path = self.base_path.clone(); - + // Add depth folder for organization path.push(format!("depth_{}", depth)); - + // Add host if let Some(host) = url.host_str() { path.push(sanitize_filename(host)); } - + // Add path components let url_path = url.path(); if url_path != "/" && !url_path.is_empty() { @@ -56,14 +59,14 @@ impl Storage { path.push(sanitize_filename(segment)); } } - + // Ensure we have a filename with .html extension if path.is_dir() || path.to_string_lossy().ends_with('/') || url_path == "/" { path.push("index.html"); } else if !path.to_string_lossy().ends_with(".html") { path.set_extension("html"); } - + Ok(path) } } @@ -80,4 +83,4 @@ fn sanitize_filename(input: &str) -> String { .collect::() .trim_matches('.') .to_string() -} \ No newline at end of file +} diff --git a/topics/web-scraper/src/worker.rs b/topics/web-scraper/src/worker.rs index 6860e61..90e7cd1 100644 --- a/topics/web-scraper/src/worker.rs +++ b/topics/web-scraper/src/worker.rs @@ -34,7 +34,7 @@ impl Worker { pub fn new(id: usize, storage: Storage) -> Result { let downloader = Downloader::new()?; let parser = HtmlParser::new()?; - + Ok(Worker { id, downloader, @@ -42,7 +42,7 @@ impl Worker { storage, }) } - + /// Start the worker loop pub async fn run( mut self, @@ -50,24 +50,24 @@ impl Worker { sender: mpsc::Sender, ) { println!("Worker {} started", self.id); - + while let Some(work_item) = receiver.recv().await { let result = self.process_work_item(work_item).await; - + if let Err(e) = sender.send(result).await { eprintln!("Worker {} failed to send result: {}", self.id, e); break; } } - + println!("Worker {} shutting down", self.id); } - + /// Process a single work item async fn process_work_item(&mut self, work_item: WorkItem) -> WorkResult { let url = work_item.url.clone(); let depth = work_item.depth; - + match self.download_and_process(&work_item).await { Ok(links) => WorkResult { url, @@ -78,7 +78,10 @@ impl Worker { }, Err(e) => { let error_msg = format!("{}", e); - eprintln!("Worker {} failed to process {}: {}", self.id, url, error_msg); + eprintln!( + "Worker {} failed to process {}: {}", + self.id, url, error_msg + ); WorkResult { url, depth, @@ -89,21 +92,28 @@ impl Worker { } } } - + /// Download and process a single page async fn download_and_process(&mut self, work_item: &WorkItem) -> Result> { // Download the page let page = self.downloader.download(work_item.url.clone()).await?; - + // Save the page - self.storage.save_page(&page.url, &page.content, work_item.depth).await?; - + self.storage + .save_page(&page.url, &page.content, work_item.depth) + .await?; + // Extract links let links = self.parser.extract_links(&page.content, &page.url)?; - - println!("Worker {} processed {} (depth {}) - found {} links", - self.id, page.url, work_item.depth, links.len()); - + + println!( + "Worker {} processed {} (depth {}) - found {} links", + self.id, + page.url, + work_item.depth, + links.len() + ); + Ok(links) } -} \ No newline at end of file +} From 6ffe5b220d737bf9efca79061005863dbd74adf8 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 21:42:52 +0200 Subject: [PATCH 09/10] chore: add unit tests Signed-off-by: margarita.surina --- topics/web-scraper/Cargo.lock | 48 ++++++++++++ topics/web-scraper/Cargo.toml | 4 + topics/web-scraper/src/cli.rs | 99 +++++++++++++++++++++++++ topics/web-scraper/src/parser.rs | 107 +++++++++++++++++++++++++++ topics/web-scraper/src/storage.rs | 117 ++++++++++++++++++++++++++++++ topics/web-scraper/src/worker.rs | 97 +++++++++++++++++++++++++ 6 files changed, 472 insertions(+) diff --git a/topics/web-scraper/Cargo.lock b/topics/web-scraper/Cargo.lock index 4644eac..bdbf716 100644 --- a/topics/web-scraper/Cargo.lock +++ b/topics/web-scraper/Cargo.lock @@ -86,6 +86,28 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -1552,6 +1574,30 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-test" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468baabc3311435b55dd935f702f42cd1b8abb7e754fb7dfb16bd36aa88f9f7" +dependencies = [ + "async-stream", + "bytes", + "futures-core", + "tokio", + "tokio-stream", +] + [[package]] name = "tokio-util" version = "0.7.16" @@ -1773,7 +1819,9 @@ dependencies = [ "clap", "reqwest", "scraper", + "tempfile", "tokio", + "tokio-test", "url", ] diff --git a/topics/web-scraper/Cargo.toml b/topics/web-scraper/Cargo.toml index e55894d..d85ed72 100644 --- a/topics/web-scraper/Cargo.toml +++ b/topics/web-scraper/Cargo.toml @@ -10,3 +10,7 @@ tokio = { version = "1.0", features = ["full"] } reqwest = { version = "0.11", features = ["json"] } anyhow = "1.0" scraper = "0.17" + +[dev-dependencies] +tempfile = "3.0" +tokio-test = "0.4" diff --git a/topics/web-scraper/src/cli.rs b/topics/web-scraper/src/cli.rs index d291ceb..8447d90 100644 --- a/topics/web-scraper/src/cli.rs +++ b/topics/web-scraper/src/cli.rs @@ -27,3 +27,102 @@ pub struct Args { fn parse_url(s: &str) -> Result { Url::parse(s) } + +#[cfg(test)] +mod tests { + use super::*; + use clap::Parser; + + #[test] + fn test_parse_url_valid() { + let result = parse_url("https://example.com"); + assert!(result.is_ok()); + assert_eq!(result.unwrap().as_str(), "https://example.com/"); + } + + #[test] + fn test_parse_url_invalid() { + let result = parse_url("not-a-url"); + assert!(result.is_err()); + } + + #[test] + fn test_args_with_defaults() { + let args = Args::try_parse_from(&["webcrawl", "https://example.com"]).unwrap(); + + assert_eq!(args.url.as_str(), "https://example.com/"); + assert_eq!(args.output, PathBuf::from("./crawled")); + assert_eq!(args.depth, 2); + assert_eq!(args.workers, 4); + } + + #[test] + fn test_args_with_custom_values() { + let args = Args::try_parse_from(&[ + "webcrawl", + "--output", + "./custom_output", + "--depth", + "5", + "--workers", + "8", + "https://test.com", + ]) + .unwrap(); + + assert_eq!(args.url.as_str(), "https://test.com/"); + assert_eq!(args.output, PathBuf::from("./custom_output")); + assert_eq!(args.depth, 5); + assert_eq!(args.workers, 8); + } + + #[test] + fn test_args_short_flags() { + let args = Args::try_parse_from(&[ + "webcrawl", + "-o", + "./short_output", + "-d", + "3", + "-w", + "6", + "https://short.com", + ]) + .unwrap(); + + assert_eq!(args.url.as_str(), "https://short.com/"); + assert_eq!(args.output, PathBuf::from("./short_output")); + assert_eq!(args.depth, 3); + assert_eq!(args.workers, 6); + } + + #[test] + fn test_args_invalid_url() { + let result = Args::try_parse_from(&["webcrawl", "invalid-url"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_missing_url() { + let result = Args::try_parse_from(&["webcrawl"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_invalid_depth() { + let result = + Args::try_parse_from(&["webcrawl", "--depth", "not-a-number", "https://example.com"]); + assert!(result.is_err()); + } + + #[test] + fn test_args_invalid_workers() { + let result = Args::try_parse_from(&[ + "webcrawl", + "--workers", + "not-a-number", + "https://example.com", + ]); + assert!(result.is_err()); + } +} diff --git a/topics/web-scraper/src/parser.rs b/topics/web-scraper/src/parser.rs index 6c18485..3aebe92 100644 --- a/topics/web-scraper/src/parser.rs +++ b/topics/web-scraper/src/parser.rs @@ -57,3 +57,110 @@ impl Parser { Ok(links) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parser_creation() { + let parser = Parser::new(); + assert!(parser.is_ok()); + } + + #[test] + fn test_extract_links_basic() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Link 1 + Link 2 + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + assert!(links.contains(&Url::parse("https://example.com/page1").unwrap())); + assert!(links.contains(&Url::parse("https://example.com/page2").unwrap())); + } + + #[test] + fn test_extract_links_filters_external() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Internal Link + External Link + Same Domain + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + assert!(links.contains(&Url::parse("https://example.com/internal").unwrap())); + assert!(links.contains(&Url::parse("https://example.com/same-domain").unwrap())); + assert!( + !links + .iter() + .any(|url| url.host_str() == Some("external.com")) + ); + } + + #[test] + fn test_extract_links_skips_fragments() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Valid Link + Fragment + Empty + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 1); + assert_eq!(links[0], Url::parse("https://example.com/page1").unwrap()); + } + + #[test] + fn test_extract_links_removes_duplicates() { + let parser = Parser::new().unwrap(); + let html = r##" + + + Link 1 + Link 1 Again + Link 2 + + + "##; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 2); + } + + #[test] + fn test_extract_links_empty_html() { + let parser = Parser::new().unwrap(); + let html = ""; + let base_url = Url::parse("https://example.com").unwrap(); + + let links = parser.extract_links(html, &base_url).unwrap(); + + assert_eq!(links.len(), 0); + } +} diff --git a/topics/web-scraper/src/storage.rs b/topics/web-scraper/src/storage.rs index 3a5f94a..4886c38 100644 --- a/topics/web-scraper/src/storage.rs +++ b/topics/web-scraper/src/storage.rs @@ -84,3 +84,120 @@ fn sanitize_filename(input: &str) -> String { .trim_matches('.') .to_string() } + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_storage_creation() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + assert_eq!(storage.base_path, temp_dir.path()); + } + + #[tokio::test] + async fn test_storage_init() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().join("test_output")); + + let result = storage.init().await; + assert!(result.is_ok()); + assert!(storage.base_path.exists()); + } + + #[tokio::test] + async fn test_save_page() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + storage.init().await.unwrap(); + + let url = Url::parse("https://example.com/page1").unwrap(); + let content = "Test content"; + + let file_path = storage.save_page(&url, content, 0).await.unwrap(); + + assert!(file_path.exists()); + let saved_content = tokio::fs::read_to_string(&file_path).await.unwrap(); + assert_eq!(saved_content, content); + } + + #[test] + fn test_url_to_path_root() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/").unwrap(); + let path = storage.url_to_path(&url, 0).unwrap(); + + let expected = temp_dir + .path() + .join("depth_0") + .join("example.com") + .join("index.html"); + assert_eq!(path, expected); + } + + #[test] + fn test_url_to_path_with_path() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/blog/post1").unwrap(); + let path = storage.url_to_path(&url, 1).unwrap(); + + let expected = temp_dir + .path() + .join("depth_1") + .join("example.com") + .join("blog") + .join("post1.html"); + assert_eq!(path, expected); + } + + #[test] + fn test_url_to_path_depth_organization() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let url = Url::parse("https://example.com/test").unwrap(); + + let path_depth_0 = storage.url_to_path(&url, 0).unwrap(); + let path_depth_2 = storage.url_to_path(&url, 2).unwrap(); + + assert!(path_depth_0.to_string_lossy().contains("depth_0")); + assert!(path_depth_2.to_string_lossy().contains("depth_2")); + } + + #[test] + fn test_sanitize_filename() { + assert_eq!(sanitize_filename("normal"), "normal"); + assert_eq!(sanitize_filename("with/slash"), "with_slash"); + assert_eq!(sanitize_filename("with:colon"), "with_colon"); + assert_eq!(sanitize_filename("with*star"), "with_star"); + assert_eq!(sanitize_filename("with?question"), "with_question"); + assert_eq!(sanitize_filename("with\"quote"), "with_quote"); + assert_eq!(sanitize_filename("with"), "with_greater_"); + assert_eq!(sanitize_filename("with|pipe"), "with_pipe"); + assert_eq!(sanitize_filename(".hidden."), "hidden"); + } + + #[tokio::test] + async fn test_save_page_creates_directories() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + storage.init().await.unwrap(); + + let url = Url::parse("https://example.com/deep/nested/path").unwrap(); + let content = "test content"; + + let file_path = storage.save_page(&url, content, 1).await.unwrap(); + + assert!(file_path.exists()); + assert!(file_path.parent().unwrap().exists()); + + let saved_content = tokio::fs::read_to_string(&file_path).await.unwrap(); + assert_eq!(saved_content, content); + } +} diff --git a/topics/web-scraper/src/worker.rs b/topics/web-scraper/src/worker.rs index 90e7cd1..b13c495 100644 --- a/topics/web-scraper/src/worker.rs +++ b/topics/web-scraper/src/worker.rs @@ -117,3 +117,100 @@ impl Worker { Ok(links) } } + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_work_item_creation() { + let url = Url::parse("https://example.com").unwrap(); + let work_item = WorkItem { + url: url.clone(), + depth: 1, + }; + + assert_eq!(work_item.url, url); + assert_eq!(work_item.depth, 1); + } + + #[test] + fn test_work_result_success() { + let url = Url::parse("https://example.com").unwrap(); + let links = vec![Url::parse("https://example.com/page1").unwrap()]; + + let result = WorkResult { + url: url.clone(), + depth: 0, + links: links.clone(), + success: true, + error: None, + }; + + assert_eq!(result.url, url); + assert_eq!(result.depth, 0); + assert_eq!(result.links, links); + assert!(result.success); + assert!(result.error.is_none()); + } + + #[test] + fn test_work_result_failure() { + let url = Url::parse("https://example.com").unwrap(); + let error_msg = "Network error".to_string(); + + let result = WorkResult { + url: url.clone(), + depth: 1, + links: Vec::new(), + success: false, + error: Some(error_msg.clone()), + }; + + assert_eq!(result.url, url); + assert_eq!(result.depth, 1); + assert!(result.links.is_empty()); + assert!(!result.success); + assert_eq!(result.error, Some(error_msg)); + } + + #[tokio::test] + async fn test_worker_creation() { + let temp_dir = TempDir::new().unwrap(); + let storage = Storage::new(temp_dir.path().to_path_buf()); + + let worker = Worker::new(0, storage); + assert!(worker.is_ok()); + assert_eq!(worker.unwrap().id, 0); + } + + #[test] + fn test_work_item_clone() { + let url = Url::parse("https://example.com").unwrap(); + let work_item = WorkItem { + url: url.clone(), + depth: 2, + }; + let cloned = work_item.clone(); + + assert_eq!(work_item.url, cloned.url); + assert_eq!(work_item.depth, cloned.depth); + } + + #[test] + fn test_work_result_debug() { + let url = Url::parse("https://example.com").unwrap(); + let result = WorkResult { + url, + depth: 1, + links: Vec::new(), + success: true, + error: None, + }; + + let debug_str = format!("{:?}", result); + assert!(debug_str.contains("WorkResult")); + assert!(debug_str.contains("example.com")); + } +} From f9a4c63c3c1a880d4a0dcb2a341b52338939da56 Mon Sep 17 00:00:00 2001 From: "margarita.surina" Date: Sat, 11 Oct 2025 22:17:44 +0200 Subject: [PATCH 10/10] chore: add documentation Signed-off-by: margarita.surina --- topics/web-scraper/docs/architecture.md | 158 ++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 topics/web-scraper/docs/architecture.md diff --git a/topics/web-scraper/docs/architecture.md b/topics/web-scraper/docs/architecture.md new file mode 100644 index 0000000..b644eb4 --- /dev/null +++ b/topics/web-scraper/docs/architecture.md @@ -0,0 +1,158 @@ +# Web Scraper Architecture + +## Project Definition + +### What is it? +The web scraper is a command-line application written in Rust that recursively downloads and processes web pages starting from given URLs. It crawls websites by following links, downloads pages concurrently using multiple async workers, and stores them locally in a depth-organized directory structure that maintains domain hierarchy while tracking the crawling depth of each page. + +### Goals +- **Concurrent web crawling**: Download multiple pages simultaneously using async/await and tokio +- **Recursive link following**: Discover and follow links up to a specified depth with same-domain filtering +- **Depth-organized storage**: Organize downloaded content in folders that track crawling depth (depth_0, depth_1, etc.) +- **Command-line interface**: Provide an intuitive CLI with configurable output directory, depth, and worker count +- **Robust error handling**: Gracefully handle network errors, invalid URLs, and file system issues with detailed logging + +## Components and Modules + +### 1. CLI Module (`cli.rs`) +**Purpose**: Handle command-line argument parsing and validation. +- Parse command-line arguments (URL, output directory, depth, concurrency) +- Validate input parameters +- Display help information + +### 2. Crawler Engine (`crawler.rs`) +**Purpose**: Core crawling logic and coordination using SimpleCrawler. +- Manage the crawling queue and visited URLs HashSet for deduplication +- Coordinate multiple async worker tasks via mpsc channels +- Implement depth-limited crawling with round-robin work distribution +- Handle graceful worker shutdown and result processing + +### 3. Downloader Module (`downloader.rs`) +**Purpose**: Handle HTTP requests and page downloading with async support. +- Make asynchronous HTTP requests using reqwest with 30-second timeout +- Custom user-agent and proper error handling +- Return page content and metadata for processing + +### 4. Parser Module (`parser.rs`) +**Purpose**: Extract links from downloaded HTML pages with filtering. +- Parse HTML content using scraper crate with CSS selectors +- Extract and normalize URLs from anchor tags (``) +- Filter to same-domain links only (excludes external sites) +- Remove URL fragments and handle duplicates + +### 5. Storage Module (`storage.rs`) +**Purpose**: Manage file system operations with depth-based organization. +- Create hierarchical directory structures organized by crawling depth +- Save downloaded pages to depth-specific folders (depth_0, depth_1, etc.) +- Handle file naming conflicts and path sanitization +- Convert URLs to appropriate file paths maintaining domain structure + +### 6. Worker Module (`worker.rs`) +**Purpose**: Handle concurrent downloading tasks with message-passing coordination. +- Define WorkItem and WorkResult message types for communication +- Implement async workers that process URLs from a shared channel +- Coordinate downloader, parser, and storage operations +- Handle round-robin work distribution through mpsc channels + +## Module Interactions + +``` + CLI + | + v + Crawler ←→ Worker Pool + | | + v v + Parser ←→ Downloader + | | + v v + Storage +``` + +1. **CLI** parses arguments and initializes the **Crawler** +2. **Crawler** creates a pool of **Workers** and manages the crawling queue +3. **Workers** use the **Downloader** to fetch pages +4. Downloaded content is processed by the **Parser** to extract links +5. **Storage** saves pages and creates directory structure +6. New links are fed back to the **Crawler** queue + +### Architecture Justification + +This modular design provides: +- **Separation of concerns**: Each module has a single responsibility +- **Testability**: Modules can be unit tested independently (29 comprehensive unit tests included) +- **Concurrency**: Async worker-based design enables efficient parallel processing +- **Extensibility**: Easy to add features like robots.txt support or different output formats +- **Error isolation**: Failures in one component don't crash the entire application + +### Key Technologies +- **Rust 2024 Edition**: Memory-safe systems programming with excellent async support +- **Tokio**: Async runtime for concurrent operations and channels +- **Reqwest**: HTTP client for reliable web requests with timeout handling +- **Scraper**: HTML parsing with CSS selector support +- **Clap**: Command-line argument parsing with derive macros +- **Anyhow**: Unified error handling across all modules + +## Usage + +### Installation +To use the `webcrawl` command directly from anywhere in your system: + +```bash +# Install to ~/.cargo/bin (make sure it's in your PATH) +cargo install --path . + +# Then you can use webcrawl directly +webcrawl --output ./crawled_url --depth 10 https://example.com +``` + +### Basic Usage +```bash +# Crawl a website with default settings +webcrawl https://example.com + +# Specify output directory and depth +webcrawl --output ./crawled_data --depth 3 https://example.com + +# Control concurrency +webcrawl --output ./output --depth 2 --workers 5 https://example.com +``` + +### Command-line Options +- ``: Starting URL to crawl (required) +- `--output, -o`: Output directory for downloaded pages (default: "./crawled") +- `--depth, -d`: Maximum crawling depth (default: 2) +- `--workers, -w`: Number of concurrent workers (default: 4) +- `--help, -h`: Display help information + +### Output Structure +The downloaded pages are organized in a hierarchical structure based on crawling depth and URL structure: + +``` +output/ +├── depth_0/ +│ └── example.com/ +│ └── index.html # Root page (depth 0) +├── depth_1/ +│ └── example.com/ +│ ├── about/ +│ │ └── index.html # /about page (depth 1) +│ └── products/ +│ └── index.html # /products page (depth 1) +└── depth_2/ + └── example.com/ + ├── about/ + │ └── team/ + │ └── index.html # /about/team page (depth 2) + └── products/ + └── software/ + └── index.html # /products/software page (depth 2) +``` + +This depth-based organization allows easy tracking of how deep each page was discovered in the crawling process and provides clear separation between different crawling levels. + +### Example Usage Scenarios + +1. **Website backup**: `webcrawl --depth 5 --output ./backup https://mysite.com` +2. **Content analysis**: `webcrawl --depth 2 --workers 8 https://news.site.com` +3. **Link validation**: `webcrawl --depth 1 https://example.com` (shallow crawl)