From 285ec710c70683417e384074ad9be17ba0d8899c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matou=C5=A1=20Dzivjak?= Date: Mon, 4 Mar 2024 23:44:30 +0100 Subject: [PATCH] =?UTF-8?q?let's=20fly=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + Cargo.lock | 71 ++++++++++++++++++-------------- Cargo.toml | 40 +++++++----------- Dockerfile | 8 ++-- fly.toml | 22 ++++++++++ mwp-scraper/Cargo.toml | 14 ++++++- mwp-scraper/src/lib.rs | 16 ------- {src => mwp-scraper/src}/main.rs | 20 +++++++-- mwp-web/Cargo.toml | 3 +- mwp-web/src/main.rs | 26 ++++++++++-- rust-toolchain.toml | 2 +- 11 files changed, 140 insertions(+), 83 deletions(-) create mode 100644 fly.toml delete mode 100644 mwp-scraper/src/lib.rs rename {src => mwp-scraper/src}/main.rs (88%) diff --git a/.gitignore b/.gitignore index 42f0800..5bbe5e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target/ db/ index/ +wiki/ diff --git a/Cargo.lock b/Cargo.lock index 8f322e2..04cd636 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -264,9 +264,9 @@ checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" [[package]] name = "anstream" -version = "0.6.5" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" dependencies = [ "anstyle", "anstyle-parse", @@ -456,18 +456,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.13" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bdc885e4cacc7f7c9eedc1ef6da641603180c783c41a15c264944deeaab642" +checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] name = "clap_builder" -version = "4.4.12" +version = "4.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb7fb5e4e979aec3be7791562fcba452f94ad85e954da024396433e0e25a79e9" +checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" dependencies = [ "anstream", "anstyle", @@ -475,11 +476,23 @@ dependencies = [ "strsim", ] +[[package]] +name = "clap_derive" +version = "4.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "clap_lex" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "codemap" @@ -989,6 +1002,12 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -1432,26 +1451,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" -[[package]] -name = "mwp" -version = "0.1.0" -dependencies = [ - "lazy_static", - "mwp-content", - "mwp-scraper", - "mwp-search", - "pulldown-cmark", - "rusqlite", - "serde", - "serde_json", - "sled", - "tantivy", - "time", - "tokio", - "url", - "walkdir", -] - [[package]] name = "mwp-content" version = "0.1.0" @@ -1468,9 +1467,20 @@ dependencies = [ "html-escape", "lazy_static", "lol_html", + "mwp-content", + "mwp-search", + "pulldown-cmark", "regex", "reqwest", + "rusqlite", + "serde", + "serde_json", + "sled", + "tantivy", + "time", + "tokio", "url", + "walkdir", ] [[package]] @@ -1489,6 +1499,7 @@ version = "0.1.0" dependencies = [ "actix-files", "actix-web", + "clap", "env_logger", "grass", "maud", @@ -2409,9 +2420,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" diff --git a/Cargo.toml b/Cargo.toml index 3d60162..c5db42f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,3 @@ -[package] -name = "mwp" -authors.workspace = true -version.workspace = true -edition.workspace = true -categories.workspace = true -repository.workspace = true -rust-version.workspace = true - [workspace] resolver = "2" members = [ @@ -16,6 +7,21 @@ members = [ "mwp-search", ] +default-members = [ + "mwp-web" +] + +[profile.release] +lto = "thin" +# debug = true + +[profile.opt] +inherits = "release" +lto = "fat" +codegen-units = 1 +# strip = "debuginfo" # TODO: or strip = true +opt-level = 3 + [workspace.package] name = "mwp" version = "0.1.0" @@ -25,19 +31,3 @@ categories = ["wiki", "knowledge-bage"] repository = "https://github.com/matoous/mwp" homepage = "https://github.com/matoous/mwp" rust-version = "1.70" - -[dependencies] -pulldown-cmark = "0.9.3" -tantivy = { version = "0.21.1", features = ["mmap"] } -tokio = { version = "1.36.0", features= ["full"]} -walkdir = "2.4.0" -lazy_static = "1.4.0" -time = "0.3.31" -url = { version = "2.5.0", features = ["serde"] } -sled = "0.34.7" -serde = "1.0.196" -serde_json = "1.0.113" -mwp-scraper = { path="./mwp-scraper" } -mwp-content = { path="./mwp-content" } -mwp-search = { path="./mwp-search" } -rusqlite = { version = "0.30.0", features = ["time", "url", "bundled"]} diff --git a/Dockerfile b/Dockerfile index c6ee3b5..0d70fd5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.70-slim-bookworm as builder +FROM rust:1.74-slim-bookworm as builder RUN apt update \ && apt install -y libssl-dev pkg-config @@ -26,8 +26,10 @@ RUN apt update \ ENV TZ=Etc/UTC COPY --from=builder /app/target/release/mwp mwp -COPY db.db3 ./ +COPY ./db.db3 ./ +COPY ./wiki ./wiki +COPY ./mwp-web/static ./ EXPOSE 4444 -CMD ["/app/mwp"] +CMD ["/app/mwp", "--adr", "0.0.0.0:4444"] diff --git a/fly.toml b/fly.toml new file mode 100644 index 0000000..6230a0b --- /dev/null +++ b/fly.toml @@ -0,0 +1,22 @@ +# fly.toml app configuration file generated for mwp on 2024-03-04T11:18:35+01:00 +# +# See https://fly.io/docs/reference/configuration/ for information about how to use this file. +# + +app = 'mwp' +primary_region = 'ams' + +[build] + +[http_service] + internal_port = 4444 + force_https = true + auto_stop_machines = true + auto_start_machines = true + min_machines_running = 0 + processes = ['app'] + +[[vm]] + memory = '1gb' + cpu_kind = 'shared' + cpus = 1 diff --git a/mwp-scraper/Cargo.toml b/mwp-scraper/Cargo.toml index 0ec7024..1d3bdfe 100644 --- a/mwp-scraper/Cargo.toml +++ b/mwp-scraper/Cargo.toml @@ -10,6 +10,18 @@ repository.workspace = true html-escape = "0.2.13" lazy_static = "1.4.0" lol_html = "1.2.0" +pulldown-cmark = "0.9.3" regex = "1.10.3" reqwest = "0.11.24" -url = "2.5.0" +serde = "1.0.196" +serde_json = "1.0.113" +sled = "0.34.7" +tantivy = { version = "0.21.1", features = ["mmap"] } +time = "0.3.31" +tokio = { version = "1.36.0", features= ["full"]} +url = { version = "2.5.0", features = ["serde"] } +walkdir = "2.4.0" +rusqlite = { version = "0.30.0", features = ["time", "url", "bundled"]} + +mwp-content = { path="../mwp-content" } +mwp-search = { path="../mwp-search" } diff --git a/mwp-scraper/src/lib.rs b/mwp-scraper/src/lib.rs deleted file mode 100644 index 660bae1..0000000 --- a/mwp-scraper/src/lib.rs +++ /dev/null @@ -1,16 +0,0 @@ -use url::Url; - -use crate::parser::{DomParser, DomParserResult}; - -mod parser; - -pub async fn scrape(link: &Url) -> Result> { - let response = reqwest::get(link.clone()).await?; - - let html_text = response.text().await?; - - let mut rewriter = DomParser::new(); - rewriter.write(html_text.as_bytes())?; - - Ok(rewriter.wrap()) -} diff --git a/src/main.rs b/mwp-scraper/src/main.rs similarity index 88% rename from src/main.rs rename to mwp-scraper/src/main.rs index d85de13..0b7ae25 100644 --- a/src/main.rs +++ b/mwp-scraper/src/main.rs @@ -2,6 +2,22 @@ use mwp_content::Link; use mwp_search::Doc; use rusqlite::Connection; use time::OffsetDateTime; +use url::Url; + +mod parser; + +use crate::parser::{DomParser, DomParserResult}; + +pub async fn scrape(link: &Url) -> Result> { + let response = reqwest::get(link.clone()).await?; + + let html_text = response.text().await?; + + let mut rewriter = DomParser::new(); + rewriter.write(html_text.as_bytes())?; + + Ok(rewriter.wrap()) +} #[tokio::main] async fn main() -> Result<(), Box> { @@ -92,9 +108,7 @@ async fn main() -> Result<(), Box> { }, }; - println!("scraping {}", link.url); - - let data = mwp_scraper::scrape(&link.url).await; + let data = scrape(&link.url).await; let data = match data { Ok(data) => data, Err(err) => { diff --git a/mwp-web/Cargo.toml b/mwp-web/Cargo.toml index 28246de..5882534 100644 --- a/mwp-web/Cargo.toml +++ b/mwp-web/Cargo.toml @@ -18,7 +18,8 @@ maud = { version = "0.26.0", features = ["actix-web"] } serde = "1.0.196" serde_json = "1.0.113" tantivy = "0.21.1" -rusqlite = { version = "0.30.0", features = ["time", "url"]} +rusqlite = { version = "0.30.0", features = ["time", "url", "bundled"]} +clap = { version = "4.5.1", features = ["derive"]} mwp-content = { path="../mwp-content" } mwp-search = { path="../mwp-search" } diff --git a/mwp-web/src/main.rs b/mwp-web/src/main.rs index d894cd3..c463bce 100644 --- a/mwp-web/src/main.rs +++ b/mwp-web/src/main.rs @@ -4,6 +4,7 @@ use actix_web::{ guard::{Guard, GuardContext}, web, App, HttpServer, Result as AwResult, }; +use clap::{command, Parser}; use maud::{html, Markup, PreEscaped}; use mwp_content::Content; use mwp_search::{Doc, SearchIndex}; @@ -180,13 +181,31 @@ impl Guard for ContentGuard { } } +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Source of the wikipedia + #[arg(short, long, default_value = "./wiki")] + src: String, + + /// The database file + #[arg(short, long, default_value = "./db.db3")] + db: String, + + /// Address to serve on + #[arg(long, default_value = "127.0.0.1:4444")] + adr: String, +} + #[actix_web::main] async fn main() -> std::io::Result<()> { env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); + let args = Args::parse(); + let index = SearchIndex::new().unwrap(); - let conn = Connection::open("./db.db3").unwrap(); + let conn = Connection::open(args.db).unwrap(); let mut stmt = conn .prepare("SELECT title, url, domain, body, tags, created_at, scraped_at FROM links") .unwrap(); @@ -211,7 +230,8 @@ async fn main() -> std::io::Result<()> { builder.add(doc.unwrap()).unwrap(); } builder.commit(); - let content = Content::from_dir("../wiki").await; + + let content = Content::from_dir(&args.src).await; HttpServer::new(move || { App::new() @@ -229,7 +249,7 @@ async fn main() -> std::io::Result<()> { ) .service(Files::new("/", "./mwp-web/static/")) }) - .bind(("127.0.0.1", 4444))? + .bind(&args.adr)? .run() .await } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index e661a11..7c7cb7f 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.70.0" +channel = "1.74.0" components = ["rustfmt", "rust-src", "clippy"]