diff --git a/Cargo.lock b/Cargo.lock index ada7428..458438d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1441,6 +1441,7 @@ name = "mwp" version = "0.1.0" dependencies = [ "lazy_static", + "mwp-content", "mwp-scraper", "pulldown-cmark", "serde", @@ -1453,6 +1454,14 @@ dependencies = [ "walkdir", ] +[[package]] +name = "mwp-content" +version = "0.1.0" +dependencies = [ + "pulldown-cmark", + "walkdir", +] + [[package]] name = "mwp-core" version = "0.1.0" @@ -1478,6 +1487,7 @@ dependencies = [ "env_logger", "grass", "maud", + "mwp-content", "serde", "serde_json", "tantivy", diff --git a/Cargo.toml b/Cargo.toml index cb15e9d..fea1c20 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ members = [ "mwp-core", "mwp-web", "mwp-scraper", + "mwp-content", ] [workspace.package] @@ -37,3 +38,4 @@ sled = "0.34.7" serde = "1.0.195" serde_json = "1.0.111" mwp-scraper = { path="./mwp-scraper" } +mwp-content = { path="./mwp-content" } diff --git a/mwp-content/Cargo.toml b/mwp-content/Cargo.toml new file mode 100644 index 0000000..6949a5b --- /dev/null +++ b/mwp-content/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "mwp-content" +authors.workspace = true +version.workspace = true +edition.workspace = true +categories.workspace = true +repository.workspace = true + +[dependencies] +pulldown-cmark = "0.9.3" +walkdir = "2.4.0" diff --git a/mwp-content/src/lib.rs b/mwp-content/src/lib.rs new file mode 100644 index 0000000..0e21abe --- /dev/null +++ b/mwp-content/src/lib.rs @@ -0,0 +1,117 @@ +use std::{ + collections::HashMap, + fmt::Write, + fs, + path::{Path, PathBuf}, +}; + +use pulldown_cmark::{html, Options, Parser}; +use walkdir::WalkDir; + +#[derive(Clone)] +pub struct Page { + pub title: String, + pub path: String, + pub html: String, + pub text: String, + pub tags: Vec, + pub hiearchy: String, +} + +fn remove_extension(path: &Path) -> PathBuf { + let mut new_path = PathBuf::new(); + let parent_dir = path.parent().unwrap(); + if let Some(file_name) = path.file_stem() { + new_path.push(parent_dir); + new_path.push(file_name); + } + new_path +} + +pub async fn read_dir(src: &str) -> HashMap { + let mut entries = HashMap::new(); + + let mut options = Options::empty(); + options.insert(Options::ENABLE_STRIKETHROUGH); + + for entry in WalkDir::new(src) + .into_iter() + .filter_map(Result::ok) + .filter(|e| !e.file_type().is_dir()) + .filter(|e| { + e.path() + .extension() + .is_some_and(|ext| ext.to_str() == Some("md")) + }) + { + let contents = + fs::read_to_string(entry.path()).expect("Something went wrong reading the file"); + + let mut text_output = String::new(); + + let mut in_heading = false; + let mut title = String::new(); + #[allow(clippy::unnecessary_filter_map)] + let parser = Parser::new_ext(&contents, options).filter_map(|event| { + match event.clone() { + pulldown_cmark::Event::Text(text) => { + write!(&mut text_output, "{}", text).expect("write text output"); + if in_heading && title.is_empty() { + title = text.to_string(); + } + } + pulldown_cmark::Event::Start(pulldown_cmark::Tag::Heading( + pulldown_cmark::HeadingLevel::H1, + _, + _, + )) => { + in_heading = true; + } + pulldown_cmark::Event::End(pulldown_cmark::Tag::Heading( + pulldown_cmark::HeadingLevel::H1, + _, + _, + )) => { + in_heading = false; + } + _ => (), + } + Some(event) + }); + + let mut html_output = String::new(); + html::push_html(&mut html_output, parser); + + let clean_path = + Path::new("/").join(remove_extension(entry.path().strip_prefix(src).unwrap())); + + let parent = clean_path.parent().unwrap(); + + let tags = parent + .iter() + .filter_map(|component| { + if *component == Path::new("/") { + None + } else { + Some(component.to_str().unwrap().into()) + } + }) + .collect::>(); + + let clean_path = clean_path.display().to_string(); + + entries.insert( + clean_path.to_owned(), + Page { + title, + path: clean_path, + html: html_output, + text: text_output, + tags, + hiearchy: parent.display().to_string(), + }, + ); + } + + entries +} diff --git a/mwp-web/Cargo.toml b/mwp-web/Cargo.toml index ecdea88..3563b8b 100644 --- a/mwp-web/Cargo.toml +++ b/mwp-web/Cargo.toml @@ -15,5 +15,7 @@ serde = "1.0.195" serde_json = "1.0.111" tantivy = "0.21.1" +mwp-content = { path="../mwp-content" } + [build-dependencies] grass = "0.13.1" diff --git a/mwp-web/src/main.rs b/mwp-web/src/main.rs index 45e0c98..301e5bf 100644 --- a/mwp-web/src/main.rs +++ b/mwp-web/src/main.rs @@ -1,5 +1,11 @@ +use std::collections::HashMap; + use actix_files::Files; -use actix_web::{get, web, App, HttpServer, Result as AwResult}; +use actix_web::{ + get, + guard::{Guard, GuardContext}, + web, App, HttpServer, Result as AwResult, +}; use maud::{html, Markup, PreEscaped}; use serde::Deserialize; use tantivy::{ @@ -149,19 +155,78 @@ async fn tag_page(tag: web::Path, index: web::Data) -> AwResult>, + content: web::Data, +) -> AwResult { + match content + .docs + .get(format!("/{}", path.join("/").as_str()).as_str()) + { + Some(mwp_content::Page { html: content, .. }) => Ok(html! { + html { + (render::header("Content | MWP")) + body { + h1 { (path.join(",")) }; + main { + article { + (PreEscaped(content)) + } + } + } + } + }), + None => Ok(html! { + html { + (render::header("Not found | MWP")) + body { + h1 { "Not found" }; + } + } + }), + } +} + +#[derive(Clone)] +struct Content { + pub docs: HashMap, +} + +struct ContentGuard { + pub contents: Vec, +} + +impl Guard for ContentGuard { + fn check(&self, req: &GuardContext<'_>) -> bool { + self.contents.contains(&req.head().uri.path().to_string()) + } +} + #[actix_web::main] async fn main() -> std::io::Result<()> { env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); let index_path = "../index"; let index = Index::open_in_dir(index_path).unwrap(); + let content = Content { + docs: mwp_content::read_dir("../../wiki").await, + }; HttpServer::new(move || { App::new() .app_data(web::Data::new(index.clone())) + .app_data(web::Data::new(content.clone())) .service(index_page) .service(tag_page) .service(search_page) + .route( + "/{path:.*}", + web::get() + .guard(ContentGuard { + contents: content.docs.keys().cloned().collect(), + }) + .to(content_page), + ) .service(Files::new("/", "./static/")) }) .bind(("127.0.0.1", 4444))? diff --git a/src/main.rs b/src/main.rs index 8728e67..85541f7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use serde::{Deserialize, Serialize}; -use tantivy::{collector::TopDocs, query::QueryParser, schema::*, Index}; +use tantivy::{schema::*, Index}; use time::OffsetDateTime; use url::Url; @@ -42,44 +42,118 @@ impl Doc { } } +pub struct SearchIndex { + pub index: Index, +} + +impl SearchIndex { + pub fn new(dir: &str) -> Result> { + let index = if let Ok(index) = Index::open_in_dir(dir) { + index + } else { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_text_field("body", TEXT | STORED); + schema_builder.add_text_field("url", STRING | STORED); + schema_builder.add_text_field("domain", STRING | STORED); + schema_builder.add_text_field("tags", STRING | STORED | FAST); + schema_builder.add_text_field("kind", STRING | STORED); + schema_builder.add_facet_field("hiearchy", FacetOptions::default()); + let schema = schema_builder.build(); + Index::builder() + .schema(schema) + .settings(tantivy::IndexSettings { + docstore_blocksize: 32_000_000, + ..tantivy::IndexSettings::default() + }) + .create_in_dir(dir)? + }; + + Ok(SearchIndex { index }) + } + + fn add(&self, docs: Vec) -> Result<(), Box> { + let schema = self.index.schema(); + + let mut index_writer = self.index.writer(32_000_000).unwrap(); + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + let url = schema.get_field("url").unwrap(); + let domain = schema.get_field("domain").unwrap(); + let tags = schema.get_field("tags").unwrap(); + let kind = schema.get_field("kind").unwrap(); + + for doc in docs { + let mut document = Document::default(); + document.add_text(title, &doc.title); + document.add_text(body, &doc.body); + document.add_text(url, &doc.url); + document.add_text(domain, &doc.domain); + document.add_text(tags, "test"); + document.add_text(tags, "example"); + document.add_text(tags, "something"); + document.add_text(kind, "link"); + + println!("Adding: {} ({})\n{}\n", doc.title, doc.url, doc.body); + + index_writer.add_document(document)?; + } + + index_writer.commit().expect("commit index"); + + Ok(()) + } + + fn add_content( + &self, + content: Vec, + ) -> Result<(), Box> { + let schema = self.index.schema(); + + let mut index_writer = self.index.writer(32_000_000).unwrap(); + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + let kind = schema.get_field("kind").unwrap(); + let url = schema.get_field("url").unwrap(); + let tags = schema.get_field("tags").unwrap(); + let hiearchy = schema.get_field("hiearchy").unwrap(); + + for page in content { + let mut document = Document::default(); + document.add_text(title, &page.title); + document.add_text(body, &page.text); + document.add_text(url, &page.path); + document.add_text(kind, "link"); + for tag in page.tags { + document.add_text(tags, tag); + } + if !page.hiearchy.is_empty() { + document.add_facet(hiearchy, &page.hiearchy); + } + + index_writer.add_document(document)?; + } + + index_writer.commit().expect("commit index"); + + Ok(()) + } +} + #[tokio::main] async fn main() -> Result<(), Box> { let db = sled::open("./db").unwrap(); + let index = SearchIndex::new("./index")?; + + let content = mwp_content::read_dir("../wiki").await; + index.add_content(content.into_values().collect())?; let links = extract::collect_links(); println!("{:?}", links); - let index_path = "./index"; - let index = if let Ok(index) = Index::open_in_dir(index_path) { - index - } else { - let mut schema_builder = Schema::builder(); - schema_builder.add_text_field("title", TEXT | STORED); - schema_builder.add_text_field("body", TEXT | STORED); - schema_builder.add_text_field("url", STRING | STORED); - schema_builder.add_text_field("domain", STRING | STORED); - schema_builder.add_text_field("tags", STRING | STORED | FAST); - let schema = schema_builder.build(); - Index::builder() - .schema(schema) - .settings(tantivy::IndexSettings { - docstore_blocksize: 32_000_000, - ..tantivy::IndexSettings::default() - }) - .create_in_dir(index_path)? - }; - - let schema = index.schema(); - - let mut index_writer = index.writer(32_000_000).unwrap(); - - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); - let url = schema.get_field("url").unwrap(); - let domain = schema.get_field("domain").unwrap(); - let tags = schema.get_field("tags").unwrap(); - for link in links.iter().take(50) { if db.get(link.url.as_str())?.is_some() { continue; @@ -110,41 +184,18 @@ async fn main() -> Result<(), Box> { doc.scraped_at = Some(OffsetDateTime::now_utc()); db.insert(doc.url.as_str(), serde_json::to_vec(&doc).unwrap())?; - - let mut document = Document::default(); - document.add_text(title, &doc.title); - document.add_text(body, &doc.body); - document.add_text(url, &doc.url); - document.add_text(domain, &doc.domain); - document.add_text(tags, "test"); - document.add_text(tags, "example"); - document.add_text(tags, "something"); - - println!("Adding: {} ({})\n{}\n", doc.title, doc.url, doc.body); - - index_writer.add_document(document)?; } db.flush()?; - index_writer.commit().expect("commit index"); - - let reader = index.reader()?; - let searcher = reader.searcher(); + let all_docs = db + .iter() + .values() + .filter_map(|value| value.ok()) + .map(|v| serde_json::from_slice::(v.as_ref()).unwrap()) + .collect::>(); - let query_parser = QueryParser::for_index(&index, vec![title, body]); - - let query = query_parser.parse_query("Rust AND parser")?; - let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - - for (score, doc_address) in top_docs { - let retrieved_doc = searcher.doc(doc_address)?; - let title = retrieved_doc.get_first(title).unwrap().as_text().unwrap(); - let url = retrieved_doc.get_first(url).unwrap().as_text().unwrap(); - - println!("{} ({}) - Score: {}", title, url, score); - println!("{:?}", retrieved_doc); - } + index.add(all_docs)?; Ok(()) }