From 1a7b22c06384cfbe8a54385e478325ee9ac9ab40 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 12 Apr 2023 12:59:47 -0700 Subject: [PATCH 1/3] Adds the option to run on a whole document at a time --- src/main.rs | 75 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/src/main.rs b/src/main.rs index bd1afb8..be1953b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -65,6 +65,14 @@ struct Args { #[arg(long, default_value_t = false)] annotate_only: bool, + /// If you want ngrams to span across paragraph breaks, set this to true. + /// This also means that bff will only remove a complete document at a time. When this happens + /// the resulting document will be empty. This also means that deduplication within a document + /// no longer works. All in all, it might be best to only use this when you're also using + /// --annotate-only. + #[arg(long, default_value_t = false)] + whole_document: bool, + /// The number of threads to use for processing. /// If this is 0, the number of threads is automatically determined. #[arg(long, short = 't', default_value_t = 0)] @@ -287,6 +295,7 @@ fn process_file( update_bloom_filter: bool, filtering_threshold: f64, annotate_only: bool, + whole_document: bool, ) -> Result <(), io::Error> { let input_file = OpenOptions::new(). read(true). @@ -311,13 +320,20 @@ fn process_file( let line = line.unwrap(); let mut data: Value = serde_json::from_str(&line).unwrap(); let text = data["text"].as_str().unwrap(); - let mut newlines = Vec::new(); - newlines.push(0); - for i in text.match_indices("\n") { - newlines.push(i.0); - } - newlines.push(text.len()); + + let newlines = if whole_document { + vec![0, text.len()] + } else { + let mut newlines = Vec::new(); + newlines.push(0); + for i in text.match_indices("\n") { + newlines.push(i.0); + } + newlines.push(text.len()); + newlines + }; let mut windows_to_remove = Vec::new(); + let mut total_contained_ngrams = 0; for paragraph_window in newlines.windows(2) { let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; @@ -338,36 +354,29 @@ fn process_file( hashes.push(bloom_filter.hashes(&ngram)); } - if filtering_threshold <= 0.0 { - // If we're just priming the filter, just do it right here without checking whether - // the ngrams are in the filter. - if update_bloom_filter { - for ngram in hashes { - bloom_filter.insert_hashes(&ngram); - } - } - } else { - // calculate how many ngrams are in the bloom filter - let contained_ngrams = hashes.iter().filter(|ngram| { - bloom_filter.contains_hashes(ngram) - }).count(); - let number_of_ngrams = hashes.len(); - - // produce output - let too_many_duplicate_ngrams = - contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold; - if too_many_duplicate_ngrams { - windows_to_remove.push(paragraph_window); - } else if update_bloom_filter { - for ngram in hashes { - bloom_filter.insert_hashes(&ngram); - } + let contained_ngrams = hashes.iter().filter(|ngram| { + bloom_filter.contains_hashes(ngram) + }).count(); + total_contained_ngrams += contained_ngrams; + + // calculate how many ngrams are in the bloom filter + let number_of_ngrams = hashes.len(); + + // produce output + let too_many_duplicate_ngrams = + contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold; + if too_many_duplicate_ngrams { + windows_to_remove.push(paragraph_window); + } else if update_bloom_filter { + for ngram in hashes { + bloom_filter.insert_hashes(&ngram); } } } if annotate_only { - data["duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); + data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); + data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap(); } else { let mut output_paragraphs = String::new(); let mut last_end = 0; @@ -377,6 +386,7 @@ fn process_file( } output_paragraphs.push_str(&text[last_end..]); data["text"] = Value::String(output_paragraphs); + data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap(); } serde_json::to_writer(&mut writer, &data)?; @@ -447,7 +457,8 @@ fn main() { args.min_ngram_size, args.update_bloom_filter, args.filtering_threshold, - args.annotate_only + args.annotate_only, + args.whole_document ).unwrap(); }); } From 6f57042b99161eb99e74ea035923158532894239 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 12 Apr 2023 13:03:16 -0700 Subject: [PATCH 2/3] Clarify name of annotation --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index be1953b..28d39ba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -386,7 +386,7 @@ fn process_file( } output_paragraphs.push_str(&text[last_end..]); data["text"] = Value::String(output_paragraphs); - data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap(); + data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap(); } serde_json::to_writer(&mut writer, &data)?; From bee5eb44356e3f09a6af2a89c2cd3f717b17e379 Mon Sep 17 00:00:00 2001 From: Ian Date: Wed, 19 Apr 2023 16:40:30 +0000 Subject: [PATCH 3/3] attribute only output option --- src/main.rs | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index eab1823..33c13e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -65,6 +65,11 @@ struct Args { #[arg(long, default_value_t = false)] annotate_only: bool, + /// If this is true, we only write out document id and source, and annotate which spans would + /// have been deleted. This produces an attribute file per the llm-data specification. + #[arg(long, default_value_t = false)] + annotate_attribute_only: bool, + /// If you want ngrams to span across paragraph breaks, set this to true. /// This also means that bff will only remove a complete document at a time. When this happens /// the resulting document will be empty. This also means that deduplication within a document @@ -295,6 +300,7 @@ fn process_file( update_bloom_filter: bool, filtering_threshold: f64, annotate_only: bool, + annotate_attribute_only: bool, whole_document: bool, ) -> Result <(), io::Error> { let input_file = OpenOptions::new(). @@ -374,7 +380,8 @@ fn process_file( } } - if annotate_only { + // if annotate_attribute_only or annotate_only, add the annotation to the json + if annotate_attribute_only || annotate_only { data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap(); } else { @@ -389,6 +396,29 @@ fn process_file( data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap(); } + if annotate_attribute_only { + // Allowed fields + let allowed_fields = [ + "bff_duplicate_spans", + "bff_contained_ngram_count", + "id", + "source", + ]; + + // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list + if let Value::Object(ref mut map) = data { + let keys_to_remove: Vec = map + .keys() + .filter(|key| !allowed_fields.contains(&key.as_str())) + .map(|key| key.to_owned()) + .collect(); + for key in keys_to_remove { + map.remove(&key); + } + } + + } + serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } @@ -458,6 +488,7 @@ fn main() { !args.no_update_bloom_filter, args.filtering_threshold, args.annotate_only, + args.annotate_attribute_only, args.whole_document ).unwrap(); });