allenai · dirkgr · Apr 25, 2023 · Apr 12, 2023 · Apr 12, 2023 · Apr 19, 2023
diff --git a/src/main.rs b/src/main.rs
@@ -65,6 +65,19 @@ struct Args {
     #[arg(long, default_value_t = false)]
     annotate_only: bool,
 
+    /// If this is true, we only write out document id and source, and annotate which spans would
+    /// have been deleted. This produces an attribute file per the llm-data specification.
+    #[arg(long, default_value_t = false)]
+    annotate_attribute_only: bool,
+
+    /// If you want ngrams to span across paragraph breaks, set this to true.
+    /// This also means that bff will only remove a complete document at a time. When this happens
+    /// the resulting document will be empty. This also means that deduplication within a document
+    /// no longer works. All in all, it might be best to only use this when you're also using
+    /// --annotate-only.
+    #[arg(long, default_value_t = false)]
+    whole_document: bool,
+
     /// The number of threads to use for processing.
     /// If this is 0, the number of threads is automatically determined.
     #[arg(long, short = 't', default_value_t = 0)]
@@ -287,6 +300,8 @@ fn process_file(
     update_bloom_filter: bool,
     filtering_threshold: f64,
     annotate_only: bool,
+    annotate_attribute_only: bool,
+    whole_document: bool,
 ) -> Result <(), io::Error> {
     let input_file = OpenOptions::new().
         read(true).
@@ -311,13 +326,20 @@ fn process_file(
         let line = line.unwrap();
         let mut data: Value = serde_json::from_str(&line).unwrap();
         let text = data["text"].as_str().unwrap();
-        let mut newlines = Vec::new();
-        newlines.push(0);
-        for i in text.match_indices("\n") {
-            newlines.push(i.0);
-        }
-        newlines.push(text.len());
+
+        let newlines = if whole_document {
+            vec![0, text.len()]
+        } else {
+            let mut newlines = Vec::new();
+            newlines.push(0);
+            for i in text.match_indices("\n") {
+                newlines.push(i.0);
+            }
+            newlines.push(text.len());
+            newlines
+        };
         let mut windows_to_remove = Vec::new();
+        let mut total_contained_ngrams = 0;
 
         for paragraph_window in newlines.windows(2) {
             let paragraph = &text[paragraph_window[0]..paragraph_window[1]];
@@ -338,36 +360,30 @@ fn process_file(
                 hashes.push(bloom_filter.hashes(&ngram));
             }
 
-            if filtering_threshold <= 0.0 {
-                // If we're just priming the filter, just do it right here without checking whether
-                // the ngrams are in the filter.
-                if update_bloom_filter {
-                    for ngram in hashes {
-                        bloom_filter.insert_hashes(&ngram);
-                    }
-                }
-            } else {
-                // calculate how many ngrams are in the bloom filter
-                let contained_ngrams = hashes.iter().filter(|ngram| {
-                    bloom_filter.contains_hashes(ngram)
-                }).count();
-                let number_of_ngrams = hashes.len();
-
-                // produce output
-                let too_many_duplicate_ngrams =
-                    contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold;
-                if too_many_duplicate_ngrams {
-                    windows_to_remove.push(paragraph_window);
-                } else if update_bloom_filter {
-                    for ngram in hashes {
-                        bloom_filter.insert_hashes(&ngram);
-                    }
+            let contained_ngrams = hashes.iter().filter(|ngram| {
+                bloom_filter.contains_hashes(ngram)
+            }).count();
+            total_contained_ngrams += contained_ngrams;
+
+            // calculate how many ngrams are in the bloom filter
+            let number_of_ngrams = hashes.len();
+
+            // produce output
+            let too_many_duplicate_ngrams =
+                contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold;
+            if too_many_duplicate_ngrams {
+                windows_to_remove.push(paragraph_window);
+            } else if update_bloom_filter {
+                for ngram in hashes {
+                    bloom_filter.insert_hashes(&ngram);
                 }
             }
         }
 
-        if annotate_only {
-            data["duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap();
+        // if annotate_attribute_only or annotate_only, add the annotation to the json
+        if annotate_attribute_only || annotate_only {
+            data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap();
+            data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap();
         } else {
             let mut output_paragraphs = String::new();
             let mut last_end = 0;
@@ -377,6 +393,30 @@ fn process_file(
             }
             output_paragraphs.push_str(&text[last_end..]);
             data["text"] = Value::String(output_paragraphs);
+            data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap();
+        }
+
+        if annotate_attribute_only {
+            // Allowed fields
+            let allowed_fields = [
+                "bff_duplicate_spans",
+                "bff_contained_ngram_count",
+                "id",
+                "source",
+            ];
+
+            // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list
+            if let Value::Object(ref mut map) = data {
+                let keys_to_remove: Vec<String> = map
+                    .keys()
+                    .filter(|key| !allowed_fields.contains(&key.as_str()))
+                    .map(|key| key.to_owned())
+                    .collect();
+                for key in keys_to_remove {
+                    map.remove(&key);
+                }
+            }
+
         }
 
         serde_json::to_writer(&mut writer, &data)?;
@@ -447,7 +487,9 @@ fn main() {
                 args.min_ngram_size,
                 !args.no_update_bloom_filter,
                 args.filtering_threshold,
-                args.annotate_only
+                args.annotate_only,
+                args.annotate_attribute_only,
+                args.whole_document
             ).unwrap();
         });
     }