From 1a7b22c06384cfbe8a54385e478325ee9ac9ab40 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 12 Apr 2023 12:59:47 -0700
Subject: [PATCH 1/3] Adds the option to run on a whole document at a time

---
 src/main.rs | 75 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index bd1afb8..be1953b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -65,6 +65,14 @@ struct Args {
     #[arg(long, default_value_t = false)]
     annotate_only: bool,
 
+    /// If you want ngrams to span across paragraph breaks, set this to true.
+    /// This also means that bff will only remove a complete document at a time. When this happens
+    /// the resulting document will be empty. This also means that deduplication within a document
+    /// no longer works. All in all, it might be best to only use this when you're also using
+    /// --annotate-only.
+    #[arg(long, default_value_t = false)]
+    whole_document: bool,
+
     /// The number of threads to use for processing.
     /// If this is 0, the number of threads is automatically determined.
     #[arg(long, short = 't', default_value_t = 0)]
@@ -287,6 +295,7 @@ fn process_file(
     update_bloom_filter: bool,
     filtering_threshold: f64,
     annotate_only: bool,
+    whole_document: bool,
 ) -> Result <(), io::Error> {
     let input_file = OpenOptions::new().
         read(true).
@@ -311,13 +320,20 @@ fn process_file(
         let line = line.unwrap();
         let mut data: Value = serde_json::from_str(&line).unwrap();
         let text = data["text"].as_str().unwrap();
-        let mut newlines = Vec::new();
-        newlines.push(0);
-        for i in text.match_indices("\n") {
-            newlines.push(i.0);
-        }
-        newlines.push(text.len());
+
+        let newlines = if whole_document {
+            vec![0, text.len()]
+        } else {
+            let mut newlines = Vec::new();
+            newlines.push(0);
+            for i in text.match_indices("\n") {
+                newlines.push(i.0);
+            }
+            newlines.push(text.len());
+            newlines
+        };
         let mut windows_to_remove = Vec::new();
+        let mut total_contained_ngrams = 0;
 
         for paragraph_window in newlines.windows(2) {
             let paragraph = &text[paragraph_window[0]..paragraph_window[1]];
@@ -338,36 +354,29 @@ fn process_file(
                 hashes.push(bloom_filter.hashes(&ngram));
             }
 
-            if filtering_threshold <= 0.0 {
-                // If we're just priming the filter, just do it right here without checking whether
-                // the ngrams are in the filter.
-                if update_bloom_filter {
-                    for ngram in hashes {
-                        bloom_filter.insert_hashes(&ngram);
-                    }
-                }
-            } else {
-                // calculate how many ngrams are in the bloom filter
-                let contained_ngrams = hashes.iter().filter(|ngram| {
-                    bloom_filter.contains_hashes(ngram)
-                }).count();
-                let number_of_ngrams = hashes.len();
-
-                // produce output
-                let too_many_duplicate_ngrams =
-                    contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold;
-                if too_many_duplicate_ngrams {
-                    windows_to_remove.push(paragraph_window);
-                } else if update_bloom_filter {
-                    for ngram in hashes {
-                        bloom_filter.insert_hashes(&ngram);
-                    }
+            let contained_ngrams = hashes.iter().filter(|ngram| {
+                bloom_filter.contains_hashes(ngram)
+            }).count();
+            total_contained_ngrams += contained_ngrams;
+
+            // calculate how many ngrams are in the bloom filter
+            let number_of_ngrams = hashes.len();
+
+            // produce output
+            let too_many_duplicate_ngrams =
+                contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold;
+            if too_many_duplicate_ngrams {
+                windows_to_remove.push(paragraph_window);
+            } else if update_bloom_filter {
+                for ngram in hashes {
+                    bloom_filter.insert_hashes(&ngram);
                 }
             }
         }
 
         if annotate_only {
-            data["duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap();
+            data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap();
+            data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap();
         } else {
             let mut output_paragraphs = String::new();
             let mut last_end = 0;
@@ -377,6 +386,7 @@ fn process_file(
             }
             output_paragraphs.push_str(&text[last_end..]);
             data["text"] = Value::String(output_paragraphs);
+            data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap();
         }
 
         serde_json::to_writer(&mut writer, &data)?;
@@ -447,7 +457,8 @@ fn main() {
                 args.min_ngram_size,
                 args.update_bloom_filter,
                 args.filtering_threshold,
-                args.annotate_only
+                args.annotate_only,
+                args.whole_document
             ).unwrap();
         });
     }

From 6f57042b99161eb99e74ea035923158532894239 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 12 Apr 2023 13:03:16 -0700
Subject: [PATCH 2/3] Clarify name of annotation

---
 src/main.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index be1953b..28d39ba 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -386,7 +386,7 @@ fn process_file(
             }
             output_paragraphs.push_str(&text[last_end..]);
             data["text"] = Value::String(output_paragraphs);
-            data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap();
+            data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap();
         }
 
         serde_json::to_writer(&mut writer, &data)?;

From bee5eb44356e3f09a6af2a89c2cd3f717b17e379 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Wed, 19 Apr 2023 16:40:30 +0000
Subject: [PATCH 3/3] attribute only output option

---
 src/main.rs | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/main.rs b/src/main.rs
index eab1823..33c13e4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -65,6 +65,11 @@ struct Args {
     #[arg(long, default_value_t = false)]
     annotate_only: bool,
 
+    /// If this is true, we only write out document id and source, and annotate which spans would
+    /// have been deleted. This produces an attribute file per the llm-data specification.
+    #[arg(long, default_value_t = false)]
+    annotate_attribute_only: bool,
+
     /// If you want ngrams to span across paragraph breaks, set this to true.
     /// This also means that bff will only remove a complete document at a time. When this happens
     /// the resulting document will be empty. This also means that deduplication within a document
@@ -295,6 +300,7 @@ fn process_file(
     update_bloom_filter: bool,
     filtering_threshold: f64,
     annotate_only: bool,
+    annotate_attribute_only: bool,
     whole_document: bool,
 ) -> Result <(), io::Error> {
     let input_file = OpenOptions::new().
@@ -374,7 +380,8 @@ fn process_file(
             }
         }
 
-        if annotate_only {
+        // if annotate_attribute_only or annotate_only, add the annotation to the json
+        if annotate_attribute_only || annotate_only {
             data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap();
             data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap();
         } else {
@@ -389,6 +396,29 @@ fn process_file(
             data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap();
         }
 
+        if annotate_attribute_only {
+            // Allowed fields
+            let allowed_fields = [
+                "bff_duplicate_spans",
+                "bff_contained_ngram_count",
+                "id",
+                "source",
+            ];
+
+            // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list
+            if let Value::Object(ref mut map) = data {
+                let keys_to_remove: Vec<String> = map
+                    .keys()
+                    .filter(|key| !allowed_fields.contains(&key.as_str()))
+                    .map(|key| key.to_owned())
+                    .collect();
+                for key in keys_to_remove {
+                    map.remove(&key);
+                }
+            }
+
+        }
+
         serde_json::to_writer(&mut writer, &data)?;
         writer.write_all(b"\n")?;
     }
@@ -458,6 +488,7 @@ fn main() {
                 !args.no_update_bloom_filter,
                 args.filtering_threshold,
                 args.annotate_only,
+                args.annotate_attribute_only,
                 args.whole_document
             ).unwrap();
         });