From 382c296f7daccb6f565c1ff0ed65e75a2097810a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 19 Dec 2023 08:56:18 +0100
Subject: [PATCH 1/2] Remove debug data normalization for span analysis

As a result of this normalization, `debug data` could show a user tokens
that do not exist in their data.
---
 spacy/cli/debug_data.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 714969be145..e2839ed0354 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1073,9 +1073,7 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
     word_counts: Counter = Counter()
     for doc in docs:
         for token in doc:
-            # Normalize the text
-            t = token.text.lower().replace("``", '"').replace("''", '"')
-            word_counts[t] += 1
+            word_counts[token.text] += 1
     if normalize:
         total = sum(word_counts.values(), 0.0)
         word_counts = Counter({k: v / total for k, v in word_counts.items()})

From fd00de42aaad5d98d7cd868ef0001f70cdf8345b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 20 Dec 2023 17:39:45 +0100
Subject: [PATCH 2/2] Update spacy/cli/debug_data.py

---
 spacy/cli/debug_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index e2839ed0354..7a98e6d563c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1073,7 +1073,8 @@ def _get_distribution(docs, normalize: bool = True) -> Counter:
     word_counts: Counter = Counter()
     for doc in docs:
         for token in doc:
-            word_counts[token.text] += 1
+            t = token.text.lower()
+            word_counts[t] += 1
     if normalize:
         total = sum(word_counts.values(), 0.0)
         word_counts = Counter({k: v / total for k, v in word_counts.items()})