From 9bf365d1c9312a178386f48b1b7b202d1f381a62 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 3 Nov 2023 17:44:39 +0100
Subject: [PATCH] Start work on doc-to-shard property copying.

---
 spacy_llm/tasks/rel/parser.py    |  6 +++---
 spacy_llm/tasks/rel/task.py      |  5 ++---
 spacy_llm/tasks/util/sharding.py | 13 +++++++++++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/spacy_llm/tasks/rel/parser.py b/spacy_llm/tasks/rel/parser.py
index 3f79b31e..27ede457 100644
--- a/spacy_llm/tasks/rel/parser.py
+++ b/spacy_llm/tasks/rel/parser.py
@@ -19,13 +19,13 @@ def parse_responses_v1(
     """
     for responses_for_doc, shards_for_doc in zip(responses, shards):
         results_for_doc: List[List[RelationItem]] = []
-        for response, doc in zip(responses_for_doc, shards_for_doc):
+        for response, shard in zip(responses_for_doc, shards_for_doc):
             relations: List[RelationItem] = []
             for line in response.strip().split("\n"):
                 try:
                     rel_item = RelationItem.parse_raw(line)
-                    if 0 <= rel_item.dep < len(doc.ents) and 0 <= rel_item.dest < len(
-                        doc.ents
+                    if 0 <= rel_item.dep < len(shard.ents) and 0 <= rel_item.dest < len(
+                        shard.ents
                     ):
                         relations.append(rel_item)
                 except ValidationError:
diff --git a/spacy_llm/tasks/rel/task.py b/spacy_llm/tasks/rel/task.py
index 8bf38f3c..dd42b246 100644
--- a/spacy_llm/tasks/rel/task.py
+++ b/spacy_llm/tasks/rel/task.py
@@ -101,12 +101,11 @@ def parse_responses(
         for shards_for_doc, rel_items_for_doc in zip(
             shards_teed[0], self._parse_responses(self, shards_teed[1], responses)
         ):
-            updated_shards_for_doc: List[Doc] = []
+            shards_for_doc = list(shards_for_doc)
             for shard, rel_items in zip(shards_for_doc, rel_items_for_doc):
                 shard._.rel = rel_items
-                updated_shards_for_doc.append(shard)
 
-            yield self._shard_reducer(updated_shards_for_doc)
+            yield self._shard_reducer(shards_for_doc)
 
     def initialize(
         self,
diff --git a/spacy_llm/tasks/util/sharding.py b/spacy_llm/tasks/util/sharding.py
index c153f170..c6919a47 100644
--- a/spacy_llm/tasks/util/sharding.py
+++ b/spacy_llm/tasks/util/sharding.py
@@ -28,8 +28,8 @@ def make_shard_mapper(
     n_token_estimator (NTokenEstimator): Estimates number of tokens in a string.
     buffer_frac (float): Buffer to consider in assessment of whether prompt fits into context. E. g. if value is 1.1,
         prompt length * 1.1 will be compared with the context length.
-    # todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
-    #  splitting we can't rely one...maybe checking for sentences and/or as optional arg?
+    todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
+     splitting we can't rely one...maybe checking for sentences and/or as optional arg?
     RETURNS (ShardMapper): Callable mapping doc to doc shards fitting within context length.
     """
     n_tok_est: NTokenEstimator = n_token_estimator or make_n_token_estimator()
@@ -54,6 +54,9 @@ def map_doc_to_shards(
             fraction = 0.5
             start_idx = 0
 
+            if n_tok_est(render_template(doc)) * buffer_frac <= context_length:
+                return [doc]
+
             while remaining_doc is not None:
                 fits_in_context = False
                 shard: Optional[Doc] = None
@@ -68,6 +71,12 @@ def map_doc_to_shards(
                     )
                     fraction /= 2
 
+                # todo doc properties, such as .ents, have to be included for some tasks (e. g. REL, EL) to work. how
+                #  should this be done in cases where the properties transcend shard limits?
+                #   - should sharding never cut across entities/other properties?
+                #   - should entities or all other properties be dropped if they transcend shard properties? this seems
+                #     like the most pragmatic solution for now.
+                #   - which properties should be copied to shards other than .ents?
                 assert shard is not None
                 shards.append(shard)
                 fraction = 1