Start work on doc-to-shard property copying.

explosion · Nov 3, 2023 · 9bf365d · 9bf365d
1 parent b54a3d9
commit 9bf365d
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 8 deletions.
diff --git a/spacy_llm/tasks/rel/parser.py b/spacy_llm/tasks/rel/parser.py
@@ -19,13 +19,13 @@ def parse_responses_v1(
     """
     for responses_for_doc, shards_for_doc in zip(responses, shards):
         results_for_doc: List[List[RelationItem]] = []
-        for response, doc in zip(responses_for_doc, shards_for_doc):
+        for response, shard in zip(responses_for_doc, shards_for_doc):
             relations: List[RelationItem] = []
             for line in response.strip().split("\n"):
                 try:
                     rel_item = RelationItem.parse_raw(line)
-                    if 0 <= rel_item.dep < len(doc.ents) and 0 <= rel_item.dest < len(
-                        doc.ents
+                    if 0 <= rel_item.dep < len(shard.ents) and 0 <= rel_item.dest < len(
+                        shard.ents
                     ):
                         relations.append(rel_item)
                 except ValidationError:

diff --git a/spacy_llm/tasks/rel/task.py b/spacy_llm/tasks/rel/task.py
@@ -101,12 +101,11 @@ def parse_responses(
         for shards_for_doc, rel_items_for_doc in zip(
             shards_teed[0], self._parse_responses(self, shards_teed[1], responses)
         ):
-            updated_shards_for_doc: List[Doc] = []
+            shards_for_doc = list(shards_for_doc)
             for shard, rel_items in zip(shards_for_doc, rel_items_for_doc):
                 shard._.rel = rel_items
-                updated_shards_for_doc.append(shard)
 
-            yield self._shard_reducer(updated_shards_for_doc)
+            yield self._shard_reducer(shards_for_doc)
 
     def initialize(
         self,

diff --git a/spacy_llm/tasks/util/sharding.py b/spacy_llm/tasks/util/sharding.py
@@ -28,8 +28,8 @@ def make_shard_mapper(
     n_token_estimator (NTokenEstimator): Estimates number of tokens in a string.
     buffer_frac (float): Buffer to consider in assessment of whether prompt fits into context. E. g. if value is 1.1,
         prompt length * 1.1 will be compared with the context length.
-    # todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
-    #  splitting we can't rely one...maybe checking for sentences and/or as optional arg?
+    todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
+     splitting we can't rely one...maybe checking for sentences and/or as optional arg?
     RETURNS (ShardMapper): Callable mapping doc to doc shards fitting within context length.
     """
     n_tok_est: NTokenEstimator = n_token_estimator or make_n_token_estimator()
@@ -54,6 +54,9 @@ def map_doc_to_shards(
             fraction = 0.5
             start_idx = 0
 
+            if n_tok_est(render_template(doc)) * buffer_frac <= context_length:
+                return [doc]
+
             while remaining_doc is not None:
                 fits_in_context = False
                 shard: Optional[Doc] = None
@@ -68,6 +71,12 @@ def map_doc_to_shards(
                     )
                     fraction /= 2
 
+                # todo doc properties, such as .ents, have to be included for some tasks (e. g. REL, EL) to work. how
+                #  should this be done in cases where the properties transcend shard limits?
+                #   - should sharding never cut across entities/other properties?
+                #   - should entities or all other properties be dropped if they transcend shard properties? this seems
+                #     like the most pragmatic solution for now.
+                #   - which properties should be copied to shards other than .ents?
                 assert shard is not None
                 shards.append(shard)
                 fraction = 1