From 9bf365d1c9312a178386f48b1b7b202d1f381a62 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 3 Nov 2023 17:44:39 +0100 Subject: [PATCH] Start work on doc-to-shard property copying. --- spacy_llm/tasks/rel/parser.py | 6 +++--- spacy_llm/tasks/rel/task.py | 5 ++--- spacy_llm/tasks/util/sharding.py | 13 +++++++++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/spacy_llm/tasks/rel/parser.py b/spacy_llm/tasks/rel/parser.py index 3f79b31e..27ede457 100644 --- a/spacy_llm/tasks/rel/parser.py +++ b/spacy_llm/tasks/rel/parser.py @@ -19,13 +19,13 @@ def parse_responses_v1( """ for responses_for_doc, shards_for_doc in zip(responses, shards): results_for_doc: List[List[RelationItem]] = [] - for response, doc in zip(responses_for_doc, shards_for_doc): + for response, shard in zip(responses_for_doc, shards_for_doc): relations: List[RelationItem] = [] for line in response.strip().split("\n"): try: rel_item = RelationItem.parse_raw(line) - if 0 <= rel_item.dep < len(doc.ents) and 0 <= rel_item.dest < len( - doc.ents + if 0 <= rel_item.dep < len(shard.ents) and 0 <= rel_item.dest < len( + shard.ents ): relations.append(rel_item) except ValidationError: diff --git a/spacy_llm/tasks/rel/task.py b/spacy_llm/tasks/rel/task.py index 8bf38f3c..dd42b246 100644 --- a/spacy_llm/tasks/rel/task.py +++ b/spacy_llm/tasks/rel/task.py @@ -101,12 +101,11 @@ def parse_responses( for shards_for_doc, rel_items_for_doc in zip( shards_teed[0], self._parse_responses(self, shards_teed[1], responses) ): - updated_shards_for_doc: List[Doc] = [] + shards_for_doc = list(shards_for_doc) for shard, rel_items in zip(shards_for_doc, rel_items_for_doc): shard._.rel = rel_items - updated_shards_for_doc.append(shard) - yield self._shard_reducer(updated_shards_for_doc) + yield self._shard_reducer(shards_for_doc) def initialize( self, diff --git a/spacy_llm/tasks/util/sharding.py b/spacy_llm/tasks/util/sharding.py index c153f170..c6919a47 100644 --- a/spacy_llm/tasks/util/sharding.py +++ b/spacy_llm/tasks/util/sharding.py @@ -28,8 +28,8 @@ def make_shard_mapper( n_token_estimator (NTokenEstimator): Estimates number of tokens in a string. buffer_frac (float): Buffer to consider in assessment of whether prompt fits into context. E. g. if value is 1.1, prompt length * 1.1 will be compared with the context length. - # todo sharding would be better with sentences instead of tokens, but this requires some form of sentence - # splitting we can't rely one...maybe checking for sentences and/or as optional arg? + todo sharding would be better with sentences instead of tokens, but this requires some form of sentence + splitting we can't rely one...maybe checking for sentences and/or as optional arg? RETURNS (ShardMapper): Callable mapping doc to doc shards fitting within context length. """ n_tok_est: NTokenEstimator = n_token_estimator or make_n_token_estimator() @@ -54,6 +54,9 @@ def map_doc_to_shards( fraction = 0.5 start_idx = 0 + if n_tok_est(render_template(doc)) * buffer_frac <= context_length: + return [doc] + while remaining_doc is not None: fits_in_context = False shard: Optional[Doc] = None @@ -68,6 +71,12 @@ def map_doc_to_shards( ) fraction /= 2 + # todo doc properties, such as .ents, have to be included for some tasks (e. g. REL, EL) to work. how + # should this be done in cases where the properties transcend shard limits? + # - should sharding never cut across entities/other properties? + # - should entities or all other properties be dropped if they transcend shard properties? this seems + # like the most pragmatic solution for now. + # - which properties should be copied to shards other than .ents? assert shard is not None shards.append(shard) fraction = 1