Skip to content

Commit

Permalink
Start work on doc-to-shard property copying.
Browse files Browse the repository at this point in the history
  • Loading branch information
rmitsch committed Nov 3, 2023
1 parent b54a3d9 commit 9bf365d
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 8 deletions.
6 changes: 3 additions & 3 deletions spacy_llm/tasks/rel/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ def parse_responses_v1(
"""
for responses_for_doc, shards_for_doc in zip(responses, shards):
results_for_doc: List[List[RelationItem]] = []
for response, doc in zip(responses_for_doc, shards_for_doc):
for response, shard in zip(responses_for_doc, shards_for_doc):
relations: List[RelationItem] = []
for line in response.strip().split("\n"):
try:
rel_item = RelationItem.parse_raw(line)
if 0 <= rel_item.dep < len(doc.ents) and 0 <= rel_item.dest < len(
doc.ents
if 0 <= rel_item.dep < len(shard.ents) and 0 <= rel_item.dest < len(
shard.ents
):
relations.append(rel_item)
except ValidationError:
Expand Down
5 changes: 2 additions & 3 deletions spacy_llm/tasks/rel/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,11 @@ def parse_responses(
for shards_for_doc, rel_items_for_doc in zip(
shards_teed[0], self._parse_responses(self, shards_teed[1], responses)
):
updated_shards_for_doc: List[Doc] = []
shards_for_doc = list(shards_for_doc)
for shard, rel_items in zip(shards_for_doc, rel_items_for_doc):
shard._.rel = rel_items
updated_shards_for_doc.append(shard)

yield self._shard_reducer(updated_shards_for_doc)
yield self._shard_reducer(shards_for_doc)

def initialize(
self,
Expand Down
13 changes: 11 additions & 2 deletions spacy_llm/tasks/util/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def make_shard_mapper(
n_token_estimator (NTokenEstimator): Estimates number of tokens in a string.
buffer_frac (float): Buffer to consider in assessment of whether prompt fits into context. E. g. if value is 1.1,
prompt length * 1.1 will be compared with the context length.
# todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
# splitting we can't rely one...maybe checking for sentences and/or as optional arg?
todo sharding would be better with sentences instead of tokens, but this requires some form of sentence
splitting we can't rely one...maybe checking for sentences and/or as optional arg?
RETURNS (ShardMapper): Callable mapping doc to doc shards fitting within context length.
"""
n_tok_est: NTokenEstimator = n_token_estimator or make_n_token_estimator()
Expand All @@ -54,6 +54,9 @@ def map_doc_to_shards(
fraction = 0.5
start_idx = 0

if n_tok_est(render_template(doc)) * buffer_frac <= context_length:
return [doc]

while remaining_doc is not None:
fits_in_context = False
shard: Optional[Doc] = None
Expand All @@ -68,6 +71,12 @@ def map_doc_to_shards(
)
fraction /= 2

# todo doc properties, such as .ents, have to be included for some tasks (e. g. REL, EL) to work. how
# should this be done in cases where the properties transcend shard limits?
# - should sharding never cut across entities/other properties?
# - should entities or all other properties be dropped if they transcend shard properties? this seems
# like the most pragmatic solution for now.
# - which properties should be copied to shards other than .ents?
assert shard is not None
shards.append(shard)
fraction = 1
Expand Down

0 comments on commit 9bf365d

Please sign in to comment.