Skip to content

Commit

Permalink
index file read fix (#229)
Browse files Browse the repository at this point in the history
Fix `.ds.index` file read in shuffler code. Also fix the logging where `path` field is accessed from string.
  • Loading branch information
sippycoder authored Jul 8, 2024
1 parent 061d4db commit af63762
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/tokens/context_shuffler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1
datafiles = self.input_folder.get_shard(rank, world_size, glob_pattern="*.ds")
datafiles_index = self.input_folder.get_shard(rank, world_size, glob_pattern="*.ds.index")
for datafile, index in zip(datafiles, datafiles_index):
logger.info(f"Context shuffling {datafile.path} with a {self.window_size} token window")
total_len = load_doc_ends(index)[-1]
logger.info(f"Context shuffling {datafile} with a {self.window_size} token window")
total_len = load_doc_ends(self.input_folder.open(index, "rb"))[-1]
nr_windows = total_len // self.window_size
ordering = self.rand.permutation(np.arange(0, nr_windows, dtype=int))
with self.output_folder.open(datafile, "wb") as fout:
Expand Down

0 comments on commit af63762

Please sign in to comment.