From a0d491e2a927184fee67404563eb1a0837a08f94 Mon Sep 17 00:00:00 2001 From: Scott Stevenson Date: Mon, 9 Dec 2024 21:43:15 +0000 Subject: [PATCH] Fix a few typos (#843) Co-authored-by: Saaketh Narayan --- CONTRIBUTING.md | 2 +- Makefile | 2 +- docs/source/_templates/base.html | 2 +- docs/source/dataset_configuration/shuffling.md | 2 +- docs/source/distributed_training/performance_tuning.md | 2 +- scripts/samples/bench_and_plot.py | 4 ++-- simulation/core/utils.py | 2 +- streaming/base/batching/stratified.py | 2 +- streaming/text/convert/enwiki/mds/merge_shard_groups.py | 2 +- streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py | 2 +- tests/test_streaming.py | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07c247ff3..3ed168953 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -75,7 +75,7 @@ pytest -vv -s . # run all the unittests cd docs && make clean && make doctest # run doctests ``` -6\. [Optional] Compile and visualize the documentation locally. If you have a documentation changes, running the below commands is mandatory. +6\. [Optional] Compile and visualize the documentation locally. If you have documentation changes, running the below commands is mandatory. ```bash diff --git a/Makefile b/Makefile index 1015ec416..670fffa8a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # several pytest settings PYTHON ?= python # Python command PYTEST ?= pytest # Pytest command -PYRIGHT ?= pyright # Pyright command. Pyright must be installed seperately -- e.g. `node install -g pyright` +PYRIGHT ?= pyright # Pyright command. Pyright must be installed separately -- e.g. `node install -g pyright` EXTRA_ARGS ?= # extra arguments for pytest dirs := streaming tests docs diff --git a/docs/source/_templates/base.html b/docs/source/_templates/base.html index c8fe55516..6a0188b0c 100644 --- a/docs/source/_templates/base.html +++ b/docs/source/_templates/base.html @@ -99,7 +99,7 @@ version = fragments[1].split("/")[0] // NOTE: The version string will resolve to the PR number for RTD sites. - // Checking whether first charater is a number. + // Checking whether first character is a number. if (version[0] >= '0' && version[0] <= '9') { version = undefined } diff --git a/docs/source/dataset_configuration/shuffling.md b/docs/source/dataset_configuration/shuffling.md index 4efa81930..c3d6fb840 100644 --- a/docs/source/dataset_configuration/shuffling.md +++ b/docs/source/dataset_configuration/shuffling.md @@ -70,4 +70,4 @@ Samples within each shard are shuffled both before and after shards are split am Globally shuffles all samples. This is useful for single-node training on small data, where you want the most random shuffle possible, but is the least download-efficient of all shuffle algorithms. Training throughput is often much lower when using the `naive` shuffling algorithm. -If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [perfomance tuning page](../distributed_training/performance_tuning.md). +If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [performance tuning page](../distributed_training/performance_tuning.md). diff --git a/docs/source/distributed_training/performance_tuning.md b/docs/source/distributed_training/performance_tuning.md index 2eb79cd0c..fa54a3025 100644 --- a/docs/source/distributed_training/performance_tuning.md +++ b/docs/source/distributed_training/performance_tuning.md @@ -23,7 +23,7 @@ $$L = 2 \cdot S \cdot \lceil\frac{C}{P}\rceil $$ Where $L$ is the required minimum cache limit per node, in MB, $S$ is the average shard size, in MB, $C$ is the number of canonical nodes (see [here](../dataset_configuration/shuffling.md#how-shuffling-works) and [here](../distributed_training/elastic_determinism.md#requirements)), and $P$ is the number of physical nodes. This is because only a single shard, plus a potentially predownloaded subsequent shard, needs to be resident per canonical node to make progress during training. -If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minumum cache limit per node will be approximately: +If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minimum cache limit per node will be approximately: $$L = k \cdot S \lceil \frac{B}{Q} \rceil \cdot \lceil\frac{C}{P}\rceil $$ diff --git a/scripts/samples/bench_and_plot.py b/scripts/samples/bench_and_plot.py index 22d4b834c..afa1f9fb0 100644 --- a/scripts/samples/bench_and_plot.py +++ b/scripts/samples/bench_and_plot.py @@ -237,7 +237,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable, args (Namespace): Command-line arguments. bench_name (str): What to call this benchmark. desc (str): Brief description of the data. - generate (Callable): Method to genereate the dataset. + generate (Callable): Method to generate the dataset. formats (List[str]): List of shard formats to benchmark this data in. """ print(f'Bench: {bench_name}') @@ -373,7 +373,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable, y *= args.plot_bins y = y.astype(np.int64) - # Truncate the higest ``args.truncate_highest_frac`` timings because they get further + # Truncate the highest ``args.truncate_highest_frac`` timings because they get further # and further spaced as you ascend, which would ruin the plot. y = y[np.nonzero(y < args.plot_bins)[0]] diff --git a/simulation/core/utils.py b/simulation/core/utils.py index 6284cc38b..20af6533a 100644 --- a/simulation/core/utils.py +++ b/simulation/core/utils.py @@ -20,7 +20,7 @@ def get_batches_epochs(dataset: SimulationDataset, max_duration: Time) -> tuple[ Returns: Tuple[int, int, int]: batches per epoch, epochs, and the total batches. """ - # get epochs, batches_per_epoch, and total_batches from a Time obect + # get epochs, batches_per_epoch, and total_batches from a Time object dataset_batches = dataset.get_num_batches() batches_per_epoch = dataset_batches epochs = 1 diff --git a/streaming/base/batching/stratified.py b/streaming/base/batching/stratified.py index 827afff87..6c52f6272 100644 --- a/streaming/base/batching/stratified.py +++ b/streaming/base/batching/stratified.py @@ -115,7 +115,7 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e f'Number of samples for stream {stream_id} is {batch_portion} because the portion ' + f'of this stream in the global batch, which is of size {global_batch_size}, is ' + - f'too low. Please increase the global batch size or increase the porportion of ' + + f'too low. Please increase the global batch size or increase the proportion of ' + f'total samples that come from stream {stream_id}.') # We now merge the partitions from each stream to get our final partition over all diff --git a/streaming/text/convert/enwiki/mds/merge_shard_groups.py b/streaming/text/convert/enwiki/mds/merge_shard_groups.py index e5e8c47ae..9c6baeb59 100644 --- a/streaming/text/convert/enwiki/mds/merge_shard_groups.py +++ b/streaming/text/convert/enwiki/mds/merge_shard_groups.py @@ -11,7 +11,7 @@ def parse_args() -> Namespace: - """Parse commmand-line arguments. + """Parse command-line arguments. Returns: Namespace: Command-line arguments. diff --git a/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py b/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py index d16361170..d6fccc899 100644 --- a/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py +++ b/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py @@ -1,4 +1,4 @@ -"""Script for picking certain number of sampels. +"""Script for picking certain number of samples. """ import argparse diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 77e26a3e0..cd113c6e8 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -512,7 +512,7 @@ def test_stratified_batching_Exception(local_remote_dir: tuple[str, str], stream with pytest.raises(ValueError, match=f'Number of samples for stream*'): # When we iterate through the dataloader, the samples will be partitioned. - # This should thow ValueError since stream 2 is too small to be included in each batch. + # This should throw ValueError since stream 2 is too small to be included in each batch. for _ in dataloader: continue