diff --git a/.github/workflows/push-docker-image.yml b/.github/workflows/push-docker-image.yml index bb1614d..47296eb 100644 --- a/.github/workflows/push-docker-image.yml +++ b/.github/workflows/push-docker-image.yml @@ -20,10 +20,7 @@ jobs: # Link to discussion: https://github.com/orgs/community/discussions/25678 - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - + uses: actions/checkout@v3 - name: Docker meta id: meta uses: crazy-max/ghaction-docker-meta@v2 diff --git a/Dockerfile b/Dockerfile index f556805..0a2b942 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,8 +35,7 @@ RUN echo "export PATH=\"/opt/conda/bin:/root/.cargo/bin:\$PATH\"" >> /root/.bash # Install Python dependencies (The gradual copies help with caching) WORKDIR open_diloco RUN pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/cpu -COPY hivemind_source hivemind_source -RUN pip install --no-cache-dir ./hivemind_source +RUN pip install flash-attn>=2.5.8 COPY requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY requirements-dev.txt requirements-dev.txt diff --git a/README.md b/README.md index 31ee3d1..9a5fd93 100644 --- a/README.md +++ b/README.md @@ -30,26 +30,16 @@ source .venv/bin/activate Install python dependencies: ```bash -# Hivemind -cd hivemind_source pip install . -cp build/lib/hivemind/proto/* hivemind/proto/. -pip install -e ".[all]" -cd .. -# Requirements -pip install -r requirements.txt -# Others pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/cpu -pip install -e ./pydantic_config -# OpenDiLoCo -pip install . ``` Optionally, you can install flash-attn to use Flash Attention 2. This requires your system to have cuda compiler set up. -``` + +```bash # (Optional) flash-attn -pip install flash-attn==2.5.8 +pip install flash-attn>=2.5.8 ``` ## Docker container @@ -305,20 +295,10 @@ We recommend using `bf16` to avoid scaling and desynchronization issues with hiv # Debugging Issues -1. `hivemind` or `pydantic_config` - If you are having issues with `hivemind` or `pydantic_config`, the issue could be related to submodules. - You can clean and reinitialize the submodules from the root of the repository with the following commands: - - ``` - git submodule deinit -f . - git clean -xdf - git submodule update --init --recursive - ``` - -2. `RuntimeError: CUDA error: invalid device ordinal` +1. `RuntimeError: CUDA error: invalid device ordinal` A possible culprit is that your `--nproc-per-node` argument for the torchrun launcher is set incorrectly. Please set it to an integer less than equal to the number of gpus you have on your machine. -3. `torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate...` +2. `torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate...` A possible culprit is that your `--per-device-train-batch-size` is too high. Try a smaller value. diff --git a/hivemind_source b/hivemind_source deleted file mode 160000 index ad080ed..0000000 --- a/hivemind_source +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ad080ed0461e8e68fbed4d28b735ccfbdd84113e diff --git a/pydantic_config b/pydantic_config deleted file mode 160000 index 8e19e05..0000000 --- a/pydantic_config +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8e19e05d20c0acc7efc27622c0f5c41f3d7c78b1 diff --git a/requirements.txt b/requirements.txt index 52e9232..e918dce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ transformers~=4.40 datasets>=2.19.1 -wandb==0.16.4 +wandb>=0.16.4 cyclopts>=2.6.1 fsspec[gcs]>=2024.3.1 -torch==2.3.1 +torch>=2.3.1 +hivemind @ git+https://github.com/learning-at-home/hivemind.git@213bff9 +pydantic_config @ git+https://github.com/samsja/pydantic_config.git@8e19e05 +