feat: add multiple images (or in-context learning) conversation examp…

…les (#47) Co-authored-by: Bo Liu <[email protected]>
deepseek-ai · Apr 16, 2024 · 9bb02cc · 9bb02cc
1 parent 3c02b24
commit 9bb02cc
Show file tree

Hide file tree

Showing 8 changed files with 121 additions and 5 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,68 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  # Allow to trigger the workflow manually
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: "${{ github.workflow }}-${{ github.ref }}"
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+env:
+  CUDA_VERSION: "11.7"
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+          fetch-depth: 1
+
+      - name: Set up Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+          update-environment: true
+
+      - name: Upgrade pip
+        run: |
+          python -m pip install --upgrade pip setuptools wheel
+
+      - name: Install TorchOpt
+        env:
+          USE_FP16: "OFF"
+          TORCH_CUDA_ARCH_LIST: "Auto"
+        run: |
+          python -m pip install torch numpy pybind11
+          python -m pip install -vvv --no-build-isolation --editable '.[lint]'
+
+      - name: pre-commit
+        run: |
+          make pre-commit
+
+      - name: ruff
+        run: |
+          make ruff
+
+      - name: flake8
+        run: |
+          make flake8
+
+      - name: isort and black
+        run: |
+          make py-format
+
+      - name: addlicense
+        run: |
+          make addlicense
diff --git a/README.md b/README.md
@@ -132,18 +132,34 @@ tokenizer = vl_chat_processor.tokenizer
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 
+## single image conversation example
 conversation = [
     {
         "role": "User",
         "content": "<image_placeholder>Describe each stage of this image.",
-        "images": ["./images/training_pipelines.jpg"]
+        "images": ["./images/training_pipelines.jpg"],
     },
-    {
-        "role": "Assistant",
-        "content": ""
-    }
+    {"role": "Assistant", "content": ""},
 ]
 
+## multiple images (or in-context learning) conversation example
+# conversation = [
+#     {
+#         "role": "User",
+#         "content": "<image_placeholder>A dog wearing nothing in the foreground, "
+#                    "<image_placeholder>a dog wearing a santa hat, "
+#                    "<image_placeholder>a dog wearing a wizard outfit, and "
+#                    "<image_placeholder>what's the dog wearing?",
+#         "images": [
+#             "images/dog_a.png",
+#             "images/dog_b.png",
+#             "images/dog_c.png",
+#             "images/dog_d.png",
+#         ],
+#     },
+#     {"role": "Assistant", "content": ""}
+# ]
+
 # load images and prepare for inputs
 pil_images = load_pil_images(conversation)
 prepare_inputs = vl_chat_processor(

diff --git a/images/dog_a.png b/images/dog_a.png
diff --git a/images/dog_b.png b/images/dog_b.png
diff --git a/images/dog_c.png b/images/dog_c.png
diff --git a/images/dog_d.png b/images/dog_d.png
diff --git a/inference.py b/inference.py
@@ -33,6 +33,7 @@
 )
 vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 
+# single image conversation example
 conversation = [
     {
         "role": "User",
@@ -42,6 +43,23 @@
     {"role": "Assistant", "content": ""},
 ]
 
+# multiple images (or in-context learning) conversation example
+# conversation = [
+#     {
+#         "role": "User",
+#         "content": "<image_placeholder>A dog wearing nothing in the foreground, "
+#                    "<image_placeholder>a dog wearing a santa hat, "
+#                    "<image_placeholder>a dog wearing a wizard outfit, and "
+#                    "<image_placeholder>what's the dog wearing?",
+#         "images": [
+#             "images/dog_a.png",
+#             "images/dog_b.png",
+#             "images/dog_c.png",
+#             "images/dog_d.png",
+#         ],
+#     },
+#     {"role": "Assistant", "content": ""}
+# ]
 
 # load images and prepare for inputs
 pil_images = load_pil_images(conversation)

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,20 @@ gradio = [
     "markdown==3.4.1",
     "SentencePiece==0.1.96"
 ]
+lint = [
+    "isort",
+    "black[jupyter] >= 22.6.0",
+    "pylint[spelling] >= 2.15.0",
+    "flake8",
+    "flake8-bugbear",
+    "flake8-comprehensions",
+    "flake8-docstrings",
+    "flake8-pyi",
+    "flake8-simplify",
+    "ruff",
+    "pyenchant",
+    "pre-commit",
+]
 
 [tool.setuptools]
 packages = {find = {exclude = ["images"]}}