AstraZeneca · vijayvammi · Dec 24, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -9,10 +9,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      - run: python -m pip install poetry
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: "Set up Python"
+        run: uv python install
       - run: |
-          python -m poetry install --only docs
-          poetry run mkdocs gh-deploy --force
+          uv sync --only-group docs
+          uv run mkdocs gh-deploy --force
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -7,16 +7,14 @@ on:
       - "mkdocs.yml"
     branches:
       - "main"
+  workflow_dispatch:
+  workflow_call:
 
 jobs:
   PRCheck:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      - run: python -m pip install poetry
       - run: |
           # Download the binary
           curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.5.4/argo-linux-amd64.gz
@@ -33,6 +31,23 @@ jobs:
           # Test installation
           argo version
 
-      - run: |
-          python -m poetry install --without docs,binary,perf,tutorial,compare
-          poetry run tox
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: "Set up Python"
+        run: uv python install
+
+      - name: Install the project
+        run: uv sync --all-extras --dev
+
+      - name: Run lint
+        # For example, using `flake8`
+        run: uvx ruff format
+
+      - name: Run tests
+        run: |
+          export PYTHONDONTWRITEBYTECODE=1
+          uv run pytest -m "not container"
+
+      - name: mypy
+        run: uv run mypy runnable extensions
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -6,35 +6,10 @@ on:
       - "examples/**"
     branches:
       - "main"
-      - "rc"
 
 jobs:
   PRCheck:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - run: |
-          # Download the binary
-          curl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.5.4/argo-linux-amd64.gz
-
-          # Unzip
-          gunzip argo-linux-amd64.gz
-
-          # Make binary executable
-          chmod +x argo-linux-amd64
-
-          # Move binary to path
-          mv ./argo-linux-amd64 /usr/local/bin/argo
-
-          # Test installation
-          argo version
-      - run: python -m pip install poetry
-      - run: |
-          python -m poetry install --without docs,binary,perf,tutorial,compare
-          poetry run tox
+    uses: "./.github/workflows/pr.yaml"
 
   Release:
     runs-on: ubuntu-latest
@@ -43,27 +18,24 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - run: python -m pip install poetry
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: "Set up Python"
+        run: uv python install
       - run: |
-          python -m poetry install --only release
-
+          uv sync --only-group release
       - name: Figure version
         continue-on-error: true
         env:
           GH_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
         id: last_tag
         run: |
-          CURRENT=$(python -m poetry run semantic-release -v --noop version --print-last-released)
+          CURRENT=$(uv run semantic-release -v --noop version --print-last-released)
           echo "Current: $CURRENT"
 
-          VERSION=$(python -m poetry run semantic-release -v --noop version --print)
+          VERSION=$(uv run semantic-release -v --noop version --print)
           echo "new: $VERSION"
 
-          # python -m poetry run semantic-release version --tag --push
-
           if [ "$CURRENT" == "$VERSION" ]; then
             echo "version=$VERSION" >> $GITHUB_OUTPUT
             exit 1

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,21 +13,13 @@ repos:
         exclude: assets
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: "v0.0.259"
+    rev: "v0.8.3"
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
         exclude: ^tests
-  - repo: https://github.com/psf/black
-    rev: 23.1.0
-    hooks:
-      - id: black
-        # It is recommended to specify the latest version of Python
-        # supported by your project here, or alternatively use
-        # pre-commit's default_language_version, see
-        # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.8
-
+      - id: ruff-format
+        exclude: ^tests
   - repo: https://github.com/jorisroovers/gitlint
     rev: v0.19.1
     hooks:

diff --git a/docs/iterative.md b/docs/iterative.md
@@ -0,0 +1,44 @@
+## The problems we are trying to solve.
+
+- feature:
+    - Users should be able to record custom data to the logs.
+
+- There is some data that need not be replicated but should be captured in logs.
+    - DECISION: We would not play a role in sourcing the data and the user is expected to do it.
+    - There should be a provision to identify the source data as a soft get.
+    - This is true for large datasets. How true is that though?
+    - This data could be sourced from a different location than the catalog.
+        - This is a soft requirement and can be justified if it is not satisfied.
+        - For simplicity lets assume that this is part of the catalog location.
+    - This could be achieved by using a type of catalog that does not copy but records.
+
+
+!!! note:
+    Can this be simplified by using a "cached catalog"?
+
+- cached catalog:
+    - the run log will capture the catalog metadata.
+    - The catalog will not be run id specific.
+
+- Cached behavior: Given a previous run.
+    - Users can refer to data generated from a previous run.
+    - If the step by name is part of the pipeline and executed successfully, we want to skip execution of the step.
+    - The logs should make it clear that it is a continuation of the previous run.
+    - Its OK to assume that the run log is maintained in the same way.
+    - Its OK to assume that the catalog is maintained in the same way.
+
+    - Question about the recursive behavior.
+        - What if the referenced run is a continuation of the previous run?
+        - The desired behavior should be:
+            - original run_id
+            - continuation run_id
+            - ...
+        - The step will be skipped based on the status of penultimate run only.
+
+    - What about many runs trying to write the same file all at once?
+        - Do we keep versions of it or error out?
+        - The desired behavior could be:
+            - do the processing.
+            - While saving, check for the existence of the file,
+                - If it exists, write a versioned file and error out.
+                - If it does not exist, continue successfully.
diff --git a/examples/02-sequential/default_fail.py b/examples/02-sequential/default_fail.py
@@ -21,7 +21,9 @@ def main():
 
     step2 = PythonTask(name="step 2", function=raise_ex)  # This will fail
 
-    step3 = Stub(name="step 3", terminate_with_success=True)  # This step will not be executed
+    step3 = Stub(
+        name="step 3", terminate_with_success=True
+    )  # This step will not be executed
 
     pipeline = Pipeline(steps=[step1, step2, step3])
 

diff --git a/examples/03-parameters/passing_parameters_notebook.py b/examples/03-parameters/passing_parameters_notebook.py
@@ -43,7 +43,11 @@ def main():
     )
 
     pipeline = Pipeline(
-        steps=[write_parameters_from_notebook, read_parameters, read_parameters_in_notebook],
+        steps=[
+            write_parameters_from_notebook,
+            read_parameters,
+            read_parameters_in_notebook,
+        ],
     )
 
     _ = pipeline.execute()

diff --git a/examples/06-parallel/nesting.py b/examples/06-parallel/nesting.py
@@ -54,7 +54,10 @@ def parallel_pipeline(execute: bool = True):
     parallel_step = Parallel(
         name="parallel step",
         terminate_with_success=True,
-        branches={"branch1": traversal(execute=False), "branch2": traversal(execute=False)},
+        branches={
+            "branch1": traversal(execute=False),
+            "branch2": traversal(execute=False),
+        },
     )
 
     pipeline = Pipeline(steps=[parallel_step])
@@ -69,7 +72,10 @@ def main():
     parallel_step = Parallel(
         name="nested_parallel",
         terminate_with_success=True,
-        branches={"branch1": parallel_pipeline(execute=False), "branch2": parallel_pipeline(execute=False)},
+        branches={
+            "branch1": parallel_pipeline(execute=False),
+            "branch2": parallel_pipeline(execute=False),
+        },
     )
 
     pipeline = Pipeline(steps=[parallel_step])

diff --git a/examples/07-map/custom_reducer.py b/examples/07-map/custom_reducer.py
@@ -84,7 +84,12 @@ def iterable_branch(execute: bool = True):
     )
 
     pipeline = Pipeline(
-        steps=[process_chunk_task_python, process_chunk_task_notebook, process_chunk_task_shell, read_chunk],
+        steps=[
+            process_chunk_task_python,
+            process_chunk_task_notebook,
+            process_chunk_task_shell,
+            read_chunk,
+        ],
     )
 
     if execute:

diff --git a/examples/07-map/map.py b/examples/07-map/map.py
@@ -87,7 +87,12 @@ def iterable_branch(execute: bool = True):
     )
 
     pipeline = Pipeline(
-        steps=[process_chunk_task_python, process_chunk_task_notebook, process_chunk_task_shell, read_chunk],
+        steps=[
+            process_chunk_task_python,
+            process_chunk_task_notebook,
+            process_chunk_task_shell,
+            read_chunk,
+        ],
     )
 
     if execute:

diff --git a/examples/common/functions.py b/examples/common/functions.py
@@ -126,7 +126,9 @@ def process_chunk(chunk: int):
     return chunk * 10
 
 
-def read_processed_chunk(chunk: int, processed_python: int, processed_notebook: int, processed_shell: int):
+def read_processed_chunk(
+    chunk: int, processed_python: int, processed_notebook: int, processed_shell: int
+):
     """
     A downstream step of process_chunk of map state which reads the processed chunk.
     Since the process_chunk returns the chunk multiplied by 10, we assert that.
@@ -137,7 +139,10 @@ def read_processed_chunk(chunk: int, processed_python: int, processed_notebook:
 
 
 def assert_default_reducer(
-    processed_python: List[int], processed_notebook: List[int], processed_shell: List[int], chunks: List[int]
+    processed_python: List[int],
+    processed_notebook: List[int],
+    processed_shell: List[int],
+    chunks: List[int],
 ):
     """
     Demonstrates the default reducer which just returns the list of processed chunks.
@@ -147,7 +152,12 @@ def assert_default_reducer(
     assert processed_shell == [chunk * 1000 for chunk in chunks]
 
 
-def assert_custom_reducer(processed_python: int, processed_notebook: int, processed_shell: int, chunks: List[int]):
+def assert_custom_reducer(
+    processed_python: int,
+    processed_notebook: int,
+    processed_shell: int,
+    chunks: List[int],
+):
     """
     Asserts the custom reducer returns the max of all the processed chunks.
     """

diff --git a/examples/common/process_chunk.ipynb b/examples/common/process_chunk.ipynb
@@ -32,7 +32,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert chunk*10 == processed_python"
+    "assert chunk * 10 == processed_python"
    ]
   },
   {
@@ -42,7 +42,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "processed_notebook = processed_python*10"
+    "processed_notebook = processed_python * 10"
    ]
   }
  ],

diff --git a/examples/common/write_parameters.ipynb b/examples/common/write_parameters.ipynb
@@ -15,7 +15,7 @@
     "\n",
     "from examples.common.functions import ComplexParams\n",
     "\n",
-    "pydantic_param = ComplexParams(x=10, foo=\"bar\")\n"
+    "pydantic_param = ComplexParams(x=10, foo=\"bar\")"
    ]
   },
   {

diff --git a/examples/comparisons/metaflow/main.py b/examples/comparisons/metaflow/main.py
@@ -46,7 +46,11 @@ def train(model, train_loader, optimizer, epoch):
         optimizer.step()
         if idx * len(data) % 10000 == 0:
             out = "Train Epoch: " + "{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                epoch, idx * len(data), len(train_loader.dataset), 100.0 * idx / len(train_loader), loss.item()
+                epoch,
+                idx * len(data),
+                len(train_loader.dataset),
+                100.0 * idx / len(train_loader),
+                loss.item(),
             )
             print(out)
 
@@ -65,9 +69,13 @@ def test(model, test_loader):
 
 
 def get_data_loaders(name="train"):
-    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
     if name == "train":
-        dataset = datasets.MNIST("../data", train=True, download=True, transform=transform)
+        dataset = datasets.MNIST(
+            "../data", train=True, download=True, transform=transform
+        )
         train_args = {"batch_size": 32}
         return dataset, train_args
     elif name == "test":
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ @@
         "\n",
         "from examples.common.functions import ComplexParams\n",
         "\n",
-        "pydantic_param = ComplexParams(x=10, foo=\"bar\")\n"
+        "pydantic_param = ComplexParams(x=10, foo=\"bar\")"
        ]
       },
       {
@@ Expand Down @@