Start a serving sub-project. (#397)

This is just a stub that gets the structure and a basic API server/test (inspired by vllm). Unlike some of the other implementations, this is going to be pretty thin as most of the work will be done in a companion project focused on compilation.
nod-ai · Feb 6, 2024 · e7f0f94 · e7f0f94
1 parent 9d929b0
commit e7f0f94
Show file tree

Hide file tree

Showing 19 changed files with 291 additions and 21 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -9,11 +9,8 @@ on:
 
 jobs:
   black:
-    strategy:
-      matrix:
-        version: [3.11]
-        os: [ubuntu-latest]
-    runs-on: ${{matrix.os}}
+    name: Python Formatting With Black
+    runs-on: ubuntu-latest
     steps:
       - name: Checking out repository
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: Turbine Core Test
+name: Turbine Unit Tests
 
 on:
   workflow_dispatch:
@@ -17,6 +17,7 @@ concurrency:
 
 jobs:
   test:
+    name: "Test"
     strategy:
       matrix:
         version: [3.11]
@@ -40,10 +41,23 @@ jobs:
           # wheels saves multiple minutes and a lot of bandwidth on runner setup.
           pip install --index-url https://download.pytorch.org/whl/cpu \
             -r core/pytorch-cpu-requirements.txt \
-            -r core/torchvision-requirements.txt          
-          pip install --upgrade -r core/requirements.txt
-          pip install -e  core[testing]
+            -r core/torchvision-requirements.txt
+          pip install --upgrade \
+            -r core/requirements.txt \
+            -r mypy-requirements.txt
+          pip install -e core[testing] -e serving[testing]
 
-      - name: Run tests
+      - name: Run core tests
+        if: ${{ !cancelled() }}
         run: |
-          pytest -n 4 core/tests/
+          pytest -n 4 core/
+
+      - name: Run serving tests
+        if: ${{ !cancelled() }}
+        run: |
+          pytest -n 4 serving/
+
+      - name: MyPy Type Checking
+        if: ${{ !cancelled() }}
+        run: |
+          mypy serving/
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ can specify pytorch-cpu and install via:
 ```
 pip install --index-url https://download.pytorch.org/whl/cpu \
     -r core/pytorch-cpu-requirements.txt \
-    -r core torchvision-requirements.txt
+    -r core/torchvision-requirements.txt
 pip install shark-turbine
 ```
 

diff --git a/core/iree-requirements.txt b/core/iree-requirements.txt
@@ -0,0 +1,2 @@
+iree-compiler==20240129.785
+iree-runtime==20240129.785
diff --git a/core/misc-requirements.txt b/core/misc-requirements.txt
@@ -0,0 +1,4 @@
+numpy>=1.26.3
+onnx>=1.15.0
+pytest>=8.0.0
+pytest-xdist>=3.5.0
diff --git a/core/requirements.txt b/core/requirements.txt
@@ -6,6 +6,4 @@
 
 -r pytorch-cpu-requirements.txt
 -r torchvision-requirements.txt
-
-iree-compiler==20240129.785
-iree-runtime==20240129.785
+-r iree-requirements.txt
diff --git a/core/setup.py b/core/setup.py
@@ -54,7 +54,8 @@ def load_requirement_pins(requirements_file: str):
     requirement_pins.update(dict(pin_pairs))
 
 
-load_requirement_pins("requirements.txt")
+load_requirement_pins("iree-requirements.txt")
+load_requirement_pins("misc-requirements.txt")
 load_requirement_pins("pytorch-cpu-requirements.txt")
 
 
@@ -97,7 +98,7 @@ def initialize_options(self):
         ],
     },
     install_requires=[
-        "numpy",
+        f"numpy{get_version_spec('numpy')}",
         f"iree-compiler{get_version_spec('iree-compiler')}",
         f"iree-runtime{get_version_spec('iree-runtime')}",
         # Use the [torch-cpu-nightly] spec to get a more recent/specific version.
@@ -106,12 +107,12 @@ def initialize_options(self):
     extras_require={
         "torch-cpu-nightly": [f"torch{get_version_spec('torch')}"],
         "onnx": [
-            "onnx>=1.15.0",
+            f"onnx{get_version_spec('onnx')}",
         ],
         "testing": [
-            "onnx==1.15.0",
-            "pytest",
-            "pytest-xdist",
+            f"onnx{get_version_spec('onnx')}",
+            f"pytest{get_version_spec('pytest')}",
+            f"pytest-xdist{get_version_spec('pytest-xdist')}",
         ],
     },
     cmdclass={"build": BuildCommand},

diff --git a/mypy-requirements.txt b/mypy-requirements.txt
@@ -0,0 +1,3 @@
+# Typing packages needed for full mypy execution at the project level.
+mypy==1.8.0
+types-requests
diff --git a/serving/README.md b/serving/README.md
@@ -0,0 +1,12 @@
+# Turbine Serving Infrastructure
+
+This sub-project contains components and infrastructure for serving various
+forms of Turbine compiled models. Instead of coming with models, it defines
+ABIs that compiled models should adhere to in order to be served. It then
+allows them to be delivered as web endpoints via popular APIs.
+
+As emulation can be the sincerest form of flattery, this project derives
+substantial inspiration from vllm and the OpenAI APIs, emulating and
+interopping with them where possible. It is intended to be the lightest
+weight possible reference implementation for serving models with an
+opinionated compiled form, built elsewhere in the project.
diff --git a/serving/mypy.ini b/serving/mypy.ini
@@ -0,0 +1,5 @@
+[mypy]
+
+explicit_package_bases = True
+mypy_path = $MYPY_CONFIG_FILE_DIR
+packages = turbine_serving.llm
diff --git a/serving/pyproject.toml b/serving/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/serving/requirements.txt b/serving/requirements.txt
@@ -0,0 +1,2 @@
+fastapi>=0.109.2
+uvicorn>=0.27.0
diff --git a/serving/setup.cfg b/serving/setup.cfg
@@ -0,0 +1,6 @@
+[tool:pytest]
+testpaths =
+  ./tests
+filterwarnings =
+  # TODO: Remove once flatbuffer 'imp' usage resolved.
+  ignore::DeprecationWarning
diff --git a/serving/setup.py b/serving/setup.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import os
+import distutils.command.build
+from pathlib import Path
+
+from setuptools import find_namespace_packages, setup  # type: ignore
+
+THIS_DIR = Path(__file__).resolve().parent
+REPO_DIR = THIS_DIR.parent
+VERSION_INFO_FILE = REPO_DIR / "version_info.json"
+
+
+with open(
+    os.path.join(
+        REPO_DIR,
+        "README.md",
+    ),
+    "rt",
+) as f:
+    README = f.read()
+
+
+def load_version_info():
+    with open(VERSION_INFO_FILE, "rt") as f:
+        return json.load(f)
+
+
+version_info = load_version_info()
+PACKAGE_VERSION = version_info["package-version"]
+
+packages = find_namespace_packages(
+    include=[
+        "turbine_serving",
+        "turbine_serving.*",
+    ],
+)
+
+print("Found packages:", packages)
+
+# Lookup version pins from requirements files.
+requirement_pins = {}
+
+
+def load_requirement_pins(requirements_file: Path):
+    with open(requirements_file, "rt") as f:
+        lines = f.readlines()
+    pin_pairs = [line.strip().split("==") for line in lines if "==" in line]
+    requirement_pins.update(dict(pin_pairs))
+
+
+load_requirement_pins(THIS_DIR / "requirements.txt")
+load_requirement_pins(REPO_DIR / "core" / "iree-requirements.txt")
+load_requirement_pins(REPO_DIR / "core" / "misc-requirements.txt")
+
+
+def get_version_spec(dep: str):
+    if dep in requirement_pins:
+        return f">={requirement_pins[dep]}"
+    else:
+        return ""
+
+
+# Override build command so that we can build into _python_build
+# instead of the default "build". This avoids collisions with
+# typical CMake incantations, which can produce all kinds of
+# hilarity (like including the contents of the build/lib directory).
+class BuildCommand(distutils.command.build.build):
+    def initialize_options(self):
+        distutils.command.build.build.initialize_options(self)
+        self.build_base = "_python_build"
+
+
+setup(
+    name=f"turbine-serving",
+    version=f"{PACKAGE_VERSION}",
+    author="SHARK Authors",
+    author_email="[email protected]",
+    description="SHARK Turbine Machine Learning Deployment Tools",
+    long_description=README,
+    long_description_content_type="text/markdown",
+    url="https://github.com/nod-ai/SHARK-Turbine",
+    license="Apache-2.0",
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+    ],
+    packages=packages,
+    package_data={"turbine_serving": ["py.typed"]},
+    install_requires=[
+        f"fastapi{get_version_spec('fastapi')}",
+        f"iree-compiler{get_version_spec('iree-compiler')}",
+        f"iree-runtime{get_version_spec('iree-runtime')}",
+        f"uvicorn{get_version_spec('uvicorn')}",
+    ],
+    extras_require={
+        "testing": [
+            f"pytest{get_version_spec('pytest')}",
+            f"pytest-xdist{get_version_spec('pytest-xdist')}",
+        ],
+    },
+    cmdclass={"build": BuildCommand},
+)
diff --git a/serving/tests/api_server_test.py b/serving/tests/api_server_test.py
@@ -0,0 +1,63 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import pytest
+import requests
+import subprocess
+import sys
+import time
+
+
+class ServerRunner:
+    def __init__(self, args):
+        self.url = "http://localhost:8000"
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.process = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "turbine_serving.llm.entrypoints.api_server",
+            ]
+            + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_ready()
+
+    def _wait_for_ready(self):
+        start = time.time()
+        while True:
+            try:
+                if requests.get(f"{self.url}/health").status_code == 200:
+                    return
+            except Exception as e:
+                if self.process.poll() is not None:
+                    raise RuntimeError("API server processs terminated") from e
+            time.sleep(0.25)
+            if time.time() - start > 30:
+                raise RuntimeError("Timeout waiting for server start") from e
+
+    def __del__(self):
+        try:
+            process = self.process
+        except AttributeError:
+            pass
+        else:
+            process.terminate()
+            process.wait()
+
+
+@pytest.fixture(scope="session")
+def server():
+    runner = ServerRunner([])
+    yield runner
+
+
+def test_basic(server: ServerRunner):
+    ...
diff --git a/serving/turbine_serving/__init__.py b/serving/turbine_serving/__init__.py
diff --git a/serving/turbine_serving/llm/__init__.py b/serving/turbine_serving/llm/__init__.py
diff --git a/serving/turbine_serving/llm/entrypoints/api_server.py b/serving/turbine_serving/llm/entrypoints/api_server.py
@@ -0,0 +1,50 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Sequence
+
+import argparse
+
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response
+import sys
+import uvicorn
+
+app = FastAPI()
+
+
+@app.get("/health")
+async def health() -> Response:
+    return Response(status_code=200)
+
+
+def main(clargs: Sequence[str]):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--root-path",
+        type=str,
+        default=None,
+        help="Root path to use for installing behind path based proxy.",
+    )
+    parser.add_argument(
+        "--timeout-keep-alive", type=int, default=5, help="Keep alive timeout"
+    )
+    args = parser.parse_args(clargs)
+
+    app.root_path = args.root_path
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=args.timeout_keep_alive,
+    )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/serving/turbine_serving/py.typed b/serving/turbine_serving/py.typed
@@ -0,0 +1 @@
+# Marker file for PEP 561 inline type checking.