Add initial scorecard infra

octoml · Mar 28, 2023 · 4d91a0a · 4d91a0a
1 parent 7b00026
commit 4d91a0a
Show file tree

Hide file tree

Showing 33 changed files with 2,428 additions and 1 deletion.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,5 +1,15 @@
+variables:
+  GCP_AUTH_JSON: $gcpAuthJson
+  AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
+  AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION
+  AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
+  IS_IN_CI: "1"
+
 stages:
   - prepare-auth
+  - build-docker
+  - test
+
 
 update_token:
   image:
@@ -9,5 +19,76 @@ update_token:
   before_script:
     - aws --version
   script:
-    - echo hello
+    - USER=AWS
+    - TOKEN=$(aws ecr get-login-password)
+    - AUTH=$(echo -n "$USER:$TOKEN" | base64 | tr -d "\n")
+    - |
+      set -eux
+      curl --request PUT --header "PRIVATE-TOKEN: $GITLAB_PERSONAL_ACCESS_TOKEN" \
+      --silent --output /dev/null --show-error --fail \
+      "https://gitlab.com/api/v4/projects/$CI_PROJECT_ID/variables/AWS_ECR_AUTH" --form "value=$AUTH"
+  only:
+    variables:
+      - $CI_PIPELINE_SOURCE == "web"
+      - $CI_COMMIT_REF_NAME == "main"
+
+docker_build:
+  stage: build-docker
+  image: docker:20
+  dependencies:
+    - update_token
+  tags:
+    - cpu-sole-tenant
+  script:
+    - apk add --update py-pip
+    - pip install awscli
+    - NO_CACHE=1 sh ./scorecard/docker/build.sh
+    # Generate a test suite ID to be used in later runs so all the concurrent results can be grouped together
+    - sh -c 'TEST_SUITE_ID=$(tr -dc a-z </dev/urandom | head -c 5 ; echo ''); echo "TEST_SUITE_ID=$TEST_SUITE_ID" >> output.env'
+  variables:
+    DOCKER_HOST: dind-service.kube-system.svc.cluster.local:2375
+    PUSH_TO_ECR: 1
+    GIT_COMMIT_SHA: $CI_COMMIT_SHA
+  artifacts:
+    reports:
+      dotenv: output.env
+
+.benchmark_template: &benchmark_template
+  tags:
+    - gpu-triton
+  stage: test
+  dependencies:
+    - docker_build
+  image:
+    # name: 186900524924.dkr.ecr.us-west-2.amazonaws.com/scorecard:2023-03-10-b4fb5b6
+    name: 186900524924.dkr.ecr.us-west-2.amazonaws.com/scorecard:$TAG
+  script: |
+    set -eux
+    ls
+    ./scorecard/scripts/show_node_info.sh
+    mkdir model-data
+    echo "$GCP_AUTH_JSON" > gcp_auth.json
+    export UPLOAD_GCP=1
+    export TEST_RUNS=10
+    export WARMUP_RUNS=3
+    pytest --tb=native -rA -v -s -q scorecard/relax-coverage/ -k "$PYTEST_FILTER"
+
+benchmarks-baseline:
+  <<: *benchmark_template
+  variables:
+    PYTEST_FILTER: onnx-trt
+
+benchmarks-relax:
+  <<: *benchmark_template
+  variables:
+    PYTEST_FILTER: relax-cuda and not stable-diffusion
+
+benchmarks-relax-sd-unet:
+  <<: *benchmark_template
+  variables:
+    PYTEST_FILTER: relax-cuda and stable-diffusion and unet
 
+benchmarks-relax-sd-vae:
+  <<: *benchmark_template
+  variables:
+    PYTEST_FILTER: relax-cuda and stable-diffusion and vae
diff --git a/scorecard/Makefile b/scorecard/Makefile
@@ -0,0 +1,17 @@
+TEST_DATA ?= build/testdata.jsonl
+
+clean:
+	rm -rf build
+
+build/venv-created.touch: poetry.lock pyproject.toml
+	poetry install
+	mkdir -p build
+	touch build/venv-created.touch
+
+prepare_testdata: build/venv-created.touch testdata/*.jsonc schema/schema.jsonschema relax_scorecard/*.py
+	mkdir -p build
+	poetry run python3 -m relax_scorecard.concat_testdata --schema schema/schema.jsonschema testdata/*.jsonc >"${TEST_DATA}"
+	@echo "Prepared testdata in ${TEST_DATA}"
+
+
+.DEFAULT_GOAL = prepare_testdata
diff --git a/scorecard/bashrc.sh b/scorecard/bashrc.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "scorecard Docker image
+
+ensure S3 credentials are set up (ask the team to get a new set):
+export AWS_ACCESS_KEY_ID=...
+export AWS_SECRET_ACCESS_KEY=...
+
+run tests with:
+pytest --tb=native -v -s -q relax-coverage
+"
diff --git a/scorecard/docker/Dockerfile.auth-test b/scorecard/docker/Dockerfile.auth-test
@@ -0,0 +1,5 @@
+FROM scratch
+
+COPY docker/Dockerfile.auth-test /
+
+ARG TVM_BUILT_AT
diff --git a/scorecard/docker/Dockerfile.scorecard b/scorecard/docker/Dockerfile.scorecard
@@ -0,0 +1,93 @@
+# TensorRT image
+# uses CUDA 11.7
+FROM nvcr.io/nvidia/tensorrt:22.12-py3
+# FROM nvcr.io/nvidia/tensorrt:23.02-py3  # uses CUDA 12.0, not supported on the gpu-triton runners' CUDA driver
+
+# CUDA images (requires us to manually install tensorrt)
+# FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
+# FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+
+WORKDIR /opt/scorecard
+
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+    build-essential \
+    curl \
+    fish \
+    git \
+    python3 \
+    libpq-dev \
+    postgresql \
+    postgresql-contrib \
+    python3-dev \
+    python3-pip \
+    sudo \
+    vim \
+    wget \
+    ;
+
+# llvm
+RUN echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" >> /etc/apt/sources.list
+RUN echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" >> /etc/apt/sources.list
+RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add
+RUN apt update && apt install -y libllvm-15-ocaml-dev \
+    libllvm15 \
+    llvm-15 \
+    llvm-15-dev \
+    llvm-15-runtime
+
+# python dependencies
+RUN python3 -m pip install --no-cache-dir \
+    cmake \
+    commentjson==0.9.0 \
+    google-cloud-bigquery==3.5.0 \
+    jinja2 \
+    jsonschema==4.17.3 \
+    ninja \
+    nvidia-tensorrt \
+    onnx \
+    onnxruntime-gpu \
+    psycopg2==2.9.5 \
+    pytest \
+    pytest-xdist \
+    pyyaml \
+    tabulate==0.9.0 \
+    torch \
+    typing_extensions \
+    xgboost \
+    ;
+
+RUN python3 -m pip --no-cache-dir install onnx_graphsurgeon==0.3.26 --index-url https://pypi.ngc.nvidia.com --no-deps
+
+# onnx nightly
+RUN mkdir /opt/onnx_nightly
+RUN PYTHONUSERBASE=/opt/onnx_nightly pip install --user \
+    --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ \
+    # ort-nightly \
+    ort-nightly-gpu
+ENV ONNX_NIGHTLY_PATH /opt/onnx_nightly/lib/python3.10/site-packages
+
+# Build TVM
+ARG TVM_BUILT_AT
+RUN git clone https://github.com/octoml/relax --recursive
+RUN cd relax && git fetch origin && git config user.name test && git config user.email [email protected]
+
+# Add this line to build in an un-merged PR
+# RUN PR_NUMBER=NN bash -c 'cd relax && curl -L "https://github.com/octoml/relax/pull/$PR_NUMBER.diff" | patch -p1 -N -d . && git add . && git commit -m"PR #$PR_NUMBER"'
+RUN bash -c 'cd relax && curl -L "https://github.com/octoml/relax/compare/TUZ-145.diff" | patch -p1 -N -d . && git add . && git commit -m"Add TUZ-145"'
+
+RUN rm -rf relax/build
+COPY docker/build_relax.sh docker/build_relax.sh
+RUN bash docker/build_relax.sh
+RUN cd relax/python && python3 -m pip install --no-cache-dir -e .
+
+# aws CLI
+RUN pip install awscli
+
+# testbench code
+COPY relax-coverage relax-coverage
+COPY schema schema
+COPY models.yaml models.yaml
+COPY hub_models.yaml hub_models.yaml
+
+ENV ORT_TENSORRT_FP16_ENABLE 1
+ENV AWS_DEFAULT_REGION us-west-2
diff --git a/scorecard/docker/build.sh b/scorecard/docker/build.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -eux
+
+set +x
+source docker/retry.sh
+set -x
+
+PUSH_TO_ECR="${PUSH_TO_ECR:=0}"
+NO_CACHE="${NO_CACHE:=0}"
+TVM_BUILT_AT="${TVM_BUILT_AT:=0}"
+RETRIES="${RETRIES:=5}"
+IMAGE_NAME="${IMAGE_NAME:=scorecard}"
+
+CACHE_ARG=""
+if [ "$NO_CACHE" == "1" ]; then
+    CACHE_ARG="--no-cache"
+fi
+
+retry $RETRIES docker build . --build-arg TVM_BUILT_AT=$TVM_BUILT_AT -f docker/Dockerfile.${IMAGE_NAME} $CACHE_ARG --tag ${IMAGE_NAME}:latest
+
+# # testing code to skip the docker build but still have an image to work with
+# docker pull hello-world
+# docker tag hello-world scorecard:latest
+
+if [ "$PUSH_TO_ECR" == "1" ]; then
+    DATE=$(date '+%Y-%m-%d')
+    HASH=${GIT_COMMIT_SHA:0:7}
+    TAG="$DATE-$HASH"
+
+    REGION="us-west-2"
+    ACCOUNT_ID="186900524924"
+
+    # Make 'docker push' authenticated with ECR
+    aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com
+
+    # Push to ECR registry (latest)
+    retry 5 docker tag ${IMAGE_NAME}:latest $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:latest
+    retry 5 docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:latest
+
+    # Push to ECR registry (fixed tag)
+    retry 5 docker tag ${IMAGE_NAME}:latest $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG
+    retry 5 docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG
+
+    # Save the tag so it can be used later
+    echo "TAG=$TAG" >> output.env
+    echo "ECR_IMAGE=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG" >> output.env
+fi
diff --git a/scorecard/docker/build_relax.sh b/scorecard/docker/build_relax.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -euxo pipefail
+cd relax
+mkdir -p build
+cd build
+cmake -GNinja \
+    -DCMAKE_LINKER=/usr/bin/lld-15 \
+    -DCMAKE_CUDA_ARCHITECTURES=75 \
+    -DUSE_LLVM=llvm-config-15 \
+    -DSUMMARIZE=1 \
+    -DUSE_CUDA=1 \
+    -DUSE_MICRO=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DUSE_CUTLASS=1 \
+    -DUSE_THRUST=1 \
+    ..
+cmake --build . --
diff --git a/scorecard/docker/dev.sh b/scorecard/docker/dev.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# NB: Also source MODEL_DATA_DIR and GCP_AUTH_JSON from a .env or whatever
+# is relevant for the running platform
+set +x
+UPLOAD_GCP="${UPLOAD_GCP:=0}"
+UPLOAD_PG="${UPLOAD_PG:=0}"
+TEST_RUNS="${TEST_RUNS:=1}"
+WARMUP_RUNS="${WARMUP_RUNS:=0}"
+IMAGE="${IMAGE:=scorecard}"
+MODEL_DATA_DIR="${MODEL_DATA_DIR:=model-data}"
+GCP_AUTH_JSON="${GCP_AUTH_JSON:=none.json}"
+PWD=$(pwd)
+
+touch .fish_history
+sudo rm -rf doc-relax
+mkdir -p doc-relax
+mkdir -p onnx-hub-cache
+mkdir -p model-data
+
+set -x
+
+docker run \
+    --gpus all \
+    --env TEST_RUNS=$TEST_RUNS \
+    --env WARMUP_RUNS=$WARMUP_RUNS \
+    --env UPLOAD_GCP=$UPLOAD_GCP \
+    --env UPLOAD_PG=$UPLOAD_PG \
+    -v $PWD/$MODEL_DATA_DIR:/opt/scorecard/model-data \
+    -v $GCP_AUTH_JSON:/opt/scorecard/gcp_auth.json:ro \
+    -v $PWD/.coverage_results:/opt/scorecard/.coverage_results \
+    -v $PWD/.tuning_records:/opt/scorecard/.tuning_records \
+    -v $PWD/.fish_history:/root/.local/share/fish/fish_history \
+    -v $PWD/relax-coverage:/opt/scorecard/relax-coverage \
+    -v $PWD/schema:/opt/scorecard/schema \
+    -v $PWD/scripts:/opt/scorecard/scripts \
+    -v $PWD/models.yaml:/opt/scorecard/models.yaml \
+    -v $PWD/hub_models.yaml:/opt/scorecard/hub_models.yaml \
+    --mount type=volume,dst=/opt/scorecard/relax,volume-driver=local,volume-opt=type=none,volume-opt=o=bind,volume-opt=device=$PWD/doc-relax \
+    --mount type=volume,dst=/root/.cache/onnx/hub,volume-driver=local,volume-opt=type=none,volume-opt=o=bind,volume-opt=device=$PWD/onnx-hub-cache \
+    -it $IMAGE \
+    fish
diff --git a/scorecard/docker/output_login.sh b/scorecard/docker/output_login.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+set -euxo pipefail
+
+REGION="us-west-2"
+ACCOUNT_ID="186900524924"
+aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com
+AUTH_JSON=$(cat ~/.docker/config.json | tr '\n' ' ')
+echo "$AUTH_JSON" >> output.env
diff --git a/scorecard/docker/retry.sh b/scorecard/docker/retry.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -eux
+
+retry() {
+  local max_retries=$1
+  shift
+  local n=0
+  until [ "$n" -ge "$max_retries" ]
+  do
+      "$@" && break
+      n=$((n+1))
+      if [ "$n" -eq "$max_retries" ]; then
+          echo "failed to update after attempt $n / $max_retries, giving up"
+          exit 1
+      fi
+
+      WAIT=$(( ( RANDOM % 200 )  + 30 ))
+      echo "failed to update $n / $max_retries, waiting $WAIT to try again"
+      sleep "$WAIT"
+  done
+}