Skip to content
This repository has been archived by the owner on Oct 25, 2023. It is now read-only.

Commit

Permalink
Add initial scorecard infra
Browse files Browse the repository at this point in the history
  • Loading branch information
driazati committed Mar 28, 2023
1 parent 7b00026 commit 4d91a0a
Show file tree
Hide file tree
Showing 33 changed files with 2,428 additions and 1 deletion.
83 changes: 82 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
variables:
GCP_AUTH_JSON: $gcpAuthJson
AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
AWS_DEFAULT_REGION: $AWS_DEFAULT_REGION
AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
IS_IN_CI: "1"

stages:
- prepare-auth
- build-docker
- test


update_token:
image:
Expand All @@ -9,5 +19,76 @@ update_token:
before_script:
- aws --version
script:
- echo hello
- USER=AWS
- TOKEN=$(aws ecr get-login-password)
- AUTH=$(echo -n "$USER:$TOKEN" | base64 | tr -d "\n")
- |
set -eux
curl --request PUT --header "PRIVATE-TOKEN: $GITLAB_PERSONAL_ACCESS_TOKEN" \
--silent --output /dev/null --show-error --fail \
"https://gitlab.com/api/v4/projects/$CI_PROJECT_ID/variables/AWS_ECR_AUTH" --form "value=$AUTH"
only:
variables:
- $CI_PIPELINE_SOURCE == "web"
- $CI_COMMIT_REF_NAME == "main"

docker_build:
stage: build-docker
image: docker:20
dependencies:
- update_token
tags:
- cpu-sole-tenant
script:
- apk add --update py-pip
- pip install awscli
- NO_CACHE=1 sh ./scorecard/docker/build.sh
# Generate a test suite ID to be used in later runs so all the concurrent results can be grouped together
- sh -c 'TEST_SUITE_ID=$(tr -dc a-z </dev/urandom | head -c 5 ; echo ''); echo "TEST_SUITE_ID=$TEST_SUITE_ID" >> output.env'
variables:
DOCKER_HOST: dind-service.kube-system.svc.cluster.local:2375
PUSH_TO_ECR: 1
GIT_COMMIT_SHA: $CI_COMMIT_SHA
artifacts:
reports:
dotenv: output.env

.benchmark_template: &benchmark_template
tags:
- gpu-triton
stage: test
dependencies:
- docker_build
image:
# name: 186900524924.dkr.ecr.us-west-2.amazonaws.com/scorecard:2023-03-10-b4fb5b6
name: 186900524924.dkr.ecr.us-west-2.amazonaws.com/scorecard:$TAG
script: |
set -eux
ls
./scorecard/scripts/show_node_info.sh
mkdir model-data
echo "$GCP_AUTH_JSON" > gcp_auth.json
export UPLOAD_GCP=1
export TEST_RUNS=10
export WARMUP_RUNS=3
pytest --tb=native -rA -v -s -q scorecard/relax-coverage/ -k "$PYTEST_FILTER"
benchmarks-baseline:
<<: *benchmark_template
variables:
PYTEST_FILTER: onnx-trt

benchmarks-relax:
<<: *benchmark_template
variables:
PYTEST_FILTER: relax-cuda and not stable-diffusion

benchmarks-relax-sd-unet:
<<: *benchmark_template
variables:
PYTEST_FILTER: relax-cuda and stable-diffusion and unet

benchmarks-relax-sd-vae:
<<: *benchmark_template
variables:
PYTEST_FILTER: relax-cuda and stable-diffusion and vae
17 changes: 17 additions & 0 deletions scorecard/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
TEST_DATA ?= build/testdata.jsonl

clean:
rm -rf build

build/venv-created.touch: poetry.lock pyproject.toml
poetry install
mkdir -p build
touch build/venv-created.touch

prepare_testdata: build/venv-created.touch testdata/*.jsonc schema/schema.jsonschema relax_scorecard/*.py
mkdir -p build
poetry run python3 -m relax_scorecard.concat_testdata --schema schema/schema.jsonschema testdata/*.jsonc >"${TEST_DATA}"
@echo "Prepared testdata in ${TEST_DATA}"


.DEFAULT_GOAL = prepare_testdata
11 changes: 11 additions & 0 deletions scorecard/bashrc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

echo "scorecard Docker image
ensure S3 credentials are set up (ask the team to get a new set):
export AWS_ACCESS_KEY_ID=...
export AWS_SECRET_ACCESS_KEY=...
run tests with:
pytest --tb=native -v -s -q relax-coverage
"
5 changes: 5 additions & 0 deletions scorecard/docker/Dockerfile.auth-test
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM scratch

COPY docker/Dockerfile.auth-test /

ARG TVM_BUILT_AT
93 changes: 93 additions & 0 deletions scorecard/docker/Dockerfile.scorecard
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# TensorRT image
# uses CUDA 11.7
FROM nvcr.io/nvidia/tensorrt:22.12-py3
# FROM nvcr.io/nvidia/tensorrt:23.02-py3 # uses CUDA 12.0, not supported on the gpu-triton runners' CUDA driver

# CUDA images (requires us to manually install tensorrt)
# FROM nvidia/cuda:11.7.1-devel-ubuntu22.04
# FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04

WORKDIR /opt/scorecard

RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
build-essential \
curl \
fish \
git \
python3 \
libpq-dev \
postgresql \
postgresql-contrib \
python3-dev \
python3-pip \
sudo \
vim \
wget \
;

# llvm
RUN echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" >> /etc/apt/sources.list
RUN echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" >> /etc/apt/sources.list
RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add
RUN apt update && apt install -y libllvm-15-ocaml-dev \
libllvm15 \
llvm-15 \
llvm-15-dev \
llvm-15-runtime

# python dependencies
RUN python3 -m pip install --no-cache-dir \
cmake \
commentjson==0.9.0 \
google-cloud-bigquery==3.5.0 \
jinja2 \
jsonschema==4.17.3 \
ninja \
nvidia-tensorrt \
onnx \
onnxruntime-gpu \
psycopg2==2.9.5 \
pytest \
pytest-xdist \
pyyaml \
tabulate==0.9.0 \
torch \
typing_extensions \
xgboost \
;

RUN python3 -m pip --no-cache-dir install onnx_graphsurgeon==0.3.26 --index-url https://pypi.ngc.nvidia.com --no-deps

# onnx nightly
RUN mkdir /opt/onnx_nightly
RUN PYTHONUSERBASE=/opt/onnx_nightly pip install --user \
--index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ \
# ort-nightly \
ort-nightly-gpu
ENV ONNX_NIGHTLY_PATH /opt/onnx_nightly/lib/python3.10/site-packages

# Build TVM
ARG TVM_BUILT_AT
RUN git clone https://github.com/octoml/relax --recursive
RUN cd relax && git fetch origin && git config user.name test && git config user.email [email protected]

# Add this line to build in an un-merged PR
# RUN PR_NUMBER=NN bash -c 'cd relax && curl -L "https://github.com/octoml/relax/pull/$PR_NUMBER.diff" | patch -p1 -N -d . && git add . && git commit -m"PR #$PR_NUMBER"'
RUN bash -c 'cd relax && curl -L "https://github.com/octoml/relax/compare/TUZ-145.diff" | patch -p1 -N -d . && git add . && git commit -m"Add TUZ-145"'

RUN rm -rf relax/build
COPY docker/build_relax.sh docker/build_relax.sh
RUN bash docker/build_relax.sh
RUN cd relax/python && python3 -m pip install --no-cache-dir -e .

# aws CLI
RUN pip install awscli

# testbench code
COPY relax-coverage relax-coverage
COPY schema schema
COPY models.yaml models.yaml
COPY hub_models.yaml hub_models.yaml

ENV ORT_TENSORRT_FP16_ENABLE 1
ENV AWS_DEFAULT_REGION us-west-2
47 changes: 47 additions & 0 deletions scorecard/docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
set -eux

set +x
source docker/retry.sh
set -x

PUSH_TO_ECR="${PUSH_TO_ECR:=0}"
NO_CACHE="${NO_CACHE:=0}"
TVM_BUILT_AT="${TVM_BUILT_AT:=0}"
RETRIES="${RETRIES:=5}"
IMAGE_NAME="${IMAGE_NAME:=scorecard}"

CACHE_ARG=""
if [ "$NO_CACHE" == "1" ]; then
CACHE_ARG="--no-cache"
fi

retry $RETRIES docker build . --build-arg TVM_BUILT_AT=$TVM_BUILT_AT -f docker/Dockerfile.${IMAGE_NAME} $CACHE_ARG --tag ${IMAGE_NAME}:latest

# # testing code to skip the docker build but still have an image to work with
# docker pull hello-world
# docker tag hello-world scorecard:latest

if [ "$PUSH_TO_ECR" == "1" ]; then
DATE=$(date '+%Y-%m-%d')
HASH=${GIT_COMMIT_SHA:0:7}
TAG="$DATE-$HASH"

REGION="us-west-2"
ACCOUNT_ID="186900524924"

# Make 'docker push' authenticated with ECR
aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com

# Push to ECR registry (latest)
retry 5 docker tag ${IMAGE_NAME}:latest $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:latest
retry 5 docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:latest

# Push to ECR registry (fixed tag)
retry 5 docker tag ${IMAGE_NAME}:latest $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG
retry 5 docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG

# Save the tag so it can be used later
echo "TAG=$TAG" >> output.env
echo "ECR_IMAGE=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/${IMAGE_NAME}:$TAG" >> output.env
fi
17 changes: 17 additions & 0 deletions scorecard/docker/build_relax.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -euxo pipefail
cd relax
mkdir -p build
cd build
cmake -GNinja \
-DCMAKE_LINKER=/usr/bin/lld-15 \
-DCMAKE_CUDA_ARCHITECTURES=75 \
-DUSE_LLVM=llvm-config-15 \
-DSUMMARIZE=1 \
-DUSE_CUDA=1 \
-DUSE_MICRO=1 \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_CUTLASS=1 \
-DUSE_THRUST=1 \
..
cmake --build . --
44 changes: 44 additions & 0 deletions scorecard/docker/dev.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

set -euxo pipefail

# NB: Also source MODEL_DATA_DIR and GCP_AUTH_JSON from a .env or whatever
# is relevant for the running platform
set +x
UPLOAD_GCP="${UPLOAD_GCP:=0}"
UPLOAD_PG="${UPLOAD_PG:=0}"
TEST_RUNS="${TEST_RUNS:=1}"
WARMUP_RUNS="${WARMUP_RUNS:=0}"
IMAGE="${IMAGE:=scorecard}"
MODEL_DATA_DIR="${MODEL_DATA_DIR:=model-data}"
GCP_AUTH_JSON="${GCP_AUTH_JSON:=none.json}"
PWD=$(pwd)

touch .fish_history
sudo rm -rf doc-relax
mkdir -p doc-relax
mkdir -p onnx-hub-cache
mkdir -p model-data

set -x

docker run \
--gpus all \
--env TEST_RUNS=$TEST_RUNS \
--env WARMUP_RUNS=$WARMUP_RUNS \
--env UPLOAD_GCP=$UPLOAD_GCP \
--env UPLOAD_PG=$UPLOAD_PG \
-v $PWD/$MODEL_DATA_DIR:/opt/scorecard/model-data \
-v $GCP_AUTH_JSON:/opt/scorecard/gcp_auth.json:ro \
-v $PWD/.coverage_results:/opt/scorecard/.coverage_results \
-v $PWD/.tuning_records:/opt/scorecard/.tuning_records \
-v $PWD/.fish_history:/root/.local/share/fish/fish_history \
-v $PWD/relax-coverage:/opt/scorecard/relax-coverage \
-v $PWD/schema:/opt/scorecard/schema \
-v $PWD/scripts:/opt/scorecard/scripts \
-v $PWD/models.yaml:/opt/scorecard/models.yaml \
-v $PWD/hub_models.yaml:/opt/scorecard/hub_models.yaml \
--mount type=volume,dst=/opt/scorecard/relax,volume-driver=local,volume-opt=type=none,volume-opt=o=bind,volume-opt=device=$PWD/doc-relax \
--mount type=volume,dst=/root/.cache/onnx/hub,volume-driver=local,volume-opt=type=none,volume-opt=o=bind,volume-opt=device=$PWD/onnx-hub-cache \
-it $IMAGE \
fish
8 changes: 8 additions & 0 deletions scorecard/docker/output_login.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh
set -euxo pipefail

REGION="us-west-2"
ACCOUNT_ID="186900524924"
aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com
AUTH_JSON=$(cat ~/.docker/config.json | tr '\n' ' ')
echo "$AUTH_JSON" >> output.env
39 changes: 39 additions & 0 deletions scorecard/docker/retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -eux

retry() {
local max_retries=$1
shift
local n=0
until [ "$n" -ge "$max_retries" ]
do
"$@" && break
n=$((n+1))
if [ "$n" -eq "$max_retries" ]; then
echo "failed to update after attempt $n / $max_retries, giving up"
exit 1
fi

WAIT=$(( ( RANDOM % 200 ) + 30 ))
echo "failed to update $n / $max_retries, waiting $WAIT to try again"
sleep "$WAIT"
done
}
Loading

0 comments on commit 4d91a0a

Please sign in to comment.