Skip to content

Commit

Permalink
Merge branch 'correct_ci_status' into 'v22.10-integration'
Browse files Browse the repository at this point in the history
Correct ci status of hdfs_backend_test and change to use computelab runner

See merge request dl/hugectr/hugectr!935
  • Loading branch information
minseokl committed Oct 17, 2022
2 parents 4b5e0bb + cf59c0b commit 4f0cef9
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 18 deletions.
14 changes: 6 additions & 8 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -777,35 +777,33 @@ inference_benchmark:
TEST_CMD: ./ci/benchmark/inference_benchmark/run.sub

inference_ps_test:
extends: .dlcluster_test_job_daily
extends: .computelab_test_job_daily
allow_failure: false
stage: test
needs:
- build_inference
script:
- export CONT=${INFER_IMAGE_VERSIONED}
- srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash ./ci/integration_test/inference/ps_test.sh
- bash ./ci/integration_test/inference/ps_test.sh

inference_embedding_cache_update_test:
extends: .dlcluster_test_job_daily
extends: .computelab_test_job_daily
allow_failure: false
stage: test
needs:
- build_inference
script:
- export CONT=${INFER_IMAGE_VERSIONED}
- srun -N 1 -p dgx1v,dgx1v16g,dgx1v32g bash ./ci/integration_test/inference/embedding_cache_update_test.sh
- bash ./ci/integration_test/inference/embedding_cache_update_test.sh

#hdfs backend test
hdfs_backend_test:
extends: .dlcluster_test_job_daily
allow_failure: false
stage: test
extends: .computelab_test_job_daily
needs:
- build_train_single_node_with_hdfs
script:
- export CONT=${TRAIN_IMAGE_VERSIONED_WITH_HDFS}
- srun -N 1 -p dgx1v32g bash ./ci/integration_test/hdfs/hdfs_backend_test.sh
- bash ./ci/integration_test/hdfs/hdfs_backend_test.sh

wdl_check:
# Push logs to gitlab
Expand Down
20 changes: 14 additions & 6 deletions ci/integration_test/hdfs/hdfs_backend_test.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
#!/bin/bash

echo $(pwd)
set -x
set -ex

docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
ID=$(docker run --gpus all -d -u root ${CONT} bash -cx "\
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
/etc/init.d/ssh start && \
hdfs namenode -format && \
bash /opt/hadoop/sbin/start-dfs.sh && \
cd /workdir/build/bin && \
./hdfs_backend_test && \
./file_loader_test")

ID=$(docker run --rm --name=hadoop_namenode -u root -dt gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "/etc/init.d/ssh start && /usr/local/hadoop/sbin/start-dfs.sh && top")
docker logs $ID

ID=$(docker run --net=container:hadoop_namenode -u root -d ${CONT} bash -cx 'export CLASSPATH=$(hadoop classpath --glob) && cd /workdir/build/bin && ./hdfs_backend_test && ./file_loader_test')
docker logs -f $ID
docker logs -f $ID
exitCode=$(docker wait $ID)
docker rm $ID
exit $exitCode
8 changes: 6 additions & 2 deletions ci/integration_test/inference/embedding_cache_update_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,9 @@ docker logs $ID
ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties ")
docker logs $ID

ID=$(docker run --net=host -v /mnt/nvdl/usr/aleliu/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./embedding_cache_update_test || exit 1")
docker logs -f $ID
ID=$(docker run --net=host --gpus=all -v /home/scratch.svc_compute_arch/hugectr-ci/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./embedding_cache_update_test || exit 1")
docker logs -f $ID
exitCode=$(docker wait $ID)
docker rm $ID
exit $exitCode

7 changes: 5 additions & 2 deletions ci/integration_test/inference/ps_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ docker logs $ID
ID=$(docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties ")
docker logs $ID

ID=$(docker run --net=host -v /mnt/nvdl/usr/aleliu/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./parameter_server_test || exit 1 ")
ID=$(docker run --gpus=all --net=host -v /home/scratch.svc_compute_arch/hugectr-ci/inference_ci/model_repository:/models -u root -d ${CONT} bash -cx "cd /workdir/build/bin && mkdir -p /hugectr/Test_Data/rockdb && ./parameter_server_test || exit 1 ")
docker logs -f $ID
exitCode=$(docker wait $ID)
docker rm $ID
exit $exitCode


#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/7000 && ../src/redis-server redis.conf "
Expand All @@ -30,4 +33,4 @@ docker logs -f $ID
#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "cd /usr/local/redis/src && echo yes | ./redis-cli --cluster create 127.0.0.1:7000 127.0.0.1:7001 127.0.0.1:7002 --cluster-replicas 0 ";
#docker run --rm --net=host -u root -d gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_hps_thirdparties sh -c "export JAVA_HOME=/usr/local/jdk-16.0.2 && /usr/local/zookeeper/bin/zkServer.sh start && /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties "
#sleep 5
#docker run --gpus=all --rm -v /gpfs/fs1/yingcanw:/hugectr/ -v /gpfs/fs1/yingcanw/wdl_infer:/wdl_infer --net=host gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_train sh -cx "cd /hugectr/hugectr/infer_build && ./bin/parameter_server_test";
#docker run --gpus=all --rm -v /gpfs/fs1/yingcanw:/hugectr/ -v /gpfs/fs1/yingcanw/wdl_infer:/wdl_infer --net=host gitlab-master.nvidia.com:5005/dl/hugectr/hugectr:devel_train sh -cx "cd /hugectr/hugectr/infer_build && ./bin/parameter_server_test";
13 changes: 13 additions & 0 deletions ci/template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ stages:
echo "RUN pip install ninja" >> ${JOB_DOCKERFILE};
echo "RUN cd /workdir/hierarchical_parameter_server/ && python setup.py install" >> ${JOB_DOCKERFILE};
fi
- echo "RUN rm /usr/local/cuda/lib64/stubs/libcuda.so.1" >> ${JOB_DOCKERFILE};
- cat ${JOB_DOCKERFILE}
- docker login -u ${CI_PRIVATE_USER} -p "${CI_PRIVATE_KEY}" "${CI_REGISTRY}"
- if [[ "$TEST_NEW_IMAGE" == "1" ]]; then
Expand Down Expand Up @@ -209,6 +210,18 @@ stages:
- .default:rules:daily-test
allow_failure: false

.computelab_test_job_daily:
extends:
- .dlcluster_job
- .default:rules:daily-test
variables:
CI_SLURM_PARTITION: "a100-pcie-40gb-product,a100-pcie-80gb-product"
CI_SLURM_ACCOUNT: "cag"
WALLTIME: "02:00:00"
tags:
- computelab_generic
allow_failure: false

.sok_test_job:
extends:
- .selene_luna_job
Expand Down

0 comments on commit 4f0cef9

Please sign in to comment.