diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index e9e10f5dc..523fee346 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-23.12 + - branch-24.02 types: [closed] jobs: @@ -29,14 +29,14 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: branch-23.12 # force to fetch from latest upstream instead of PR ref + ref: branch-24.02 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids-examples - HEAD: branch-23.12 - BASE: branch-24.02 + HEAD: branch-24.02 + BASE: branch-24.04 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/.github/workflows/markdown-links-check.yml b/.github/workflows/markdown-links-check.yml index 8bba68ac4..0dce03914 100644 --- a/.github/workflows/markdown-links-check.yml +++ b/.github/workflows/markdown-links-check.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,6 +30,5 @@ jobs: with: max-depth: -1 use-verbose-mode: 'yes' - check-modified-files-only: 'yes' config-file: '.github/workflows/markdown-links-check/markdown-links-check-config.json' base-branch: 'main' \ No newline at end of file diff --git a/.github/workflows/markdown-links-check/markdown-links-check-config.json b/.github/workflows/markdown-links-check/markdown-links-check-config.json index 32fba2b72..de3af9914 100644 --- a/.github/workflows/markdown-links-check/markdown-links-check-config.json +++ b/.github/workflows/markdown-links-check/markdown-links-check-config.json @@ -1,4 +1,18 @@ { + "ignorePatterns": [ + { + "pattern": "/docs" + }, + { + "pattern": "/datasets" + }, + { + "pattern": "/dockerfile" + }, + { + "pattern": "/examples" + } + ], "timeout": "15s", "retryOn429": true, "retryCount":30, diff --git a/README.md b/README.md index 6c4df4ca5..a75e6dacc 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ can be built for running on GPU with RAPIDS Accelerator in this repo: | 3 | XGBoost | Taxi (Scala) | End-to-end ETL + XGBoost example to predict taxi trip fare amount with [NYC taxi trips data set](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) | 4 | ML/DL | PCA End-to-End | Spark MLlib based PCA example to train and transform with a synthetic dataset | 5 | UDF | cuSpatial - Point in Polygon | Spark cuSpatial example for Point in Polygon function using NYC Taxi pickup location dataset -| 6 | UDF | URL Decode | Decodes URL-encoded strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable/) -| 7 | UDF | URL Encode | URL-encodes strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable/) +| 6 | UDF | URL Decode | Decodes URL-encoded strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy/) +| 7 | UDF | URL Encode | URL-encodes strings using the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy/) | 8 | UDF | [CosineSimilarity](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java) | Computes the cosine similarity between two float vectors using [native code](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/src) | 9 | UDF | [StringWordCount](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java) | Implements a Hive simple UDF using [native code](./examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/src) to count words in strings diff --git a/docs/get-started/xgboost-examples/csp/databricks/databricks.md b/docs/get-started/xgboost-examples/csp/databricks/databricks.md index c211c020b..574b3f94a 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/databricks.md +++ b/docs/get-started/xgboost-examples/csp/databricks/databricks.md @@ -21,7 +21,7 @@ Navigate to your home directory in the UI and select **Create** > **File** from create an `init.sh` scripts with contents: ```bash #!/bin/bash - sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.12.1.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar + sudo wget -O /databricks/jars/rapids-4-spark_2.12-24.02.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar ``` 1. Select the Databricks Runtime Version from one of the supported runtimes specified in the Prerequisites section. @@ -68,7 +68,7 @@ create an `init.sh` scripts with contents: ```bash spark.rapids.sql.python.gpu.enabled true spark.python.daemon.module rapids.daemon_databricks - spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.12.1.jar:/databricks/spark/python + spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.02.0.jar:/databricks/spark/python ``` Note that since python memory pool require installing the cudf library, so you need to install cudf library in each worker nodes `pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com` or disable python memory pool diff --git a/docs/get-started/xgboost-examples/csp/databricks/init.sh b/docs/get-started/xgboost-examples/csp/databricks/init.sh index 29d658f2e..8be5d6b0d 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/init.sh +++ b/docs/get-started/xgboost-examples/csp/databricks/init.sh @@ -1,7 +1,7 @@ sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-gpu_2.12--ml.dmlc__xgboost4j-gpu_2.12__1.5.2.jar sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar -sudo wget -O /databricks/jars/rapids-4-spark_2.12-23.12.1.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar +sudo wget -O /databricks/jars/rapids-4-spark_2.12-24.02.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar sudo wget -O /databricks/jars/xgboost4j-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.7.1/xgboost4j-gpu_2.12-1.7.1.jar sudo wget -O /databricks/jars/xgboost4j-spark-gpu_2.12-1.7.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.7.1/xgboost4j-spark-gpu_2.12-1.7.1.jar ls -ltr diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md index 9e0c81187..096b53803 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md @@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE= export SPARK_DOCKER_TAG= pushd ${SPARK_HOME} -wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-23.12/dockerfile/Dockerfile +wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-24.02/dockerfile/Dockerfile # Optionally install additional jars into ${SPARK_HOME}/jars/ diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md index 4cf02a102..89695717e 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md @@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag ### Download the jars Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar) ### Build XGBoost Python Examples diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md index 9c90123b3..4da71efe1 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md @@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag ### Download the jars 1. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar) ### Build XGBoost Scala Examples diff --git a/examples/ML+DL-Examples/Spark-DL/criteo_train/README.md b/examples/ML+DL-Examples/Spark-DL/criteo_train/README.md index a743e45cf..083894055 100644 --- a/examples/ML+DL-Examples/Spark-DL/criteo_train/README.md +++ b/examples/ML+DL-Examples/Spark-DL/criteo_train/README.md @@ -7,7 +7,7 @@ _Please note: The following demo is dedicated for DGX-2 machine(with V100 GPUs). ## Dataset The dataset used here is from Criteo clicklog dataset. -It's preprocessed by [DLRM](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM/preproc) +It's preprocessed by [DLRM](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM_and_DCNv2/preproc) ETL job on Spark. We also provide a small size sample data in sample_data folder. All 40 columns(1 label + 39 features) are already numeric. diff --git a/examples/ML+DL-Examples/Spark-DL/criteo_train/notebooks/Criteo-Training.ipynb b/examples/ML+DL-Examples/Spark-DL/criteo_train/notebooks/Criteo-Training.ipynb index 93dcd16f6..d760106b4 100644 --- a/examples/ML+DL-Examples/Spark-DL/criteo_train/notebooks/Criteo-Training.ipynb +++ b/examples/ML+DL-Examples/Spark-DL/criteo_train/notebooks/Criteo-Training.ipynb @@ -9,7 +9,7 @@ "\n", "This notebook contains the same content as \"criteo_keras.py\" but in a notebook(interactive) form.\n", "\n", - "The dataset used here is from Criteo clicklog dataset. It's preprocessed by DLRM(https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM/preproc) ETL job on Spark.\n", + "The dataset used here is from Criteo clicklog dataset. It's preprocessed by DLRM(https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM_and_DCNv2/preproc) ETL job on Spark.\n", "\n", "We provide a small size sample data in `sample_data` folder.\n", "\n", diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile index dafa28056..5b6048e3d 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile +++ b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile @@ -18,7 +18,7 @@ ARG CUDA_VER=11.8.0 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 # Please do not update the BRANCH_VER version -ARG BRANCH_VER=23.12 +ARG BRANCH_VER=24.02 RUN apt-get update RUN apt-get install -y wget ninja-build git diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md index 35ea557a2..ed27ba2aa 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md +++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md @@ -12,9 +12,9 @@ User can also download the release jar from Maven central: [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar) -[rapids-4-spark_2.12-23.12.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar) +[rapids-4-spark_2.12-24.02.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar) -Note: This demo could only work with v22.02.0 spark-ml version, and only compatible with spark-rapids versions prior to 23.12.1 . Please do not update the version in release. +Note: This demo could only work with v22.02.0 spark-ml version, and only compatible with spark-rapids versions prior to 24.02.0 . Please do not update the version in release. ## Sample code @@ -49,7 +49,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER ``` bash RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar - PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-23.12.1.jar + PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-24.02.0.jar jupyter toree install \ --spark_home=${SPARK_HOME} \ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index a63c9cc6a..c4c715d74 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -21,7 +21,7 @@ com.nvidia PCAExample jar - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT 8 diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index 8c781a46a..1f6ca8e77 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -17,7 +17,7 @@ # Note that the last rapids-4-spark-ml release version is 22.02.0, snapshot version is 23.04.0-SNPASHOT, please do not update the version in release ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0.jar -PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar +PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar Note: The last rapids-4-spark-ml release version is 22.02.0, snapshot version is 23.04.0-SNPASHOT. $SPARK_HOME/bin/spark-submit \ @@ -40,4 +40,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=1000s \ --jars $ML_JAR,$PLUGIN_JAR \ --class com.nvidia.spark.examples.pca.Main \ -/workspace/target/PCAExample-23.12.1-SNAPSHOT.jar +/workspace/target/PCAExample-24.02.0-SNAPSHOT.jar diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index e58cba00a..6b33dc841 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -22,7 +22,7 @@ "import os\n", "# Change to your cluster ip:port and directories\n", "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n", - "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-23.12.1.jar\")\n" + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-24.02.0.jar\")\n" ] }, { diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile index cf0ed8635..b5ef1cc0c 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile @@ -1,5 +1,5 @@ # -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -70,3 +70,18 @@ RUN cd /tmp \ && make install -j${PARALLEL_LEVEL} \ && cd /tmp && rm -rf /tmp/cmake-$CMAKE_VERSION* +# Install ccache +ARG CCACHE_VERSION=4.6 +RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \ + tar zxf ccache-${CCACHE_VERSION}.tar.gz && \ + rm ccache-${CCACHE_VERSION}.tar.gz && \ + cd ccache-${CCACHE_VERSION} && \ + mkdir build && \ + cd build && \ + cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DZSTD_FROM_INTERNET=ON \ + -DREDIS_STORAGE_BACKEND=OFF && \ + cmake --build . --parallel ${PARALLEL_LEVEL} --target install && \ + cd ../.. && \ + rm -rf ccache-${CCACHE_VERSION} diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md index 29caa2d9d..a102fb1c0 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md @@ -18,7 +18,7 @@ which provides a single method we need to override called evaluateColumnar returns a cudf ColumnVector, because the GPU get its speed by performing operations on many rows at a time. In the `evaluateColumnar` function, there is a cudf implementation of URL decode that we're leveraging, so we don't need to write any native C++ code. This is all done -through the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable). The benefit to +through the [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy). The benefit to implement via the Java API is ease of development, but the memory model is not friendly for doing GPU operations because the JVM makes the assumption that everything we're trying to do is in heap memory. We need to free the GPU resources in a timely manner with try-finally blocks. Note that we @@ -27,10 +27,10 @@ involving the RAPIDS accelerated UDF falls back to the CPU. - [URLDecode](src/main/scala/com/nvidia/spark/rapids/udf/scala/URLDecode.scala) decodes URL-encoded strings using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) - [URLEncode](src/main/scala/com/nvidia/spark/rapids/udf/scala/URLEncode.scala) URL-encodes strings using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) ## Spark Java UDF Examples @@ -53,10 +53,10 @@ significant effort. - [URLDecode](src/main/java/com/nvidia/spark/rapids/udf/java/URLDecode.java) decodes URL-encoded strings using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) - [URLEncode](src/main/java/com/nvidia/spark/rapids/udf/java/URLEncode.java) URL-encodes strings using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) - [CosineSimilarity](src/main/java/com/nvidia/spark/rapids/udf/java/CosineSimilarity.java) computes the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between two float vectors using [native code](src/main/cpp/src) @@ -67,11 +67,11 @@ Below are some examples for implementing RAPIDS accelerated Hive UDF via JNI and - [URLDecode](src/main/java/com/nvidia/spark/rapids/udf/hive/URLDecode.java) implements a Hive simple UDF using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) to decode URL-encoded strings - [URLEncode](src/main/java/com/nvidia/spark/rapids/udf/hive/URLEncode.java) implements a Hive generic UDF using the - [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/stable) + [Java APIs of RAPIDS cudf](https://docs.rapids.ai/api/cudf-java/legacy) to URL-encode strings - [StringWordCount](src/main/java/com/nvidia/spark/rapids/udf/hive/StringWordCount.java) implements a Hive simple UDF using @@ -118,8 +118,6 @@ and other settings. See the top of the `Dockerfile` for details. First install docker and [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) -Run the following commands to build and start a docker - ```bash cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs docker build -t my-local:my-udf-example-ubuntu . @@ -133,11 +131,34 @@ In the Docker container, clone the code and compile. ```bash git clone https://github.com/NVIDIA/spark-rapids-examples.git cd spark-rapids-examples/examples/UDF-Examples/RAPIDS-accelerated-UDFs +export LOCAL_CCACHE_DIR="$HOME/.ccache" +mkdir -p $LOCAL_CCACHE_DIR +export CCACHE_DIR="$LOCAL_CCACHE_DIR" +export CMAKE_C_COMPILER_LAUNCHER="ccache" +export CMAKE_CXX_COMPILER_LAUNCHER="ccache" +export CMAKE_CUDA_COMPILER_LAUNCHER="ccache" +export CMAKE_CXX_LINKER_LAUNCHER="ccache mvn clean package -Pudf-native-examples ``` -The build could take a long time (e.g.: 1.5 hours). Then the rapids-4-spark-udf-examples*.jar is +The Docker container has installed ccache 4.6 to accelerate the incremental building. +You can change the LOCAL_CCACHE_DIR to a mounted folder so that the cache can persist. +If you don't want to use ccache, you can remove or unset the ccache environment variables. + +```bash +unset CCACHE_DIR +unset CMAKE_C_COMPILER_LAUNCHER +unset CMAKE_CXX_COMPILER_LAUNCHER +unset CMAKE_CUDA_COMPILER_LAUNCHER +unset CMAKE_CXX_LINKER_LAUNCHER +``` + +The first build could take a long time (e.g.: 1.5 hours). Then the rapids-4-spark-udf-examples*.jar is generated under RAPIDS-accelerated-UDFs/target directory. +The following build can benefit from ccache if you enable it. + +If you want to enable building with ccache on your own system, +please refer to the commands which build ccache from the source code in the Dockerfile. ### Run all the examples including native examples in the docker @@ -163,7 +184,7 @@ then do the following inside the Docker container. ### Get jars from Maven Central -[rapids-4-spark_2.12-23.12.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar) +[rapids-4-spark_2.12-24.02.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar) ### Launch a local mode Spark @@ -192,11 +213,10 @@ schema = StructType([ StructField("c2", IntegerType()), ]) data = [ - ("s1",1), - ("s2",2), - ("s1",3), - ("s2",3), - ("s1",3), + ("a b c d",1), + ("",2), + (None,3), + ("the quick brown fox jumped over the lazy dog",3), ] df = spark.createDataFrame( SparkContext.getOrCreate().parallelize(data, numSlices=2), @@ -204,6 +224,6 @@ df = spark.createDataFrame( df.createOrReplaceTempView("tab") spark.sql("CREATE TEMPORARY FUNCTION {} AS '{}'".format("wordcount", "com.nvidia.spark.rapids.udf.hive.StringWordCount")) -spark.sql("select wordcount(c1) from tab group by c1").show() -spark.sql("select wordcount(c1) from tab group by c1").explain() +spark.sql("select c1, wordcount(c1) from tab").show() +spark.sql("select c1, wordcount(c1) from tab").explain() ``` diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml index b3463afa6..414c5ed18 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml @@ -25,7 +25,7 @@ user defined functions for use with the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT 1.8 @@ -37,7 +37,7 @@ cuda11 2.12 - 23.12.1 + 24.02.0 3.1.1 2.12.15 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt index c5a03390b..6412d7d63 100755 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -32,7 +32,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(UDFEXAMPLESJNI) -project(UDFEXAMPLESJNI VERSION 23.12.0 LANGUAGES C CXX CUDA) +project(UDFEXAMPLESJNI VERSION 24.02.0 LANGUAGES C CXX CUDA) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF) @@ -81,20 +81,20 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relax # - cudf ------------------------------------------------------------------------------------------- # Ensure CUDA runtime is dynamic despite statically linking Arrow in libcudf -set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) +set(CUDA_USE_STATIC_CUDA_RUNTIME ON) rapids_cpm_init() -rapids_cpm_find(cudf 23.12.00 +rapids_cpm_find(cudf 24.02.00 CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-23.12 + GIT_TAG branch-24.02 GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "CUDF_USE_ARROW_STATIC ON" "JITIFY_USE_CACHE ON" - "CUDA_STATIC_RUNTIME OFF" + "CUDA_STATIC_RUNTIME ${CUDA_USE_STATIC_CUDA_RUNTIME}" "DISABLE_DEPRECATION_WARNING ON" "AUTO_DETECT_CUDA_ARCHITECTURES OFF" ) diff --git a/examples/UDF-Examples/Spark-cuSpatial/README.md b/examples/UDF-Examples/Spark-cuSpatial/README.md index 0daf9fe0f..6ba27ae2e 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/README.md +++ b/examples/UDF-Examples/Spark-cuSpatial/README.md @@ -82,7 +82,8 @@ Note: The docker env is just for building the jar, not for running the applicati ## Run ### GPU Demo on Spark Standalone on-premises cluster -1. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. +1. Set up [a standalone cluster](../../../docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. + Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. 2. Download Spark RAPIDS JAR * [Spark RAPIDS JAR v23.02.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar) or above @@ -105,7 +106,7 @@ Note: The docker env is just for building the jar, not for running the applicati docker push : ``` -2. Follow the [Spark-rapids get-started document](https://nvidia.github.io/spark-rapids/docs/get-started/getting-started-databricks.html#start-a-databricks-cluster) to create a GPU cluster on AWS Databricks. +2. Follow the [Spark-rapids get-started document](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/databricks.html) to create a GPU cluster on AWS Databricks. Below are some different steps since a custom docker image is used with Databricks: * Databricks Runtime Version Choose a non-ML Databricks Runtime such as `Runtime: 9.1 LTS(Scala 2.12, Spark 3.1.2)` and diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index 6b355fe0f..88d7af3e8 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,7 +24,7 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT 1.8 diff --git a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb index 4ec73c923..b34565dda 100644 --- a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb +++ b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb @@ -73,7 +73,7 @@ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "2022-11-30 06:57:40,550 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", - "2022-11-30 06:57:54,195 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 23.12.1 using cudf 23.12.0.\n", + "2022-11-30 06:57:54,195 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.0.\n", "2022-11-30 06:57:54,210 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n", diff --git a/examples/XGBoost-Examples/agaricus/pom.xml b/examples/XGBoost-Examples/agaricus/pom.xml index 6b4bab9a8..0c560dac7 100644 --- a/examples/XGBoost-Examples/agaricus/pom.xml +++ b/examples/XGBoost-Examples/agaricus/pom.xml @@ -1,6 +1,6 @@ - jar-with-dependencies + jar-with-dependencies_${scala.binary.version} jar diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb index 2af22cff3..4eb875ec3 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb @@ -9,7 +9,7 @@ "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-23.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-23.12.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar)\n", + "* [rapids-4-spark_2.12-24.02.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar)\n", "\n", "\n", "### 3. Start Spark Standalone\n", @@ -17,7 +17,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-23.12.1.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-24.02.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb index 1f0e68484..0383ceb7b 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb @@ -63,7 +63,7 @@ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "2022-11-25 09:34:43,952 WARN resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", - "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 23.12.1 using cudf 23.12.0.\n", + "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.0.\n", "2022-11-25 09:34:58,171 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb index 0ad1593bb..67914b1bc 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb @@ -84,7 +84,7 @@ "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster\n", "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat\n", "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator\n", - "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator 23.12.1 using cudf 23.12.0.\n", + "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.0.\n", "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb index f0a9a3630..447d88d41 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb @@ -20,14 +20,14 @@ "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-23.12/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-23.12.1.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/23.12.1/rapids-4-spark_2.12-23.12.1.jar)\n", + "* [rapids-4-spark_2.12-24.02.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before Running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-23.12.1.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-24.02.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/mortgage/pom.xml b/examples/XGBoost-Examples/mortgage/pom.xml index ffef6cfd5..fd24935de 100644 --- a/examples/XGBoost-Examples/mortgage/pom.xml +++ b/examples/XGBoost-Examples/mortgage/pom.xml @@ -1,6 +1,6 @@