From ce90ec6efb8a2aa87d0fbb071765bf03a2b86a6f Mon Sep 17 00:00:00 2001 From: "Ricardo M. Oliveira" Date: Tue, 17 Sep 2024 16:20:30 -0300 Subject: [PATCH] Revert files deletion Signed-off-by: Ricardo M. Oliveira --- .github/workflows/backend.yml | 1 + .github/workflows/e2e-test.yml | 1 + .../pipelines/xgboost_sample_pipeline.py | 95 ++ .../pipelines/xgboost_sample_pipeline.yaml | 926 ++++++++++++++++++ 4 files changed, 1023 insertions(+) create mode 100644 sdk/python/test_data/pipelines/xgboost_sample_pipeline.py create mode 100644 sdk/python/test_data/pipelines/xgboost_sample_pipeline.yaml diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index faef43a0042f..5f4a65735b14 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -12,6 +12,7 @@ on: - 'backend/**' - 'scripts/deploy/github/**' - 'manifests/kustomize/**' + - 'sdk/python/**' env: GITHUB_ACTION: "true" diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index bc7783bafdac..d8d90f55cb68 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -15,6 +15,7 @@ on: - 'proxy/**' - 'manifests/kustomize/**' - 'test/**' + - 'sdk/python/**' jobs: initialization-tests-v1: diff --git a/sdk/python/test_data/pipelines/xgboost_sample_pipeline.py b/sdk/python/test_data/pipelines/xgboost_sample_pipeline.py new file mode 100644 index 000000000000..cb40d4905d3a --- /dev/null +++ b/sdk/python/test_data/pipelines/xgboost_sample_pipeline.py @@ -0,0 +1,95 @@ +# Copyright 2021 The Kubeflow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kfp import compiler +from kfp import components +from kfp import dsl + +chicago_taxi_dataset_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/60a2612541ec08c6a85c237d2ec7525b12543a43/components/datasets/Chicago_Taxi_Trips/component.yaml' +) +convert_csv_to_apache_parquet_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/0d7d6f41c92bdc05c2825232afe2b47e5cb6c4b3/components/_converters/ApacheParquet/from_CSV/component.yaml' +) +xgboost_train_on_csv_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/567c04c51ff00a1ee525b3458425b17adbe3df61/components/XGBoost/Train/component.yaml' +) +xgboost_predict_on_csv_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/31939086d66d633732f75300ce69eb60e9fb0269/components/XGBoost/Predict/component.yaml' +) +xgboost_train_on_parquet_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/0ae2f30ff24beeef1c64cc7c434f1f652c065192/components/XGBoost/Train/from_ApacheParquet/component.yaml' +) +xgboost_predict_on_parquet_op = components.load_component_from_url( + 'https://raw.githubusercontent.com/kubeflow/pipelines/31939086d66d633732f75300ce69eb60e9fb0269/components/XGBoost/Predict/from_ApacheParquet/component.yaml' +) + + +@dsl.pipeline(name='xgboost-sample-pipeline') +def xgboost_pipeline(): + training_data_csv = chicago_taxi_dataset_op( + where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"', + select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total', + limit=10000, + ).output + + # Training and prediction on dataset in CSV format + model_trained_on_csv = xgboost_train_on_csv_op( + training_data=training_data_csv, + label_column=0, + objective='reg:squarederror', + num_iterations=200, + ).outputs['model'] + + xgboost_predict_on_csv_op( + data=training_data_csv, + model=model_trained_on_csv, + label_column=0, + ) + + # Training and prediction on dataset in Apache Parquet format + training_data_parquet = convert_csv_to_apache_parquet_op( + data=training_data_csv).output + + model_trained_on_parquet = xgboost_train_on_parquet_op( + training_data=training_data_parquet, + label_column_name='tips', + objective='reg:squarederror', + num_iterations=200, + ).outputs['model'] + + xgboost_predict_on_parquet_op( + data=training_data_parquet, + model=model_trained_on_parquet, + label_column_name='tips', + ) + + # Checking cross-format predictions + xgboost_predict_on_parquet_op( + data=training_data_parquet, + model=model_trained_on_csv, + label_column_name='tips', + ) + + xgboost_predict_on_csv_op( + data=training_data_csv, + model=model_trained_on_parquet, + label_column=0, + ) + + +if __name__ == '__main__': + compiler.Compiler().compile( + pipeline_func=xgboost_pipeline, + package_path=__file__.replace('.py', '.yaml')) diff --git a/sdk/python/test_data/pipelines/xgboost_sample_pipeline.yaml b/sdk/python/test_data/pipelines/xgboost_sample_pipeline.yaml new file mode 100644 index 000000000000..2e03d06edff1 --- /dev/null +++ b/sdk/python/test_data/pipelines/xgboost_sample_pipeline.yaml @@ -0,0 +1,926 @@ +# PIPELINE DEFINITION +# Name: xgboost-sample-pipeline +components: + comp-chicago-taxi-trips-dataset: + executorLabel: exec-chicago-taxi-trips-dataset + inputDefinitions: + parameters: + format: + defaultValue: csv + isOptional: true + parameterType: STRING + limit: + defaultValue: 1000.0 + isOptional: true + parameterType: NUMBER_INTEGER + select: + defaultValue: trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location + isOptional: true + parameterType: STRING + where: + defaultValue: trip_start_timestamp>="1900-01-01" AND trip_start_timestamp<"2100-01-01" + isOptional: true + parameterType: STRING + outputDefinitions: + artifacts: + table: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-convert-csv-to-apache-parquet: + executorLabel: exec-convert-csv-to-apache-parquet + inputDefinitions: + artifacts: + data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + outputDefinitions: + artifacts: + output_data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-predict: + executorLabel: exec-xgboost-predict + inputDefinitions: + artifacts: + data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + label_column: + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + predictions: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-predict-2: + executorLabel: exec-xgboost-predict-2 + inputDefinitions: + artifacts: + data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + label_column_name: + isOptional: true + parameterType: STRING + outputDefinitions: + artifacts: + predictions: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-predict-3: + executorLabel: exec-xgboost-predict-3 + inputDefinitions: + artifacts: + data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + label_column_name: + isOptional: true + parameterType: STRING + outputDefinitions: + artifacts: + predictions: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-predict-4: + executorLabel: exec-xgboost-predict-4 + inputDefinitions: + artifacts: + data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + label_column: + isOptional: true + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + predictions: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-train: + executorLabel: exec-xgboost-train + inputDefinitions: + artifacts: + starting_model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + isOptional: true + training_data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + booster: + defaultValue: gbtree + isOptional: true + parameterType: STRING + booster_params: + isOptional: true + parameterType: STRUCT + label_column: + defaultValue: 0.0 + isOptional: true + parameterType: NUMBER_INTEGER + learning_rate: + defaultValue: 0.3 + isOptional: true + parameterType: NUMBER_DOUBLE + max_depth: + defaultValue: 6.0 + isOptional: true + parameterType: NUMBER_INTEGER + min_split_loss: + defaultValue: 0.0 + isOptional: true + parameterType: NUMBER_DOUBLE + num_iterations: + defaultValue: 10.0 + isOptional: true + parameterType: NUMBER_INTEGER + objective: + defaultValue: reg:squarederror + isOptional: true + parameterType: STRING + outputDefinitions: + artifacts: + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model_config: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + comp-xgboost-train-2: + executorLabel: exec-xgboost-train-2 + inputDefinitions: + artifacts: + starting_model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + isOptional: true + training_data: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + parameters: + booster: + defaultValue: gbtree + isOptional: true + parameterType: STRING + booster_params: + isOptional: true + parameterType: STRUCT + label_column_name: + parameterType: STRING + learning_rate: + defaultValue: 0.3 + isOptional: true + parameterType: NUMBER_DOUBLE + max_depth: + defaultValue: 6.0 + isOptional: true + parameterType: NUMBER_INTEGER + min_split_loss: + defaultValue: 0.0 + isOptional: true + parameterType: NUMBER_DOUBLE + num_iterations: + defaultValue: 10.0 + isOptional: true + parameterType: NUMBER_INTEGER + objective: + defaultValue: reg:squarederror + isOptional: true + parameterType: STRING + outputDefinitions: + artifacts: + model: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 + model_config: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 +deploymentSpec: + executors: + exec-chicago-taxi-trips-dataset: + container: + command: + - sh + - -c + - "set -e -x -o pipefail\noutput_path=\"$0\"\nselect=\"$1\"\nwhere=\"$2\"\n\ + limit=\"$3\"\nformat=\"$4\"\nmkdir -p \"$(dirname \"$output_path\")\"\n\ + curl --get 'https://data.cityofchicago.org/resource/wrvz-psew.'\"${format}\"\ + \ \\\n --data-urlencode '$limit='\"${limit}\" \\\n --data-urlencode\ + \ '$where='\"${where}\" \\\n --data-urlencode '$select='\"${select}\"\ + \ \\\n | tr -d '\"' > \"$output_path\" # Removing unneeded quotes around\ + \ all numbers\n" + - '{{$.outputs.artifacts[''table''].path}}' + - '{{$.inputs.parameters[''select'']}}' + - '{{$.inputs.parameters[''where'']}}' + - '{{$.inputs.parameters[''limit'']}}' + - '{{$.inputs.parameters[''format'']}}' + image: byrnedo/alpine-curl@sha256:548379d0a4a0c08b9e55d9d87a592b7d35d9ab3037f4936f5ccd09d0b625a342 + exec-convert-csv-to-apache-parquet: + container: + args: + - --data + - '{{$.inputs.artifacts[''data''].path}}' + - --output-data + - '{{$.outputs.artifacts[''output_data''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install + --quiet --no-warn-script-location 'pyarrow==0.17.1' --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef convert_csv_to_apache_parquet(\n data_path,\n output_data_path,\n\ + ):\n '''Converts CSV table to Apache Parquet.\n\n [Apache Parquet](https://parquet.apache.org/)\n\ + \n Annotations:\n author: Alexey Volkov \n\ + \ '''\n from pyarrow import csv, parquet\n\n table = csv.read_csv(data_path)\n\ + \ parquet.write_table(table, output_data_path)\n\nimport argparse\n_parser\ + \ = argparse.ArgumentParser(prog='Convert csv to apache parquet', description='Converts\ + \ CSV table to Apache Parquet.\\n\\n [Apache Parquet](https://parquet.apache.org/)\\\ + n\\n Annotations:\\n author: Alexey Volkov ')\n\ + _parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-data\", dest=\"\ + output_data_path\", type=_make_parent_dirs_and_return_path, required=True,\ + \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + _output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = convert_csv_to_apache_parquet(**_parsed_args)\n\ + \n_output_serializers = [\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n\ + \ try:\n os.makedirs(os.path.dirname(output_file))\n except\ + \ OSError:\n pass\n with open(output_file, 'w') as f:\n \ + \ f.write(_output_serializers[idx](_outputs[idx]))\n" + image: python:3.7 + exec-xgboost-predict: + container: + args: + - --data + - '{{$.inputs.artifacts[''data''].path}}' + - --model + - '{{$.inputs.artifacts[''model''].path}}' + - '{"IfPresent": {"InputName": "label_column", "Then": ["--label-column", + "{{$.inputs.parameters[''label_column'']}}"]}}' + - --predictions + - '{{$.outputs.artifacts[''predictions''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 + -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' + --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_predict(\n data_path, # Also supports LibSVM\n\ + \ model_path,\n predictions_path,\n label_column = None,\n):\n\ + \ '''Make predictions using a trained XGBoost model.\n\n Args:\n \ + \ data_path: Path for the feature data in CSV format.\n model_path:\ + \ Path for the trained model in binary XGBoost format.\n predictions_path:\ + \ Output path for the predictions.\n label_column: Column containing\ + \ the label data.\n\n Annotations:\n author: Alexey Volkov \n\ + \ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\ + \ import xgboost\n\n df = pandas.read_csv(\n data_path,\n \ + \ )\n\n if label_column is not None:\n df = df.drop(columns=[df.columns[label_column]])\n\ + \n testing_data = xgboost.DMatrix(\n data=df,\n )\n\n model\ + \ = xgboost.Booster(model_file=model_path)\n\n predictions = model.predict(testing_data)\n\ + \n Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)\n\ + \ numpy.savetxt(predictions_path, predictions)\n\nimport argparse\n_parser\ + \ = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions\ + \ using a trained XGBoost model.\\n\\n Args:\\n data_path: Path\ + \ for the feature data in CSV format.\\n model_path: Path for the\ + \ trained model in binary XGBoost format.\\n predictions_path: Output\ + \ path for the predictions.\\n label_column: Column containing the\ + \ label data.\\n\\n Annotations:\\n author: Alexey Volkov ')\n\ + _parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\ + model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --label-column\", dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--predictions\", dest=\"predictions_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_predict(**_parsed_args)\n" + image: python:3.7 + exec-xgboost-predict-2: + container: + args: + - --data + - '{{$.inputs.artifacts[''data''].path}}' + - --model + - '{{$.inputs.artifacts[''model''].path}}' + - '{"IfPresent": {"InputName": "label_column_name", "Then": ["--label-column-name", + "{{$.inputs.parameters[''label_column_name'']}}"]}}' + - --predictions + - '{{$.outputs.artifacts[''predictions''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 + python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' + 'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \ + \ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\ + \ using a trained XGBoost model.\n\n Args:\n data_path: Path for\ + \ the feature data in Apache Parquet format.\n model_path: Path for\ + \ the trained model in binary XGBoost format.\n predictions_path:\ + \ Output path for the predictions.\n label_column_name: Optional.\ + \ Name of the column containing the label data that is excluded during the\ + \ prediction.\n\n Annotations:\n author: Alexey Volkov \n\ + \ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\ + \ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\ + \ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\ + \n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \ + \ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \ + \ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\ + \ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\ + \ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\ + \ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\ + \ Path for the feature data in Apache Parquet format.\\n model_path:\ + \ Path for the trained model in binary XGBoost format.\\n predictions_path:\ + \ Output path for the predictions.\\n label_column_name: Optional.\ + \ Name of the column containing the label data that is excluded during the\ + \ prediction.\\n\\n Annotations:\\n author: Alexey Volkov ')\n\ + _parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\ + model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --label-column-name\", dest=\"label_column_name\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\ + predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\ + \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_predict(**_parsed_args)\n" + image: python:3.7 + exec-xgboost-predict-3: + container: + args: + - --data + - '{{$.inputs.artifacts[''data''].path}}' + - --model + - '{{$.inputs.artifacts[''model''].path}}' + - '{"IfPresent": {"InputName": "label_column_name", "Then": ["--label-column-name", + "{{$.inputs.parameters[''label_column_name'']}}"]}}' + - --predictions + - '{{$.outputs.artifacts[''predictions''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 + python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' + 'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_predict(\n data_path,\n model_path,\n \ + \ predictions_path,\n label_column_name = None,\n):\n '''Make predictions\ + \ using a trained XGBoost model.\n\n Args:\n data_path: Path for\ + \ the feature data in Apache Parquet format.\n model_path: Path for\ + \ the trained model in binary XGBoost format.\n predictions_path:\ + \ Output path for the predictions.\n label_column_name: Optional.\ + \ Name of the column containing the label data that is excluded during the\ + \ prediction.\n\n Annotations:\n author: Alexey Volkov \n\ + \ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\ + \ import xgboost\n\n # Loading data\n df = pandas.read_parquet(data_path)\n\ + \ if label_column_name:\n df = df.drop(columns=[label_column_name])\n\ + \n evaluation_data = xgboost.DMatrix(\n data=df,\n )\n\n \ + \ # Training\n model = xgboost.Booster(model_file=model_path)\n\n \ + \ predictions = model.predict(evaluation_data)\n\n Path(predictions_path).parent.mkdir(parents=True,\ + \ exist_ok=True)\n numpy.savetxt(predictions_path, predictions)\n\nimport\ + \ argparse\n_parser = argparse.ArgumentParser(prog='Xgboost predict', description='Make\ + \ predictions using a trained XGBoost model.\\n\\n Args:\\n data_path:\ + \ Path for the feature data in Apache Parquet format.\\n model_path:\ + \ Path for the trained model in binary XGBoost format.\\n predictions_path:\ + \ Output path for the predictions.\\n label_column_name: Optional.\ + \ Name of the column containing the label data that is excluded during the\ + \ prediction.\\n\\n Annotations:\\n author: Alexey Volkov ')\n\ + _parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\ + model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --label-column-name\", dest=\"label_column_name\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--predictions\", dest=\"\ + predictions_path\", type=_make_parent_dirs_and_return_path, required=True,\ + \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_predict(**_parsed_args)\n" + image: python:3.7 + exec-xgboost-predict-4: + container: + args: + - --data + - '{{$.inputs.artifacts[''data''].path}}' + - --model + - '{{$.inputs.artifacts[''model''].path}}' + - '{"IfPresent": {"InputName": "label_column", "Then": ["--label-column", + "{{$.inputs.parameters[''label_column'']}}"]}}' + - --predictions + - '{{$.outputs.artifacts[''predictions''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 + -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' + --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_predict(\n data_path, # Also supports LibSVM\n\ + \ model_path,\n predictions_path,\n label_column = None,\n):\n\ + \ '''Make predictions using a trained XGBoost model.\n\n Args:\n \ + \ data_path: Path for the feature data in CSV format.\n model_path:\ + \ Path for the trained model in binary XGBoost format.\n predictions_path:\ + \ Output path for the predictions.\n label_column: Column containing\ + \ the label data.\n\n Annotations:\n author: Alexey Volkov \n\ + \ '''\n from pathlib import Path\n\n import numpy\n import pandas\n\ + \ import xgboost\n\n df = pandas.read_csv(\n data_path,\n \ + \ )\n\n if label_column is not None:\n df = df.drop(columns=[df.columns[label_column]])\n\ + \n testing_data = xgboost.DMatrix(\n data=df,\n )\n\n model\ + \ = xgboost.Booster(model_file=model_path)\n\n predictions = model.predict(testing_data)\n\ + \n Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)\n\ + \ numpy.savetxt(predictions_path, predictions)\n\nimport argparse\n_parser\ + \ = argparse.ArgumentParser(prog='Xgboost predict', description='Make predictions\ + \ using a trained XGBoost model.\\n\\n Args:\\n data_path: Path\ + \ for the feature data in CSV format.\\n model_path: Path for the\ + \ trained model in binary XGBoost format.\\n predictions_path: Output\ + \ path for the predictions.\\n label_column: Column containing the\ + \ label data.\\n\\n Annotations:\\n author: Alexey Volkov ')\n\ + _parser.add_argument(\"--data\", dest=\"data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"\ + model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --label-column\", dest=\"label_column\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--predictions\", dest=\"predictions_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_predict(**_parsed_args)\n" + image: python:3.7 + exec-xgboost-train: + container: + args: + - --training-data + - '{{$.inputs.artifacts[''training_data''].path}}' + - '{"IfPresent": {"InputName": "starting_model", "Then": ["--starting-model", + "{{$.inputs.artifacts[''starting_model''].path}}"]}}' + - '{"IfPresent": {"InputName": "label_column", "Then": ["--label-column", + "{{$.inputs.parameters[''label_column'']}}"]}}' + - '{"IfPresent": {"InputName": "num_iterations", "Then": ["--num-iterations", + "{{$.inputs.parameters[''num_iterations'']}}"]}}' + - '{"IfPresent": {"InputName": "booster_params", "Then": ["--booster-params", + "{{$.inputs.parameters[''booster_params'']}}"]}}' + - '{"IfPresent": {"InputName": "objective", "Then": ["--objective", "{{$.inputs.parameters[''objective'']}}"]}}' + - '{"IfPresent": {"InputName": "booster", "Then": ["--booster", "{{$.inputs.parameters[''booster'']}}"]}}' + - '{"IfPresent": {"InputName": "learning_rate", "Then": ["--learning-rate", + "{{$.inputs.parameters[''learning_rate'']}}"]}}' + - '{"IfPresent": {"InputName": "min_split_loss", "Then": ["--min-split-loss", + "{{$.inputs.parameters[''min_split_loss'']}}"]}}' + - '{"IfPresent": {"InputName": "max_depth", "Then": ["--max-depth", "{{$.inputs.parameters[''max_depth'']}}"]}}' + - --model + - '{{$.outputs.artifacts[''model''].path}}' + - --model-config + - '{{$.outputs.artifacts[''model_config''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 + -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' 'pandas==1.0.5' + --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_train(\n training_data_path, # Also supports\ + \ LibSVM\n model_path,\n model_config_path,\n starting_model_path\ + \ = None,\n\n label_column = 0,\n num_iterations = 10,\n booster_params\ + \ = None,\n\n # Booster parameters\n objective = 'reg:squarederror',\n\ + \ booster = 'gbtree',\n learning_rate = 0.3,\n min_split_loss =\ + \ 0,\n max_depth = 6,\n):\n '''Train an XGBoost model.\n\n Args:\n\ + \ training_data_path: Path for the training data in CSV format.\n\ + \ model_path: Output path for the trained model in binary XGBoost\ + \ format.\n model_config_path: Output path for the internal parameter\ + \ configuration of Booster as a JSON string.\n starting_model_path:\ + \ Path for the existing trained model to start from.\n label_column:\ + \ Column containing the label data.\n num_boost_rounds: Number of\ + \ boosting iterations.\n booster_params: Parameters for the booster.\ + \ See https://xgboost.readthedocs.io/en/latest/parameter.html\n objective:\ + \ The learning task and the corresponding learning objective.\n \ + \ See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n\ + \ The most common values are:\n \"reg:squarederror\"\ + \ - Regression with squared loss (default).\n \"reg:logistic\"\ + \ - Logistic regression.\n \"binary:logistic\" - Logistic regression\ + \ for binary classification, output probability.\n \"binary:logitraw\"\ + \ - Logistic regression for binary classification, output score before logistic\ + \ transformation\n \"rank:pairwise\" - Use LambdaMART to perform\ + \ pairwise ranking where the pairwise loss is minimized\n \"\ + rank:ndcg\" - Use LambdaMART to perform list-wise ranking where Normalized\ + \ Discounted Cumulative Gain (NDCG) is maximized\n\n Annotations:\n \ + \ author: Alexey Volkov \n '''\n \ + \ import pandas\n import xgboost\n\n df = pandas.read_csv(\n \ + \ training_data_path,\n )\n\n training_data = xgboost.DMatrix(\n\ + \ data=df.drop(columns=[df.columns[label_column]]),\n label=df[df.columns[label_column]],\n\ + \ )\n\n booster_params = booster_params or {}\n booster_params.setdefault('objective',\ + \ objective)\n booster_params.setdefault('booster', booster)\n booster_params.setdefault('learning_rate',\ + \ learning_rate)\n booster_params.setdefault('min_split_loss', min_split_loss)\n\ + \ booster_params.setdefault('max_depth', max_depth)\n\n starting_model\ + \ = None\n if starting_model_path:\n starting_model = xgboost.Booster(model_file=starting_model_path)\n\ + \n model = xgboost.train(\n params=booster_params,\n dtrain=training_data,\n\ + \ num_boost_round=num_iterations,\n xgb_model=starting_model\n\ + \ )\n\n # Saving the model in binary format\n model.save_model(model_path)\n\ + \n model_config_str = model.save_config()\n with open(model_config_path,\ + \ 'w') as model_config_file:\n model_config_file.write(model_config_str)\n\ + \nimport json\nimport argparse\n_parser = argparse.ArgumentParser(prog='Xgboost\ + \ train', description='Train an XGBoost model.\\n\\n Args:\\n \ + \ training_data_path: Path for the training data in CSV format.\\n \ + \ model_path: Output path for the trained model in binary XGBoost format.\\\ + n model_config_path: Output path for the internal parameter configuration\ + \ of Booster as a JSON string.\\n starting_model_path: Path for the\ + \ existing trained model to start from.\\n label_column: Column containing\ + \ the label data.\\n num_boost_rounds: Number of boosting iterations.\\\ + n booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\\ + n objective: The learning task and the corresponding learning objective.\\\ + n See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\\ + n The most common values are:\\n \"reg:squarederror\"\ + \ - Regression with squared loss (default).\\n \"reg:logistic\"\ + \ - Logistic regression.\\n \"binary:logistic\" - Logistic regression\ + \ for binary classification, output probability.\\n \"binary:logitraw\"\ + \ - Logistic regression for binary classification, output score before logistic\ + \ transformation\\n \"rank:pairwise\" - Use LambdaMART to perform\ + \ pairwise ranking where the pairwise loss is minimized\\n \"\ + rank:ndcg\" - Use LambdaMART to perform list-wise ranking where Normalized\ + \ Discounted Cumulative Gain (NDCG) is maximized\\n\\n Annotations:\\\ + n author: Alexey Volkov ')\n_parser.add_argument(\"\ + --training-data\", dest=\"training_data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--starting-model\"\ + , dest=\"starting_model_path\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--label-column\", dest=\"label_column\", type=int,\ + \ required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--num-iterations\"\ + , dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--booster-params\", dest=\"booster_params\", type=json.loads,\ + \ required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\"\ + , dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--booster\", dest=\"booster\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\",\ + \ dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--min-split-loss\", dest=\"min_split_loss\", type=float,\ + \ required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\"\ + , dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\"\ + , dest=\"model_config_path\", type=_make_parent_dirs_and_return_path, required=True,\ + \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_train(**_parsed_args)\n" + image: python:3.7 + exec-xgboost-train-2: + container: + args: + - --training-data + - '{{$.inputs.artifacts[''training_data''].path}}' + - --label-column-name + - '{{$.inputs.parameters[''label_column_name'']}}' + - '{"IfPresent": {"InputName": "starting_model", "Then": ["--starting-model", + "{{$.inputs.artifacts[''starting_model''].path}}"]}}' + - '{"IfPresent": {"InputName": "num_iterations", "Then": ["--num-iterations", + "{{$.inputs.parameters[''num_iterations'']}}"]}}' + - '{"IfPresent": {"InputName": "booster_params", "Then": ["--booster-params", + "{{$.inputs.parameters[''booster_params'']}}"]}}' + - '{"IfPresent": {"InputName": "objective", "Then": ["--objective", "{{$.inputs.parameters[''objective'']}}"]}}' + - '{"IfPresent": {"InputName": "booster", "Then": ["--booster", "{{$.inputs.parameters[''booster'']}}"]}}' + - '{"IfPresent": {"InputName": "learning_rate", "Then": ["--learning-rate", + "{{$.inputs.parameters[''learning_rate'']}}"]}}' + - '{"IfPresent": {"InputName": "min_split_loss", "Then": ["--min-split-loss", + "{{$.inputs.parameters[''min_split_loss'']}}"]}}' + - '{"IfPresent": {"InputName": "max_depth", "Then": ["--max-depth", "{{$.inputs.parameters[''max_depth'']}}"]}}' + - --model + - '{{$.outputs.artifacts[''model''].path}}' + - --model-config + - '{{$.outputs.artifacts[''model_config''].path}}' + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'xgboost==1.1.1' 'pandas==1.0.5' 'pyarrow==0.17.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 + python3 -m pip install --quiet --no-warn-script-location 'xgboost==1.1.1' + 'pandas==1.0.5' 'pyarrow==0.17.1' --user) && "$0" "$@" + - python3 + - -u + - -c + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n\ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return\ + \ file_path\n\ndef xgboost_train(\n training_data_path,\n model_path,\n\ + \ model_config_path,\n label_column_name,\n\n starting_model_path\ + \ = None,\n\n num_iterations = 10,\n booster_params = None,\n\n \ + \ # Booster parameters\n objective = 'reg:squarederror',\n booster\ + \ = 'gbtree',\n learning_rate = 0.3,\n min_split_loss = 0,\n max_depth\ + \ = 6,\n):\n '''Train an XGBoost model.\n\n Args:\n training_data_path:\ + \ Path for the training data in Apache Parquet format.\n model_path:\ + \ Output path for the trained model in binary XGBoost format.\n model_config_path:\ + \ Output path for the internal parameter configuration of Booster as a JSON\ + \ string.\n starting_model_path: Path for the existing trained model\ + \ to start from.\n label_column_name: Name of the column containing\ + \ the label data.\n num_boost_rounds: Number of boosting iterations.\n\ + \ booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\n\ + \ objective: The learning task and the corresponding learning objective.\n\ + \ See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\n\ + \ The most common values are:\n \"reg:squarederror\"\ + \ - Regression with squared loss (default).\n \"reg:logistic\"\ + \ - Logistic regression.\n \"binary:logistic\" - Logistic regression\ + \ for binary classification, output probability.\n \"binary:logitraw\"\ + \ - Logistic regression for binary classification, output score before logistic\ + \ transformation\n \"rank:pairwise\" - Use LambdaMART to perform\ + \ pairwise ranking where the pairwise loss is minimized\n \"\ + rank:ndcg\" - Use LambdaMART to perform list-wise ranking where Normalized\ + \ Discounted Cumulative Gain (NDCG) is maximized\n\n Annotations:\n \ + \ author: Alexey Volkov \n '''\n \ + \ import pandas\n import xgboost\n\n # Loading data\n df = pandas.read_parquet(training_data_path)\n\ + \ training_data = xgboost.DMatrix(\n data=df.drop(columns=[label_column_name]),\n\ + \ label=df[[label_column_name]],\n )\n # Training\n booster_params\ + \ = booster_params or {}\n booster_params.setdefault('objective', objective)\n\ + \ booster_params.setdefault('booster', booster)\n booster_params.setdefault('learning_rate',\ + \ learning_rate)\n booster_params.setdefault('min_split_loss', min_split_loss)\n\ + \ booster_params.setdefault('max_depth', max_depth)\n\n starting_model\ + \ = None\n if starting_model_path:\n starting_model = xgboost.Booster(model_file=starting_model_path)\n\ + \n model = xgboost.train(\n params=booster_params,\n dtrain=training_data,\n\ + \ num_boost_round=num_iterations,\n xgb_model=starting_model\n\ + \ )\n\n # Saving the model in binary format\n model.save_model(model_path)\n\ + \n model_config_str = model.save_config()\n with open(model_config_path,\ + \ 'w') as model_config_file:\n model_config_file.write(model_config_str)\n\ + \nimport json\nimport argparse\n_parser = argparse.ArgumentParser(prog='Xgboost\ + \ train', description='Train an XGBoost model.\\n\\n Args:\\n \ + \ training_data_path: Path for the training data in Apache Parquet format.\\\ + n model_path: Output path for the trained model in binary XGBoost\ + \ format.\\n model_config_path: Output path for the internal parameter\ + \ configuration of Booster as a JSON string.\\n starting_model_path:\ + \ Path for the existing trained model to start from.\\n label_column_name:\ + \ Name of the column containing the label data.\\n num_boost_rounds:\ + \ Number of boosting iterations.\\n booster_params: Parameters for\ + \ the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html\\\ + n objective: The learning task and the corresponding learning objective.\\\ + n See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters\\\ + n The most common values are:\\n \"reg:squarederror\"\ + \ - Regression with squared loss (default).\\n \"reg:logistic\"\ + \ - Logistic regression.\\n \"binary:logistic\" - Logistic regression\ + \ for binary classification, output probability.\\n \"binary:logitraw\"\ + \ - Logistic regression for binary classification, output score before logistic\ + \ transformation\\n \"rank:pairwise\" - Use LambdaMART to perform\ + \ pairwise ranking where the pairwise loss is minimized\\n \"\ + rank:ndcg\" - Use LambdaMART to perform list-wise ranking where Normalized\ + \ Discounted Cumulative Gain (NDCG) is maximized\\n\\n Annotations:\\\ + n author: Alexey Volkov ')\n_parser.add_argument(\"\ + --training-data\", dest=\"training_data_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--label-column-name\"\ + , dest=\"label_column_name\", type=str, required=True, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--starting-model\", dest=\"starting_model_path\"\ + , type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --num-iterations\", dest=\"num_iterations\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--booster-params\", dest=\"booster_params\", type=json.loads,\ + \ required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--objective\"\ + , dest=\"objective\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--booster\", dest=\"booster\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--learning-rate\",\ + \ dest=\"learning_rate\", type=float, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--min-split-loss\", dest=\"min_split_loss\", type=float,\ + \ required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--max-depth\"\ + , dest=\"max_depth\", type=int, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-config\"\ + , dest=\"model_config_path\", type=_make_parent_dirs_and_return_path, required=True,\ + \ default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + \n_outputs = xgboost_train(**_parsed_args)\n" + image: python:3.7 +pipelineInfo: + name: xgboost-sample-pipeline +root: + dag: + tasks: + chicago-taxi-trips-dataset: + cachingOptions: + enableCache: true + componentRef: + name: comp-chicago-taxi-trips-dataset + inputs: + parameters: + limit: + runtimeValue: + constant: 10000.0 + select: + runtimeValue: + constant: tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total + where: + runtimeValue: + constant: trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp + < "2019-02-01" + taskInfo: + name: chicago-taxi-trips-dataset + convert-csv-to-apache-parquet: + cachingOptions: + enableCache: true + componentRef: + name: comp-convert-csv-to-apache-parquet + dependentTasks: + - chicago-taxi-trips-dataset + inputs: + artifacts: + data: + taskOutputArtifact: + outputArtifactKey: table + producerTask: chicago-taxi-trips-dataset + taskInfo: + name: convert-csv-to-apache-parquet + xgboost-predict: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-predict + dependentTasks: + - chicago-taxi-trips-dataset + - xgboost-train + inputs: + artifacts: + data: + taskOutputArtifact: + outputArtifactKey: table + producerTask: chicago-taxi-trips-dataset + model: + taskOutputArtifact: + outputArtifactKey: model + producerTask: xgboost-train + parameters: + label_column: + runtimeValue: + constant: 0.0 + taskInfo: + name: xgboost-predict + xgboost-predict-2: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-predict-2 + dependentTasks: + - convert-csv-to-apache-parquet + - xgboost-train-2 + inputs: + artifacts: + data: + taskOutputArtifact: + outputArtifactKey: output_data + producerTask: convert-csv-to-apache-parquet + model: + taskOutputArtifact: + outputArtifactKey: model + producerTask: xgboost-train-2 + parameters: + label_column_name: + runtimeValue: + constant: tips + taskInfo: + name: xgboost-predict-2 + xgboost-predict-3: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-predict-3 + dependentTasks: + - convert-csv-to-apache-parquet + - xgboost-train + inputs: + artifacts: + data: + taskOutputArtifact: + outputArtifactKey: output_data + producerTask: convert-csv-to-apache-parquet + model: + taskOutputArtifact: + outputArtifactKey: model + producerTask: xgboost-train + parameters: + label_column_name: + runtimeValue: + constant: tips + taskInfo: + name: xgboost-predict-3 + xgboost-predict-4: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-predict-4 + dependentTasks: + - chicago-taxi-trips-dataset + - xgboost-train-2 + inputs: + artifacts: + data: + taskOutputArtifact: + outputArtifactKey: table + producerTask: chicago-taxi-trips-dataset + model: + taskOutputArtifact: + outputArtifactKey: model + producerTask: xgboost-train-2 + parameters: + label_column: + runtimeValue: + constant: 0.0 + taskInfo: + name: xgboost-predict-4 + xgboost-train: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-train + dependentTasks: + - chicago-taxi-trips-dataset + inputs: + artifacts: + training_data: + taskOutputArtifact: + outputArtifactKey: table + producerTask: chicago-taxi-trips-dataset + parameters: + label_column: + runtimeValue: + constant: 0.0 + num_iterations: + runtimeValue: + constant: 200.0 + objective: + runtimeValue: + constant: reg:squarederror + taskInfo: + name: xgboost-train + xgboost-train-2: + cachingOptions: + enableCache: true + componentRef: + name: comp-xgboost-train-2 + dependentTasks: + - convert-csv-to-apache-parquet + inputs: + artifacts: + training_data: + taskOutputArtifact: + outputArtifactKey: output_data + producerTask: convert-csv-to-apache-parquet + parameters: + label_column_name: + runtimeValue: + constant: tips + num_iterations: + runtimeValue: + constant: 200.0 + objective: + runtimeValue: + constant: reg:squarederror + taskInfo: + name: xgboost-train-2 +schemaVersion: 2.1.0 +sdkVersion: kfp-2.7.0