From 890537b5fddb1d806388a832b420f49ed20d1d0c Mon Sep 17 00:00:00 2001 From: Brandon Mayer Date: Wed, 13 Dec 2023 09:04:07 -0800 Subject: [PATCH] Cleanup old sampler examples. PiperOrigin-RevId: 590617039 --- examples/arxiv/Makefile | 40 -------- examples/arxiv/sample_dataflow.sh | 98 ------------------ examples/arxiv/sampling_spec.pbtxt | 18 ---- examples/mag/download_and_format.sh | 39 ------- examples/mag/sample_dataflow.sh | 104 ------------------- examples/mag/sampling_spec.pbtxt | 46 --------- examples/mag/schema.pbtxt | 152 ---------------------------- 7 files changed, 497 deletions(-) delete mode 100755 examples/arxiv/Makefile delete mode 100755 examples/arxiv/sample_dataflow.sh delete mode 100644 examples/arxiv/sampling_spec.pbtxt delete mode 100755 examples/mag/download_and_format.sh delete mode 100755 examples/mag/sample_dataflow.sh delete mode 100644 examples/mag/sampling_spec.pbtxt delete mode 100644 examples/mag/schema.pbtxt diff --git a/examples/arxiv/Makefile b/examples/arxiv/Makefile deleted file mode 100755 index ea7875d9..00000000 --- a/examples/arxiv/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env make -# -# Makefile to run the OGB conversion to Unigraph tool on some of its datasets. -# - -# Temporary installation prefix setup on Linux. -PYVERSION = 3 -PYTHON = python$(PYVERSION) - -# Selected dataset. -DATASET = ogbn-arxiv -ROOT=/tmp/data/$(DATASET) - -# Run the sampler on the X dataset. -graph: - tfgnn_convert_ogb_dataset \ - --dataset=$(DATASET) \ - --ogb_datasets_dir=/tmp/ogb-preprocessed \ - --output=$(ROOT)/graph - -sample: - tfgnn_graph_sampler \ - --alsologtostderr \ - --graph_schema=$(ROOT)/graph/schema.pbtxt \ - --sampling_spec=$(PWD)/sampling_spec.pbtxt \ - --output_samples=$(ROOT)/training/data@20 - -stats: - tfgnn_sampled_stats \ - --alsologtostderr \ - --graph_schema=$(ROOT)/graph/schema.pbtxt \ - --input_pattern=$(ROOT)/training/data-?????-of-00020 \ - --input_format=tfrecord \ - --output_filename=$(ROOT)/training/stats.pbtxt - -print: - tfgnn_print_training_data \ - --graph_schema=$(ROOT)/graph/schema.pbtxt \ - --examples=$(ROOT)/training/data-?????-of-00020 \ - --file_format=tfrecord diff --git a/examples/arxiv/sample_dataflow.sh b/examples/arxiv/sample_dataflow.sh deleted file mode 100755 index 2aedc119..00000000 --- a/examples/arxiv/sample_dataflow.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# -# Dataflow workers must have access to the tfgnn docker container. -# Clients must have a local copy of the tensorflow_gnn image to launch the -# dataflow job from within the container. -# -# Clients must also have access to the target's GCP project application default -# credentials in their home directory. -# -# To run sampling as a Dataflow job, clients must have run the data generation -# tool in tensorflow_gnn/examples/Makefile and copied the output folder to GCS. -# An end to end example from building docker, pushing to GCR, -# copying data to GCS, and running sampling: -# -# pushd /[path-to]/tensorflow_gnn -# docker build . -t tfgnn:latest -t gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest -# docker push gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest -# -# docker run -v /tmp:/tmp \ -# -it --entrypoint make tfgnn:latest -C /app/examples/arxiv graph \ -# -# gsutil cp /tmp/data/ogbn-arxiv/graph \ -# gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv -# -# ./[path-to]/tensorflow_gnn/examples/arxiv/sample_dataflow.sh -# -MACHINE_TYPE="n1-standard-1" -MAX_NUM_WORKERS=1000 - -# The sampling spec is included in the container, read by the controller and -# sent to the remote Dataflow server. -SAMPLING_SPEC="/app/examples/arxiv/sampling_spec.pbtxt" - -# The values below are just suggestions, feel free to change them. -GOOGLE_CLOUD_PROJECT="[FILL-ME-IN]" - -# Make sure you have already made the GCP bucket with something like: -# `gsutil mb gs://${GOOGLE_CLOUD_PROJECT}`. -EXAMPLE_ARTIFACT_DIRECTORY="gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv" - -GRAPH_SCHEMA="${EXAMPLE_ARTIFACT_DIRECTORY}/schema.pbtxt" - -# Required by Dataflow -TEMP_LOCATION="${EXAMPLE_ARTIFACT_DIRECTORY}/tmp" - -# (Sharded) output sample tfrecord filespec. -OUTPUT_SAMPLES="${EXAMPLE_ARTIFACT_DIRECTORY}/samples@20" - -# This should be a path to a docker image with TFGNN installed that pinned to -# the image version that the user is running this script with. A valid example -# using Google Container Registry: -# `gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest`. -REMOTE_WORKER_CONTAINER="[FILL-ME-IN]" - -# Useful to define a private GCP VPC hat does not allocate external IP addresses -# so worker machines do not impact quota limits. -GCP_VPC_NAME="[FILL-ME-IN]" - -JOB_NAME="tensorflow-gnn-arxiv-sampling" - -# Placeholder for Google-internal script config - -docker run -v ~/.config/gcloud:/root/.config/gcloud \ - -e "GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT}" \ - -e "GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json" \ - --entrypoint tfgnn_graph_sampler \ - tfgnn:latest \ - --graph_schema="${GRAPH_SCHEMA}" \ - --sampling_spec="${SAMPLING_SPEC}" \ - --output_samples="${OUTPUT_SAMPLES}" \ - --runner=DataflowRunner \ - --project=${GOOGLE_CLOUD_PROJECT} \ - --region=us-east1 \ - --max_num_workers="${MAX_NUM_WORKERS}" \ - --temp_location="${TEMP_LOCATION}" \ - --job_name="${JOB_NAME}" \ - --no_use_public_ips \ - --network="${GCP_VPC_NAME}" \ - --worker_machine_type="${MACHINE_TYPE}" \ - --experiments=use_monitoring_state_manager \ - --experiments=enable_execution_details_collection \ - --experiment=use_runner_v2 \ - --worker_harness_container_image=gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest \ - --alsologtostderr diff --git a/examples/arxiv/sampling_spec.pbtxt b/examples/arxiv/sampling_spec.pbtxt deleted file mode 100644 index 8fd7558e..00000000 --- a/examples/arxiv/sampling_spec.pbtxt +++ /dev/null @@ -1,18 +0,0 @@ -seed_op: < - op_name: 'seed' - node_set_name: 'nodes' -> -sampling_ops: < - op_name: 'hop-1' - input_op_names: [ 'seed' ] - strategy: TOP_K - sample_size: 8 - edge_set_name: 'edges' -> -sampling_ops: < - op_name: 'hop-2' - input_op_names: [ 'hop-1' ] - strategy: TOP_K - sample_size: 3 - edge_set_name: 'edges' -> diff --git a/examples/mag/download_and_format.sh b/examples/mag/download_and_format.sh deleted file mode 100755 index 11c67182..00000000 --- a/examples/mag/download_and_format.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# Simple script for using TFGNN docker image to download and format OGBN-MAG. -# Dataset download is approximately 0.4 GB and expands on disk to be larger. -# To run the batch sampler using Dataflow on this dataset, the graph artifacts -# must be pushed to GCS. Assuming the user is authenticated to work with a -# GCP project with a bucket named ${BUCKET}, this can be done with a command -# akin to (This will copy approximately 2GB of data to GCS): -# -# gsutil -m cp -r ${OUTPUT_PATH} gs://${BUCKET}/tfgnn/examples/obgn-mag -# -DATASET="ogbn-mag" -DOWNLOAD_PATH="/tmp/ogb-preprocessed" -OUTPUT_PATH="/tmp/data/${DATASET}/graph" - -docker run -it --entrypoint tfgnn_convert_ogb_dataset \ - -v /tmp:/tmp \ - tfgnn:latest \ - --dataset="${DATASET}" \ - --ogb_datasets_dir="${DOWNLOAD_PATH}" \ - --output="${OUTPUT_PATH}" - -sudo chown -R ${USER} ${OUTPUT_PATH} - -# Copy over the extended schema with the "written" relationship. -cp $(dirname $0)/schema.pbtxt ${OUTPUT_PATH} diff --git a/examples/mag/sample_dataflow.sh b/examples/mag/sample_dataflow.sh deleted file mode 100755 index 3da2606a..00000000 --- a/examples/mag/sample_dataflow.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash -# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# -# Dataflow workers must have access to the tfgnn docker container. -# Clients must have a local copy of the tensorflow_gnn image to launch the -# dataflow job from within the container. -# -# Clients must also have access to the target's GCP project application default -# credentials in their home directory. -# -# To run sampling as a Dataflow job, clients must have run the data generation -# tool in tensorflow_gnn/examples/Makefile and copied the output folder to GCS. -# An end to end example from building docker, pushing to GCR, -# copying data to GCS, and running sampling: -# -# docker build [path-to]/tensorflow_gnn -t tfgnn:latest \ -# -t gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest -# docker push gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest -# -# docker run -v /tmp:/tmp \ -# -it --entrypoint make tfgnn:latest -C /app/examples/arxiv graph \ -# -# gsutil cp /tmp/data/ogbn-arxiv/graph \ -# gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv -# -# ./[path-to]/tensorflow_gnn/examples/arxiv/sample_dataflow.sh -# -MAX_NUM_WORKERS=1000 - -# The sampling spec is included in the container, read by the controller and -# sent to the remote Dataflow server. -SAMPLING_SPEC="/app/examples/mag/sampling_spec.pbtxt" - -# The values below are just suggestions, feel free to change them. -GOOGLE_CLOUD_PROJECT="[FILL-ME-IN]" -GOOGLE_CLOUD_COMPUTE_REGION="[FILL-ME-In]" -# Make sure you have already made the GCP bucket with something like: -# `gsutil mb gs://${GOOGLE_CLOUD_PROJECT}`. -TIMESTAMP="$(date +"%Y-%m-%d-%H-%M-%S")" -EXAMPLE_ARTIFACT_DIRECTORY="gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/ogbn-mag/${TIMESTAMP}" - -# Sampler expects the graph artifacts to be in the same directory as the -# schema. -GRAPH_SCHEMA="${EXAMPLE_ARTIFACT_DIRECTORY}/schema.pbtxt" - -# Required by Dataflow -TEMP_LOCATION="${EXAMPLE_ARTIFACT_DIRECTORY}/tmp" - -# (Sharded) output sample tfrecord filespec. -OUTPUT_SAMPLES="${EXAMPLE_ARTIFACT_DIRECTORY}/samples@100" - -# This should be a path to a docker image with TFGNN installed that pinned to -# the image version that the user is running this script with. A valid example -# using Google Container Registry: -# `gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest`. -REMOTE_WORKER_CONTAINER="[FILL-ME-IN]" - -# Useful to define a private GCP VPC hat does not allocate external IP addresses -# so worker machines do not impact quota limits. -GCP_VPC_NAME="[FILL-ME-IN]" - -EDGE_AGGREGATION_METHOD="edge" - -JOB_NAME="tfgnn-mag-sampling-${EDGE_AGGREGATION_METHOD}-${TIMESTAMP}" - -# Placeholder for Google-internal script config - -docker run -v ~/.config/gcloud:/root/.config/gcloud \ - -e "GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT}" \ - -e "GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json" \ - --entrypoint tfgnn_graph_sampler \ - tfgnn:latest \ - --graph_schema="${GRAPH_SCHEMA}" \ - --sampling_spec="${SAMPLING_SPEC}" \ - --output_samples="${OUTPUT_SAMPLES}" \ - --edge_aggregation_method="${EDGE_AGGREGATION_METHOD}" \ - --runner=DataflowRunner \ - --project=${GOOGLE_CLOUD_PROJECT} \ - --region=${GOOGLE_CLOUD_COMPUTE_REGION} \ - --max_num_workers="${MAX_NUM_WORKERS}" \ - --temp_location="${TEMP_LOCATION}" \ - --job_name="${JOB_NAME}" \ - --no_use_public_ips \ - --network="${GCP_VPC_NAME}" \ - --dataflow_service_options=enable_prime \ - --experiments=use_monitoring_state_manager \ - --experiments=enable_execution_details_collection \ - --experiment=use_runner_v2 \ - --worker_harness_container_image=gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest \ - --number_of_worker_harness_threads="${NUMBER_OF_WORKER_HARNESS_THREADS}" \ - --alsologtostderr diff --git a/examples/mag/sampling_spec.pbtxt b/examples/mag/sampling_spec.pbtxt deleted file mode 100644 index 2b67578a..00000000 --- a/examples/mag/sampling_spec.pbtxt +++ /dev/null @@ -1,46 +0,0 @@ -seed_op < - op_name: "seed" - node_set_name: "paper" -> -sampling_ops < - op_name: "seed->paper" - input_op_names: "seed" - edge_set_name: "cites" - sample_size: 32 - # Sample edges uniformly at random, because that works without any further - # information. We could use TOP_K or RANDOM_WEIGHTED if we had put a - # "#weight" column into the edge set's input table. - strategy: RANDOM_UNIFORM -> -sampling_ops < - op_name: "paper->author" - input_op_names: "seed" - input_op_names: "seed->paper" - edge_set_name: "written" - sample_size: 8 - strategy: RANDOM_UNIFORM -> -sampling_ops < - op_name: "author->paper" - input_op_names: "paper->author" - edge_set_name: "writes" - sample_size: 16 - strategy: RANDOM_UNIFORM -> -sampling_ops < - op_name: "author->institution" - input_op_names: "paper->author" - edge_set_name: "affiliated_with" - sample_size: 16 - strategy: RANDOM_UNIFORM -> -sampling_ops < - op_name: "paper->field_of_study" - input_op_names: "seed" - input_op_names: "seed->paper" - input_op_names: "author->paper" - edge_set_name: "has_topic" - sample_size: 16 - strategy: RANDOM_UNIFORM -> - diff --git a/examples/mag/schema.pbtxt b/examples/mag/schema.pbtxt deleted file mode 100644 index 049ce373..00000000 --- a/examples/mag/schema.pbtxt +++ /dev/null @@ -1,152 +0,0 @@ -node_sets { - key: "author" - value { - features { - key: "#id" - value { - dtype: DT_STRING - } - } - metadata { - filename: "nodes-author.tfrecords@15" - cardinality: 1134649 - } - } -} -node_sets { - key: "field_of_study" - value { - features { - key: "#id" - value { - dtype: DT_STRING - } - } - metadata { - filename: "nodes-field_of_study.tfrecords@2" - cardinality: 59965 - } - } -} -node_sets { - key: "institution" - value { - features { - key: "#id" - value { - dtype: DT_STRING - } - } - metadata { - filename: "nodes-institution.tfrecords" - cardinality: 8740 - } - } -} -node_sets { - key: "paper" - value { - features { - key: "#id" - value { - dtype: DT_STRING - } - } - features { - key: "feat" - value { - dtype: DT_FLOAT - shape { - dim { - size: 128 - } - } - } - } - features { - key: "labels" - value { - dtype: DT_INT64 - shape { - dim { - size: 1 - } - } - } - } - features { - key: "year" - value { - dtype: DT_INT64 - shape { - dim { - size: 1 - } - } - } - } - metadata { - filename: "nodes-paper.tfrecords@397" - cardinality: 736389 - } - } -} -edge_sets { - key: "affiliated_with" - value { - source: "author" - target: "institution" - metadata { - filename: "edges-affiliated_with.tfrecords@30" - cardinality: 1043998 - } - } -} -edge_sets { - key: "cites" - value { - source: "paper" - target: "paper" - metadata { - filename: "edges-cites.tfrecords@120" - cardinality: 5416271 - } - } -} -edge_sets { - key: "has_topic" - value { - source: "paper" - target: "field_of_study" - metadata { - filename: "edges-has_topic.tfrecords@226" - cardinality: 7505078 - } - } -} -edge_sets { - key: "writes" - value { - source: "author" - target: "paper" - metadata { - filename: "edges-writes.tfrecords@172" - cardinality: 7145660 - } - } -} -edge_sets { - key: "written" - value { - source: "paper" - target: "author" - metadata { - filename: "edges-writes.tfrecords@172" - cardinality: 7145660 - extra { - key: "edge_type" - value: "reversed" - } - } - } -}