From 890537b5fddb1d806388a832b420f49ed20d1d0c Mon Sep 17 00:00:00 2001
From: Brandon Mayer <bmayer@google.com>
Date: Wed, 13 Dec 2023 09:04:07 -0800
Subject: [PATCH] Cleanup old sampler examples.

PiperOrigin-RevId: 590617039
---
 examples/arxiv/Makefile             |  40 --------
 examples/arxiv/sample_dataflow.sh   |  98 ------------------
 examples/arxiv/sampling_spec.pbtxt  |  18 ----
 examples/mag/download_and_format.sh |  39 -------
 examples/mag/sample_dataflow.sh     | 104 -------------------
 examples/mag/sampling_spec.pbtxt    |  46 ---------
 examples/mag/schema.pbtxt           | 152 ----------------------------
 7 files changed, 497 deletions(-)
 delete mode 100755 examples/arxiv/Makefile
 delete mode 100755 examples/arxiv/sample_dataflow.sh
 delete mode 100644 examples/arxiv/sampling_spec.pbtxt
 delete mode 100755 examples/mag/download_and_format.sh
 delete mode 100755 examples/mag/sample_dataflow.sh
 delete mode 100644 examples/mag/sampling_spec.pbtxt
 delete mode 100644 examples/mag/schema.pbtxt

diff --git a/examples/arxiv/Makefile b/examples/arxiv/Makefile
deleted file mode 100755
index ea7875d9..00000000
--- a/examples/arxiv/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env make
-#
-# Makefile to run the OGB conversion to Unigraph tool on some of its datasets.
-#
-
-# Temporary installation prefix setup on Linux.
-PYVERSION = 3
-PYTHON = python$(PYVERSION)
-
-# Selected dataset.
-DATASET = ogbn-arxiv
-ROOT=/tmp/data/$(DATASET)
-
-# Run the sampler on the X dataset.
-graph:
-	tfgnn_convert_ogb_dataset	\
-		--dataset=$(DATASET)				\
-		--ogb_datasets_dir=/tmp/ogb-preprocessed	\
-		--output=$(ROOT)/graph
-
-sample:
-	tfgnn_graph_sampler		\
-		--alsologtostderr				\
-		--graph_schema=$(ROOT)/graph/schema.pbtxt	\
-		--sampling_spec=$(PWD)/sampling_spec.pbtxt	\
-		--output_samples=$(ROOT)/training/data@20
-
-stats:
-	tfgnn_sampled_stats			\
-		--alsologtostderr					\
-		--graph_schema=$(ROOT)/graph/schema.pbtxt		\
-		--input_pattern=$(ROOT)/training/data-?????-of-00020	\
-		--input_format=tfrecord					\
-		--output_filename=$(ROOT)/training/stats.pbtxt
-
-print:
-	tfgnn_print_training_data	\
-		--graph_schema=$(ROOT)/graph/schema.pbtxt	\
-		--examples=$(ROOT)/training/data-?????-of-00020	\
-		--file_format=tfrecord
diff --git a/examples/arxiv/sample_dataflow.sh b/examples/arxiv/sample_dataflow.sh
deleted file mode 100755
index 2aedc119..00000000
--- a/examples/arxiv/sample_dataflow.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Dataflow workers must have access to the tfgnn docker container.
-# Clients must have a local copy of the tensorflow_gnn image to launch the
-# dataflow job from within the container.
-#
-# Clients must also have access to the target's GCP project application default
-# credentials in their home directory.
-#
-# To run sampling as a Dataflow job, clients must have run the data generation
-# tool in tensorflow_gnn/examples/Makefile and copied the output folder to GCS.
-# An end to end example from building docker, pushing to GCR,
-# copying data to GCS, and running sampling:
-#
-# pushd /[path-to]/tensorflow_gnn
-# docker build . -t tfgnn:latest -t gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest
-# docker push gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest
-#
-# docker run -v /tmp:/tmp \
-#   -it --entrypoint make tfgnn:latest -C /app/examples/arxiv graph \
-#
-# gsutil cp /tmp/data/ogbn-arxiv/graph \
-#   gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv
-#
-# ./[path-to]/tensorflow_gnn/examples/arxiv/sample_dataflow.sh
-#
-MACHINE_TYPE="n1-standard-1"
-MAX_NUM_WORKERS=1000
-
-# The sampling spec is included in the container, read by the controller and
-# sent to the remote Dataflow server.
-SAMPLING_SPEC="/app/examples/arxiv/sampling_spec.pbtxt"
-
-# The values below are just suggestions, feel free to change them.
-GOOGLE_CLOUD_PROJECT="[FILL-ME-IN]"
-
-# Make sure you have already made the GCP bucket with something like:
-# `gsutil mb gs://${GOOGLE_CLOUD_PROJECT}`.
-EXAMPLE_ARTIFACT_DIRECTORY="gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv"
-
-GRAPH_SCHEMA="${EXAMPLE_ARTIFACT_DIRECTORY}/schema.pbtxt"
-
-# Required by Dataflow
-TEMP_LOCATION="${EXAMPLE_ARTIFACT_DIRECTORY}/tmp"
-
-# (Sharded) output sample tfrecord filespec.
-OUTPUT_SAMPLES="${EXAMPLE_ARTIFACT_DIRECTORY}/samples@20"
-
-# This should be a path to a docker image with TFGNN installed that pinned to
-# the image version that the user is running this script with. A valid example
-# using Google Container Registry:
-# `gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest`.
-REMOTE_WORKER_CONTAINER="[FILL-ME-IN]"
-
-# Useful to define a private GCP VPC hat does not allocate external IP addresses
-# so worker machines do not impact quota limits.
-GCP_VPC_NAME="[FILL-ME-IN]"
-
-JOB_NAME="tensorflow-gnn-arxiv-sampling"
-
-# Placeholder for Google-internal script config
-
-docker run -v ~/.config/gcloud:/root/.config/gcloud \
-  -e "GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT}" \
-  -e "GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json" \
-  --entrypoint tfgnn_graph_sampler \
-  tfgnn:latest \
-  --graph_schema="${GRAPH_SCHEMA}" \
-  --sampling_spec="${SAMPLING_SPEC}" \
-  --output_samples="${OUTPUT_SAMPLES}" \
-  --runner=DataflowRunner \
-  --project=${GOOGLE_CLOUD_PROJECT} \
-  --region=us-east1 \
-  --max_num_workers="${MAX_NUM_WORKERS}" \
-  --temp_location="${TEMP_LOCATION}" \
-  --job_name="${JOB_NAME}" \
-  --no_use_public_ips \
-  --network="${GCP_VPC_NAME}" \
-  --worker_machine_type="${MACHINE_TYPE}" \
-  --experiments=use_monitoring_state_manager \
-  --experiments=enable_execution_details_collection \
-  --experiment=use_runner_v2 \
-  --worker_harness_container_image=gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest \
-  --alsologtostderr
diff --git a/examples/arxiv/sampling_spec.pbtxt b/examples/arxiv/sampling_spec.pbtxt
deleted file mode 100644
index 8fd7558e..00000000
--- a/examples/arxiv/sampling_spec.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-seed_op: <
-  op_name: 'seed'
-  node_set_name: 'nodes'
->
-sampling_ops: <
-  op_name: 'hop-1'
-  input_op_names: [ 'seed' ]
-  strategy: TOP_K
-  sample_size: 8
-  edge_set_name: 'edges'
->
-sampling_ops: <
-  op_name: 'hop-2'
-  input_op_names: [ 'hop-1' ]
-  strategy: TOP_K
-  sample_size: 3
-  edge_set_name: 'edges'
->
diff --git a/examples/mag/download_and_format.sh b/examples/mag/download_and_format.sh
deleted file mode 100755
index 11c67182..00000000
--- a/examples/mag/download_and_format.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Simple script for using TFGNN docker image to download and format OGBN-MAG.
-# Dataset download is approximately 0.4 GB and expands on disk to be larger.
-# To run the batch sampler using Dataflow on this dataset, the graph artifacts
-# must be pushed to GCS. Assuming the user is authenticated to work with a
-# GCP project with a bucket named ${BUCKET}, this can be done with a command
-# akin to (This will copy approximately 2GB of data to GCS):
-#
-# gsutil -m cp -r ${OUTPUT_PATH} gs://${BUCKET}/tfgnn/examples/obgn-mag
-#
-DATASET="ogbn-mag"
-DOWNLOAD_PATH="/tmp/ogb-preprocessed"
-OUTPUT_PATH="/tmp/data/${DATASET}/graph"
-
-docker run -it --entrypoint tfgnn_convert_ogb_dataset \
-  -v /tmp:/tmp \
-  tfgnn:latest \
-  --dataset="${DATASET}" \
-  --ogb_datasets_dir="${DOWNLOAD_PATH}" \
-  --output="${OUTPUT_PATH}"
-
-sudo chown -R ${USER} ${OUTPUT_PATH}
-
-# Copy over the extended schema with the "written" relationship.
-cp $(dirname $0)/schema.pbtxt ${OUTPUT_PATH}
diff --git a/examples/mag/sample_dataflow.sh b/examples/mag/sample_dataflow.sh
deleted file mode 100755
index 3da2606a..00000000
--- a/examples/mag/sample_dataflow.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-# Copyright 2021 The TensorFlow GNN Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Dataflow workers must have access to the tfgnn docker container.
-# Clients must have a local copy of the tensorflow_gnn image to launch the
-# dataflow job from within the container.
-#
-# Clients must also have access to the target's GCP project application default
-# credentials in their home directory.
-#
-# To run sampling as a Dataflow job, clients must have run the data generation
-# tool in tensorflow_gnn/examples/Makefile and copied the output folder to GCS.
-# An end to end example from building docker, pushing to GCR,
-# copying data to GCS, and running sampling:
-#
-# docker build [path-to]/tensorflow_gnn -t tfgnn:latest \
-#   -t gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest
-# docker push gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest
-#
-# docker run -v /tmp:/tmp \
-#   -it --entrypoint make tfgnn:latest -C /app/examples/arxiv graph \
-#
-# gsutil cp /tmp/data/ogbn-arxiv/graph \
-#   gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/examples/arxiv
-#
-# ./[path-to]/tensorflow_gnn/examples/arxiv/sample_dataflow.sh
-#
-MAX_NUM_WORKERS=1000
-
-# The sampling spec is included in the container, read by the controller and
-# sent to the remote Dataflow server.
-SAMPLING_SPEC="/app/examples/mag/sampling_spec.pbtxt"
-
-# The values below are just suggestions, feel free to change them.
-GOOGLE_CLOUD_PROJECT="[FILL-ME-IN]"
-GOOGLE_CLOUD_COMPUTE_REGION="[FILL-ME-In]"
-# Make sure you have already made the GCP bucket with something like:
-# `gsutil mb gs://${GOOGLE_CLOUD_PROJECT}`.
-TIMESTAMP="$(date +"%Y-%m-%d-%H-%M-%S")"
-EXAMPLE_ARTIFACT_DIRECTORY="gs://${GOOGLE_CLOUD_PROJECT}/tfgnn/ogbn-mag/${TIMESTAMP}"
-
-# Sampler expects the graph artifacts to be in the same directory as the
-# schema.
-GRAPH_SCHEMA="${EXAMPLE_ARTIFACT_DIRECTORY}/schema.pbtxt"
-
-# Required by Dataflow
-TEMP_LOCATION="${EXAMPLE_ARTIFACT_DIRECTORY}/tmp"
-
-# (Sharded) output sample tfrecord filespec.
-OUTPUT_SAMPLES="${EXAMPLE_ARTIFACT_DIRECTORY}/samples@100"
-
-# This should be a path to a docker image with TFGNN installed that pinned to
-# the image version that the user is running this script with. A valid example
-# using Google Container Registry:
-# `gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest`.
-REMOTE_WORKER_CONTAINER="[FILL-ME-IN]"
-
-# Useful to define a private GCP VPC hat does not allocate external IP addresses
-# so worker machines do not impact quota limits.
-GCP_VPC_NAME="[FILL-ME-IN]"
-
-EDGE_AGGREGATION_METHOD="edge"
-
-JOB_NAME="tfgnn-mag-sampling-${EDGE_AGGREGATION_METHOD}-${TIMESTAMP}"
-
-# Placeholder for Google-internal script config
-
-docker run -v ~/.config/gcloud:/root/.config/gcloud \
-  -e "GOOGLE_CLOUD_PROJECT=${GOOGLE_CLOUD_PROJECT}" \
-  -e "GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json" \
-  --entrypoint tfgnn_graph_sampler \
-  tfgnn:latest \
-  --graph_schema="${GRAPH_SCHEMA}" \
-  --sampling_spec="${SAMPLING_SPEC}" \
-  --output_samples="${OUTPUT_SAMPLES}" \
-  --edge_aggregation_method="${EDGE_AGGREGATION_METHOD}" \
-  --runner=DataflowRunner \
-  --project=${GOOGLE_CLOUD_PROJECT} \
-  --region=${GOOGLE_CLOUD_COMPUTE_REGION} \
-  --max_num_workers="${MAX_NUM_WORKERS}" \
-  --temp_location="${TEMP_LOCATION}" \
-  --job_name="${JOB_NAME}" \
-  --no_use_public_ips \
-  --network="${GCP_VPC_NAME}" \
-  --dataflow_service_options=enable_prime \
-  --experiments=use_monitoring_state_manager \
-  --experiments=enable_execution_details_collection \
-  --experiment=use_runner_v2 \
-  --worker_harness_container_image=gcr.io/${GOOGLE_CLOUD_PROJECT}/tfgnn:latest \
-  --number_of_worker_harness_threads="${NUMBER_OF_WORKER_HARNESS_THREADS}" \
-  --alsologtostderr
diff --git a/examples/mag/sampling_spec.pbtxt b/examples/mag/sampling_spec.pbtxt
deleted file mode 100644
index 2b67578a..00000000
--- a/examples/mag/sampling_spec.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-seed_op <
-  op_name: "seed"
-  node_set_name: "paper"
->
-sampling_ops <
-  op_name: "seed->paper"
-  input_op_names: "seed"
-  edge_set_name: "cites"
-  sample_size: 32
-  # Sample edges uniformly at random, because that works without any further
-  # information. We could use TOP_K or RANDOM_WEIGHTED if we had put a
-  # "#weight" column into the edge set's input table.
-  strategy: RANDOM_UNIFORM
->
-sampling_ops <
-  op_name: "paper->author"
-  input_op_names: "seed"
-  input_op_names: "seed->paper"
-  edge_set_name: "written"
-  sample_size: 8
-  strategy: RANDOM_UNIFORM
->
-sampling_ops <
-  op_name: "author->paper"
-  input_op_names: "paper->author"
-  edge_set_name: "writes"
-  sample_size: 16
-  strategy: RANDOM_UNIFORM
->
-sampling_ops <
-  op_name: "author->institution"
-  input_op_names: "paper->author"
-  edge_set_name: "affiliated_with"
-  sample_size: 16
-  strategy: RANDOM_UNIFORM
->
-sampling_ops <
-  op_name: "paper->field_of_study"
-  input_op_names: "seed"
-  input_op_names: "seed->paper"
-  input_op_names: "author->paper"
-  edge_set_name: "has_topic"
-  sample_size: 16
-  strategy: RANDOM_UNIFORM
->
-
diff --git a/examples/mag/schema.pbtxt b/examples/mag/schema.pbtxt
deleted file mode 100644
index 049ce373..00000000
--- a/examples/mag/schema.pbtxt
+++ /dev/null
@@ -1,152 +0,0 @@
-node_sets {
-  key: "author"
-  value {
-    features {
-      key: "#id"
-      value {
-        dtype: DT_STRING
-      }
-    }
-    metadata {
-      filename: "nodes-author.tfrecords@15"
-      cardinality: 1134649
-    }
-  }
-}
-node_sets {
-  key: "field_of_study"
-  value {
-    features {
-      key: "#id"
-      value {
-        dtype: DT_STRING
-      }
-    }
-    metadata {
-      filename: "nodes-field_of_study.tfrecords@2"
-      cardinality: 59965
-    }
-  }
-}
-node_sets {
-  key: "institution"
-  value {
-    features {
-      key: "#id"
-      value {
-        dtype: DT_STRING
-      }
-    }
-    metadata {
-      filename: "nodes-institution.tfrecords"
-      cardinality: 8740
-    }
-  }
-}
-node_sets {
-  key: "paper"
-  value {
-    features {
-      key: "#id"
-      value {
-        dtype: DT_STRING
-      }
-    }
-    features {
-      key: "feat"
-      value {
-        dtype: DT_FLOAT
-        shape {
-          dim {
-            size: 128
-          }
-        }
-      }
-    }
-    features {
-      key: "labels"
-      value {
-        dtype: DT_INT64
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-    features {
-      key: "year"
-      value {
-        dtype: DT_INT64
-        shape {
-          dim {
-            size: 1
-          }
-        }
-      }
-    }
-    metadata {
-      filename: "nodes-paper.tfrecords@397"
-      cardinality: 736389
-    }
-  }
-}
-edge_sets {
-  key: "affiliated_with"
-  value {
-    source: "author"
-    target: "institution"
-    metadata {
-      filename: "edges-affiliated_with.tfrecords@30"
-      cardinality: 1043998
-    }
-  }
-}
-edge_sets {
-  key: "cites"
-  value {
-    source: "paper"
-    target: "paper"
-    metadata {
-      filename: "edges-cites.tfrecords@120"
-      cardinality: 5416271
-    }
-  }
-}
-edge_sets {
-  key: "has_topic"
-  value {
-    source: "paper"
-    target: "field_of_study"
-    metadata {
-      filename: "edges-has_topic.tfrecords@226"
-      cardinality: 7505078
-    }
-  }
-}
-edge_sets {
-  key: "writes"
-  value {
-    source: "author"
-    target: "paper"
-    metadata {
-      filename: "edges-writes.tfrecords@172"
-      cardinality: 7145660
-    }
-  }
-}
-edge_sets {
-  key: "written"
-  value {
-    source: "paper"
-    target: "author"
-    metadata {
-      filename: "edges-writes.tfrecords@172"
-      cardinality: 7145660
-      extra {
-        key: "edge_type"
-        value: "reversed"
-      }
-    }
-  }
-}