Version adjustments to setup.py with deps

PiperOrigin-RevId: 579985175
google · Nov 7, 2023 · 214e5e1 · 214e5e1
1 parent b1403b5
commit 214e5e1
Show file tree

Hide file tree

Showing 13 changed files with 604 additions and 1 deletion.
diff --git a/beam/__init__.py b/beam/__init__.py
@@ -0,0 +1,20 @@
+"""Apache Beam module for array_record.
+
+This module provides both core components and
+helper functions to enable users to convert different file formats to AR.
+
+To keep dependencies light, we'll import Beam on module usage so any errors
+occur early.
+"""
+
+import apache_beam as beam
+
+# I'd really like a PEP8 compatible conditional import here with a more
+# explicit error message. Example below:
+
+# try:
+#   import apache_beam as beam
+# except Exception as e:
+#   raise ImportError(
+#       ('Beam functionality requires extra dependencies. '
+#        'Install apache-beam or run "pip install array_record[beam]".')) from e
diff --git a/beam/arrayrecordio.py b/beam/arrayrecordio.py
@@ -0,0 +1,67 @@
+"""An IO module for ArrayRecord.
+
+CURRENTLY ONLY SINK IS IMPLEMENTED, AND IT DOESN'T WORK WITH NON-DISK WRITES
+"""
+
+from apache_beam import io
+from apache_beam import transforms
+from apache_beam.coders import coders
+from apache_beam.io import filebasedsink
+from apache_beam.io.filesystem.CompressionTypes import AUTO
+from array_record.python.array_record_module import ArrayRecordWriter
+
+
+class _ArrayRecordSink(filebasedsink.FileBasedSink):
+  """Sink Class for use in Arrayrecord PTransform."""
+
+  def __init__(
+      self,
+      file_path_prefix,
+      file_name_suffix=None,
+      num_shards=0,
+      shard_name_template=None,
+      coder=coders.ToBytesCoder(),
+      compression_type=AUTO):
+
+    super().__init__(
+        file_path_prefix,
+        file_name_suffix=file_name_suffix,
+        num_shards=num_shards,
+        shard_name_template=shard_name_template,
+        coder=coder,
+        mime_type='application/octet-stream',
+        compression_type=compression_type)
+
+  def open(self, temp_path):
+    array_writer = ArrayRecordWriter(temp_path, 'group_size:1')
+    return array_writer
+
+  def close(self, file_handle):
+    file_handle.close()
+
+  def write_encoded_record(self, file_handle, value):
+    file_handle.write(value)
+
+
+class WriteToArrayRecord(transforms.PTransform):
+  """PTransform for a disk-based write to ArrayRecord."""
+
+  def __init__(
+      self,
+      file_path_prefix,
+      file_name_suffix='',
+      num_shards=0,
+      shard_name_template=None,
+      coder=coders.ToBytesCoder(),
+      compression_type=AUTO):
+
+    self._sink = _ArrayRecordSink(
+        file_path_prefix,
+        file_name_suffix,
+        num_shards,
+        shard_name_template,
+        coder,
+        compression_type)
+
+  def expand(self, pcoll):
+    return pcoll | io.iobase.Write(self._sink)
diff --git a/beam/demo.py b/beam/demo.py
@@ -0,0 +1,43 @@
+"""Demo Pipeline.
+
+This file creates a TFrecord dataset and converts it to ArrayRecord on GCS
+"""
+
+import apache_beam as beam
+from apache_beam.coders import coders
+from . import dofns
+from . import example
+from . import options
+
+
+## Grab CLI arguments.
+## Override by passing args/pipeline_options to the function manually.
+args, pipeline_options = options.get_arguments()
+
+
+def main():
+  p1 = beam.Pipeline(options=pipeline_options)
+  initial = (p1
+             | 'Create a set of TFExamples' >> beam.Create(
+                 example.generate_movie_examples()
+             )
+             | 'Write TFRecords' >> beam.io.WriteToTFRecord(
+                 args['input'],
+                 coder=coders.ToBytesCoder(),
+                 num_shards=4,
+                 file_name_suffix='.tfrecord'
+             )
+             | 'Read shards from GCS' >> beam.io.ReadAllFromTFRecord(
+                 with_filename=True)
+             | 'Group with Filename' >> beam.GroupByKey()
+             | 'Write to ArrayRecord in GCS' >> beam.ParDo(
+                 dofns.ConvertToArrayRecordGCS(),
+                 args['output'],
+                 overwrite_extension=True))
+
+  return p1, initial
+
+
+if __name__ == '__main__':
+  demo_pipeline = main()
+  demo_pipeline.run()
diff --git a/beam/dofns.py b/beam/dofns.py
@@ -0,0 +1,57 @@
+"""DoFn's for parallel processing."""
+
+import os
+import urllib
+import apache_beam as beam
+from array_record.python.array_record_module import ArrayRecordWriter
+from google.cloud import storage
+
+
+class ConvertToArrayRecordGCS(beam.DoFn):
+  """Write a tuple consisting of a filename and records to GCS ArrayRecords."""
+
+  _WRITE_DIR = '/tmp/'
+
+  def process(
+      self,
+      element,
+      path,
+      write_dir=_WRITE_DIR,
+      file_path_suffix='.arrayrecord',
+      overwrite_extension=False,
+    ):
+
+    ## Upload to GCS
+    def upload_to_gcs(bucket_name, filename, prefix='', source_dir=self._WRITE_DIR):
+      source_filename = os.path.join(source_dir, filename)
+      blob_name = os.path.join(prefix, filename)
+      storage_client = storage.Client()
+      bucket = storage_client.get_bucket(bucket_name)
+      blob = bucket.blob(blob_name)
+      blob.upload_from_filename(source_filename)
+
+    ## Simple logic for stripping a file extension and replacing it
+    def fix_filename(filename):
+      base_name = os.path.splitext(filename)[0]
+      new_filename = base_name + file_path_suffix
+      return new_filename
+
+    parsed_gcs_path = urllib.parse.urlparse(path)
+    bucket_name = parsed_gcs_path.hostname
+    gcs_prefix = parsed_gcs_path.path.lstrip('/')
+
+    if overwrite_extension:
+      filename = fix_filename(os.path.basename(element[0]))
+    else:
+      filename = '{}{}'.format(os.path.basename(element[0]), file_path_suffix)
+
+    write_path = os.path.join(write_dir, filename)
+    writer = ArrayRecordWriter(write_path, 'group_size:1')
+
+    for item in element[1]:
+      writer.write(item)
+
+    writer.close()
+
+    upload_to_gcs(bucket_name, filename, prefix=gcs_prefix)
+    os.remove(os.path.join(write_dir, filename))
diff --git a/beam/example.py b/beam/example.py
@@ -0,0 +1,106 @@
+"""Helper file for generating TF/ArrayRecords and writing them to disk."""
+
+import os
+from array_record.beam.testdata import data
+from array_record.python.array_record_module import ArrayRecordWriter
+import tensorflow as tf
+
+
+def generate_movie_examples():
+  """Create a list of TF examples from the dummy data above and return it.
+  
+  Returns:
+    TFExample object
+  """
+
+  examples = []
+  for example in data:
+    examples.append(
+        tf.train.Example(
+            features=tf.train.Features(
+                feature={
+                    'Age': tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[example['Age']])),
+                    'Movie': tf.train.Feature(
+                        bytes_list=tf.train.BytesList(
+                            value=[
+                                m.encode('utf-8') for m in example['Movie']])),
+                    'Movie Ratings': tf.train.Feature(
+                        float_list=tf.train.FloatList(
+                            value=example['Movie Ratings'])),
+                    'Suggestion': tf.train.Feature(
+                        bytes_list=tf.train.BytesList(
+                            value=[example['Suggestion'].encode('utf-8')])),
+                    'Suggestion Purchased': tf.train.Feature(
+                        float_list=tf.train.FloatList(
+                            value=[example['Suggestion Purchased']])),
+                    'Purchase Price': tf.train.Feature(
+                        float_list=tf.train.FloatList(
+                            value=[example['Purchase Price']]))
+                }
+            )
+        )
+    )
+
+  return(examples)
+
+
+def generate_serialized_movie_examples():
+  """Return a serialized version of the above data for byte insertion."""
+
+  return [example.SerializeToString() for example in generate_movie_examples()]
+
+
+def write_example_to_tfrecord(example, file_path):
+  """Write example(s) to a single TFrecord file."""
+
+  with tf.io.TFRecordWriter(file_path) as writer:
+    writer.write(example.SerializeToString())
+
+
+# Write example(s) to a single ArrayRecord file
+def write_example_to_arrayrecord(example, file_path):
+  writer = ArrayRecordWriter(file_path, 'group_size:1')
+  writer.write(example.SerializeToString())
+  writer.close()
+
+
+def kitty_tfrecord(prefix=''):
+  """Create a TFRecord from a cat pic on the Internet.
+
+  This is mainly for testing; probably don't use it.
+
+  Args:
+    prefix: A file directory in string format.
+  """
+
+  cat_in_snow = tf.keras.utils.get_file(
+      '320px-Felis_catus-cat_on_snow.jpg',
+      'https://storage.googleapis.com/download.tensorflow.org/example_images/320px-Felis_catus-cat_on_snow.jpg')
+
+  image_labels = {
+      cat_in_snow: 0
+  }
+
+  image_string = open(cat_in_snow, 'rb').read()
+  label = image_labels[cat_in_snow]
+  image_shape = tf.io.decode_jpeg(image_string).shape
+
+  feature = {
+      'height': tf.train.Feature(int64_list=tf.train.Int64List(
+          value=[image_shape[0]])),
+      'width': tf.train.Feature(int64_list=tf.train.Int64List(
+          value=[image_shape[1]])),
+      'depth': tf.train.Feature(int64_list=tf.train.Int64List(
+          value=[image_shape[2]])),
+      'label': tf.train.Feature(int64_list=tf.train.Int64List(
+          value=[label])),
+      'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(
+          value=[image_string]))
+  }
+
+  example = tf.train.Example(features=tf.train.Features(feature=feature))
+
+  record_file = os.path.join(prefix, 'kittymeow.tfrecord')
+  with tf.io.TFRecordWriter(record_file) as writer:
+    writer.write(example.SerializeToString())
diff --git a/beam/examples/example_full_demo_cli.sh b/beam/examples/example_full_demo_cli.sh
@@ -0,0 +1,13 @@
+# Execute this via BASH to run a full demo that creates TFRecords and converts them
+
+#!/bin/bash
+
+
+# Set bucket info below. Uncomment lower lines and set values to use Dataflow.
+python -m array_record.beam.demo \
+  --input="gs://<YOUR_INPUT_BUCKET>/records/movies" \
+  --output="gs://<YOUR_OUTPUT_BUCKET>/records/" \
+  # --region="<YOUR_REGION>" \
+  # --runner="DataflowRunner" \
+  # --project="<YOUR_PROJECT>" \
+  # --requirements_file="requirements.txt"
diff --git a/beam/examples/example_gcs_conversion.py b/beam/examples/example_gcs_conversion.py
@@ -0,0 +1,29 @@
+"""Execute this to convert an existing set of TFRecords to ArrayRecords."""
+
+
+from apache_beam.options import pipeline_options
+from array_record.beam.pipelines import convert_tf_to_arrayrecord_gcs
+
+## Set input and output patterns as specified
+input_pattern = 'gs://<YOUR_INPUT_BUCKET>/records/*.tfrecord'
+output_path = 'gs://<YOUR_OUTPUT_BUCKET>/records/'
+
+args = {'input': input_pattern, 'output': output_path}
+
+## Set pipeline options and uncomment in main() to run in Dataflow
+pipeline_options = pipeline_options.PipelineOptions(
+    runner='DataflowRunner',
+    project='<YOUR_PROJECT>',
+    region='<YOUR_REGION>',
+    requirements_file='requirements.txt'
+)
+
+
+def main():
+  convert_tf_to_arrayrecord_gcs(
+      args=args,
+      # pipeline_options=pipeline_options
+  ).run()
+
+if __name__ == '__main__':
+  main()
diff --git a/beam/examples/example_sink_conversion.py b/beam/examples/example_sink_conversion.py
@@ -0,0 +1,29 @@
+"""Execute this to convert TFRecords to ArrayRecords using the disk Sink."""
+
+
+from apache_beam.options import pipeline_options
+from array_record.beam.pipelines import convert_tf_to_arrayrecord_disk_match_shards
+
+## Set input and output patterns as specified
+input_pattern = 'gs://<YOUR_INPUT_BUCKET>/records/*.tfrecord'
+output_path = 'records/movies'
+
+args = {'input': input_pattern, 'output': output_path}
+
+## Set pipeline options and uncomment in main() to run in Dataflow
+pipeline_options = pipeline_options.PipelineOptions(
+    runner='DataflowRunner',
+    project='<YOUR_PROJECT>',
+    region='<YOUR_REGION>',
+    requirements_file='requirements.txt'
+)
+
+
+def main():
+  convert_tf_to_arrayrecord_disk_match_shards(
+      args=args,
+      # pipeline_options=pipeline_options
+  ).run()
+
+if __name__ == '__main__':
+  main()
diff --git a/beam/examples/requirements.txt b/beam/examples/requirements.txt
@@ -0,0 +1,2 @@
+google-cloud-storage==2.11.0
+tensorflow==2.14.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		google-cloud-storage==2.11.0
		tensorflow==2.14.0