Skip to content

Commit

Permalink
Version adjustments to setup.py with deps
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 579985175
  • Loading branch information
ArrayRecord Team authored and copybara-github committed Nov 7, 2023
1 parent b1403b5 commit 214e5e1
Show file tree
Hide file tree
Showing 13 changed files with 604 additions and 1 deletion.
20 changes: 20 additions & 0 deletions beam/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Apache Beam module for array_record.
This module provides both core components and
helper functions to enable users to convert different file formats to AR.
To keep dependencies light, we'll import Beam on module usage so any errors
occur early.
"""

import apache_beam as beam

# I'd really like a PEP8 compatible conditional import here with a more
# explicit error message. Example below:

# try:
# import apache_beam as beam
# except Exception as e:
# raise ImportError(
# ('Beam functionality requires extra dependencies. '
# 'Install apache-beam or run "pip install array_record[beam]".')) from e
67 changes: 67 additions & 0 deletions beam/arrayrecordio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""An IO module for ArrayRecord.
CURRENTLY ONLY SINK IS IMPLEMENTED, AND IT DOESN'T WORK WITH NON-DISK WRITES
"""

from apache_beam import io
from apache_beam import transforms
from apache_beam.coders import coders
from apache_beam.io import filebasedsink
from apache_beam.io.filesystem.CompressionTypes import AUTO
from array_record.python.array_record_module import ArrayRecordWriter


class _ArrayRecordSink(filebasedsink.FileBasedSink):
"""Sink Class for use in Arrayrecord PTransform."""

def __init__(
self,
file_path_prefix,
file_name_suffix=None,
num_shards=0,
shard_name_template=None,
coder=coders.ToBytesCoder(),
compression_type=AUTO):

super().__init__(
file_path_prefix,
file_name_suffix=file_name_suffix,
num_shards=num_shards,
shard_name_template=shard_name_template,
coder=coder,
mime_type='application/octet-stream',
compression_type=compression_type)

def open(self, temp_path):
array_writer = ArrayRecordWriter(temp_path, 'group_size:1')
return array_writer

def close(self, file_handle):
file_handle.close()

def write_encoded_record(self, file_handle, value):
file_handle.write(value)


class WriteToArrayRecord(transforms.PTransform):
"""PTransform for a disk-based write to ArrayRecord."""

def __init__(
self,
file_path_prefix,
file_name_suffix='',
num_shards=0,
shard_name_template=None,
coder=coders.ToBytesCoder(),
compression_type=AUTO):

self._sink = _ArrayRecordSink(
file_path_prefix,
file_name_suffix,
num_shards,
shard_name_template,
coder,
compression_type)

def expand(self, pcoll):
return pcoll | io.iobase.Write(self._sink)
43 changes: 43 additions & 0 deletions beam/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Demo Pipeline.
This file creates a TFrecord dataset and converts it to ArrayRecord on GCS
"""

import apache_beam as beam
from apache_beam.coders import coders
from . import dofns
from . import example
from . import options


## Grab CLI arguments.
## Override by passing args/pipeline_options to the function manually.
args, pipeline_options = options.get_arguments()


def main():
p1 = beam.Pipeline(options=pipeline_options)
initial = (p1
| 'Create a set of TFExamples' >> beam.Create(
example.generate_movie_examples()
)
| 'Write TFRecords' >> beam.io.WriteToTFRecord(
args['input'],
coder=coders.ToBytesCoder(),
num_shards=4,
file_name_suffix='.tfrecord'
)
| 'Read shards from GCS' >> beam.io.ReadAllFromTFRecord(
with_filename=True)
| 'Group with Filename' >> beam.GroupByKey()
| 'Write to ArrayRecord in GCS' >> beam.ParDo(
dofns.ConvertToArrayRecordGCS(),
args['output'],
overwrite_extension=True))

return p1, initial


if __name__ == '__main__':
demo_pipeline = main()
demo_pipeline.run()
57 changes: 57 additions & 0 deletions beam/dofns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""DoFn's for parallel processing."""

import os
import urllib
import apache_beam as beam
from array_record.python.array_record_module import ArrayRecordWriter
from google.cloud import storage


class ConvertToArrayRecordGCS(beam.DoFn):
"""Write a tuple consisting of a filename and records to GCS ArrayRecords."""

_WRITE_DIR = '/tmp/'

def process(
self,
element,
path,
write_dir=_WRITE_DIR,
file_path_suffix='.arrayrecord',
overwrite_extension=False,
):

## Upload to GCS
def upload_to_gcs(bucket_name, filename, prefix='', source_dir=self._WRITE_DIR):
source_filename = os.path.join(source_dir, filename)
blob_name = os.path.join(prefix, filename)
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_filename(source_filename)

## Simple logic for stripping a file extension and replacing it
def fix_filename(filename):
base_name = os.path.splitext(filename)[0]
new_filename = base_name + file_path_suffix
return new_filename

parsed_gcs_path = urllib.parse.urlparse(path)
bucket_name = parsed_gcs_path.hostname
gcs_prefix = parsed_gcs_path.path.lstrip('/')

if overwrite_extension:
filename = fix_filename(os.path.basename(element[0]))
else:
filename = '{}{}'.format(os.path.basename(element[0]), file_path_suffix)

write_path = os.path.join(write_dir, filename)
writer = ArrayRecordWriter(write_path, 'group_size:1')

for item in element[1]:
writer.write(item)

writer.close()

upload_to_gcs(bucket_name, filename, prefix=gcs_prefix)
os.remove(os.path.join(write_dir, filename))
106 changes: 106 additions & 0 deletions beam/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Helper file for generating TF/ArrayRecords and writing them to disk."""

import os
from array_record.beam.testdata import data
from array_record.python.array_record_module import ArrayRecordWriter
import tensorflow as tf


def generate_movie_examples():
"""Create a list of TF examples from the dummy data above and return it.
Returns:
TFExample object
"""

examples = []
for example in data:
examples.append(
tf.train.Example(
features=tf.train.Features(
feature={
'Age': tf.train.Feature(
int64_list=tf.train.Int64List(value=[example['Age']])),
'Movie': tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[
m.encode('utf-8') for m in example['Movie']])),
'Movie Ratings': tf.train.Feature(
float_list=tf.train.FloatList(
value=example['Movie Ratings'])),
'Suggestion': tf.train.Feature(
bytes_list=tf.train.BytesList(
value=[example['Suggestion'].encode('utf-8')])),
'Suggestion Purchased': tf.train.Feature(
float_list=tf.train.FloatList(
value=[example['Suggestion Purchased']])),
'Purchase Price': tf.train.Feature(
float_list=tf.train.FloatList(
value=[example['Purchase Price']]))
}
)
)
)

return(examples)


def generate_serialized_movie_examples():
"""Return a serialized version of the above data for byte insertion."""

return [example.SerializeToString() for example in generate_movie_examples()]


def write_example_to_tfrecord(example, file_path):
"""Write example(s) to a single TFrecord file."""

with tf.io.TFRecordWriter(file_path) as writer:
writer.write(example.SerializeToString())


# Write example(s) to a single ArrayRecord file
def write_example_to_arrayrecord(example, file_path):
writer = ArrayRecordWriter(file_path, 'group_size:1')
writer.write(example.SerializeToString())
writer.close()


def kitty_tfrecord(prefix=''):
"""Create a TFRecord from a cat pic on the Internet.
This is mainly for testing; probably don't use it.
Args:
prefix: A file directory in string format.
"""

cat_in_snow = tf.keras.utils.get_file(
'320px-Felis_catus-cat_on_snow.jpg',
'https://storage.googleapis.com/download.tensorflow.org/example_images/320px-Felis_catus-cat_on_snow.jpg')

image_labels = {
cat_in_snow: 0
}

image_string = open(cat_in_snow, 'rb').read()
label = image_labels[cat_in_snow]
image_shape = tf.io.decode_jpeg(image_string).shape

feature = {
'height': tf.train.Feature(int64_list=tf.train.Int64List(
value=[image_shape[0]])),
'width': tf.train.Feature(int64_list=tf.train.Int64List(
value=[image_shape[1]])),
'depth': tf.train.Feature(int64_list=tf.train.Int64List(
value=[image_shape[2]])),
'label': tf.train.Feature(int64_list=tf.train.Int64List(
value=[label])),
'image_raw': tf.train.Feature(bytes_list=tf.train.BytesList(
value=[image_string]))
}

example = tf.train.Example(features=tf.train.Features(feature=feature))

record_file = os.path.join(prefix, 'kittymeow.tfrecord')
with tf.io.TFRecordWriter(record_file) as writer:
writer.write(example.SerializeToString())
13 changes: 13 additions & 0 deletions beam/examples/example_full_demo_cli.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Execute this via BASH to run a full demo that creates TFRecords and converts them

#!/bin/bash


# Set bucket info below. Uncomment lower lines and set values to use Dataflow.
python -m array_record.beam.demo \
--input="gs://<YOUR_INPUT_BUCKET>/records/movies" \
--output="gs://<YOUR_OUTPUT_BUCKET>/records/" \
# --region="<YOUR_REGION>" \
# --runner="DataflowRunner" \
# --project="<YOUR_PROJECT>" \
# --requirements_file="requirements.txt"
29 changes: 29 additions & 0 deletions beam/examples/example_gcs_conversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Execute this to convert an existing set of TFRecords to ArrayRecords."""


from apache_beam.options import pipeline_options
from array_record.beam.pipelines import convert_tf_to_arrayrecord_gcs

## Set input and output patterns as specified
input_pattern = 'gs://<YOUR_INPUT_BUCKET>/records/*.tfrecord'
output_path = 'gs://<YOUR_OUTPUT_BUCKET>/records/'

args = {'input': input_pattern, 'output': output_path}

## Set pipeline options and uncomment in main() to run in Dataflow
pipeline_options = pipeline_options.PipelineOptions(
runner='DataflowRunner',
project='<YOUR_PROJECT>',
region='<YOUR_REGION>',
requirements_file='requirements.txt'
)


def main():
convert_tf_to_arrayrecord_gcs(
args=args,
# pipeline_options=pipeline_options
).run()

if __name__ == '__main__':
main()
29 changes: 29 additions & 0 deletions beam/examples/example_sink_conversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Execute this to convert TFRecords to ArrayRecords using the disk Sink."""


from apache_beam.options import pipeline_options
from array_record.beam.pipelines import convert_tf_to_arrayrecord_disk_match_shards

## Set input and output patterns as specified
input_pattern = 'gs://<YOUR_INPUT_BUCKET>/records/*.tfrecord'
output_path = 'records/movies'

args = {'input': input_pattern, 'output': output_path}

## Set pipeline options and uncomment in main() to run in Dataflow
pipeline_options = pipeline_options.PipelineOptions(
runner='DataflowRunner',
project='<YOUR_PROJECT>',
region='<YOUR_REGION>',
requirements_file='requirements.txt'
)


def main():
convert_tf_to_arrayrecord_disk_match_shards(
args=args,
# pipeline_options=pipeline_options
).run()

if __name__ == '__main__':
main()
2 changes: 2 additions & 0 deletions beam/examples/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
google-cloud-storage==2.11.0
tensorflow==2.14.0
Loading

0 comments on commit 214e5e1

Please sign in to comment.