-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathCreateManifest.py
77 lines (59 loc) · 2.24 KB
/
CreateManifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from __future__ import absolute_import
'''
Create documents for parallel processing of videos using cloud dataflow
'''
import os
from google.cloud import storage
from oauth2client.client import GoogleCredentials
import random
import csv
import tempfile
import argparse
from urlparse import urlparse
# Serice account credentials
#if on google cloud, can get credentials directly.
try:
credentials = GoogleCredentials.get_application_default()
except:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/Users/Ben/Dropbox/Google/MeerkatReader-9fbf10d1e30c.json"
def process_args():
parser = argparse.ArgumentParser(description='Create document for dataflow job.')
parser.add_argument('-input_dir', help='Google cloud storage path for input videos samples.',default="gs://api-project-773889352370-testing/testing/")
parser.add_argument('-limit', help='Total number of videos',default=None,type=int)
args, _ = parser.parse_known_args()
return args
class Organizer:
def __init__(self,args):
"""Downloads a blob from the bucket."""
storage_client = storage.Client()
self.parsed = urlparse(args.input_dir)
#parse gcp path
self.bucket = storage_client.get_bucket(self.parsed.hostname)
vids=self.bucket.list_blobs(prefix=self.parsed.path[1:])
#video list
self.video_list=[]
#first position is always folder containing videos
is_first=True
for vid in vids:
if is_first:
is_first=False
continue
self.video_list.append("gs://" + self.bucket.name +"/"+ str(vid.name))
#Limit total number of videos if testing
if args.limit:
self.video_list=self.video_list[0:limit]
print(self.video_list)
def WriteCsv(self):
#Write to temp then send to google cloud
handle, fn = tempfile.mkstemp(suffix='.csv')
with open(fn,"wb") as f:
writer=csv.writer(f)
for row in self.video_list:
writer.writerow([row])
#write to google cloud
blob=self.bucket.blob("DataFlow/manifest.csv")
blob.upload_from_filename(fn)
if __name__ == "__main__":
args = process_args()
p=Organizer(args)
p.WriteCsv()