Skip to content

Commit

Permalink
[DEVX-128] upload URLs from CSV (#168)
Browse files Browse the repository at this point in the history
* upload_URLs_from_CSV
  • Loading branch information
sanjaychelliah authored Sep 8, 2023
1 parent e030add commit 547f723
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 25 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ dataset = app.create_dataset(dataset_id="demo_dataset")
dataset.upload_dataset(task='visual_segmentation', split="train", dataset_loader='coco_segmentation')

#upload text from csv
dataset.upload_from_csv(csv_path='csv_path', labels=True)
dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw', labels=True)

#upload data from folder
dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)
Expand Down
22 changes: 16 additions & 6 deletions clarifai/client/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,29 +224,39 @@ def upload_dataset(self,
def upload_from_csv(self,
csv_path: str,
input_type: str = 'text',
csv_type: str = None,
labels: bool = True,
chunk_size: int = 128) -> None:
"""Uploads dataset from a csv file.
Args:
csv_path (str): path to the csv file
input_type (str): type of the dataset(text, image)
csv_type (str): type of the csv file(raw, url, file_path)
labels (bool): True if csv file has labels column
chunk_size (int): chunk size for concurrent upload of inputs and annotations
Example:
>>> from clarifai.client.dataset import Dataset
>>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
>>> dataset.upload_from_csv(csv_path='csv_path', labels=True)
>>> dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw, labels=True)
Note: csv file should have either one(input) or two columns(input, labels).
"""
if input_type not in ['image', 'text']: #TODO: add image
raise UserError('Invalid input type it should be image or text')
if input_type not in ['image', 'text', 'video', 'audio']:
raise UserError('Invalid input type, it should be image,text,audio or video')
if csv_type not in ['raw', 'url', 'file_path']:
raise UserError('Invalid csv type, it should be raw, url or file_path')
assert csv_path.endswith('.csv'), 'csv_path should be a csv file'
if csv_type == 'raw' and input_type != 'text':
raise UserError('Only text input type is supported for raw csv type')
chunk_size = min(128, chunk_size)
if input_type == 'text':
input_protos = self.input_object.get_text_input_from_csv(
csv_path=csv_path, dataset_id=self.id, labels=labels)
input_protos = self.input_object.get_inputs_from_csv(
csv_path=csv_path,
input_type=input_type,
csv_type=csv_type,
dataset_id=self.id,
labels=labels)
self.input_object._bulk_upload(inputs=input_protos, chunk_size=chunk_size)

def upload_from_folder(self,
Expand Down
82 changes: 64 additions & 18 deletions clarifai/client/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,17 @@ def get_input_from_file(self,
image_file: str = None,
video_file: str = None,
audio_file: str = None,
text_file: str = None,
dataset_id: str = None,
**kwargs) -> Input:
"""Create input proto from files.
Args:
input_id (str): The input ID for the input to create.
image_file (str): The url for the image.
video_file (str): The url for the video.
audio_file (str): The url for the audio.
image_file (str): The file_path for the image.
video_file (str): The file_path for the video.
audio_file (str): The file_path for the audio.
text_file (str): The file_path for the text.
dataset_id (str): The dataset ID for the dataset to add the input to.
Returns:
Expand All @@ -165,24 +167,28 @@ def get_input_from_file(self,
>>> input_obj = Input()
>>> input_proto = input_obj.get_input_from_file(input_id = 'demo', video_file='file_path')
"""
if not any((image_file, video_file, audio_file)):
raise ValueError("At least one of image_file, video_file, audio_file, must be provided.")
if not any((image_file, video_file, audio_file, text_file)):
raise ValueError(
"At least one of image_file, video_file, audio_file, text_file must be provided.")
image_pb = resources_pb2.Image(base64=open(image_file, 'rb').read()) if image_file else None
video_pb = resources_pb2.Video(base64=open(video_file, 'rb').read()) if video_file else None
audio_pb = resources_pb2.Audio(base64=open(audio_file, 'rb').read()) if audio_file else None
text_pb = resources_pb2.Text(raw=open(text_file, 'rb').read()) if text_file else None
return self._get_proto(
input_id=input_id,
dataset_id=dataset_id,
imagepb=image_pb,
video_pb=video_pb,
audio_pb=audio_pb,
text_pb=text_pb,
**kwargs)

def get_input_from_bytes(self,
input_id: str,
image_bytes: bytes = None,
video_bytes: bytes = None,
audio_bytes: bytes = None,
text_bytes: bytes = None,
dataset_id: str = None,
**kwargs) -> Input:
"""Create input proto from bytes.
Expand All @@ -192,6 +198,7 @@ def get_input_from_bytes(self,
image_bytes (str): The bytes for the image.
video_bytes (str): The bytes for the video.
audio_bytes (str): The bytes for the audio.
text_bytes (str): The bytes for the text.
dataset_id (str): The dataset ID for the dataset to add the input to.
Returns:
Expand All @@ -204,17 +211,20 @@ def get_input_from_bytes(self,
>>> video = open('demo.mp4', 'rb').read()
>>> input_proto = input_obj.get_input_from_bytes(input_id = 'demo',image_bytes =image, video_bytes=video)
"""
if not any((image_bytes, video_bytes, audio_bytes)):
raise ValueError("At least one of image_bytes, video_bytes, audio_bytes, must be provided.")
if not any((image_bytes, video_bytes, audio_bytes, text_bytes)):
raise ValueError(
"At least one of image_bytes, video_bytes, audio_bytes, text_bytes must be provided.")
image_pb = resources_pb2.Image(base64=image_bytes) if image_bytes else None
video_pb = resources_pb2.Video(base64=video_bytes) if video_bytes else None
audio_pb = resources_pb2.Audio(base64=audio_bytes) if audio_bytes else None
text_pb = resources_pb2.Text(raw=text_bytes) if text_bytes else None
return self._get_proto(
input_id=input_id,
dataset_id=dataset_id,
imagepb=image_pb,
video_pb=video_pb,
audio_pb=audio_pb,
text_pb=text_pb,
**kwargs)

def get_image_inputs_from_folder(self,
Expand Down Expand Up @@ -267,12 +277,18 @@ def get_text_input(self, input_id: str, raw_text: str, dataset_id: str = None,
text_pb = resources_pb2.Text(raw=raw_text)
return self._get_proto(input_id=input_id, dataset_id=dataset_id, text_pb=text_pb, **kwargs)

def get_text_input_from_csv(self, csv_path: str, dataset_id: str = None,
labels: str = True) -> List[Text]: #text specific
"""Create input proto for text data type from cscv.
def get_inputs_from_csv(self,
csv_path: str,
input_type: str = 'text',
csv_type: str = 'raw',
dataset_id: str = None,
labels: str = True) -> List[Text]:
"""Create input protos from cscv.
Args:
csv_path (str): Path to the csv file.
input_type (str): Type of input. Options: 'text', 'image', 'video', 'audio'.
csv_type (str): Type of csv file. Options: 'raw', 'url', 'file_path'.
dataset_id (str): The dataset ID for the dataset to add the input to.
labels (str): True if csv file has labels column.
Expand All @@ -282,23 +298,49 @@ def get_text_input_from_csv(self, csv_path: str, dataset_id: str = None,
Example:
>>> from clarifai.client.input import Input
>>> input_obj = Input()
>>> input_protos = input_obj.get_text_input_from_csv(csv_path = 'filepath')
>>> input_protos = input_obj.get_inputs_from_csv(csv_path='filepath', input_type='text', csv_type='raw')
"""
input_protos = []
with open(csv_path) as _file:
reader = csv.reader(_file)
next(reader, None) # skip header
for id, input in enumerate(reader):
text = input[0]
if labels:
assert len(input) == 2, "csv file should have two columns(input, labels)"
labels = input[1] if isinstance(input[1], list) else [input[1]]
else:
labels = None

input_id = f"{dataset_id}-{id}"
input_protos.append(
self.get_text_input(
input_id=input_id, raw_text=text, dataset_id=dataset_id, labels=labels))
text = input[0] if input_type == 'text' else None
image = input[0] if input_type == 'image' else None
video = input[0] if input_type == 'video' else None
audio = input[0] if input_type == 'audio' else None

if csv_type == 'raw':
input_protos.append(
self.get_text_input(
input_id=input_id, raw_text=text, dataset_id=dataset_id, labels=labels))
elif csv_type == 'url':
input_protos.append(
self.get_input_from_url(
input_id=input_id,
image_url=image,
text_url=text,
audio_url=audio,
video_url=video,
dataset_id=dataset_id,
labels=labels))
else:
input_protos.append(
self.get_input_from_file(
input_id=input_id,
image_file=image,
text_file=text,
audio_file=audio,
video_file=video,
dataset_id=dataset_id,
labels=labels))

return input_protos

Expand Down Expand Up @@ -442,6 +484,7 @@ def upload_from_file(self,
image_file: str = None,
video_file: str = None,
audio_file: str = None,
text_file: str = None,
dataset_id: str = None,
**kwargs) -> str:
"""Upload input from file.
Expand All @@ -451,6 +494,7 @@ def upload_from_file(self,
image_file (str): The file for the image.
video_file (str): The file for the video.
audio_file (str): The file for the audio.
text_file (str): The file for the text.
dataset_id (str): The dataset ID for the dataset to add the input to.
Returns:
Expand All @@ -461,15 +505,16 @@ def upload_from_file(self,
>>> input_obj = Input(user_id = 'user_id', app_id = 'demo_app')
>>> input_obj.upload_from_file(input_id='demo', audio_file='demo.mp3')
"""
input_pb = self.get_input_from_file(input_id, image_file, video_file, audio_file, dataset_id,
**kwargs)
input_pb = self.get_input_from_file(input_id, image_file, video_file, audio_file, text_file,
dataset_id, **kwargs)
return self.upload_inputs([input_pb])

def upload_from_bytes(self,
input_id: str,
image_bytes: bytes = None,
video_bytes: bytes = None,
audio_bytes: bytes = None,
text_bytes: bytes = None,
dataset_id: str = None,
**kwargs) -> str:
"""Upload input from bytes.
Expand All @@ -479,6 +524,7 @@ def upload_from_bytes(self,
image_bytes (str): The bytes for the image.
video_bytes (str): The bytes for the video.
audio_bytes (str): The bytes for the audio.
text_bytes (str): The bytes for the text.
dataset_id (str): The dataset ID for the dataset to add the input to.
Returns:
Expand All @@ -491,7 +537,7 @@ def upload_from_bytes(self,
>>> input_obj.upload_from_bytes(input_id='demo', image_bytes=image)
"""
input_pb = self.get_input_from_bytes(input_id, image_bytes, video_bytes, audio_bytes,
dataset_id, **kwargs)
text_bytes, dataset_id, **kwargs)
return self.upload_inputs([input_pb])

def upload_text(self, input_id: str, raw_text: str, dataset_id: str = None,
Expand Down

0 comments on commit 547f723

Please sign in to comment.