Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gia-dataset-parquet -> gia-dataset #136

Merged
merged 1 commit into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/GUIDELINES.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ _BASE_URL = "https://huggingface.co/datasets/gia-project/gia-dataset/resolve/my_
```python
from datasets import load_dataset

load_dataset("gia-project/gia-dataset-parquet", "my_task", revision="my_branch")
load_dataset("gia-project/gia-dataset", "my_task", revision="my_branch")
```


Expand Down
2 changes: 1 addition & 1 deletion data/conceptual_captions/generate_conceptual_caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def process(example: Dict[str, List[str]]) -> Dict[str, List[Union[str, PIL.Imag

for i in range(retry):
try:
dataset.push_to_hub("gia-project/gia-dataset-parquet", "conceptual-captions", split=split)
dataset.push_to_hub("gia-project/gia-dataset", "conceptual-captions", split=split)
break
except Exception:
print(f"Retry {i+1}/{retry}")
4 changes: 2 additions & 2 deletions data/envs/atari/create_atari_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ def create_atari_dataset(cfg: Config):
]
dataset = concatenate_datasets(ds)
dataset = dataset.train_test_split(test_size=0.1, writer_batch_size=1)
HfApi().create_branch("gia-project/gia-dataset-parquet", branch="new_breakout", exist_ok=True, repo_type="dataset")
HfApi().create_branch("gia-project/gia-dataset", branch="new_breakout", exist_ok=True, repo_type="dataset")
dataset.push_to_hub(
"gia-project/gia-dataset-parquet",
"gia-project/gia-dataset",
config_name=f"atari-{task}",
branch="new_breakout",
)
Expand Down
2 changes: 1 addition & 1 deletion data/envs/babyai/create_babyai_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def create_babyai_dataset(task_name, max_num_episodes):

print("Pushing dataset to hub...")
dataset = dataset.train_test_split(test_size=0.02)
dataset.push_to_hub("gia-project/gia-dataset-parquet", task_name, branch="additional_babyai_tasks")
dataset.push_to_hub("gia-project/gia-dataset", task_name, branch="additional_babyai_tasks")
print("Pushed dataset to hub!")


Expand Down
2 changes: 1 addition & 1 deletion data/envs/download_expert_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@
for env_name in tqdm(ENV_NAMES):
tqdm.write(f"Downloading expert scores for {env_name}")

dataset = load_dataset("gia-project/gia-dataset-parquet", env_name)
dataset = load_dataset("gia-project/gia-dataset", env_name)
# Initialize the variables
rewards = dataset["train"]["rewards"] + dataset["test"]["rewards"]
episode_sum_rewards = [np.sum(r) for r in rewards]
Expand Down
4 changes: 2 additions & 2 deletions data/to_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ def add_dataset_to_hub(
path_in_repo = f"data/{domain}/{task}/"
commit_message = f"adds {domain} {task} {n_episodes=}"
HfApi().create_repo(
repo_id="gia-project/gia-dataset-parquet", private=False, exist_ok=True, repo_type="dataset"
repo_id="gia-project/gia-dataset", private=False, exist_ok=True, repo_type="dataset"
)

upload_folder(
repo_id="gia-project/gia-dataset-parquet",
repo_id="gia-project/gia-dataset",
commit_message=commit_message,
folder_path=tmpdirname,
path_in_repo=path_in_repo,
Expand Down
2 changes: 1 addition & 1 deletion examples/load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


# Load the dataset
dataset = load_dataset("gia-project/gia-dataset-parquet", "mujoco-ant", split="train")
dataset = load_dataset("gia-project/gia-dataset", "mujoco-ant", split="train")

print(
f"""
Expand Down
2 changes: 1 addition & 1 deletion gia/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def generate_model_card(model_name: str, scores_dict: Optional[Dict[str, List[fl
tags=tags,
eval_results=generate_rl_eval_results(scores_dict) if scores_dict is not None else None,
model_name=model_name,
datasets="gia-project/gia-dataset-parquet",
datasets="gia-project/gia-dataset",
pipeline_tag="reinforcement-learning",
)
card = ModelCard.from_template(
Expand Down
6 changes: 3 additions & 3 deletions scripts/download_all_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@

tasks = parser.parse_args().tasks
if tasks == ["all"]:
tasks = get_dataset_config_names("gia-project/gia-dataset-parquet") # get all task names from gia dataset
tasks = get_dataset_config_names("gia-project/gia-dataset") # get all task names from gia dataset

for task in tasks:
print(f"Loading {task}...")
cache_path = f"{HF_DATASETS_CACHE}/gia-project/gia-dataset-parquet/{task}"
cache_path = f"{HF_DATASETS_CACHE}/gia-project/gia-dataset/{task}"
if not os.path.exists(cache_path):
if task == "oscar":
dataset = load_dataset("ClementRomac/cleaned_deduplicated_oscar")
else:
dataset = load_dataset("gia-project/gia-dataset-parquet", task)
dataset = load_dataset("gia-project/gia-dataset", task)
dataset.save_to_disk(cache_path)
12 changes: 6 additions & 6 deletions scripts/train_gia.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,24 +122,24 @@ def main():
dataset_dict = {}
if HF_DATASETS_OFFLINE:
for task in tasks:
if not os.path.exists(f"{HF_DATASETS_CACHE}/gia-project/gia-dataset-parquet/{task}"):
if not os.path.exists(f"{HF_DATASETS_CACHE}/gia-project/gia-dataset/{task}"):
raise ValueError(
f"""Dataset {task} not found in {HF_DATASETS_CACHE}/gia-project/gia-dataset-parquet/
f"""Dataset {task} not found in {HF_DATASETS_CACHE}/gia-project/gia-dataset/
Make sure to download and save it first with
```
from datasets import load_dataset
dataset = load_dataset('gia-project/gia-dataset-parquet', '{task}')
dataset.save_to_disk('{HF_DATASETS_CACHE}/gia-project/gia-dataset-parquet/{task}')
dataset = load_dataset('gia-project/gia-dataset', '{task}')
dataset.save_to_disk('{HF_DATASETS_CACHE}/gia-project/gia-dataset/{task}')
```"""
)
dataset = load_from_disk(f"{HF_DATASETS_CACHE}/gia-project/gia-dataset-parquet/{task}")
dataset = load_from_disk(f"{HF_DATASETS_CACHE}/gia-project/gia-dataset/{task}")
dataset_dict[task] = {s: d.to_iterable_dataset() for s, d in dataset.items()}
else:
for task in tasks:
if task == "oscar":
dataset_dict[task] = load_dataset("ClementRomac/cleaned_deduplicated_oscar", streaming=True)
else:
dataset_dict[task] = load_dataset("gia-project/gia-dataset-parquet", task, streaming=True)
dataset_dict[task] = load_dataset("gia-project/gia-dataset", task, streaming=True)

# Preprocess the dataset
for task in dataset_dict.keys():
Expand Down
Loading