-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_documents.py
139 lines (109 loc) · 4.4 KB
/
load_documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import json
from typing import Iterator
def get_document_iterator(file_path: str, file_format: str) -> Iterator[str]:
if file_format in ["the_pile", "oig", "c4"]:
return get_the_pile_document_iterator(file_path)
elif file_format == "flan":
return get_flan_document_iterator(file_path)
elif file_format in ["xp3", "p3"]:
return get_xp3_document_iterator(file_path)
elif file_format == "root":
return get_root_document_iterator(file_path)
elif file_format == "natural":
return get_natural_document_iterator(file_path)
elif file_format == "hh":
return get_root_document_iterator(file_path)
elif file_format == "raw":
return get_raw_document_iterator(file_path)
elif file_format == "custom":
return get_custom_document_iterator(file_path)
else:
raise NotImplementedError()
def get_hh_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with Anthropic/hh-rlhh jsonl format.
example:
{'chosen': '\n\nHuman: What kind of noises did ...'
'rejected': '\n\nHuman: What kind of noises...'}
"""
with open(file_path, "r") as f:
for line in f:
yield json.loads(line)["chosen"]
yield json.loads(line)["rejected"]
def get_natural_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with natural instructions json format.
"""
with open(file_path) as f:
json_dict = json.load(f)
for example in json_dict["Positive Examples"]:
yield example["input"]
yield example["output"]
yield example["explanation"]
for example in json_dict["Negative Examples"]:
yield example["input"]
yield example["output"]
yield example["explanation"]
def get_root_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with root's parquet format.
"""
import pyarrow.parquet as pq
parquet_file = pq.ParquetFile(file_path)
for batch in parquet_file.iter_batches():
df = batch.to_pandas()
for row in df.iterrows():
yield row[1].tolist()[0]
def get_xp3_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with xp3's jsonl format.
Example:
{"inputs":"( 1 ) Benzoyl peroxide and ..."}
"""
with open(file_path, "r") as f:
for line in f:
json_dict = json.loads(line)
yield json_dict["inputs"]
yield json_dict["targets"]
def get_flan_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with FLAN's jsonl format.
Example:
{'id': 'task587-712049dced9641209d4f3ba815616a8f',
'input': "I was looking for ..."
'output': 'False'}
"""
with open(file_path, "r") as f:
for line in f:
json_dict = json.loads(line)
for sample in json_dict["sample"]:
yield sample["input"]
yield sample["output"]
def get_the_pile_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files with similar file formats with The Pile's jsonl format.
Each line of the input file should be a json string, where the document is stored in a field named "text".
There are no empty lines between json lines.
Example:
{"text": "Hello World!", "meta": {"pile_set_name": "Pile-CC"}}
{"text": "Foo bar", "meta": {"pile_set_name": "Pile-CC"}}
"""
with open(file_path, "r") as f:
for line in f:
yield json.loads(line)["text"]
def get_raw_document_iterator(file_path: str) -> Iterator[str]:
"""
This method reads input files where each line is a document. The file should not be organized
in any specific file structures such as json, jsonl, or tsv, as this may affect ngram computation.
Any characters other than the actual text content should be removed.
Example:
Hello World!
Foo bar
This is the 3rd document.
"""
with open(file_path, "r") as f:
for line in f:
yield line.rstrip("\n")
def get_custom_document_iterator(file_path: str) -> Iterator[str]:
"""Define your own document reading method"""
raise NotImplementedError()