forked from NExT-GPT/NExT-GPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
catalog.py
125 lines (108 loc) · 4.56 KB
/
catalog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
class DatasetCatalog:
def __init__(self):
# the following dataset utilized for encoding-side alignment learning
self.audiocap_enc = {
"target": "dataset.audiocap_datase.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="AudioToText",
),
}
self.webvid_enc = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="VideoToText",
),
}
self.cc3m_enc = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="ImageToText",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for decoding-side alignment learning.
self.audiocap_dec = {
"target": "dataset.audiocap_dataset.AudioCapDataset",
"params": dict(
data_path="../data/T-X_pair_data/audiocap/audiocap.json",
mm_root_path="../data/T-X_pair_data/audiocap/audios",
embed_path="../data/embed/",
dataset_type="TextToAudio",
),
}
self.webvid_dec = {
"target": "dataset.webvid_dataset.WebvidDataset",
"params": dict(
data_path="../data/T-X_pair_data/webvid/webvid.json",
mm_root_path="../data/T-X_pair_data/webvid/videos",
embed_path="../data/embed/",
dataset_type="TextToVideo",
),
}
self.cc3m_dec = {
"target": "dataset.cc3m_dataset.CC3MDataset",
"params": dict(
data_path="../data/T-X_pair_data/cc3m/cc3m.json",
mm_root_path="../data/T-X_pair_data/cc3m/images",
embed_path="../data/embed/",
dataset_type="TextToImage",
),
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# the following dataset utilized for instruction tuning, so they are instruction dataset.
self.audio_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/audio_t2x.json",
embed_path="./embed/",
dataset_type="TextToAudio",
),
}
self.video_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/video_t2x.json",
embed_path="./embed/",
dataset_type="TextToVideo",
),
}
self.image_instruction = {
"target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset",
"params": dict(
data_path="../data/IT_data/T-T+X_data/image_t2x.json",
embed_path="./embed/",
dataset_type="TextToImage",
),
}
self.llava_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/llava/llava.json",
mm_root_path="../data/IT_data/T+X-T_data/llava/images",
dataset_type="ImageToText",
),
}
self.alpaca_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json",
dataset_type="TextToText",
),
}
self.videochat_instruction = {
"target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset",
"params": dict(
data_path="../data/IT_data/T+X-T_data/videochat/videochat.json",
dataset_type="VideoToText",
),
}