-
Notifications
You must be signed in to change notification settings - Fork 609
/
Copy pathyoutubevos.py
195 lines (162 loc) · 8.26 KB
/
youtubevos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from pathlib import Path
import os
from ltr.dataset.vos_base import VOSDatasetBase, VOSMeta
from pytracking.evaluation import Sequence
import json
from ltr.admin.environment import env_settings
from ltr.data.image_loader import jpeg4py_loader
class YouTubeVOSMeta:
""" Thin wrapper for YouTubeVOS meta data
meta.json
{
"videos": {
"<video_id>": {
"objects": {
"<object_id>": {
"category": "<category>",
"frames": [
"<frame_id>",
"<frame_id>",
]
}
}
}
}
}
# <object_id> is the same as the pixel values of object in annotated segmentation PNG files.
# <frame_id> is the 5-digit index of frame in video, and not necessary to start from 0.
"""
def __init__(self, dset_split_path):
self._data = json.load(open(dset_split_path / 'meta.json'))['videos']
def sequences(self):
return list(self._data.keys())
def seq_frames(self, seq_name):
""" All filename stems of the frames in the sequence """
frames = set()
for obj_id in self.object_ids(seq_name):
for f in self.object_frames(seq_name, obj_id):
frames.add(f)
return list(sorted(frames))
def object_ids(self, seq_name):
""" All objects in the sequence """
return list(self._data[seq_name]['objects'].keys())
def object_category(self, seq_name, obj_id):
return self._data[seq_name]['objects'][str(obj_id)]['category']
def object_frames(self, seq_name, obj_id):
return self._data[seq_name]['objects'][str(obj_id)]['frames']
def object_first_frame(self, seq_name, obj_id):
return self.object_frames(seq_name, obj_id)[0]
class YouTubeVOS(VOSDatasetBase):
"""
YoutubeVOS video object segmentation dataset.
Publication:
YouTube-VOS: A Large-Scale Video Object Segmentation Benchmark
Ning Xu, Linjie Yang, Yuchen Fan, Dingcheng Yue, Yuchen Liang, Jianchao Yang, and Thomas Huang
ECCV, 2018
https://arxiv.org/pdf/1809.03327.pdf
Download dataset from: https://youtube-vos.org/dataset/
"""
def __init__(self, root=None, version='2019', split='train', cleanup=None, all_frames=False, sequences=None,
multiobj=True, vis_threshold=10, image_loader=jpeg4py_loader):
"""
args:
root - Dataset root path. If unset, it uses the path in your local.py config.
version - '2018' or '2019'
split - 'test', 'train', 'valid', or 'jjtrain', 'jjvalid'. 'jjvalid' corresponds to a custom validation
dataset consisting of 300 videos randomly sampled from the train set. 'jjtrain' contains the
remaining videos used for training.
cleanup - List of actions to take to to clean up known problems in the dataset.
'aspects': remove frames with weird aspect ratios,
'starts': fix up start frames from original meta data
all_frames - Whether to use an "all_frames" split.
sequences - List of sequence names. Limit to a subset of sequences if not None.
multiobj - Whether the dataset will return all objects in a sequence or multiple sequences with one
object in each.
vis_threshold - Minimum number of pixels required to consider a target object "visible".
image_loader - Image loader.
"""
root = env_settings().youtubevos_dir if root is None else root
super().__init__(name="YouTubeVOS", root=Path(root), version=version, split=split, multiobj=multiobj,
vis_threshold=vis_threshold, image_loader=image_loader)
split_folder = self.split
if self.split.startswith("jj"):
split_folder = "train"
dset_path = self.root / self.version / split_folder
self._anno_path = dset_path / 'Annotations'
if all_frames:
self._jpeg_path = self.root / self.version / (split_folder + "_all_frames") / 'JPEGImages'
else:
self._jpeg_path = dset_path / 'JPEGImages'
self.meta = YouTubeVOSMeta(dset_path)
meta_path = dset_path / "generated_meta.json"
if meta_path.exists():
self.gmeta = VOSMeta(filename=meta_path)
else:
self.gmeta = VOSMeta.generate('YouTubeVOS', self._jpeg_path, self._anno_path)
self.gmeta.save(meta_path)
if all_frames:
self.gmeta.enable_all_frames(self._jpeg_path)
if self.split not in ['train', 'valid', 'test']:
self.gmeta.select_split('youtubevos', self.split)
if sequences is None:
sequences = self.gmeta.get_sequence_names()
to_remove = set()
cleanup = {} if cleanup is None else set(cleanup)
if 'aspect' in cleanup:
# Remove sequences with unusual aspect ratios
for seq_name in sequences:
a = self.gmeta.get_aspect_ratio(seq_name)
if a < 1.45 or a > 1.9:
to_remove.add(seq_name)
if 'starts' in cleanup:
# Fix incorrect start frames for some objects found with ytvos_start_frames_test()
bad_start_frames = [("0e27472bea", '2', ['00055', '00060'], '00065'),
("5937b08d69", '4', ['00000'], '00005'),
("5e1ce354fd", '5', ['00010', '00015'], '00020'),
("7053e4f41e", '2', ['00000', '00005', '00010', '00015'], '00020'),
("720e3fa04c", '2', ['00050'], '00055'),
("c73c8e747f", '2', ['00035'], '00040')]
for seq_name, obj_id, bad_frames, good_frame in bad_start_frames:
# bad_frames is from meta.json included with the dataset
# good_frame is from the generated meta - and the first actual frame where the object was seen.
if seq_name in self.meta._data:
frames = self.meta.object_frames(seq_name, obj_id)
for f in bad_frames:
frames.remove(f)
assert frames[0] == good_frame
sequences = [seq for seq in sequences if seq not in to_remove]
self.sequence_names = sequences
self._samples = []
for seq in sequences:
obj_ids = self.meta.object_ids(seq)
if self.multiobj: # Multiple objects per sample
self._samples.append((seq, obj_ids))
else: # One object per sample
self._samples.extend([(seq, [obj_id]) for obj_id in obj_ids])
print("%s loaded." % self.get_name())
if len(to_remove) > 0:
print(" %d sequences were removed, (%d remaining)." % (len(to_remove), len(sequences)))
def is_mot_dataset(self):
return self.multiobj
def _construct_sequence(self, sequence_info):
seq_name = sequence_info['sequence']
frame_names = sequence_info['frame_names']
fname_to_fid = {f: i for i, f in enumerate(frame_names)}
images, gt_segs, gt_bboxes = self.get_paths_and_bboxes(sequence_info)
init_data = dict()
for obj_id in sequence_info['object_ids']:
if obj_id == '0':
print("!")
f_name = self.meta.object_first_frame(seq_name, obj_id)
f_id = fname_to_fid[f_name]
if f_id not in init_data:
init_data[f_id] = {'object_ids': [obj_id],
'bbox': {obj_id: gt_bboxes[obj_id][f_id,:]},
'mask': os.path.join(os.path.dirname(gt_segs[f_id]), (f_name + ".png"))}
assert init_data[f_id]['mask'] in gt_segs # If this fails, some file is missing
else:
init_data[f_id]['object_ids'].append(obj_id)
init_data[f_id]['bbox'][obj_id] = gt_bboxes[obj_id][f_id,:]
return Sequence(name=seq_name, frames=images, dataset='YouTubeVOS', ground_truth_rect=gt_bboxes,
init_data=init_data, ground_truth_seg=gt_segs, object_ids=sequence_info['object_ids'],
multiobj_mode=self.multiobj)