-
Notifications
You must be signed in to change notification settings - Fork 115
/
tracker_ensemble.m
216 lines (171 loc) · 8.14 KB
/
tracker_ensemble.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
% tracker_ensemble: Correlation filter tracking with convolutional features
%
% Input:
% - video_path: path to the image sequence
% - img_files: list of image names
% - pos: intialized center position of the target in (row, col)
% - target_sz: intialized target size in (Height, Width)
% - padding: padding parameter for the search area
% - lambda: regularization term for ridge regression
% - output_sigma_factor: spatial bandwidth for the Gaussian label
% - interp_factor: learning rate for model update
% - cell_size: spatial quantization level
% - show_visualization: set to True for showing intermediate results
% Output:
% - positions: predicted target position at each frame
% - time: time spent for tracking
%
% It is provided for educational/researrch purpose only.
% If you find the software useful, please consider cite our paper.
%
% Hierarchical Convolutional Features for Visual Tracking
% Chao Ma, Jia-Bin Huang, Xiaokang Yang, and Ming-Hsuan Yang
% IEEE International Conference on Computer Vision, ICCV 2015
%
% Contact:
% Chao Ma ([email protected]), or
% Jia-Bin Huang ([email protected]).
function [positions, time] = tracker_ensemble(video_path, img_files, pos, target_sz, ...
padding, lambda, output_sigma_factor, interp_factor, cell_size, show_visualization)
% ================================================================================
% Environment setting
% ================================================================================
indLayers = [37, 28, 19]; % The CNN layers Conv5-4, Conv4-4, and Conv3-4 in VGG Net
nweights = [1, 0.5, 0.25]; % Weights for combining correlation filter responses
numLayers = length(indLayers);
% Get image size and search window size
im_sz = size(imread([video_path img_files{1}]));
window_sz = get_search_window(target_sz, im_sz, padding);
% Compute the sigma for the Gaussian function label
output_sigma = sqrt(prod(target_sz)) * output_sigma_factor / cell_size;
%create regression labels, gaussian shaped, with a bandwidth
%proportional to target size d=bsxfun(@times,c,[1 2]);
l1_patch_num = floor(window_sz/ cell_size);
% Pre-compute the Fourier Transform of the Gaussian function label
yf = fft2(gaussian_shaped_labels(output_sigma, l1_patch_num));
% Pre-compute and cache the cosine window (for avoiding boundary discontinuity)
cos_window = hann(size(yf,1)) * hann(size(yf,2))';
% Create video interface for visualization
if(show_visualization)
update_visualization = show_video(img_files, video_path);
end
% Initialize variables for calculating FPS and distance precision
time = 0;
rects = zeros(numel(img_files), 4);
nweights = reshape(nweights,1,1,[]);
% Note: variables ending with 'f' are in the Fourier domain.
model_xf = cell(1, numLayers);
model_alphaf = cell(1, numLayers);
current_scale_factor=1;
% ================================================================================
% Start tracking
% ================================================================================
for frame = 1:numel(img_files),
im = imread([video_path img_files{frame}]); % Load the image at the current frame
if ismatrix(im)
im = cat(3, im, im, im);
end
tic();
% ================================================================================
% Predicting the object position from the learned object model
% ================================================================================
if frame > 1
% Extracting hierarchical convolutional features
feat = extractFeature(im, pos, window_sz, cos_window, indLayers);
% Predict position
pos = predictPosition(feat, pos, indLayers, nweights, cell_size, l1_patch_num, ...
model_xf, model_alphaf);
% Scale estimation
current_scale_factor = estimate_scale( rgb2gray(im), pos, current_scale_factor);
else
init_scale_para(rgb2gray(im), target_sz, pos);
end
% ================================================================================
% Learning correlation filters over hierarchical convolutional features
% ================================================================================
% Extracting hierarchical convolutional features
feat = extractFeature(im, pos, window_sz, cos_window, indLayers);
% Model update
[model_xf, model_alphaf] = updateModel(feat, yf, interp_factor, lambda, frame, ...
model_xf, model_alphaf);
% ================================================================================
% Save predicted position and timing
% ================================================================================
% positions(frame,:) = pos;
target_sz_t=target_sz*current_scale_factor;
box = [pos([2,1]) - target_sz_t([2,1])/2, target_sz_t([2,1])];
rects(frame,:)=box;
time = time + toc();
% Visualization
if show_visualization,
%box = [pos([2,1]) - target_sz([2,1])/2, target_sz([2,1])];
stop = update_visualization(frame, box);
if stop, break, end %user pressed Esc, stop early
drawnow
% pause(0.05) % uncomment to run slower
end
end
end
function pos = predictPosition(feat, pos, indLayers, nweights, cell_size, l1_patch_num, ...
model_xf, model_alphaf)
% ================================================================================
% Compute correlation filter responses at each layer
% ================================================================================
res_layer = zeros([l1_patch_num, length(indLayers)]);
for ii = 1 : length(indLayers)
zf = fft2(feat{ii});
kzf=sum(zf .* conj(model_xf{ii}), 3) / numel(zf);
temp= real(fftshift(ifft2(model_alphaf{ii} .* kzf))); %equation for fast detection
res_layer(:,:,ii)=temp/max(temp(:));
end
% Combine responses from multiple layers (see Eqn. 5)
response = sum(bsxfun(@times, res_layer, nweights), 3);
% ================================================================================
% Find target location
% ================================================================================
% Target location is at the maximum response. we must take into
% account the fact that, if the target doesn't move, the peak
% will appear at the top-left corner, not at the center (this is
% discussed in the KCF paper). The responses wrap around cyclically.
[vert_delta, horiz_delta] = find(response == max(response(:)), 1);
vert_delta = vert_delta - floor(size(zf,1)/2);
horiz_delta = horiz_delta - floor(size(zf,2)/2);
% Map the position to the image space
pos = pos + cell_size * [vert_delta - 1, horiz_delta - 1];
end
function [model_xf, model_alphaf] = updateModel(feat, yf, interp_factor, lambda, frame, ...
model_xf, model_alphaf)
numLayers = length(feat);
% ================================================================================
% Initialization
% ================================================================================
xf = cell(1, numLayers);
alphaf = cell(1, numLayers);
% ================================================================================
% Model update
% ================================================================================
for ii=1 : numLayers
xf{ii} = fft2(feat{ii});
kf = sum(xf{ii} .* conj(xf{ii}), 3) / numel(xf{ii});
alphaf{ii} = yf./ (kf+ lambda); % Fast training
end
% Model initialization or update
if frame == 1, % First frame, train with a single image
for ii=1:numLayers
model_alphaf{ii} = alphaf{ii};
model_xf{ii} = xf{ii};
end
else
% Online model update using learning rate interp_factor
for ii=1:numLayers
model_alphaf{ii} = (1 - interp_factor) * model_alphaf{ii} + interp_factor * alphaf{ii};
model_xf{ii} = (1 - interp_factor) * model_xf{ii} + interp_factor * xf{ii};
end
end
end
function feat = extractFeature(im, pos, window_sz, cos_window, indLayers)
% Get the search window from previous detection
patch = get_subwindow(im, pos, window_sz);
% Extracting hierarchical convolutional features
feat = get_features(patch, cos_window, indLayers);
end