-
Notifications
You must be signed in to change notification settings - Fork 4
/
clusterWormTrackCompiler.m
executable file
·321 lines (264 loc) · 11.4 KB
/
clusterWormTrackCompiler.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
function clusterWormTrackCompiler(filePath,fileOutput)
% Run after all images have straightened and segmented, and all the track
% matrices have been created. This program is built to run on della. It
% takes all of the positional fingerprints (track matrices) and clusters
% them. The input is the file path to the main folder, which has a folder
% with all of the track matrices (from clusterWormTracker). It also
% requires a pointsstats.mat file in the folder. The program has no output,
% but ssaves a pointStats2.mat folder, which adds a trackIdx field to the
% original pointStats.
% get folder
if nargin==1
fileOutput=filePath;
end
display(filePath);
display(fileOutput);
%%
dataFolder=fileparts(filePath);
if isempty(dataFolder)
dataFolder=pwd;
end
display(['Parent Folder is ' dataFolder]);
display([ 'PS file is ' filePath]);
%load all pointStats with, which has all trackMatrix data
[~,pointStats]=compileTrackMatrix(dataFolder);
PS_ref=load([dataFolder filesep 'pointStatsRef']);
PS_ref=PS_ref.PS_ref;
% fill in indexing holes with error fits,
for iPS=1:length(pointStats)
if isempty(pointStats(iPS).stackIdx);
pointStats(iPS).stackIdx=iPS;
end
end
%find which pointStats had tracking completed
track_present=cellfun(@(x) ~isempty(x),{pointStats.TrackMatrixi},'uniform',0);
N=find(cell2mat(track_present),1,'last');
display([ 'Number of frames is ' num2str(N)]);
%only go up to last trackmatrix present
pointStats=pointStats(1:N);
presentIdx=1:length(pointStats);
%% do matching of tracks by clustering similarities in matching matrices
%build cell array of all track matrices
TrackMatrix={pointStats(presentIdx).TrackMatrixi};
n_ref_neurons=cellfun(@(x) size(x,1),{PS_ref.straightPoints});
%% loop through all track matrices to build binary feature vectors
%each track matrix has an n neurons by t time points vector with each
%neuron having a number (match index) for each time. To build the binary
%feature vector
%build binary sparce matrix, referred to as a transition matrix
transitionMatrixCell=...
cellfun(@(x) oneHotNeuron(x,n_ref_neurons),TrackMatrix,'Uniform',0);
%% build training set on first 800 time points
%if this fails, continuously decrease the number of training time points
%until it works.
for iTry=0:5
%% try loop to try to work around out of memory issues
NTrainingRange=900-100*iTry;
NTrainingRange=min(NTrainingRange,N-1);
nTraining=min(NTrainingRange,N-1);
display(['Attempting nTraining of ' num2str(nTraining)]);
nSelect=round(1:NTrainingRange/nTraining:NTrainingRange);
try
%% correlation and cluster, can take up to 10 minutes
%make subset of transition matrix
subTranstionMatrix=cell2mat(transitionMatrixCell(nSelect)');
%calculate distance matrix as correlation distance using custom
%code for sparce correlations.
tic;
tcorr2=sparseTransitionCorr(subTranstionMatrix',[],1);
display(['Distance calculated in ' num2str(toc) 's']);
%get lower triangular section of the matrix for using clustering
tcorr2_lin=tcorr2(tril(true(length(tcorr2)),-1));
%% cluster
%find clusters with a cutoff distance .9
Z=linkage(1-tcorr2_lin','complete');
%clusters_initial is a linear list for each neuron in the subtransition matrix
%with a number indicating which cluster that neuron belongs to
clusters_initial=cluster(Z,'cutoff',.9,'criterion','distance');
%if clusters successfully calculated, exit loop.
display(['Success with nTraining of ' num2str(nTraining)]);
break
catch me
display(['Failed with nTraining of ' num2str(nTraining)]);
me
end
end
%% remove nans and zeros
cluster_items=unique(clusters_initial(~isnan(clusters_initial)));
cluster_items(cluster_items==0)=[];
n_clusters=max(cluster_items);
%% correction factor, add groups that are close but were not clustered
%build a new distance matrix with distance between formed clusters.
subTcorr=zeros(length(tcorr2),n_clusters);
subTcorr2=zeros(n_clusters);
% average correlations matrices from each subgoup in each of the two
% directions
for i=1:n_clusters
i_cluster=cluster_items(i);
subTcorr(:,i_cluster)=mean(tcorr2(:,clusters_initial==i_cluster),2);
end
for i=1:n_clusters
i_cluster=cluster_items(i);
subTcorr2(i_cluster,:)=mean(subTcorr(clusters_initial==i_cluster,:),1);
end
%get value dy the diagnol, use it as distribution to determine whether off
%diagnol terms are significant.
inGroup=subTcorr2(eye(n_clusters)>0);
%make a cutoff to determine if diagnol terms should be significant.
regroupCutoff=max(mean(inGroup)-std(inGroup)*2,.25);
%find clusters matched above cutoff
[x,y,v]=find(subTcorr2.*((triu(subTcorr2,1)>regroupCutoff)));
[~,ia]=sort(v,'descend');
x=x(ia);
y=y(ia);
%% recombine clusters that are determined to have been matched together.
for i=1:length(x)
% get the neurontimes that belong into the clusters to be merged
newSumx=sum(clusters_initial==x(i));
newSumy=sum(clusters_initial==y(i));
%the new number in that cluster is the sum of the two old clusters
newSum=newSumx+newSumy;
%if the sum of the two clusters is not more than the number of training
%times x 1.2, just straight merge them
if newSum<nTraining*1.2
clusters_initial(clusters_initial==x(i))=y(i);
else
%if they are larger combine the two clusters and recluster them so
%that the number of objects is less then 1.2xnTraining.
checkx=find(clusters_initial==x(i));
checky=find(clusters_initial==y(i));
check_set=[checkx;checky];
%make distance matrix for subset
checkCorr=tcorr2(check_set,check_set);
checkCorr=checkCorr(tril(true(newSum),-1));
%make clusters
Ztest=linkage(1-full(checkCorr)','complete');
recluster=cluster(Ztest,'maxclust',2);
if max(accumarray(recluster,ones(1,newSum)))<nTraining*1.2
clusters_initial(check_set(recluster==1))=x(i);
clusters_initial(check_set(recluster==2))=y(i);
end
end
end
%% order clusters, remove bad ones.
%add one to clusters, anything equal to one will be removed
clusters_initial=clusters_initial+1;
%how many in each cluster
caccum=hist(clusters_initial,1:max(clusters_initial));
% remove clusteres smaller than 40% of training, or larger
% than 120%
caccumN=find(caccum<nTraining*.4|caccum>nTraining*1.2); %bad clusters
% set bad clusters and nans to one for later removal.
clusters_initial(ismember(clusters_initial,caccumN))=1;
clusters_initial(isnan(clusters_initial))=1;
%% raname based on size of cluster, so cluster 1 will be largest
caccum=accumarray(clusters_initial,ones(size(clusters_initial)));
[~,iaAccum]=sort(caccum,'descend'); %the rank of each cluster
[~,iaAccum]=sort(iaAccum);
% ib is posotion of unique terms)
%cluster_items are the unique labels for the clusters
[cluster_items,~,ib]=unique(clusters_initial);
cluster_items=iaAccum(cluster_items);
cluster_assign=cluster_items(ib);
cluster_assign(cluster_assign==1)=nan;
cluster_assign=cluster_assign-1;
%% make new distance matrix between clusters for saving,
%not currently used elsewhere
cluster_items=unique(cluster_assign(~isnan(cluster_assign)));
cluster_items(cluster_items==0)=[];
n_clusters=max(cluster_items);
subTcorr=zeros(length(tcorr2),n_clusters);
subTcorr2=zeros(n_clusters);
% average correlations matrices from each subgoup
for i=1:n_clusters
i_cluster=cluster_items(i);
subTcorr(:,i_cluster)=mean(tcorr2(:,cluster_assign==i_cluster),2);
end
for i=1:n_clusters
i_cluster=cluster_items(i);
subTcorr2(i_cluster,:)=mean(subTcorr(cluster_assign==i_cluster,:),1);
end
%% find "basis" by averaging over a cluster.
masterVec=zeros(n_clusters,size(subTranstionMatrix,2));
totalVec=zeros(n_clusters,size(subTranstionMatrix,2));
masterVecVar=zeros(n_clusters,size(subTranstionMatrix,2));
%
for i=1:n_clusters
%loop over clusters get average, sum and std of the trainingset
%transition matrix for the items in a cluster
i_cluster=cluster_items(i);
masterVec(i_cluster,:)=mean(...
subTranstionMatrix(cluster_assign==i_cluster,:),1);
totalVec(i_cluster,:)=sum(...
subTranstionMatrix(cluster_assign==i_cluster,:),1);
masterVecVar(i_cluster,:)=std(....
subTranstionMatrix(cluster_assign==i_cluster,:),[],1);
end
%subtract off mean to get mean subtracted means of the trainingset
%transition matrix
masterVec=bsxfun(@minus,masterVec,mean(masterVec,1));
%make a minimum value for the std
masterVecVar(masterVecVar<.1)=.1;
%use 1/std as the weighting for each vector
masterWeights=1./masterVecVar;
%apply the weightings to the mastervectors
masterVec=normr(masterVec.*masterWeights);
%% project training set data onto master vectors for each cluster
input=subTranstionMatrix;
input=bsxfun(@rdivide,input,sqrt(sum(input.^2,2)));
input(isnan(input))=0;
%project training data onto masters, score is a n training neurons by m
%clusters, with the match score of each cluster in the rows.
score=masterVec*input';
%%
%compare the distribution of values in the projection that are expected to
%be hits and the ones expected not to be. Find the cutoff which makes us
%80% sure that a hit should be a hit.
assignedNodes=find(~isnan(cluster_assign));
cluster_assign_present=cluster_assign(~isnan(cluster_assign));
[~,ia]=sort(cluster_assign_present);
assignedNodes=assignedNodes(ia);
%make list of indices that should be theoretically
hitIdx=sub2ind(size(score),cluster_assign_present(ia),assignedNodes);
%turn that list onto a logical matrix, where each training neuron has 1
%for the cluster it belongs to and 0 everywhere else.
hitMap=false(size(score));
hitMap(hitIdx)=true;
% get the actual values of the projections at the locations we expect 1
hitValues=score(hitMap);
% and get the other values to compare
notHitValues=score(~hitMap);
%compare distributions to find cutoff
histX=0:.02:1;
%histogram two distributions
nHit=hist(hitValues,histX);
nMiss=hist(notHitValues,histX);
%find probability that a value is a hit
gamma=nHit./(nHit+nMiss);
%cutoff is where p(hit) is greater than 20%.
hitCutoff=histX(find(gamma>.2,1,'first'));
%% project data onto clustered components and detect values larger than threshold
output=cellfun(@(x) distanceClassify(masterVec,x,hitCutoff),...
transitionMatrixCell','Uniform',0);
output=cat(1,output{:});
track_cell=output(:,1);
weight_cell=output(:,2);
%% add new identities into track matrix, kill off unidentified cell
pointStats2=pointStats;
for i=1:length(pointStats2)
pointStats2(presentIdx(i)).trackIdx=track_cell{i};
pointStats2(presentIdx(i)).trackWeights=weight_cell{i};
end
%% SAVE HERE %%
fileOutput_stats=strrep(fileOutput,'.mat','_info.mat');
save(fileOutput,'pointStats2');
save(fileOutput_stats,'masterVec','hitCutoff'...
,'cluster_assign','caccum','subTcorr2')
%% write time end stamp
hostname = char( getHostName( java.net.InetAddress.getLocalHost ) );
if contains(hostname,'della')
Fid=fopen([dataFolder filesep 'status.txt'],'a');
status=[datestr(datetime('now')) ': Finished NERVE tracking \n'];
fprintf(Fid,status);
fclose(Fid);
end