diff --git a/core/checkModelStruct.m b/core/checkModelStruct.m index 7526d590..1ba21cb0 100755 --- a/core/checkModelStruct.m +++ b/core/checkModelStruct.m @@ -120,6 +120,21 @@ function checkModelStruct(model,throwErrors,trimWarnings) EM='The "grRules" field must be a cell array of strings'; dispEM(EM,throwErrors); end + if ~isfield(model,'genes') + EM='If "grRules" field exists, the model should also contain a "genes" field'; + dispEM(EM,throwErrors); + else + geneList = strjoin(model.grRules); + geneList = regexp(geneList,' |)|(|and|or','split'); % Remove all grRule punctuation + geneList = geneList(~cellfun(@isempty,geneList)); % Remove spaces and empty genes + geneList = setdiff(unique(geneList),model.genes); + if ~isempty(geneList) + problemGrRules = model.rxns(contains(model.grRules,geneList)); + problemGrRules = strjoin(problemGrRules(:),'; '); + EM=['The reaction(s) "' problemGrRules '" contain the following genes in its "grRules" field, but these are not in the "genes" field:']; + dispEM(EM,throwErrors,geneList); + end + end end if isfield(model,'rxnComps') if ~isnumeric(model.rxnComps) @@ -262,7 +277,10 @@ function checkModelStruct(model,throwErrors,trimWarnings) EM='The following reactions have contradicting bounds:'; dispEM(EM,throwErrors,model.rxns(model.lb>model.ub),trimWarnings); EM='The following reactions have bounds contradicting their reversibility:'; -dispEM(EM,throwErrors,model.rxns(model.lb<0 & model.rev==0),trimWarnings); +contradictBound = (model.lb < 0 & model.ub > 0 & model.rev==0) | ... % Reversible bounds, irreversible label + (model.lb < 0 & model.ub <= 0 & model.rev==1) | ... % Negative bounds, reversible label + (model.lb >= 0 & model.ub > 0 & model.rev==1); % Positive bounds, reversible label +dispEM(EM,throwErrors,model.rxns(contradictBound),trimWarnings); %Multiple or no objective functions not allowed in SBML L3V1 FBCv2 if numel(find(model.c))>1 diff --git a/core/constructS.m b/core/constructS.m index 87ac8c89..5412cf24 100755 --- a/core/constructS.m +++ b/core/constructS.m @@ -152,7 +152,8 @@ strjoin(unique(metsToS(~metsPresent)),', ')],'') else missingMet = find(~metsPresent); - missingMet = char(strcat(metsToS(missingMet),' (reaction:',rxns(rxnsToS(missingMet)),')\n')); + missingMet = strcat(metsToS(missingMet),' (reaction:',rxns(rxnsToS(missingMet)),')\n'); + missingMet = strjoin(missingMet,''); error(['Could not find the following metabolites (reaction indicated) in the metabolite list: \n' ... missingMet '%s'],''); end diff --git a/core/getExchangeRxns.m b/core/getExchangeRxns.m index 459bc095..6433a4e6 100755 --- a/core/getExchangeRxns.m +++ b/core/getExchangeRxns.m @@ -1,47 +1,82 @@ function [exchangeRxns, exchangeRxnsIndexes]=getExchangeRxns(model,reactionType) % getExchangeRxns -% Retrieves the exchange reactions from a model +% Retrieves the exchange reactions from a model. Exchange reactions are +% identified by having either no substrates or products. % +% Input: % model a model structure -% reactionType retrieve all reactions ('both'), only production -% ('out'), or only consumption ('in') (optional, default -% 'both') +% reactionType which exchange reactions should be returned +% 'all' all reactions, irrespective of reaction +% bounds +% 'uptake' reactions with bounds that imply that +% only uptake are allowed. Reaction +% direction, upper and lower bounds are +% all considered +% 'excrete' reactions with bounds that imply that +% only excretion are allowed. Reaction +% direction, upper and lower bounds are +% all considered +% 'reverse' reactions with non-zero upper and lower +% bounds that imply that both uptake and +% excretion are allowed +% 'blocked' reactions that have zero upper and lower +% bounds, not allowing any flux +% 'in' reactions where the boundary metabolite +% is the substrate of the reaction, a +% positive flux value would imply uptake, +% but reaction bounds are not considered +% 'out' reactions where the boundary metabolite +% is the substrate of the reaction, a +% positive flux value would imply uptake, +% but reaction bounds are not considered. % +% Output: % exchangeRxns cell array with the IDs of the exchange reactions % exchangeRxnsIndexes vector with the indexes of the exchange reactions % -% Exchange reactions are defined as reactions which involve only products -% or only reactants. If the unconstrained field is present, then that is -% used instead. +% Note: +% The union of 'in' and 'out' equals 'all'. Also, the union of 'uptake', +% 'excrete', 'reverse' and 'blocked' equals all. % % Usage: [exchangeRxns,exchangeRxnsIndexes]=getExchangeRxns(model,reactionType) if nargin<2 - reactionType='both'; + reactionType='all'; else reactionType=char(reactionType); end -hasNoProducts=sparse(numel(model.rxns),1); -hasNoReactants=sparse(numel(model.rxns),1); - -if isfield(model,'unconstrained') - if strcmpi(reactionType,'both') || strcmpi(reactionType,'out') - [~, I]=find(model.S(model.unconstrained~=0,:)>0); - hasNoProducts(I)=true; - end - if strcmpi(reactionType,'both') || strcmpi(reactionType,'in') - [~, I]=find(model.S(model.unconstrained~=0,:)<0); - hasNoReactants(I)=true; - end +% Find exchange reactions +if isfield(model, 'unconstrained') + [~, I]=find(model.S(model.unconstrained~=0,:)>0); + hasNoProd(I)=true; + [~, I]=find(model.S(model.unconstrained~=0,:)<0); + hasNoSubs(I)=true; else - if strcmpi(reactionType,'both') || strcmpi(reactionType,'out') - hasNoProducts=sum((model.S>0))==0; - end - if strcmpi(reactionType,'both') || strcmpi(reactionType,'in') - hasNoReactants=sum((model.S<0))==0; - end + hasNoProd = transpose(find(sum(model.S>0)==0)); + hasNoSubs = transpose(find(sum(model.S<0)==0)); +end +allExch = [hasNoProd; hasNoSubs]; + +switch reactionType + case {'both','all'} % For legacy reasons, 'both' is also allowed + exchangeRxnsIndexes = allExch; + case 'in' + exchangeRxnsIndexes = hasNoSubs; + case 'out' + exchangeRxnsIndexes = hasNoProd; + case 'blocked' + exchangeRxnsIndexes = allExch(model.lb(allExch) == 0 & model.ub(allExch) == 0); + case 'reverse' + exchangeRxnsIndexes = allExch(model.lb(allExch) < 0 & model.ub(allExch) > 0); + case 'uptake' + exchangeRxnsIndexes = allExch([(model.lb(hasNoSubs) >= 0 & model.ub(hasNoSubs) > 0); ... + (model.lb(hasNoProd) < 0 & model.ub(hasNoProd) <= 0)]); + case 'excrete' + exchangeRxnsIndexes = allExch([(model.lb(hasNoSubs) < 0 & model.ub(hasNoSubs) <= 0); ... + (model.lb(hasNoProd) >= 0 & model.ub(hasNoProd) > 0)]); + otherwise + error('Invalid reactionType specified') end -exchangeRxnsIndexes=find(hasNoProducts(:) | hasNoReactants(:)); -exchangeRxns=model.rxns(exchangeRxnsIndexes); +exchangeRxns = model.rxns(exchangeRxnsIndexes); end diff --git a/doc/core/checkModelStruct.html b/doc/core/checkModelStruct.html index 2b49a88b..7cc327fe 100644 --- a/doc/core/checkModelStruct.html +++ b/doc/core/checkModelStruct.html @@ -179,305 +179,323 @@

SOURCE CODE ^'The "grRules" field must be a cell array of strings'; 0121 dispEM(EM,throwErrors); 0122 end -0123 end -0124 if isfield(model,'rxnComps') -0125 if ~isnumeric(model.rxnComps) -0126 EM='The "rxnComps" field must be of type "double"'; -0127 dispEM(EM,throwErrors); -0128 end -0129 end -0130 if isfield(model,'inchis') -0131 if ~iscellstr(model.inchis) -0132 EM='The "inchis" field must be a cell array of strings'; -0133 dispEM(EM,throwErrors); -0134 end -0135 end -0136 if isfield(model,'metSmiles') -0137 if ~iscellstr(model.metSmiles) -0138 EM='The "metSmiles" field must be a cell array of strings'; -0139 dispEM(EM,throwErrors); -0140 end -0141 end -0142 if isfield(model,'metFormulas') -0143 if ~iscellstr(model.metFormulas) -0144 EM='The "metFormulas" field must be a cell array of strings'; -0145 dispEM(EM,throwErrors); -0146 end -0147 end -0148 if isfield(model,'metCharges') -0149 if ~isnumeric(model.metCharges) -0150 EM='The "metCharges" field must be a double'; -0151 dispEM(EM,throwErrors); -0152 end -0153 end -0154 if isfield(model,'metDeltaG') -0155 if ~isnumeric(model.metDeltaG) -0156 EM='The "metDeltaG" field must be a double'; -0157 dispEM(EM,throwErrors); -0158 end -0159 end -0160 if isfield(model,'subSystems') -0161 for i=1:numel(model.subSystems) -0162 if ~iscell(model.subSystems{i,1}) -0163 EM='The "subSystems" field must be a cell array'; -0164 dispEM(EM,throwErrors); -0165 end -0166 end -0167 end -0168 if isfield(model,'eccodes') -0169 if ~iscellstr(model.eccodes) -0170 EM='The "eccodes" field must be a cell array of strings'; -0171 dispEM(EM,throwErrors); -0172 end -0173 end -0174 if isfield(model,'unconstrained') -0175 if ~isnumeric(model.unconstrained) -0176 EM='The "unconstrained" field must be of type "double"'; -0177 dispEM(EM,throwErrors); -0178 end -0179 end -0180 if isfield(model,'rxnNotes') -0181 if ~iscellstr(model.rxnNotes) -0182 EM='The "rxnNotes" field must be a cell array of strings'; -0183 dispEM(EM,throwErrors); -0184 end -0185 end -0186 if isfield(model,'rxnReferences') -0187 if ~iscellstr(model.rxnReferences) -0188 EM='The "rxnReferences" field must be a cell array of strings'; -0189 dispEM(EM,throwErrors); -0190 end -0191 end -0192 if isfield(model,'rxnConfidenceScores') -0193 if ~isnumeric(model.rxnConfidenceScores) -0194 EM='The "rxnConfidenceScores" field must be a double'; -0195 dispEM(EM,throwErrors); -0196 end -0197 end -0198 if isfield(model,'rxnDeltaG') -0199 if ~isnumeric(model.rxnDeltaG) -0200 EM='The "rxnDeltaG" field must be a double'; -0201 dispEM(EM,throwErrors); -0202 end -0203 end -0204 -0205 %Empty strings -0206 if isempty(model.id) -0207 EM='The "id" field cannot be empty'; -0208 dispEM(EM,throwErrors); -0209 end -0210 if any(cellfun(@isempty,model.rxns)) -0211 EM='The model contains empty reaction IDs'; -0212 dispEM(EM,throwErrors); -0213 end -0214 if any(cellfun(@isempty,model.mets)) -0215 EM='The model contains empty metabolite IDs'; -0216 dispEM(EM,throwErrors); -0217 end -0218 if any(cellfun(@isempty,model.comps)) -0219 EM='The model contains empty compartment IDs'; -0220 dispEM(EM,throwErrors); -0221 end -0222 EM='The following metabolites have empty names:'; -0223 dispEM(EM,throwErrors,model.mets(cellfun(@isempty,model.metNames)),trimWarnings); -0224 -0225 if isfield(model,'genes') -0226 if any(cellfun(@isempty,model.genes)) -0227 EM='The model contains empty gene IDs'; -0228 dispEM(EM,throwErrors); -0229 end -0230 end -0231 -0232 %Duplicates -0233 EM='The following reaction IDs are duplicates:'; -0234 dispEM(EM,throwErrors,model.rxns(duplicates(model.rxns)),trimWarnings); -0235 EM='The following metabolite IDs are duplicates:'; -0236 dispEM(EM,throwErrors,model.mets(duplicates(model.mets)),trimWarnings); -0237 EM='The following compartment IDs are duplicates:'; -0238 dispEM(EM,throwErrors,model.comps(duplicates(model.comps)),trimWarnings); -0239 if isfield(model,'genes') -0240 EM='The following genes are duplicates:'; -0241 dispEM(EM,throwErrors,model.genes(duplicates(model.genes)),trimWarnings); -0242 end -0243 metInComp=strcat(model.metNames,'[',model.comps(model.metComps),']'); -0244 EM='The following metabolites already exist in the same compartment:'; -0245 dispEM(EM,throwErrors,metInComp(duplicates(metInComp)),trimWarnings); +0123 if ~isfield(model,'genes') +0124 EM='If "grRules" field exists, the model should also contain a "genes" field'; +0125 dispEM(EM,throwErrors); +0126 else +0127 geneList = strjoin(model.grRules); +0128 geneList = regexp(geneList,' |)|(|and|or','split'); % Remove all grRule punctuation +0129 geneList = geneList(~cellfun(@isempty,geneList)); % Remove spaces and empty genes +0130 geneList = setdiff(unique(geneList),model.genes); +0131 if ~isempty(geneList) +0132 problemGrRules = model.rxns(contains(model.grRules,geneList)); +0133 problemGrRules = strjoin(problemGrRules(:),'; '); +0134 EM=['The reaction(s) "' problemGrRules '" contain the following genes in its "grRules" field, but these are not in the "genes" field:']; +0135 dispEM(EM,throwErrors,geneList); +0136 end +0137 end +0138 end +0139 if isfield(model,'rxnComps') +0140 if ~isnumeric(model.rxnComps) +0141 EM='The "rxnComps" field must be of type "double"'; +0142 dispEM(EM,throwErrors); +0143 end +0144 end +0145 if isfield(model,'inchis') +0146 if ~iscellstr(model.inchis) +0147 EM='The "inchis" field must be a cell array of strings'; +0148 dispEM(EM,throwErrors); +0149 end +0150 end +0151 if isfield(model,'metSmiles') +0152 if ~iscellstr(model.metSmiles) +0153 EM='The "metSmiles" field must be a cell array of strings'; +0154 dispEM(EM,throwErrors); +0155 end +0156 end +0157 if isfield(model,'metFormulas') +0158 if ~iscellstr(model.metFormulas) +0159 EM='The "metFormulas" field must be a cell array of strings'; +0160 dispEM(EM,throwErrors); +0161 end +0162 end +0163 if isfield(model,'metCharges') +0164 if ~isnumeric(model.metCharges) +0165 EM='The "metCharges" field must be a double'; +0166 dispEM(EM,throwErrors); +0167 end +0168 end +0169 if isfield(model,'metDeltaG') +0170 if ~isnumeric(model.metDeltaG) +0171 EM='The "metDeltaG" field must be a double'; +0172 dispEM(EM,throwErrors); +0173 end +0174 end +0175 if isfield(model,'subSystems') +0176 for i=1:numel(model.subSystems) +0177 if ~iscell(model.subSystems{i,1}) +0178 EM='The "subSystems" field must be a cell array'; +0179 dispEM(EM,throwErrors); +0180 end +0181 end +0182 end +0183 if isfield(model,'eccodes') +0184 if ~iscellstr(model.eccodes) +0185 EM='The "eccodes" field must be a cell array of strings'; +0186 dispEM(EM,throwErrors); +0187 end +0188 end +0189 if isfield(model,'unconstrained') +0190 if ~isnumeric(model.unconstrained) +0191 EM='The "unconstrained" field must be of type "double"'; +0192 dispEM(EM,throwErrors); +0193 end +0194 end +0195 if isfield(model,'rxnNotes') +0196 if ~iscellstr(model.rxnNotes) +0197 EM='The "rxnNotes" field must be a cell array of strings'; +0198 dispEM(EM,throwErrors); +0199 end +0200 end +0201 if isfield(model,'rxnReferences') +0202 if ~iscellstr(model.rxnReferences) +0203 EM='The "rxnReferences" field must be a cell array of strings'; +0204 dispEM(EM,throwErrors); +0205 end +0206 end +0207 if isfield(model,'rxnConfidenceScores') +0208 if ~isnumeric(model.rxnConfidenceScores) +0209 EM='The "rxnConfidenceScores" field must be a double'; +0210 dispEM(EM,throwErrors); +0211 end +0212 end +0213 if isfield(model,'rxnDeltaG') +0214 if ~isnumeric(model.rxnDeltaG) +0215 EM='The "rxnDeltaG" field must be a double'; +0216 dispEM(EM,throwErrors); +0217 end +0218 end +0219 +0220 %Empty strings +0221 if isempty(model.id) +0222 EM='The "id" field cannot be empty'; +0223 dispEM(EM,throwErrors); +0224 end +0225 if any(cellfun(@isempty,model.rxns)) +0226 EM='The model contains empty reaction IDs'; +0227 dispEM(EM,throwErrors); +0228 end +0229 if any(cellfun(@isempty,model.mets)) +0230 EM='The model contains empty metabolite IDs'; +0231 dispEM(EM,throwErrors); +0232 end +0233 if any(cellfun(@isempty,model.comps)) +0234 EM='The model contains empty compartment IDs'; +0235 dispEM(EM,throwErrors); +0236 end +0237 EM='The following metabolites have empty names:'; +0238 dispEM(EM,throwErrors,model.mets(cellfun(@isempty,model.metNames)),trimWarnings); +0239 +0240 if isfield(model,'genes') +0241 if any(cellfun(@isempty,model.genes)) +0242 EM='The model contains empty gene IDs'; +0243 dispEM(EM,throwErrors); +0244 end +0245 end 0246 -0247 %Elements never used (print only as warnings -0248 EM='The following reactions are empty (no involved metabolites):'; -0249 dispEM(EM,false,model.rxns(~any(model.S,1)),trimWarnings); -0250 EM='The following metabolites are never used in a reaction:'; -0251 dispEM(EM,false,model.mets(~any(model.S,2)),trimWarnings); -0252 if isfield(model,'genes') -0253 EM='The following genes are not associated to a reaction:'; -0254 dispEM(EM,false,model.genes(~any(model.rxnGeneMat,1)),trimWarnings); -0255 end -0256 I=true(numel(model.comps),1); -0257 I(model.metComps)=false; -0258 EM='The following compartments contain no metabolites:'; -0259 dispEM(EM,false,model.comps(I),trimWarnings); -0260 -0261 %Contradicting bounds -0262 EM='The following reactions have contradicting bounds:'; -0263 dispEM(EM,throwErrors,model.rxns(model.lb>model.ub),trimWarnings); -0264 EM='The following reactions have bounds contradicting their reversibility:'; -0265 dispEM(EM,throwErrors,model.rxns(model.lb<0 & model.rev==0),trimWarnings); -0266 -0267 %Multiple or no objective functions not allowed in SBML L3V1 FBCv2 -0268 if numel(find(model.c))>1 -0269 EM='Multiple objective functions found. This might be intended, but results in FBCv2 non-compliant SBML file when exported'; -0270 dispEM(EM,false,model.rxns(find(model.c)),trimWarnings); -0271 elseif ~any(model.c) -0272 EM='No objective function found. This might be intended, but results in FBCv2 non-compliant SBML file when exported'; -0273 dispEM(EM,false); -0274 end -0275 -0276 EM='The following reactions have contradicting bounds:'; -0277 dispEM(EM,throwErrors,model.rxns(model.lb>model.ub),trimWarnings); -0278 -0279 %Mapping of compartments -0280 if isfield(model,'compOutside') -0281 EM='The following compartments are in "compOutside" but not in "comps":'; -0282 dispEM(EM,throwErrors,setdiff(model.compOutside,[{''};model.comps]),trimWarnings); -0283 end +0247 %Duplicates +0248 EM='The following reaction IDs are duplicates:'; +0249 dispEM(EM,throwErrors,model.rxns(duplicates(model.rxns)),trimWarnings); +0250 EM='The following metabolite IDs are duplicates:'; +0251 dispEM(EM,throwErrors,model.mets(duplicates(model.mets)),trimWarnings); +0252 EM='The following compartment IDs are duplicates:'; +0253 dispEM(EM,throwErrors,model.comps(duplicates(model.comps)),trimWarnings); +0254 if isfield(model,'genes') +0255 EM='The following genes are duplicates:'; +0256 dispEM(EM,throwErrors,model.genes(duplicates(model.genes)),trimWarnings); +0257 end +0258 metInComp=strcat(model.metNames,'[',model.comps(model.metComps),']'); +0259 EM='The following metabolites already exist in the same compartment:'; +0260 dispEM(EM,throwErrors,metInComp(duplicates(metInComp)),trimWarnings); +0261 +0262 %Elements never used (print only as warnings +0263 EM='The following reactions are empty (no involved metabolites):'; +0264 dispEM(EM,false,model.rxns(~any(model.S,1)),trimWarnings); +0265 EM='The following metabolites are never used in a reaction:'; +0266 dispEM(EM,false,model.mets(~any(model.S,2)),trimWarnings); +0267 if isfield(model,'genes') +0268 EM='The following genes are not associated to a reaction:'; +0269 dispEM(EM,false,model.genes(~any(model.rxnGeneMat,1)),trimWarnings); +0270 end +0271 I=true(numel(model.comps),1); +0272 I(model.metComps)=false; +0273 EM='The following compartments contain no metabolites:'; +0274 dispEM(EM,false,model.comps(I),trimWarnings); +0275 +0276 %Contradicting bounds +0277 EM='The following reactions have contradicting bounds:'; +0278 dispEM(EM,throwErrors,model.rxns(model.lb>model.ub),trimWarnings); +0279 EM='The following reactions have bounds contradicting their reversibility:'; +0280 contradictBound = (model.lb < 0 & model.ub > 0 & model.rev==0) | ... % Reversible bounds, irreversible label +0281 (model.lb < 0 & model.ub <= 0 & model.rev==1) | ... % Negative bounds, reversible label +0282 (model.lb >= 0 & model.ub > 0 & model.rev==1); % Positive bounds, reversible label +0283 dispEM(EM,throwErrors,model.rxns(contradictBound),trimWarnings); 0284 -0285 %Met names which start with number -0286 I=false(numel(model.metNames),1); -0287 for i=1:numel(model.metNames) -0288 index=strfind(model.metNames{i},' '); -0289 if any(index) -0290 if any(str2double(model.metNames{i}(1:index(1)-1))) -0291 I(i)=true; -0292 end -0293 end -0294 end -0295 EM='The following metabolite IDs begin with a number directly followed by space:'; -0296 dispEM(EM,throwErrors,model.mets(I),trimWarnings); -0297 -0298 %Non-parseable composition -0299 if isfield(model,'metFormulas') -0300 [~, ~, exitFlag]=parseFormulas(model.metFormulas,true,false); -0301 EM='The composition for the following metabolites could not be parsed:'; -0302 dispEM(EM,false,model.mets(exitFlag==-1),trimWarnings); -0303 end -0304 -0305 %Check if there are metabolites with different names but the same MIRIAM -0306 %codes -0307 if isfield(model,'metMiriams') -0308 miriams=containers.Map(); -0309 for i=1:numel(model.mets) -0310 if ~isempty(model.metMiriams{i}) -0311 %Loop through and add for each miriam -0312 for j=1:numel(model.metMiriams{i}.name) -0313 %Get existing metabolite indexes -0314 current=strcat(model.metMiriams{i}.name{j},'/',model.metMiriams{i}.value{j}); -0315 if isKey(miriams,current) -0316 existing=miriams(current); -0317 else -0318 existing=[]; -0319 end -0320 miriams(current)=[existing;i]; -0321 end -0322 end -0323 end -0324 -0325 %Get all keys -0326 allMiriams=keys(miriams); -0327 -0328 hasMultiple=false(numel(allMiriams),1); -0329 for i=1:numel(allMiriams) -0330 if numel(miriams(allMiriams{i}))>1 -0331 %Check if they all have the same name -0332 if numel(unique(model.metNames(miriams(allMiriams{i}))))>1 -0333 if ~regexp(allMiriams{i},'^sbo\/SBO:') % SBO terms are expected to be multiple -0334 hasMultiple(i)=true; -0335 end -0336 end -0337 end -0338 end -0339 -0340 %Print output -0341 EM='The following MIRIAM strings are associated to more than one unique metabolite name:'; -0342 dispEM(EM,false,allMiriams(hasMultiple),trimWarnings); -0343 end -0344 -0345 %Check if there are metabolites with different names but the same InChI -0346 %codes -0347 if isfield(model,'inchis') -0348 inchis=containers.Map(); -0349 for i=1:numel(model.mets) -0350 if ~isempty(model.inchis{i}) -0351 %Get existing metabolite indexes -0352 if isKey(inchis,model.inchis{i}) -0353 existing=inchis(model.inchis{i}); -0354 else -0355 existing=[]; -0356 end -0357 inchis(model.inchis{i})=[existing;i]; -0358 end -0359 end -0360 -0361 %Get all keys -0362 allInchis=keys(inchis); -0363 -0364 hasMultiple=false(numel(allInchis),1); -0365 for i=1:numel(allInchis) -0366 if numel(inchis(allInchis{i}))>1 -0367 %Check if they all have the same name -0368 if numel(unique(model.metNames(inchis(allInchis{i}))))>1 -0369 hasMultiple(i)=true; -0370 end -0371 end -0372 end -0373 -0374 %Print output -0375 EM='The following InChI strings are associated to more than one unique metabolite name:'; -0376 dispEM(EM,false,allInchis(hasMultiple),trimWarnings); -0377 end -0378 -0379 % %Check if there are metabolites with different names but the same SMILES -0380 % if isfield(model,'metSmiles') -0381 % metSmiles=containers.Map(); -0382 % for i=1:numel(model.mets) -0383 % if ~isempty(model.metSmiles{i}) -0384 % %Get existing metabolite indexes -0385 % if isKey(metSmiles,model.metSmiles{i}) -0386 % existing=metSmiles(model.metSmiles{i}); -0387 % else -0388 % existing=[]; -0389 % end -0390 % metSmiles(model.metSmiles{i})=[existing;i]; -0391 % end -0392 % end -0393 % -0394 % %Get all keys -0395 % allmetSmiles=keys(metSmiles); -0396 % -0397 % hasMultiple=false(numel(metSmiles),1); -0398 % for i=1:numel(metSmiles) -0399 % if numel(metSmiles(metSmiles{i}))>1 -0400 % %Check if they all have the same name -0401 % if numel(unique(model.metNames(metSmiles(allmetSmiles{i}))))>1 -0402 % hasMultiple(i)=true; -0403 % end -0404 % end -0405 % end -0406 % -0407 % %Print output -0408 % EM='The following metSmiles strings are associated to more than one unique metabolite name:'; -0409 % dispEM(EM,false,allmetSmiles(hasMultiple),trimWarnings); -0410 % end -0411 end -0412 -0413 function I=duplicates(strings) -0414 I=false(numel(strings),1); -0415 [J, K]=unique(strings); -0416 if numel(J)~=numel(strings) -0417 L=1:numel(strings); -0418 L(K)=[]; -0419 I(L)=true; -0420 end -0421 end +0285 %Multiple or no objective functions not allowed in SBML L3V1 FBCv2 +0286 if numel(find(model.c))>1 +0287 EM='Multiple objective functions found. This might be intended, but results in FBCv2 non-compliant SBML file when exported'; +0288 dispEM(EM,false,model.rxns(find(model.c)),trimWarnings); +0289 elseif ~any(model.c) +0290 EM='No objective function found. This might be intended, but results in FBCv2 non-compliant SBML file when exported'; +0291 dispEM(EM,false); +0292 end +0293 +0294 EM='The following reactions have contradicting bounds:'; +0295 dispEM(EM,throwErrors,model.rxns(model.lb>model.ub),trimWarnings); +0296 +0297 %Mapping of compartments +0298 if isfield(model,'compOutside') +0299 EM='The following compartments are in "compOutside" but not in "comps":'; +0300 dispEM(EM,throwErrors,setdiff(model.compOutside,[{''};model.comps]),trimWarnings); +0301 end +0302 +0303 %Met names which start with number +0304 I=false(numel(model.metNames),1); +0305 for i=1:numel(model.metNames) +0306 index=strfind(model.metNames{i},' '); +0307 if any(index) +0308 if any(str2double(model.metNames{i}(1:index(1)-1))) +0309 I(i)=true; +0310 end +0311 end +0312 end +0313 EM='The following metabolite IDs begin with a number directly followed by space:'; +0314 dispEM(EM,throwErrors,model.mets(I),trimWarnings); +0315 +0316 %Non-parseable composition +0317 if isfield(model,'metFormulas') +0318 [~, ~, exitFlag]=parseFormulas(model.metFormulas,true,false); +0319 EM='The composition for the following metabolites could not be parsed:'; +0320 dispEM(EM,false,model.mets(exitFlag==-1),trimWarnings); +0321 end +0322 +0323 %Check if there are metabolites with different names but the same MIRIAM +0324 %codes +0325 if isfield(model,'metMiriams') +0326 miriams=containers.Map(); +0327 for i=1:numel(model.mets) +0328 if ~isempty(model.metMiriams{i}) +0329 %Loop through and add for each miriam +0330 for j=1:numel(model.metMiriams{i}.name) +0331 %Get existing metabolite indexes +0332 current=strcat(model.metMiriams{i}.name{j},'/',model.metMiriams{i}.value{j}); +0333 if isKey(miriams,current) +0334 existing=miriams(current); +0335 else +0336 existing=[]; +0337 end +0338 miriams(current)=[existing;i]; +0339 end +0340 end +0341 end +0342 +0343 %Get all keys +0344 allMiriams=keys(miriams); +0345 +0346 hasMultiple=false(numel(allMiriams),1); +0347 for i=1:numel(allMiriams) +0348 if numel(miriams(allMiriams{i}))>1 +0349 %Check if they all have the same name +0350 if numel(unique(model.metNames(miriams(allMiriams{i}))))>1 +0351 if ~regexp(allMiriams{i},'^sbo\/SBO:') % SBO terms are expected to be multiple +0352 hasMultiple(i)=true; +0353 end +0354 end +0355 end +0356 end +0357 +0358 %Print output +0359 EM='The following MIRIAM strings are associated to more than one unique metabolite name:'; +0360 dispEM(EM,false,allMiriams(hasMultiple),trimWarnings); +0361 end +0362 +0363 %Check if there are metabolites with different names but the same InChI +0364 %codes +0365 if isfield(model,'inchis') +0366 inchis=containers.Map(); +0367 for i=1:numel(model.mets) +0368 if ~isempty(model.inchis{i}) +0369 %Get existing metabolite indexes +0370 if isKey(inchis,model.inchis{i}) +0371 existing=inchis(model.inchis{i}); +0372 else +0373 existing=[]; +0374 end +0375 inchis(model.inchis{i})=[existing;i]; +0376 end +0377 end +0378 +0379 %Get all keys +0380 allInchis=keys(inchis); +0381 +0382 hasMultiple=false(numel(allInchis),1); +0383 for i=1:numel(allInchis) +0384 if numel(inchis(allInchis{i}))>1 +0385 %Check if they all have the same name +0386 if numel(unique(model.metNames(inchis(allInchis{i}))))>1 +0387 hasMultiple(i)=true; +0388 end +0389 end +0390 end +0391 +0392 %Print output +0393 EM='The following InChI strings are associated to more than one unique metabolite name:'; +0394 dispEM(EM,false,allInchis(hasMultiple),trimWarnings); +0395 end +0396 +0397 % %Check if there are metabolites with different names but the same SMILES +0398 % if isfield(model,'metSmiles') +0399 % metSmiles=containers.Map(); +0400 % for i=1:numel(model.mets) +0401 % if ~isempty(model.metSmiles{i}) +0402 % %Get existing metabolite indexes +0403 % if isKey(metSmiles,model.metSmiles{i}) +0404 % existing=metSmiles(model.metSmiles{i}); +0405 % else +0406 % existing=[]; +0407 % end +0408 % metSmiles(model.metSmiles{i})=[existing;i]; +0409 % end +0410 % end +0411 % +0412 % %Get all keys +0413 % allmetSmiles=keys(metSmiles); +0414 % +0415 % hasMultiple=false(numel(metSmiles),1); +0416 % for i=1:numel(metSmiles) +0417 % if numel(metSmiles(metSmiles{i}))>1 +0418 % %Check if they all have the same name +0419 % if numel(unique(model.metNames(metSmiles(allmetSmiles{i}))))>1 +0420 % hasMultiple(i)=true; +0421 % end +0422 % end +0423 % end +0424 % +0425 % %Print output +0426 % EM='The following metSmiles strings are associated to more than one unique metabolite name:'; +0427 % dispEM(EM,false,allmetSmiles(hasMultiple),trimWarnings); +0428 % end +0429 end +0430 +0431 function I=duplicates(strings) +0432 I=false(numel(strings),1); +0433 [J, K]=unique(strings); +0434 if numel(J)~=numel(strings) +0435 L=1:numel(strings); +0436 L(K)=[]; +0437 I(L)=true; +0438 end +0439 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/core/constructS.html b/doc/core/constructS.html index b61c0b0b..0ba99957 100644 --- a/doc/core/constructS.html +++ b/doc/core/constructS.html @@ -219,31 +219,32 @@

SOURCE CODE ^', ')],'') 0153 else 0154 missingMet = find(~metsPresent); -0155 missingMet = char(strcat(metsToS(missingMet),' (reaction:',rxns(rxnsToS(missingMet)),')\n')); -0156 error(['Could not find the following metabolites (reaction indicated) in the metabolite list: \n' ... -0157 missingMet '%s'],''); -0158 end -0159 end -0160 linearIndices=sub2ind(size(S),metsLoc,rxnsToS); -0161 S(linearIndices)=coefToS; -0162 S=sparse(S); -0163 end -0164 -0165 function equ=fixEquations(equ) -0166 %If the equation starts with "=>" or "<=>" then add a space again. This is -0167 %an alternative way to represent uptake reactions. The opposite way for -0168 %producing reactions -0169 equ=equ(:); -0170 for i=1:numel(equ) -0171 if strcmp(equ{i}(1:2),'=>') || strcmp(equ{i}(1:3),'<=>') -0172 equ{i}=[' ' equ{i}]; -0173 else -0174 if strcmp(equ{i}(end-1:end),'=>') || strcmp(equ{i}(end-2:end),'<=>') -0175 equ{i}=[equ{i} ' ']; -0176 end -0177 end -0178 end -0179 end +0155 missingMet = strcat(metsToS(missingMet),' (reaction:',rxns(rxnsToS(missingMet)),')\n'); +0156 missingMet = strjoin(missingMet,''); +0157 error(['Could not find the following metabolites (reaction indicated) in the metabolite list: \n' ... +0158 missingMet '%s'],''); +0159 end +0160 end +0161 linearIndices=sub2ind(size(S),metsLoc,rxnsToS); +0162 S(linearIndices)=coefToS; +0163 S=sparse(S); +0164 end +0165 +0166 function equ=fixEquations(equ) +0167 %If the equation starts with "=>" or "<=>" then add a space again. This is +0168 %an alternative way to represent uptake reactions. The opposite way for +0169 %producing reactions +0170 equ=equ(:); +0171 for i=1:numel(equ) +0172 if strcmp(equ{i}(1:2),'=>') || strcmp(equ{i}(1:3),'<=>') +0173 equ{i}=[' ' equ{i}]; +0174 else +0175 if strcmp(equ{i}(end-1:end),'=>') || strcmp(equ{i}(end-2:end),'<=>') +0176 equ{i}=[equ{i} ' ']; +0177 end +0178 end +0179 end +0180 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/core/getExchangeRxns.html b/doc/core/getExchangeRxns.html index 1baa037c..88e7f409 100644 --- a/doc/core/getExchangeRxns.html +++ b/doc/core/getExchangeRxns.html @@ -28,19 +28,43 @@

SYNOPSIS ^DESCRIPTION ^

 getExchangeRxns
-   Retrieves the exchange reactions from a model
+   Retrieves the exchange reactions from a model. Exchange reactions are
+   identified by having either no substrates or products.
 
+ Input:
    model               a model structure
-   reactionType        retrieve all reactions ('both'), only production
-                       ('out'), or only consumption ('in') (optional, default
-                       'both')
+   reactionType        which exchange reactions should be returned
+                       'all'     all reactions, irrespective of reaction
+                                 bounds
+                       'uptake'  reactions with bounds that imply that
+                                 only uptake are allowed. Reaction
+                                 direction, upper and lower bounds are
+                                 all considered
+                       'excrete' reactions with bounds that imply that
+                                 only excretion are allowed. Reaction
+                                 direction, upper and lower bounds are
+                                 all considered
+                       'reverse' reactions with non-zero upper and lower
+                                 bounds that imply that both uptake and
+                                 excretion are allowed
+                       'blocked' reactions that have zero upper and lower
+                                 bounds, not allowing any flux
+                       'in'      reactions where the boundary metabolite
+                                 is the substrate of the reaction, a
+                                 positive flux value would imply uptake,
+                                 but reaction bounds are not considered
+                       'out'     reactions where the boundary metabolite
+                                 is the substrate of the reaction, a
+                                 positive flux value would imply uptake,
+                                 but reaction bounds are not considered.
 
+ Output:
    exchangeRxns        cell array with the IDs of the exchange reactions
    exchangeRxnsIndexes vector with the indexes of the exchange reactions
 
-   Exchange reactions are defined as reactions which involve only products
-   or only reactants. If the unconstrained field is present, then that is
-   used instead.
+ Note:
+   The union of 'in' and 'out' equals 'all'. Also, the union of 'uptake',
+   'excrete', 'reverse' and 'blocked' equals all.
 
  Usage: [exchangeRxns,exchangeRxnsIndexes]=getExchangeRxns(model,reactionType)
@@ -59,51 +83,86 @@

CROSS-REFERENCE INFORMATION ^
 <h2><a name=SOURCE CODE ^

0001 function [exchangeRxns, exchangeRxnsIndexes]=getExchangeRxns(model,reactionType)
 0002 % getExchangeRxns
-0003 %   Retrieves the exchange reactions from a model
-0004 %
-0005 %   model               a model structure
-0006 %   reactionType        retrieve all reactions ('both'), only production
-0007 %                       ('out'), or only consumption ('in') (optional, default
-0008 %                       'both')
-0009 %
-0010 %   exchangeRxns        cell array with the IDs of the exchange reactions
-0011 %   exchangeRxnsIndexes vector with the indexes of the exchange reactions
-0012 %
-0013 %   Exchange reactions are defined as reactions which involve only products
-0014 %   or only reactants. If the unconstrained field is present, then that is
-0015 %   used instead.
-0016 %
-0017 % Usage: [exchangeRxns,exchangeRxnsIndexes]=getExchangeRxns(model,reactionType)
-0018 
-0019 if nargin<2
-0020     reactionType='both';
-0021 else
-0022     reactionType=char(reactionType);
-0023 end
-0024 
-0025 hasNoProducts=sparse(numel(model.rxns),1);
-0026 hasNoReactants=sparse(numel(model.rxns),1);
-0027 
-0028 if isfield(model,'unconstrained')
-0029     if strcmpi(reactionType,'both') || strcmpi(reactionType,'out')
-0030         [~, I]=find(model.S(model.unconstrained~=0,:)>0);
-0031         hasNoProducts(I)=true;
-0032     end
-0033     if strcmpi(reactionType,'both') || strcmpi(reactionType,'in')
-0034         [~, I]=find(model.S(model.unconstrained~=0,:)<0);
-0035         hasNoReactants(I)=true;
-0036     end
-0037 else
-0038     if strcmpi(reactionType,'both') || strcmpi(reactionType,'out')
-0039         hasNoProducts=sum((model.S>0))==0;
-0040     end
-0041     if strcmpi(reactionType,'both') || strcmpi(reactionType,'in')
-0042         hasNoReactants=sum((model.S<0))==0;
-0043     end
-0044 end
-0045 exchangeRxnsIndexes=find(hasNoProducts(:) | hasNoReactants(:));
-0046 exchangeRxns=model.rxns(exchangeRxnsIndexes);
-0047 end
+0003 % Retrieves the exchange reactions from a model. Exchange reactions are +0004 % identified by having either no substrates or products. +0005 % +0006 % Input: +0007 % model a model structure +0008 % reactionType which exchange reactions should be returned +0009 % 'all' all reactions, irrespective of reaction +0010 % bounds +0011 % 'uptake' reactions with bounds that imply that +0012 % only uptake are allowed. Reaction +0013 % direction, upper and lower bounds are +0014 % all considered +0015 % 'excrete' reactions with bounds that imply that +0016 % only excretion are allowed. Reaction +0017 % direction, upper and lower bounds are +0018 % all considered +0019 % 'reverse' reactions with non-zero upper and lower +0020 % bounds that imply that both uptake and +0021 % excretion are allowed +0022 % 'blocked' reactions that have zero upper and lower +0023 % bounds, not allowing any flux +0024 % 'in' reactions where the boundary metabolite +0025 % is the substrate of the reaction, a +0026 % positive flux value would imply uptake, +0027 % but reaction bounds are not considered +0028 % 'out' reactions where the boundary metabolite +0029 % is the substrate of the reaction, a +0030 % positive flux value would imply uptake, +0031 % but reaction bounds are not considered. +0032 % +0033 % Output: +0034 % exchangeRxns cell array with the IDs of the exchange reactions +0035 % exchangeRxnsIndexes vector with the indexes of the exchange reactions +0036 % +0037 % Note: +0038 % The union of 'in' and 'out' equals 'all'. Also, the union of 'uptake', +0039 % 'excrete', 'reverse' and 'blocked' equals all. +0040 % +0041 % Usage: [exchangeRxns,exchangeRxnsIndexes]=getExchangeRxns(model,reactionType) +0042 +0043 if nargin<2 +0044 reactionType='all'; +0045 else +0046 reactionType=char(reactionType); +0047 end +0048 +0049 % Find exchange reactions +0050 if isfield(model, 'unconstrained') +0051 [~, I]=find(model.S(model.unconstrained~=0,:)>0); +0052 hasNoProd(I)=true; +0053 [~, I]=find(model.S(model.unconstrained~=0,:)<0); +0054 hasNoSubs(I)=true; +0055 else +0056 hasNoProd = transpose(find(sum(model.S>0)==0)); +0057 hasNoSubs = transpose(find(sum(model.S<0)==0)); +0058 end +0059 allExch = [hasNoProd; hasNoSubs]; +0060 +0061 switch reactionType +0062 case {'both','all'} % For legacy reasons, 'both' is also allowed +0063 exchangeRxnsIndexes = allExch; +0064 case 'in' +0065 exchangeRxnsIndexes = hasNoSubs; +0066 case 'out' +0067 exchangeRxnsIndexes = hasNoProd; +0068 case 'blocked' +0069 exchangeRxnsIndexes = allExch(model.lb(allExch) == 0 & model.ub(allExch) == 0); +0070 case 'reverse' +0071 exchangeRxnsIndexes = allExch(model.lb(allExch) < 0 & model.ub(allExch) > 0); +0072 case 'uptake' +0073 exchangeRxnsIndexes = allExch([(model.lb(hasNoSubs) >= 0 & model.ub(hasNoSubs) > 0); ... +0074 (model.lb(hasNoProd) < 0 & model.ub(hasNoProd) <= 0)]); +0075 case 'excrete' +0076 exchangeRxnsIndexes = allExch([(model.lb(hasNoSubs) < 0 & model.ub(hasNoSubs) <= 0); ... +0077 (model.lb(hasNoProd) >= 0 & model.ub(hasNoProd) > 0)]); +0078 otherwise +0079 error('Invalid reactionType specified') +0080 end +0081 exchangeRxns = model.rxns(exchangeRxnsIndexes); +0082 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/external/kegg/getKEGGModelForOrganism.html b/doc/external/kegg/getKEGGModelForOrganism.html index 7010b5eb..e5ec6a91 100644 --- a/doc/external/kegg/getKEGGModelForOrganism.html +++ b/doc/external/kegg/getKEGGModelForOrganism.html @@ -63,15 +63,12 @@

DESCRIPTION ^SOURCE CODE ^% The hidden Markov models as generated in 2b or 0039 % downloaded from BioMet Toolbox (see below) 0040 % The final directory in dataDir should be styled as -0041 % proXXX_keggYY or eukXXX_keggYY, indicating whether +0041 % prok90_kegg105 or euk90_kegg105, indicating whether 0042 % the HMMs were trained on pro- or eukaryotic -0043 % sequences, using a sequence similarity threshold of -0044 % XXX %, fitting the KEGG version YY. E.g. -0045 % euk90_kegg105. (optional, see note about fastaFile. Note -0046 % that in order to rebuild the KEGG model from a -0047 % database dump, as opposed to using the version -0048 % supplied with RAVEN, you would still need to supply -0049 % this) -0050 % outDir directory to save the results from the quering of -0051 % the hidden Markov models. The output is specific -0052 % for the input sequences and the settings used. It -0053 % is stored in this manner so that the function can -0054 % continue if interrupted or if it should run in -0055 % parallel. Be careful not to leave output files from -0056 % different organisms or runs with different settings -0057 % in the same folder. They will not be overwritten -0058 % (optional, default is a temporary dir where all *.out -0059 % files are deleted before and after doing the -0060 % reconstruction) -0061 % keepSpontaneous include reactions labeled as "spontaneous". (optional, -0062 % default true) -0063 % keepUndefinedStoich include reactions in the form n A <=> n+1 A. These -0064 % will be dealt with as two separate metabolites -0065 % (optional, default true) -0066 % keepIncomplete include reactions which have been labelled as -0067 % "incomplete", "erroneous" or "unclear" (optional, -0068 % default true) -0069 % keepGeneral include reactions which have been labelled as -0070 % "general reaction". These are reactions on the form -0071 % "an aldehyde <=> an alcohol", and are therefore -0072 % unsuited for modelling purposes. Note that not all -0073 % reactions have this type of annotation, and the -0074 % script will therefore not be able to remove all -0075 % such reactions (optional, default false) -0076 % cutOff significance score from HMMer needed to assign -0077 % genes to a KO (optional, default 10^-50) -0078 % minScoreRatioG a gene is only assigned to KOs for which the score -0079 % is >=log(score)/log(best score) for that gene. This -0080 % is to prevent that a gene which clearly belongs to -0081 % one KO is assigned also to KOs with much lower -0082 % scores (optional, default 0.8 (lower is less strict)) -0083 % minScoreRatioKO ignore genes in a KO if their score is -0084 % <log(score)/log(best score in KO). This is to -0085 % "prune" KOs which have many genes and where some are -0086 % clearly a better fit (optional, default 0.3 (lower is -0087 % less strict)) -0088 % maxPhylDist -1: only use sequences from the same domain -0089 % (Prokaryota, Eukaryota) -0090 % other (positive) value: only use sequences for -0091 % organisms where the phylogenetic distance is at the -0092 % most this large (as calculated in getPhylDist) -0093 % (optional, default Inf, which means that all sequences -0094 % will be used) -0095 % nSequences for each KO, use up to this many sequences from the -0096 % most closely related species. This is mainly to -0097 % speed up the alignment process for KOs with very -0098 % many genes. This subsampling is performed before -0099 % running CD-HIT (optional, default inf) -0100 % seqIdentity sequence identity threshold in CD-HIT, referred as -0101 % "global sequence identity" in CD-HIT User's Guide. -0102 % If -1 is provided, CD-HIT is skipped (optional, default 0.9) -0103 % globalModel structure containing both model and KOModel -0104 % structures as generated by getModelFromKEGG. These -0105 % will otherwise be loaded by via getModelFromKEGG. -0106 % Providing globalKEGGmodel can speed up model -0107 % generation if getKEGGModelForOrganism is run -0108 % multiple times for different strains. Example: -0109 % [globalModel.model,globalModel.KOModel] = getModelFromKEGG; -0110 % (optional, default empty, global model is loaded by -0111 % getModelFromKEGG) +0043 % sequences; using which sequence similarity treshold +0044 % (first set of digits); using which KEGG version +0045 % (second set of digits). (this parameter should +0046 % ALWAYS be provided) +0047 % outDir directory to save the results from the quering of +0048 % the hidden Markov models. The output is specific +0049 % for the input sequences and the settings used. It +0050 % is stored in this manner so that the function can +0051 % continue if interrupted or if it should run in +0052 % parallel. Be careful not to leave output files from +0053 % different organisms or runs with different settings +0054 % in the same folder. They will not be overwritten +0055 % (optional, default is a temporary dir where all *.out +0056 % files are deleted before and after doing the +0057 % reconstruction) +0058 % keepSpontaneous include reactions labeled as "spontaneous". (optional, +0059 % default true) +0060 % keepUndefinedStoich include reactions in the form n A <=> n+1 A. These +0061 % will be dealt with as two separate metabolites +0062 % (optional, default true) +0063 % keepIncomplete include reactions which have been labelled as +0064 % "incomplete", "erroneous" or "unclear" (optional, +0065 % default true) +0066 % keepGeneral include reactions which have been labelled as +0067 % "general reaction". These are reactions on the form +0068 % "an aldehyde <=> an alcohol", and are therefore +0069 % unsuited for modelling purposes. Note that not all +0070 % reactions have this type of annotation, and the +0071 % script will therefore not be able to remove all +0072 % such reactions (optional, default false) +0073 % cutOff significance score from HMMer needed to assign +0074 % genes to a KO (optional, default 10^-50) +0075 % minScoreRatioG a gene is only assigned to KOs for which the score +0076 % is >=log(score)/log(best score) for that gene. This +0077 % is to prevent that a gene which clearly belongs to +0078 % one KO is assigned also to KOs with much lower +0079 % scores (optional, default 0.8 (lower is less strict)) +0080 % minScoreRatioKO ignore genes in a KO if their score is +0081 % <log(score)/log(best score in KO). This is to +0082 % "prune" KOs which have many genes and where some are +0083 % clearly a better fit (optional, default 0.3 (lower is +0084 % less strict)) +0085 % maxPhylDist -1: only use sequences from the same domain +0086 % (Prokaryota, Eukaryota) +0087 % other (positive) value: only use sequences for +0088 % organisms where the phylogenetic distance is at the +0089 % most this large (as calculated in getPhylDist) +0090 % (optional, default Inf, which means that all sequences +0091 % will be used) +0092 % nSequences for each KO, use up to this many sequences from the +0093 % most closely related species. This is mainly to +0094 % speed up the alignment process for KOs with very +0095 % many genes. This subsampling is performed before +0096 % running CD-HIT (optional, default inf) +0097 % seqIdentity sequence identity threshold in CD-HIT, referred as +0098 % "global sequence identity" in CD-HIT User's Guide. +0099 % If -1 is provided, CD-HIT is skipped (optional, default 0.9) +0100 % globalModel structure containing both model and KOModel +0101 % structures as generated by getModelFromKEGG. These +0102 % will otherwise be loaded by via getModelFromKEGG. +0103 % Providing globalKEGGmodel can speed up model +0104 % generation if getKEGGModelForOrganism is run +0105 % multiple times for different strains. Example: +0106 % [globalModel.model,globalModel.KOModel] = getModelFromKEGG; +0107 % (optional, default empty, global model is loaded by +0108 % getModelFromKEGG) +0109 % +0110 % Output: +0111 % model the reconstructed model 0112 % -0113 % Output: -0114 % model the reconstructed model -0115 % -0116 % PLEASE READ THIS: The input to this function can be confusing, because -0117 % it is intended to be run in parallel on a cluster or in multiple -0118 % sessions. It therefore saves a lot of intermediate results to storage. -0119 % This also serves the purpose of not having to do redundant -0120 % calculations. This, however, comes with the disadvantage of somewhat -0121 % trickier handling. This is what this function does: -0122 % -0123 % 1a. Loads files from a local KEGG FTP dump and constructs a general -0124 % RAVEN model representing the metabolic network. The functions -0125 % getRxnsFromKEGG, getGenesFromKEGG, getMetsFromKEGG summarise the -0126 % data into 'keggRxns.mat', 'keggGenes.mat' and 'keggMets.mat' files, -0127 % which are later merged into 'keggModel.mat' by getModelFromKEGG -0128 % function. The function getPhylDist generates 'keggPhylDist.mat' -0129 % file. KEGG FTP access requires a <a href="matlab: -0130 % web('http://www.bioinformatics.jp/en/keggftp.html')">license</a>. -0131 % 1b. Generates protein FASTA files from the KEGG FTP dump (see 1a). One -0132 % multi-FASTA file for each KO in KEGG is generated. -0133 % -0134 % The Step 1 has to be re-done every time KEGG updates their database (or -0135 % rather when the updates are large enough to warrant re-running this -0136 % part). Many users would probably never use this feature. -0137 % -0138 % 2a. Filters KO-specific protein sets. This is done by using the -0139 % settings "maxPhylDist" and "nSequences" to control which sequences -0140 % should be used for constructing Hidden Markov models (HMMs), and -0141 % later for matching your sequences to. -0142 % The most common alternatives here would be to use sequences from -0143 % only eukaryotes, only prokaryotes or all sequences in KEGG, but you -0144 % could also play around with the parameters to use e.g. only fungal -0145 % sequences. -0146 % 2b. KO-specific protein FASTA files are re-organised into -0147 % non-redundant protein sets with CD-HIT. The user can only set -0148 % seqIdentity parameter, which corresponds to '-c' parameter in -0149 % CD-HIT, described as "sequence identity threshold". CD-HIT suggsted -0150 % sequence identity specific word_length (-n) parameters are used. -0151 % 2c. Does a multi sequence alignment for multi-FASTA files obtained in -0152 % Step 2b for future use. MAFFT software with automatic selection of -0153 % alignment algorithm is used in this step ('--auto'). -0154 % 2d. Trains hidden Markov models using HMMer for each of the aligned -0155 % KO-specific FASTA files obtained in Step 2c. This is performed with -0156 % 'hmmbuild' using the default settings. -0157 % -0158 % Step 2 may be reasonable to be re-done if the user wants to tweak the -0159 % settings in proteins filtering, clustering, multi sequence alignment or -0160 % HMMs training steps. However, it requires to have KO-specific protein -0161 % FASTA files obtained in Step 1a. As such files are not provided in -0162 % RAVEN and BioMet ToolBox, the user can only generate these files from -0163 % KEGG FTP dump files, so KEGG FTP license is needed. -0164 % -0165 % 3a. Queries the HMMs with sequences for the organism you are making a -0166 % model for. This step uses both the output from step 1a and from 2d. -0167 % This is done with 'hmmsearch' function under default settings. The -0168 % significance threshold value set in 'cutOff' parameter is used -0169 % later when parsing '*.out' files to filter out KO hits with higher -0170 % value than 'cutOff' value. The results with passable E values are -0171 % summarised into KO-gene occurence matrix with E values in -0172 % intersections as 'koGeneMat'. The parameters 'minScoreRatioG' and -0173 % 'minScoreRatioKO' are then applied to 'prune' KO-gene associations -0174 % (see the function descriptions above for more details). The -0175 % intersection values for these 'prunable' associations are converted -0176 % to zeroes. -0177 % 3b. Constructs a model based on the pre-processed KO-gene association -0178 % matrix (koGeneMat). As the full KEGG model already has reaction-KO -0179 % relationships, KOs are converted into the query genes. The final -0180 % draft model contains only these reactions, which are associated -0181 % with KOs from koGeneMat. The reactions without the genes may also -0182 % be included, if the user set keepSpontaneous as 'true'. +0113 % PLEASE READ THIS: The input to this function can be confusing, because +0114 % it is intended to be run in parallel on a cluster or in multiple +0115 % sessions. It therefore saves a lot of intermediate results to storage. +0116 % This also serves the purpose of not having to do redundant +0117 % calculations. This, however, comes with the disadvantage of somewhat +0118 % trickier handling. This is what this function does: +0119 % +0120 % 1a. Loads files from a local KEGG FTP dump and constructs a general +0121 % RAVEN model representing the metabolic network. The functions +0122 % getRxnsFromKEGG, getGenesFromKEGG, getMetsFromKEGG summarise the +0123 % data into 'keggRxns.mat', 'keggGenes.mat' and 'keggMets.mat' files, +0124 % which are later merged into 'keggModel.mat' by getModelFromKEGG +0125 % function. The function getPhylDist generates 'keggPhylDist.mat' +0126 % file. KEGG FTP access requires a <a href="matlab: +0127 % web('http://www.bioinformatics.jp/en/keggftp.html')">license</a>. +0128 % 1b. Generates protein FASTA files from the KEGG FTP dump (see 1a). One +0129 % multi-FASTA file for each KO in KEGG is generated. +0130 % +0131 % The Step 1 has to be re-done every time KEGG updates their database (or +0132 % rather when the updates are large enough to warrant re-running this +0133 % part). Many users would probably never use this feature. +0134 % +0135 % 2a. Filters KO-specific protein sets. This is done by using the +0136 % settings "maxPhylDist" and "nSequences" to control which sequences +0137 % should be used for constructing Hidden Markov models (HMMs), and +0138 % later for matching your sequences to. +0139 % The most common alternatives here would be to use sequences from +0140 % only eukaryotes, only prokaryotes or all sequences in KEGG, but you +0141 % could also play around with the parameters to use e.g. only fungal +0142 % sequences. +0143 % 2b. KO-specific protein FASTA files are re-organised into +0144 % non-redundant protein sets with CD-HIT. The user can only set +0145 % seqIdentity parameter, which corresponds to '-c' parameter in +0146 % CD-HIT, described as "sequence identity threshold". CD-HIT suggsted +0147 % sequence identity specific word_length (-n) parameters are used. +0148 % 2c. Does a multi sequence alignment for multi-FASTA files obtained in +0149 % Step 2b for future use. MAFFT software with automatic selection of +0150 % alignment algorithm is used in this step ('--auto'). +0151 % 2d. Trains hidden Markov models using HMMer for each of the aligned +0152 % KO-specific FASTA files obtained in Step 2c. This is performed with +0153 % 'hmmbuild' using the default settings. +0154 % +0155 % Step 2 may be reasonable to be re-done if the user wants to tweak the +0156 % settings in proteins filtering, clustering, multi sequence alignment or +0157 % HMMs training steps. However, it requires to have KO-specific protein +0158 % FASTA files obtained in Step 1a. As such files are not provided in +0159 % RAVEN and BioMet ToolBox, the user can only generate these files from +0160 % KEGG FTP dump files, so KEGG FTP license is needed. +0161 % +0162 % 3a. Queries the HMMs with sequences for the organism you are making a +0163 % model for. This step uses both the output from step 1a and from 2d. +0164 % This is done with 'hmmsearch' function under default settings. The +0165 % significance threshold value set in 'cutOff' parameter is used +0166 % later when parsing '*.out' files to filter out KO hits with higher +0167 % value than 'cutOff' value. The results with passable E values are +0168 % summarised into KO-gene occurence matrix with E values in +0169 % intersections as 'koGeneMat'. The parameters 'minScoreRatioG' and +0170 % 'minScoreRatioKO' are then applied to 'prune' KO-gene associations +0171 % (see the function descriptions above for more details). The +0172 % intersection values for these 'prunable' associations are converted +0173 % to zeroes. +0174 % 3b. Constructs a model based on the pre-processed KO-gene association +0175 % matrix (koGeneMat). As the full KEGG model already has reaction-KO +0176 % relationships, KOs are converted into the query genes. The final +0177 % draft model contains only these reactions, which are associated +0178 % with KOs from koGeneMat. The reactions without the genes may also +0179 % be included, if the user set keepSpontaneous as 'true'. +0180 % +0181 % The Step 3 is specific to the organism for which the model is +0182 % reconstructed. 0183 % -0184 % The Step 3 is specific to the organism for which the model is -0185 % reconstructed. -0186 % -0187 % In principle the function looks at which output that is already available -0188 % and runs only the parts that are required for step 3. This means -0189 % that (see the definition of the parameters for details): -0190 % -1a is only performed if there are no KEGG model files in the -0191 % RAVEN\external\kegg directory -0192 % -1b is only performed if not all required HMMs OR aligned FASTA files -0193 % OR multi-FASTA files exist in the defined dataDir. This means that this -0194 % step is skipped if the HMMs are downloaded from BioMet Toolbox instead -0195 % (see above). If not all files exist it will try to find -0196 % the KEGG database files in dataDir. -0197 % -2a is only performed if not all required HMMs OR aligned FASTA files -0198 % files exist in the defined dataDir. This means that this step is skipped -0199 % if the HMMs are downloaded from BioMet Toolbox instead (see above). -0200 % -2b is only performed if not all required HMMs exist in the defined -0201 % dataDir. This means that this step is skipped if the FASTA files or -0202 % HMMs are downloaded from BioMet Toolbox instead (see above). -0203 % -3a is performed for the required HMMs for which no corresponding .out -0204 % file exists in outDir. This is just a way to enable the function to be -0205 % run in parallel or to resume if interrupted. -0206 % -3b is always performed. +0184 % In principle the function looks at which output that is already available +0185 % and runs only the parts that are required for step 3. This means +0186 % that (see the definition of the parameters for details): +0187 % -1a is only performed if there are no KEGG model files in the +0188 % RAVEN\external\kegg directory +0189 % -1b is only performed if not all required HMMs OR aligned FASTA files +0190 % OR multi-FASTA files exist in the defined dataDir. This means that this +0191 % step is skipped if the HMMs are downloaded from BioMet Toolbox instead +0192 % (see above). If not all files exist it will try to find +0193 % the KEGG database files in dataDir. +0194 % -2a is only performed if not all required HMMs OR aligned FASTA files +0195 % files exist in the defined dataDir. This means that this step is skipped +0196 % if the HMMs are downloaded from BioMet Toolbox instead (see above). +0197 % -2b is only performed if not all required HMMs exist in the defined +0198 % dataDir. This means that this step is skipped if the FASTA files or +0199 % HMMs are downloaded from BioMet Toolbox instead (see above). +0200 % -3a is performed for the required HMMs for which no corresponding .out +0201 % file exists in outDir. This is just a way to enable the function to be +0202 % run in parallel or to resume if interrupted. +0203 % -3b is always performed. +0204 % +0205 % These steps are specific to the organism for which you are +0206 % reconstructing the model. 0207 % -0208 % These steps are specific to the organism for which you are -0209 % reconstructing the model. -0210 % -0211 % Regarding the whole pipeline, the function checks the output that is -0212 % already available and runs only the parts that are required for step 3. -0213 % This means that (see the definition of the parameters for details): -0214 % -1a is only performed if there are no KEGG model files in the -0215 % RAVEN\external\kegg directory. -0216 % -1b is only performed if any of required KOs do not have HMMs, aligned -0217 % FASTA files, clustered FASTA files and raw FASTA files in the defined -0218 % dataDir. This means that this step is skipped if the HMMs are -0219 % downloaded from BioMet Toolbox instead (see above). If not all files -0220 % exist it will try to find the KEGG database files in dataDir. -0221 % -2ab are only performed if any of required KOs do not have HMMs, -0222 % aligned FASTA files and clustered FASTA files in the defined dataDir. -0223 % This means that this step is skipped if the HMMs are downloaded from -0224 % BioMet Toolbox instead (see above). -0225 % -2c is only performed if any of required KOs do not have HMMs and -0226 % aligned FASTA files in the defined dataDir. This means that this step -0227 % is skipped if the HMMs are downloaded from BioMet Toolbox instead (see -0228 % above). -0229 % -2d is only performed if any of required KOs do not have HMMs exist in -0230 % the defined dataDir. This means that this step is skipped if the FASTA -0231 % files or HMMs are downloaded from BioMet Toolbox instead (see above). -0232 % -3a is performed for the required HMMs for which no corresponding .out -0233 % file exists in outDir. This is just a way to enable the function to be -0234 % run in parallel or to resume if interrupted. -0235 % -3b is always performed. -0236 % -0237 % NOTE: it is also possible to obtain draft model from KEGG without -0238 % providing protein FASTA file for the target organism. In such case the -0239 % organism three-four letter abbreviation set as 'organismID' must exist -0240 % in the local KEGG database. In such case, the program just fetches all -0241 % the reactions, which are associated with given 'organismID'. -0242 % -0243 % Usage: model=getKEGGModelForOrganism(organismID,fastaFile,dataDir,... -0244 % outDir,keepSpontaneous,keepUndefinedStoich,keepIncomplete,... -0245 % keepGeneral,cutOff,minScoreRatioKO,minScoreRatioG,maxPhylDist,... -0246 % nSequences,seqIdentity) -0247 -0248 if nargin<2 || isempty(fastaFile) -0249 fastaFile=[]; -0250 else -0251 fastaFile=char(fastaFile); -0252 end -0253 if nargin<3 -0254 dataDir=[]; -0255 else -0256 dataDir=char(dataDir); -0257 end -0258 if nargin<4 || isempty(outDir) -0259 outDir=tempdir; -0260 %Delete all *.out files if any exist -0261 delete(fullfile(outDir,'*.out')); -0262 else -0263 outDir=char(outDir); +0208 % Regarding the whole pipeline, the function checks the output that is +0209 % already available and runs only the parts that are required for step 3. +0210 % This means that (see the definition of the parameters for details): +0211 % -1a is only performed if there are no KEGG model files in the +0212 % RAVEN\external\kegg directory. +0213 % -1b is only performed if any of required KOs do not have HMMs, aligned +0214 % FASTA files, clustered FASTA files and raw FASTA files in the defined +0215 % dataDir. This means that this step is skipped if the HMMs are +0216 % downloaded from BioMet Toolbox instead (see above). If not all files +0217 % exist it will try to find the KEGG database files in dataDir. +0218 % -2ab are only performed if any of required KOs do not have HMMs, +0219 % aligned FASTA files and clustered FASTA files in the defined dataDir. +0220 % This means that this step is skipped if the HMMs are downloaded from +0221 % BioMet Toolbox instead (see above). +0222 % -2c is only performed if any of required KOs do not have HMMs and +0223 % aligned FASTA files in the defined dataDir. This means that this step +0224 % is skipped if the HMMs are downloaded from BioMet Toolbox instead (see +0225 % above). +0226 % -2d is only performed if any of required KOs do not have HMMs exist in +0227 % the defined dataDir. This means that this step is skipped if the FASTA +0228 % files or HMMs are downloaded from BioMet Toolbox instead (see above). +0229 % -3a is performed for the required HMMs for which no corresponding .out +0230 % file exists in outDir. This is just a way to enable the function to be +0231 % run in parallel or to resume if interrupted. +0232 % -3b is always performed. +0233 % +0234 % NOTE: it is also possible to obtain draft model from KEGG without +0235 % providing protein FASTA file for the target organism. In such case the +0236 % organism three-four letter abbreviation set as 'organismID' must exist +0237 % in the local KEGG database. In such case, the program just fetches all +0238 % the reactions, which are associated with given 'organismID'. +0239 % +0240 % Usage: model=getKEGGModelForOrganism(organismID,fastaFile,dataDir,... +0241 % outDir,keepSpontaneous,keepUndefinedStoich,keepIncomplete,... +0242 % keepGeneral,cutOff,minScoreRatioKO,minScoreRatioG,maxPhylDist,... +0243 % nSequences,seqIdentity) +0244 +0245 if nargin<2 || isempty(fastaFile) +0246 fastaFile=[]; +0247 else +0248 fastaFile=char(fastaFile); +0249 end +0250 if nargin<3 +0251 dataDir=[]; +0252 else +0253 dataDir=char(dataDir); +0254 end +0255 if nargin<4 || isempty(outDir) +0256 outDir=tempdir; +0257 %Delete all *.out files if any exist +0258 delete(fullfile(outDir,'*.out')); +0259 else +0260 outDir=char(outDir); +0261 end +0262 if nargin<5 +0263 keepSpontaneous=true; 0264 end -0265 if nargin<5 -0266 keepSpontaneous=true; +0265 if nargin<6 +0266 keepUndefinedStoich=true; 0267 end -0268 if nargin<6 -0269 keepUndefinedStoich=true; +0268 if nargin<7 +0269 keepIncomplete=true; 0270 end -0271 if nargin<7 -0272 keepIncomplete=true; +0271 if nargin<8 +0272 keepGeneral=false; 0273 end -0274 if nargin<8 -0275 keepGeneral=false; +0274 if nargin<9 +0275 cutOff=10^-50; 0276 end -0277 if nargin<9 -0278 cutOff=10^-50; +0277 if nargin<10 +0278 minScoreRatioKO=0.3; 0279 end -0280 if nargin<10 -0281 minScoreRatioKO=0.3; +0280 if nargin<11 +0281 minScoreRatioG=0.8; 0282 end -0283 if nargin<11 -0284 minScoreRatioG=0.8; -0285 end -0286 if nargin<12 -0287 maxPhylDist=inf; -0288 %Include all sequences for each reaction -0289 end -0290 if nargin<13 -0291 nSequences=inf; -0292 %Include all sequences for each reaction +0283 if nargin<12 +0284 maxPhylDist=inf; +0285 %Include all sequences for each reaction +0286 end +0287 if nargin<13 +0288 nSequences=inf; +0289 %Include all sequences for each reaction +0290 end +0291 if nargin<14 +0292 seqIdentity=0.9; 0293 end -0294 if nargin<14 -0295 seqIdentity=0.9; -0296 end -0297 -0298 if isempty(fastaFile) -0299 fprintf(['\n*** The model reconstruction from KEGG based on the annotation available for KEGG Species <strong>' organismID '</strong> ***\n\n']); -0300 else -0301 fprintf('\n*** The model reconstruction from KEGG based on the protein homology search against KEGG Orthology specific HMMs ***\n\n'); -0302 %Check if query fasta exists -0303 fastaFile=checkFileExistence(fastaFile,2); %Copy file to temp dir -0304 end -0305 -0306 %Run the external binaries multi-threaded to use all logical cores assigned -0307 %to MATLAB -0308 cores = evalc('feature(''numcores'')'); -0309 cores = strsplit(cores, 'MATLAB was assigned: '); -0310 cores = regexp(cores{2},'^\d*','match'); -0311 cores = cores{1}; +0294 +0295 if isempty(fastaFile) +0296 fprintf(['\n*** The model reconstruction from KEGG based on the annotation available for KEGG Species <strong>' organismID '</strong> ***\n\n']); +0297 else +0298 fprintf('\n*** The model reconstruction from KEGG based on the protein homology search against KEGG Orthology specific HMMs ***\n\n'); +0299 %Check if query fasta exists +0300 fastaFile=checkFileExistence(fastaFile,2); %Copy file to temp dir +0301 end +0302 +0303 %Run the external binaries multi-threaded to use all logical cores assigned +0304 %to MATLAB +0305 cores = evalc('feature(''numcores'')'); +0306 cores = strsplit(cores, 'MATLAB was assigned: '); +0307 cores = regexp(cores{2},'^\d*','match'); +0308 cores = cores{1}; +0309 +0310 %Get the directory for RAVEN Toolbox. +0311 ravenPath=findRAVENroot(); 0312 -0313 %Get the directory for RAVEN Toolbox. -0314 ravenPath=findRAVENroot(); -0315 -0316 %Checking if dataDir is consistent. It must point to pre-trained HMMs set, -0317 %compatible with the the current RAVEN version. The user may have the -0318 %required zip file already in working directory or have it extracted. If -0319 %the zip file and directory is not here, it is downloaded from the cloud -0320 if ~isempty(dataDir) -0321 hmmOptions={'euk90_kegg105','prok90_kegg105'}; -0322 if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions. -0323 %If not, then check whether the required folders exist anyway. -0324 if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ... -0325 ~isfolder(fullfile(dataDir,'fasta')) && ... -0326 ~isfolder(fullfile(dataDir,'aligned')) && ... -0327 ~isfolder(fullfile(dataDir,'hmms')) -0328 error(['Pre-trained HMMs set is not recognised. If you want download RAVEN provided sets, it should match any of the following: ' strjoin(hmmOptions,' or ')]) -0329 end -0330 else -0331 if isfolder(dataDir) && isfile(fullfile(dataDir,'hmms','K00844.hmm')) -0332 fprintf(['NOTE: Found <strong>' dataDir '</strong> directory with pre-trained HMMs, it will therefore be used during reconstruction\n']); -0333 elseif ~isfolder(dataDir) && isfile([dataDir,'.zip']) -0334 fprintf('Extracting the HMMs archive file... '); -0335 unzip([dataDir,'.zip']); -0336 fprintf('COMPLETE\n'); -0337 else -0338 hmmIndex=strcmp(dataDir,hmmOptions); -0339 if ~any(hmmIndex) -0340 error(['Pre-trained HMMs are only provided with proteins clustered at 90% sequence identity (i.e. prok90_kegg105 and euk90_kegg105). ' ... -0341 'Use either of these datasets, or otherwise download the relevant sequence data from KEGG to train HMMs with your desired sequence identity']) -0342 else -0343 fprintf('Downloading the HMMs archive file... '); -0344 try -0345 websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']); -0346 catch ME -0347 if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError') -0348 error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues') -0349 end -0350 end -0351 end -0352 +0313 %Checking if dataDir is consistent. It must point to pre-trained HMMs set, +0314 %compatible with the the current RAVEN version. The user may have the +0315 %required zip file already in working directory or have it extracted. If +0316 %the zip file and directory is not here, it is downloaded from the cloud +0317 if ~isempty(dataDir) +0318 hmmOptions={'euk90_kegg105','prok90_kegg105'}; +0319 if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions. +0320 %If not, then check whether the required folders exist anyway. +0321 if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ... +0322 ~isfolder(fullfile(dataDir,'fasta')) && ... +0323 ~isfolder(fullfile(dataDir,'aligned')) && ... +0324 ~isfolder(fullfile(dataDir,'hmms')) +0325 error(['Pre-trained HMMs set is not recognised. If you want download RAVEN provided sets, it should match any of the following: ' strjoin(hmmOptions,' or ')]) +0326 end +0327 else +0328 if isfolder(dataDir) && isfile(fullfile(dataDir,'hmms','K00844.hmm')) +0329 fprintf(['NOTE: Found <strong>' dataDir '</strong> directory with pre-trained HMMs, it will therefore be used during reconstruction\n']); +0330 elseif ~isfolder(dataDir) && isfile([dataDir,'.zip']) +0331 fprintf('Extracting the HMMs archive file... '); +0332 unzip([dataDir,'.zip']); +0333 fprintf('COMPLETE\n'); +0334 else +0335 hmmIndex=strcmp(dataDir,hmmOptions); +0336 if ~any(hmmIndex) +0337 error(['Pre-trained HMMs are only provided with proteins clustered at 90% sequence identity (i.e. prok90_kegg105 and euk90_kegg105). ' ... +0338 'Use either of these datasets, or otherwise download the relevant sequence data from KEGG to train HMMs with your desired sequence identity']) +0339 else +0340 fprintf('Downloading the HMMs archive file... '); +0341 try +0342 websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']); +0343 catch ME +0344 if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError') +0345 error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues') +0346 end +0347 end +0348 end +0349 +0350 fprintf('COMPLETE\n'); +0351 fprintf('Extracting the HMMs archive file... '); +0352 unzip([dataDir,'.zip']); 0353 fprintf('COMPLETE\n'); -0354 fprintf('Extracting the HMMs archive file... '); -0355 unzip([dataDir,'.zip']); -0356 fprintf('COMPLETE\n'); -0357 end -0358 %Check if HMMs are extracted -0359 if ~isfile(fullfile(dataDir,'hmms','K00844.hmm')) -0360 error(['The HMM files seem improperly extracted and not found in ',dataDir,'/hmms. Please remove ',dataDir,' folder and rerun getKEGGModelForOrganism']); -0361 end -0362 end -0363 end -0364 -0365 %Check if the fasta-file contains '/' or'\'. If not then it's probably just -0366 %a file name. Expand to full path. -0367 if any(fastaFile) -0368 if ~any(strfind(fastaFile,'\')) && ~any(strfind(fastaFile,'/')) -0369 fastaFile=which(fastaFile); -0370 end -0371 %Create the required sub-folders in dataDir if they dont exist -0372 if ~isfolder(fullfile(dataDir,'keggdb')) -0373 mkdir(dataDir,'keggdb'); +0354 end +0355 %Check if HMMs are extracted +0356 if ~isfile(fullfile(dataDir,'hmms','K00844.hmm')) +0357 error(['The HMM files seem improperly extracted and not found in ',dataDir,'/hmms. Please remove ',dataDir,' folder and rerun getKEGGModelForOrganism']); +0358 end +0359 end +0360 end +0361 +0362 %Check if the fasta-file contains '/' or'\'. If not then it's probably just +0363 %a file name. Expand to full path. +0364 if any(fastaFile) +0365 if ~any(strfind(fastaFile,'\')) && ~any(strfind(fastaFile,'/')) +0366 fastaFile=which(fastaFile); +0367 end +0368 %Create the required sub-folders in dataDir if they dont exist +0369 if ~isfolder(fullfile(dataDir,'keggdb')) +0370 mkdir(dataDir,'keggdb'); +0371 end +0372 if ~isfolder(fullfile(dataDir,'fasta')) +0373 mkdir(dataDir,'fasta'); 0374 end -0375 if ~isfolder(fullfile(dataDir,'fasta')) -0376 mkdir(dataDir,'fasta'); +0375 if ~isfolder(fullfile(dataDir,'aligned')) +0376 mkdir(dataDir,'aligned'); 0377 end -0378 if ~isfolder(fullfile(dataDir,'aligned')) -0379 mkdir(dataDir,'aligned'); +0378 if ~isfolder(fullfile(dataDir,'hmms')) +0379 mkdir(dataDir,'hmms'); 0380 end -0381 if ~isfolder(fullfile(dataDir,'hmms')) -0382 mkdir(dataDir,'hmms'); +0381 if ~isfolder(outDir) +0382 mkdir(outDir); 0383 end -0384 if ~isfolder(outDir) -0385 mkdir(outDir); -0386 end -0387 end -0388 -0389 %First generate the full global KEGG model. Can be provided as input. -0390 %Otherwise, getModelFromKEGG is run. The dataDir must not be supplied as -0391 %there is also an internal RAVEN version available -0392 if nargin==15 -0393 model=globalModel.model; -0394 KOModel=globalModel.KOModel; -0395 elseif any(dataDir) -0396 [model, KOModel]=getModelFromKEGG(fullfile(dataDir,'keggdb'),keepSpontaneous,keepUndefinedStoich,keepIncomplete,keepGeneral); -0397 else -0398 [model, KOModel]=getModelFromKEGG([],keepSpontaneous,keepUndefinedStoich,keepIncomplete,keepGeneral); -0399 end -0400 model.id=organismID; -0401 model.c=zeros(numel(model.rxns),1); -0402 -0403 %If no FASTA file is supplied, then just remove all genes which are not for -0404 %the given organism ID -0405 if isempty(fastaFile) -0406 %Check if organismID can be found in KEGG species list or is -0407 %set to "eukaryotes" or "prokaryotes" -0408 phylDistsFull=getPhylDist(fullfile(dataDir,'keggdb'),true); -0409 if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) -0410 error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); -0411 end -0412 -0413 fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); -0414 if ismember(organismID,{'eukaryotes','prokaryotes'}) -0415 phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); -0416 if strcmp(organismID,'eukaryotes') -0417 proxyid='hsa'; -0418 %Use H. sapiens here -0419 else -0420 proxyid='eco'; -0421 %Use E. coli here -0422 end -0423 [~, phylDistId]=ismember(proxyid,phylDists.ids); -0424 idsToKeep=phylDists.ids(~isinf(phylDists.distMat(phylDistId,:))); -0425 taxIDs=cellfun(@(x) x{1},cellfun(@(x) strsplit(x,':'),model.genes,'UniformOutput',false),'UniformOutput',false); -0426 I=ismember(upper(taxIDs),upper(idsToKeep)); -0427 else -0428 %KEGG organism IDs may have three or four letters -0429 organismID=strcat(organismID,':'); -0430 %Add colon for accurate matching -0431 if length(organismID)==4 -0432 I=cellfun(@(x) strcmpi(x(1:4),organismID),model.genes); -0433 elseif length(organismID)==5 -0434 I=cellfun(@(x) strcmpi(x(1:5),organismID),model.genes); -0435 end -0436 end -0437 %Remove those genes -0438 model.genes=model.genes(I); -0439 model.rxnGeneMat=model.rxnGeneMat(:,I); -0440 fprintf('COMPLETE\n'); -0441 end -0442 -0443 %First remove all reactions without genes -0444 if keepSpontaneous==true -0445 fprintf('Removing non-spontaneous reactions without GPR rules... '); -0446 load(fullfile(ravenPath,'external','kegg','keggRxns.mat'),'isSpontaneous'); -0447 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); -0448 spontRxnsWithGenes=model.rxns(any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous)); -0449 else -0450 fprintf('Removing reactions without GPR rules... '); -0451 I=~any(model.rxnGeneMat,2); -0452 end -0453 model=removeReactions(model,I,true); -0454 fprintf('COMPLETE\n'); -0455 -0456 %Clean gene names -0457 fprintf('Fixing gene names in the model... '); -0458 %Get rid of the prefix organism id -0459 model.genes=regexprep(model.genes,'^\w+?:',''); -0460 fprintf('COMPLETE\n'); -0461 -0462 %If no FASTA file is supplied, then we are done here -0463 if isempty(fastaFile) -0464 %Create grRules -0465 fprintf('Constructing GPR associations and annotations for the model... '); -0466 model.grRules=cell(numel(model.rxns),1); -0467 model.grRules(:)={''}; -0468 %Add the gene associations as 'or' -0469 for i=1:numel(model.rxns) -0470 %Find the involved genes -0471 I=find(model.rxnGeneMat(i,:)); -0472 if any(I) -0473 model.grRules{i}=['(' model.genes{I(1)}]; -0474 for j=2:numel(I) -0475 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; -0476 end -0477 model.grRules{i}=[model.grRules{i} ')']; -0478 end -0479 end -0480 %Fix grRules and reconstruct rxnGeneMat -0481 [grRules,rxnGeneMat] = standardizeGrRules(model); %Give detailed output -0482 model.grRules = grRules; -0483 model.rxnGeneMat = rxnGeneMat; -0484 %Add geneMiriams, assuming that it follows the syntax -0485 %kegg.genes/organismID:geneName -0486 model.geneMiriams=''; -0487 for i=1:numel(model.genes) -0488 model.geneMiriams{i,1}.name{1,1}='kegg.genes'; -0489 model.geneMiriams{i,1}.value{1,1}=strcat(lower(organismID),model.genes{i,1}); -0490 end -0491 %Add the description to the reactions -0492 for i=1:numel(model.rxns) -0493 if ~isempty(model.rxnNotes{i}) -0494 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (without HMMs).',model.rxnNotes(i)); -0495 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); -0496 else -0497 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (without HMMs)'}; -0498 end -0499 end -0500 fprintf('COMPLETE\n\n'); -0501 fprintf('*** Model reconstruction complete ***\n'); -0502 return; -0503 end -0504 -0505 %Create a phylogenetic distance structure -0506 phylDistStruct=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); -0507 [~, phylDistId]=ismember(model.id,phylDistStruct.ids); -0508 -0509 %Calculate the real maximal distance now. An abitary large number of 1000 -0510 %is used for the "all in kingdom" or "all sequences" options. This is a bit -0511 %inconvenient way to do it, but it is to make it fit with some older code -0512 if isinf(maxPhylDist) || maxPhylDist==-1 -0513 maxPhylDist=1000; -0514 end -0515 -0516 %Get the KO ids for which files have been generated. Maybe not the neatest -0517 %way.. -0518 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); -0519 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); -0520 alignedWorking=listFiles(fullfile(dataDir,'aligned','*.faw')); -0521 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); -0522 outFiles=listFiles(fullfile(outDir,'*.out')); -0523 -0524 %Check if multi-FASTA files should be generated. This should only be -0525 %performed if there are IDs in the KOModel structure that haven't been -0526 %parsed yet -0527 missingFASTA=setdiff(KOModel.rxns,[fastaFiles;alignedFiles;hmmFiles;outFiles]); -0528 -0529 if ~isempty(missingFASTA) -0530 if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) -0531 EM=['The file ''genes.pep'' cannot be located at ' strrep(dataDir,'\','/') '/ and should be downloaded from the KEGG FTP.\n']; -0532 dispEM(EM); -0533 end -0534 %Only construct models for KOs which don't have files already -0535 fastaModel=removeReactions(KOModel,setdiff(KOModel.rxns,missingFASTA),true,true); -0536 %Permute the order of the KOs in the model so that constructMultiFasta -0537 %can be run on several processors at once -0538 fastaModel=permuteModel(fastaModel,randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(fastaModel.rxns)),'rxns'); -0539 constructMultiFasta(fastaModel,fullfile(dataDir,'keggdb','genes.pep'),fullfile(dataDir,'fasta')); -0540 else -0541 fprintf('Generating the KEGG Orthology specific multi-FASTA files... COMPLETE\n'); -0542 end -0543 -0544 if isunix -0545 if ismac -0546 binEnd='.mac'; -0547 else -0548 binEnd=''; -0549 end -0550 elseif ispc -0551 binEnd=''; -0552 else -0553 EM='Unknown OS, exiting.'; -0554 disp(EM); -0555 return -0556 end -0557 -0558 %Check if alignment of FASTA files should be performed -0559 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); -0560 if ~isempty(missingAligned) -0561 if seqIdentity==-1 -0562 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); -0563 else -0564 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); -0565 end -0566 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); -0567 tmpFile=tempname; -0568 %On Windows, paths need to be translated to Unix before parsing it to WSL -0569 if ispc -0570 wslPath.tmpFile=getWSLpath(tmpFile); -0571 %mafft has problems writing to terminal (/dev/stderr) when running -0572 %on WSL via MATLAB, instead write and read progress file -0573 mafftOutput = tempname; -0574 wslPath.mafftOutput=getWSLpath(mafftOutput); -0575 wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); -0576 wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); -0577 end -0578 -0579 for i=1:numel(missingAligned) -0580 %This is checked here because it could be that it is created by a -0581 %parallel process. The faw-files are saved as temporary files to -0582 %kept track of which files are being worked on -0583 if ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])) &&... -0584 ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.fa'])) -0585 %Check that the multi-FASTA file exists. It should do so since -0586 %we are saving empty files as well. Print a warning and -0587 %continue if not -0588 if ~isfile(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])) -0589 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; -0590 dispEM(EM,false); -0591 continue; -0592 end -0593 -0594 %If the multi-FASTA file is empty then save an empty aligned -0595 %file and continue -0596 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0597 if s.bytes<=0 -0598 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); -0599 fclose(fid); -0600 continue; -0601 end -0602 -0603 %Create an empty file to prevent other threads to start to work -0604 %on the same alignment -0605 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); -0606 fclose(fid); -0607 -0608 %First load the FASTA file, then select up to nSequences -0609 %sequences of the most closely related species, apply any -0610 %constraints from maxPhylDist, and save it as a temporary file, -0611 %and create the model from that -0612 -0613 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); -0614 phylDist=inf(numel(fastaStruct),1); -0615 for j=1:numel(fastaStruct) -0616 %Get the organism abbreviation -0617 index=strfind(fastaStruct(j).Header,':'); -0618 if any(index) -0619 abbrev=fastaStruct(j).Header(1:index(1)-1); -0620 [~, index]=ismember(abbrev,phylDistStruct.ids); -0621 if any(index) -0622 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); -0623 end -0624 end -0625 end +0384 end +0385 +0386 %First generate the full global KEGG model. Can be provided as input. +0387 %Otherwise, getModelFromKEGG is run. The dataDir must not be supplied as +0388 %there is also an internal RAVEN version available +0389 if nargin==15 +0390 model=globalModel.model; +0391 KOModel=globalModel.KOModel; +0392 elseif any(dataDir) +0393 [model, KOModel]=getModelFromKEGG(fullfile(dataDir,'keggdb'),keepSpontaneous,keepUndefinedStoich,keepIncomplete,keepGeneral); +0394 else +0395 [model, KOModel]=getModelFromKEGG([],keepSpontaneous,keepUndefinedStoich,keepIncomplete,keepGeneral); +0396 end +0397 model.id=organismID; +0398 model.c=zeros(numel(model.rxns),1); +0399 +0400 %If no FASTA file is supplied, then just remove all genes which are not for +0401 %the given organism ID +0402 if isempty(fastaFile) +0403 %Check if organismID can be found in KEGG species list or is +0404 %set to "eukaryotes" or "prokaryotes" +0405 phylDistsFull=getPhylDist(fullfile(dataDir,'keggdb'),true); +0406 if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) +0407 error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); +0408 end +0409 +0410 fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); +0411 if ismember(organismID,{'eukaryotes','prokaryotes'}) +0412 phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); +0413 if strcmp(organismID,'eukaryotes') +0414 proxyid='hsa'; +0415 %Use H. sapiens here +0416 else +0417 proxyid='eco'; +0418 %Use E. coli here +0419 end +0420 [~, phylDistId]=ismember(proxyid,phylDists.ids); +0421 idsToKeep=phylDists.ids(~isinf(phylDists.distMat(phylDistId,:))); +0422 taxIDs=cellfun(@(x) x{1},cellfun(@(x) strsplit(x,':'),model.genes,'UniformOutput',false),'UniformOutput',false); +0423 I=ismember(upper(taxIDs),upper(idsToKeep)); +0424 else +0425 %KEGG organism IDs may have three or four letters +0426 organismID=strcat(organismID,':'); +0427 %Add colon for accurate matching +0428 if length(organismID)==4 +0429 I=cellfun(@(x) strcmpi(x(1:4),organismID),model.genes); +0430 elseif length(organismID)==5 +0431 I=cellfun(@(x) strcmpi(x(1:5),organismID),model.genes); +0432 end +0433 end +0434 %Remove those genes +0435 model.genes=model.genes(I); +0436 model.rxnGeneMat=model.rxnGeneMat(:,I); +0437 fprintf('COMPLETE\n'); +0438 end +0439 +0440 %First remove all reactions without genes +0441 if keepSpontaneous==true +0442 fprintf('Removing non-spontaneous reactions without GPR rules... '); +0443 load(fullfile(ravenPath,'external','kegg','keggRxns.mat'),'isSpontaneous'); +0444 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); +0445 spontRxnsWithGenes=model.rxns(any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous)); +0446 else +0447 fprintf('Removing reactions without GPR rules... '); +0448 I=~any(model.rxnGeneMat,2); +0449 end +0450 model=removeReactions(model,I,true); +0451 fprintf('COMPLETE\n'); +0452 +0453 %Clean gene names +0454 fprintf('Fixing gene names in the model... '); +0455 %Get rid of the prefix organism id +0456 model.genes=regexprep(model.genes,'^\w+?:',''); +0457 fprintf('COMPLETE\n'); +0458 +0459 %If no FASTA file is supplied, then we are done here +0460 if isempty(fastaFile) +0461 %Create grRules +0462 fprintf('Constructing GPR associations and annotations for the model... '); +0463 model.grRules=cell(numel(model.rxns),1); +0464 model.grRules(:)={''}; +0465 %Add the gene associations as 'or' +0466 for i=1:numel(model.rxns) +0467 %Find the involved genes +0468 I=find(model.rxnGeneMat(i,:)); +0469 if any(I) +0470 model.grRules{i}=['(' model.genes{I(1)}]; +0471 for j=2:numel(I) +0472 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; +0473 end +0474 model.grRules{i}=[model.grRules{i} ')']; +0475 end +0476 end +0477 %Fix grRules and reconstruct rxnGeneMat +0478 [grRules,rxnGeneMat] = standardizeGrRules(model); %Give detailed output +0479 model.grRules = grRules; +0480 model.rxnGeneMat = rxnGeneMat; +0481 %Add geneMiriams, assuming that it follows the syntax +0482 %kegg.genes/organismID:geneName +0483 model.geneMiriams=''; +0484 for i=1:numel(model.genes) +0485 model.geneMiriams{i,1}.name{1,1}='kegg.genes'; +0486 model.geneMiriams{i,1}.value{1,1}=strcat(lower(organismID),model.genes{i,1}); +0487 end +0488 %Add the description to the reactions +0489 for i=1:numel(model.rxns) +0490 if ~isempty(model.rxnNotes{i}) +0491 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (without HMMs).',model.rxnNotes(i)); +0492 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); +0493 else +0494 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (without HMMs)'}; +0495 end +0496 end +0497 fprintf('COMPLETE\n\n'); +0498 fprintf('*** Model reconstruction complete ***\n'); +0499 return; +0500 end +0501 +0502 %Create a phylogenetic distance structure +0503 phylDistStruct=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); +0504 [~, phylDistId]=ismember(model.id,phylDistStruct.ids); +0505 +0506 %Calculate the real maximal distance now. An abitary large number of 1000 +0507 %is used for the "all in kingdom" or "all sequences" options. This is a bit +0508 %inconvenient way to do it, but it is to make it fit with some older code +0509 if isinf(maxPhylDist) || maxPhylDist==-1 +0510 maxPhylDist=1000; +0511 end +0512 +0513 %Get the KO ids for which files have been generated. Maybe not the neatest +0514 %way.. +0515 fastaFiles=listFiles(fullfile(dataDir,'fasta','*.fa')); +0516 alignedFiles=listFiles(fullfile(dataDir,'aligned','*.fa')); +0517 alignedWorking=listFiles(fullfile(dataDir,'aligned','*.faw')); +0518 hmmFiles=listFiles(fullfile(dataDir,'hmms','*.hmm')); +0519 outFiles=listFiles(fullfile(outDir,'*.out')); +0520 +0521 %Check if multi-FASTA files should be generated. This should only be +0522 %performed if there are IDs in the KOModel structure that haven't been +0523 %parsed yet +0524 missingFASTA=setdiff(KOModel.rxns,[fastaFiles;alignedFiles;hmmFiles;outFiles]); +0525 +0526 if ~isempty(missingFASTA) +0527 if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) +0528 EM=['The file ''genes.pep'' cannot be located at ' strrep(dataDir,'\','/') '/ and should be downloaded from the KEGG FTP.\n']; +0529 dispEM(EM); +0530 end +0531 %Only construct models for KOs which don't have files already +0532 fastaModel=removeReactions(KOModel,setdiff(KOModel.rxns,missingFASTA),true,true); +0533 %Permute the order of the KOs in the model so that constructMultiFasta +0534 %can be run on several processors at once +0535 fastaModel=permuteModel(fastaModel,randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(fastaModel.rxns)),'rxns'); +0536 constructMultiFasta(fastaModel,fullfile(dataDir,'keggdb','genes.pep'),fullfile(dataDir,'fasta')); +0537 else +0538 fprintf('Generating the KEGG Orthology specific multi-FASTA files... COMPLETE\n'); +0539 end +0540 +0541 if isunix +0542 if ismac +0543 binEnd='.mac'; +0544 else +0545 binEnd=''; +0546 end +0547 elseif ispc +0548 binEnd=''; +0549 else +0550 EM='Unknown OS, exiting.'; +0551 disp(EM); +0552 return +0553 end +0554 +0555 %Check if alignment of FASTA files should be performed +0556 missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); +0557 if ~isempty(missingAligned) +0558 if seqIdentity==-1 +0559 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); +0560 else +0561 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); +0562 end +0563 missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); +0564 tmpFile=tempname; +0565 %On Windows, paths need to be translated to Unix before parsing it to WSL +0566 if ispc +0567 wslPath.tmpFile=getWSLpath(tmpFile); +0568 %mafft has problems writing to terminal (/dev/stderr) when running +0569 %on WSL via MATLAB, instead write and read progress file +0570 mafftOutput = tempname; +0571 wslPath.mafftOutput=getWSLpath(mafftOutput); +0572 wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); +0573 wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); +0574 end +0575 +0576 for i=1:numel(missingAligned) +0577 %This is checked here because it could be that it is created by a +0578 %parallel process. The faw-files are saved as temporary files to +0579 %kept track of which files are being worked on +0580 if ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])) &&... +0581 ~isfile(fullfile(dataDir,'aligned',[missingAligned{i} '.fa'])) +0582 %Check that the multi-FASTA file exists. It should do so since +0583 %we are saving empty files as well. Print a warning and +0584 %continue if not +0585 if ~isfile(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])) +0586 EM=['WARNING: The multi-FASTA file for ' missingAligned{i} ' does not exist']; +0587 dispEM(EM,false); +0588 continue; +0589 end +0590 +0591 %If the multi-FASTA file is empty then save an empty aligned +0592 %file and continue +0593 s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0594 if s.bytes<=0 +0595 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'w'); +0596 fclose(fid); +0597 continue; +0598 end +0599 +0600 %Create an empty file to prevent other threads to start to work +0601 %on the same alignment +0602 fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); +0603 fclose(fid); +0604 +0605 %First load the FASTA file, then select up to nSequences +0606 %sequences of the most closely related species, apply any +0607 %constraints from maxPhylDist, and save it as a temporary file, +0608 %and create the model from that +0609 +0610 fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); +0611 phylDist=inf(numel(fastaStruct),1); +0612 for j=1:numel(fastaStruct) +0613 %Get the organism abbreviation +0614 index=strfind(fastaStruct(j).Header,':'); +0615 if any(index) +0616 abbrev=fastaStruct(j).Header(1:index(1)-1); +0617 [~, index]=ismember(abbrev,phylDistStruct.ids); +0618 if any(index) +0619 phylDist(j)=phylDistStruct.distMat(index(1),phylDistId); +0620 end +0621 end +0622 end +0623 +0624 %Inf means that it should not be included +0625 phylDist(phylDist>maxPhylDist)=[]; 0626 -0627 %Inf means that it should not be included -0628 phylDist(phylDist>maxPhylDist)=[]; +0627 %Sort based on phylDist +0628 [~, order]=sort(phylDist); 0629 -0630 %Sort based on phylDist -0631 [~, order]=sort(phylDist); -0632 -0633 %Save the first nSequences hits to a temporary FASTA file -0634 if nSequences<=numel(fastaStruct) -0635 fastaStruct=fastaStruct(order(1:nSequences)); -0636 else -0637 fastaStruct=fastaStruct(order); -0638 end -0639 -0640 %Do the clustering and alignment if there are more than one -0641 %sequences, otherwise just save the sequence (or an empty file) -0642 if numel(fastaStruct)>1 -0643 if seqIdentity~=-1 -0644 cdhitInpCustom=tempname; -0645 fastawrite(cdhitInpCustom,fastaStruct); -0646 if seqIdentity<=1 && seqIdentity>0.7 -0647 nparam='5'; -0648 elseif seqIdentity>0.6 -0649 nparam='4'; -0650 elseif seqIdentity>0.5 -0651 nparam='3'; -0652 elseif seqIdentity>0.4 -0653 nparam='2'; -0654 else -0655 EM='The provided seqIdentity must be between 0 and 1\n'; -0656 dispEM(EM); -0657 end -0658 if ispc -0659 wslPath.cdhitInpCustom=getWSLpath(cdhitInpCustom); -0660 [status, output]=system(['wsl "' wslPath.cdhit '" -T "' num2str(cores) '" -i "' wslPath.cdhitInpCustom '" -o "' wslPath.tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); -0661 elseif ismac || isunix -0662 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); -0663 end -0664 if status~=0 -0665 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; -0666 dispEM(EM); -0667 end -0668 %Remove the old tempfile -0669 if exist(cdhitInpCustom, 'file') -0670 delete([cdhitInpCustom '*']); -0671 end -0672 else -0673 %This means that CD-HIT should be skipped since -0674 %seqIdentity is equal to -1 -0675 fastawrite(tmpFile,fastaStruct); -0676 end -0677 %Do the alignment for this file -0678 if ismac -0679 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0680 elseif isunix -0681 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); -0682 elseif ispc -0683 wslPath.fawFile=getWSLpath(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])); -0684 [status, ~]=system(['wsl "' wslPath.mafft '" --auto --anysymbol --progress "' wslPath.mafftOutput '" --thread "' num2str(cores) '" --out "' wslPath.fawFile '" "' wslPath.tmpFile '"']); -0685 output=fileread(mafftOutput); -0686 delete(mafftOutput); -0687 end -0688 if status~=0 -0689 %It could be that alignment failed because only one -0690 %sequence was left after clustering. If that is the -0691 %case, then the clustered file is just copied as 'faw' -0692 %file -0693 if any(regexp(output,'Only 1 sequence found')) -0694 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); -0695 else -0696 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; -0697 dispEM(EM); -0698 end -0699 end -0700 %Remove the old tempfile -0701 if exist(tmpFile, 'file') -0702 delete([tmpFile '*']); -0703 end -0704 else -0705 %If there is only one sequence then it's not possible to do -0706 %a multiple alignment. Just print the sequence instead. An -0707 %empty file was written previously so that doesn't have to -0708 %be dealt with -0709 if numel(fastaStruct)==1 -0710 warnState = warning; %Save the current warning state -0711 warning('off','Bioinfo:fastawrite:AppendToFile'); -0712 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); -0713 warning(warnState) %Reset warning state to previous settings -0714 end -0715 end -0716 %Move the temporary file to the real one -0717 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); -0718 -0719 %Print the progress every 25 files -0720 if rem(i-1,25) == 0 -0721 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); -0722 progress=pad(progress,3,'left'); -0723 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0724 end -0725 end -0726 end -0727 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0728 else -0729 if seqIdentity==-1 -0730 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0731 else -0732 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); -0733 end -0734 end -0735 -0736 %Check if training of Hidden Markov models should be performed -0737 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); -0738 if ~isempty(missingHMMs) -0739 fprintf('Training the KEGG Orthology specific HMMs... 0%% complete'); -0740 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); -0741 %Train models for all missing KOs -0742 for i=1:numel(missingHMMs) -0743 %This is checked here because it could be that it is created by a -0744 %parallel process -0745 if ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])) && ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])) -0746 %Check that the aligned FASTA file exists. It could be that it -0747 %is still being worked on by some other instance of the program -0748 %(the .faw file should then exist). This should not happen on a -0749 %single computer. It doesn't throw an error, because it should -0750 %finalize the ones it can -0751 if ~isfile(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])) -0752 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; -0753 dispEM(EM,false); -0754 continue; -0755 end -0756 -0757 %If the multi-FASTA file is empty then save an empty aligned -0758 %file and continue -0759 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); -0760 if s.bytes<=0 -0761 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); -0762 fclose(fid); -0763 continue; -0764 end -0765 %Create a temporary file to indicate that it is working on the -0766 %KO. This is because hmmbuild cannot overwrite existing files -0767 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); -0768 fclose(fid); -0769 -0770 %Create HMM -0771 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); -0772 if status~=0 -0773 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; -0774 dispEM(EM); -0775 end -0776 -0777 %Delete the temporary file -0778 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); -0779 -0780 %Print the progress every 25 files -0781 if rem(i-1,25) == 0 -0782 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'hmms','*.hmm')))/numel(KOModel.rxns))); -0783 progress=pad(progress,3,'left'); -0784 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0785 end -0786 end -0787 end -0788 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0789 else -0790 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); -0791 end -0792 -0793 %Check which new .out files that should be generated. Check if training of -0794 %Hidden Markov models should be performed -0795 missingOUT=setdiff(KOModel.rxns,outFiles); -0796 if ~isempty(missingOUT) -0797 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... 0%% complete'); -0798 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); -0799 for i=1:numel(missingOUT) -0800 %This is checked here because it could be that it is created by a -0801 %parallel process -0802 if ~isfile(fullfile(outDir,[missingOUT{i} '.out'])) -0803 %Check that the HMM file exists. It should do so since %we are -0804 %saving empty files as well. Print a warning and continue if -0805 %not -0806 if ~isfile(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])) -0807 EM=['The HMM file for ' missingOUT{i} ' does not exist']; -0808 dispEM(EM,false); -0809 continue; -0810 end -0811 -0812 %Save an empty file to prevent several threads working on the -0813 %same file -0814 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0815 fclose(fid); -0816 -0817 %If the HMM file is empty then save an out file and continue -0818 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); -0819 if s.bytes<=0 -0820 continue; -0821 end -0822 -0823 %Check each gene in the input file against this model -0824 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); -0825 if status~=0 -0826 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; -0827 dispEM(EM); -0828 end -0829 -0830 %Save the output to a file -0831 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); -0832 fwrite(fid,output); -0833 fclose(fid); -0834 -0835 %Print the progress every 25 files -0836 if rem(i-1,25) == 0 -0837 progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); -0838 progress=pad(progress,3,'left'); -0839 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); -0840 end -0841 end -0842 end -0843 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); -0844 else -0845 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... COMPLETE\n'); -0846 end +0630 %Save the first nSequences hits to a temporary FASTA file +0631 if nSequences<=numel(fastaStruct) +0632 fastaStruct=fastaStruct(order(1:nSequences)); +0633 else +0634 fastaStruct=fastaStruct(order); +0635 end +0636 +0637 %Do the clustering and alignment if there are more than one +0638 %sequences, otherwise just save the sequence (or an empty file) +0639 if numel(fastaStruct)>1 +0640 if seqIdentity~=-1 +0641 cdhitInpCustom=tempname; +0642 fastawrite(cdhitInpCustom,fastaStruct); +0643 if seqIdentity<=1 && seqIdentity>0.7 +0644 nparam='5'; +0645 elseif seqIdentity>0.6 +0646 nparam='4'; +0647 elseif seqIdentity>0.5 +0648 nparam='3'; +0649 elseif seqIdentity>0.4 +0650 nparam='2'; +0651 else +0652 EM='The provided seqIdentity must be between 0 and 1\n'; +0653 dispEM(EM); +0654 end +0655 if ispc +0656 wslPath.cdhitInpCustom=getWSLpath(cdhitInpCustom); +0657 [status, output]=system(['wsl "' wslPath.cdhit '" -T "' num2str(cores) '" -i "' wslPath.cdhitInpCustom '" -o "' wslPath.tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); +0658 elseif ismac || isunix +0659 [status, output]=system(['"' fullfile(ravenPath,'software','cd-hit',['cd-hit' binEnd]) '" -T "' num2str(cores) '" -i "' cdhitInpCustom '" -o "' tmpFile '" -c "' num2str(seqIdentity) '" -n ' nparam ' -M 2000']); +0660 end +0661 if status~=0 +0662 EM=['Error when performing clustering of ' missingAligned{i} ':\n' output]; +0663 dispEM(EM); +0664 end +0665 %Remove the old tempfile +0666 if exist(cdhitInpCustom, 'file') +0667 delete([cdhitInpCustom '*']); +0668 end +0669 else +0670 %This means that CD-HIT should be skipped since +0671 %seqIdentity is equal to -1 +0672 fastawrite(tmpFile,fastaStruct); +0673 end +0674 %Do the alignment for this file +0675 if ismac +0676 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-mac','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0677 elseif isunix +0678 [status, output]=system(['"' fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat') '" --auto --anysymbol --thread "' num2str(cores) '" "' tmpFile '" > "' fullfile(dataDir,'aligned',[missingAligned{i} '.faw']) '"']); +0679 elseif ispc +0680 wslPath.fawFile=getWSLpath(fullfile(dataDir,'aligned',[missingAligned{i} '.faw'])); +0681 [status, ~]=system(['wsl "' wslPath.mafft '" --auto --anysymbol --progress "' wslPath.mafftOutput '" --thread "' num2str(cores) '" --out "' wslPath.fawFile '" "' wslPath.tmpFile '"']); +0682 output=fileread(mafftOutput); +0683 delete(mafftOutput); +0684 end +0685 if status~=0 +0686 %It could be that alignment failed because only one +0687 %sequence was left after clustering. If that is the +0688 %case, then the clustered file is just copied as 'faw' +0689 %file +0690 if any(regexp(output,'Only 1 sequence found')) +0691 movefile(tmpFile,fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'f'); +0692 else +0693 EM=['Error when performing alignment of ' missingAligned{i} ':\n' output]; +0694 dispEM(EM); +0695 end +0696 end +0697 %Remove the old tempfile +0698 if exist(tmpFile, 'file') +0699 delete([tmpFile '*']); +0700 end +0701 else +0702 %If there is only one sequence then it's not possible to do +0703 %a multiple alignment. Just print the sequence instead. An +0704 %empty file was written previously so that doesn't have to +0705 %be dealt with +0706 if numel(fastaStruct)==1 +0707 warnState = warning; %Save the current warning state +0708 warning('off','Bioinfo:fastawrite:AppendToFile'); +0709 fastawrite(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fastaStruct); +0710 warning(warnState) %Reset warning state to previous settings +0711 end +0712 end +0713 %Move the temporary file to the real one +0714 movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); +0715 +0716 %Print the progress every 25 files +0717 if rem(i-1,25) == 0 +0718 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); +0719 progress=pad(progress,3,'left'); +0720 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0721 end +0722 end +0723 end +0724 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0725 else +0726 if seqIdentity==-1 +0727 fprintf('Performing the multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0728 else +0729 fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... COMPLETE\n'); +0730 end +0731 end +0732 +0733 %Check if training of Hidden Markov models should be performed +0734 missingHMMs=setdiff(KOModel.rxns,[hmmFiles;outFiles]); +0735 if ~isempty(missingHMMs) +0736 fprintf('Training the KEGG Orthology specific HMMs... 0%% complete'); +0737 missingHMMs=missingHMMs(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingHMMs))); +0738 %Train models for all missing KOs +0739 for i=1:numel(missingHMMs) +0740 %This is checked here because it could be that it is created by a +0741 %parallel process +0742 if ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])) && ~isfile(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])) +0743 %Check that the aligned FASTA file exists. It could be that it +0744 %is still being worked on by some other instance of the program +0745 %(the .faw file should then exist). This should not happen on a +0746 %single computer. It doesn't throw an error, because it should +0747 %finalize the ones it can +0748 if ~isfile(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])) +0749 EM=['The aligned FASTA file for ' missingHMMs{i} ' does not exist']; +0750 dispEM(EM,false); +0751 continue; +0752 end +0753 +0754 %If the multi-FASTA file is empty then save an empty aligned +0755 %file and continue +0756 s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); +0757 if s.bytes<=0 +0758 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']),'w'); +0759 fclose(fid); +0760 continue; +0761 end +0762 %Create a temporary file to indicate that it is working on the +0763 %KO. This is because hmmbuild cannot overwrite existing files +0764 fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); +0765 fclose(fid); +0766 +0767 %Create HMM +0768 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); +0769 if status~=0 +0770 EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; +0771 dispEM(EM); +0772 end +0773 +0774 %Delete the temporary file +0775 delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); +0776 +0777 %Print the progress every 25 files +0778 if rem(i-1,25) == 0 +0779 progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'hmms','*.hmm')))/numel(KOModel.rxns))); +0780 progress=pad(progress,3,'left'); +0781 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0782 end +0783 end +0784 end +0785 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0786 else +0787 fprintf('Training the KEGG Orthology specific HMMs... COMPLETE\n'); +0788 end +0789 +0790 %Check which new .out files that should be generated. Check if training of +0791 %Hidden Markov models should be performed +0792 missingOUT=setdiff(KOModel.rxns,outFiles); +0793 if ~isempty(missingOUT) +0794 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... 0%% complete'); +0795 missingOUT=missingOUT(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingOUT))); +0796 for i=1:numel(missingOUT) +0797 %This is checked here because it could be that it is created by a +0798 %parallel process +0799 if ~isfile(fullfile(outDir,[missingOUT{i} '.out'])) +0800 %Check that the HMM file exists. It should do so since %we are +0801 %saving empty files as well. Print a warning and continue if +0802 %not +0803 if ~isfile(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])) +0804 EM=['The HMM file for ' missingOUT{i} ' does not exist']; +0805 dispEM(EM,false); +0806 continue; +0807 end +0808 +0809 %Save an empty file to prevent several threads working on the +0810 %same file +0811 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0812 fclose(fid); +0813 +0814 %If the HMM file is empty then save an out file and continue +0815 s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); +0816 if s.bytes<=0 +0817 continue; +0818 end +0819 +0820 %Check each gene in the input file against this model +0821 [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); +0822 if status~=0 +0823 EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; +0824 dispEM(EM); +0825 end +0826 +0827 %Save the output to a file +0828 fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); +0829 fwrite(fid,output); +0830 fclose(fid); +0831 +0832 %Print the progress every 25 files +0833 if rem(i-1,25) == 0 +0834 progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); +0835 progress=pad(progress,3,'left'); +0836 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\b%s%% complete',progress); +0837 end +0838 end +0839 end +0840 fprintf('\b\b\b\b\b\b\b\b\b\b\b\b\bCOMPLETE\n'); +0841 else +0842 fprintf('Querying the user-specified FASTA file against the KEGG Orthology specific HMMs... COMPLETE\n'); +0843 end +0844 +0845 +0846 %***Begin retrieving the output and putting together the resulting model 0847 -0848 -0849 %***Begin retrieving the output and putting together the resulting model -0850 -0851 fprintf('Parsing the HMM search results... '); -0852 %Retrieve matched genes from the HMMs -0853 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes -0854 genes=cell(3000,1); -0855 %Store the best score for a gene in a hash list (since it will be searching -0856 %many times) -0857 hTable = java.util.Hashtable; -0858 -0859 geneCounter=0; -0860 for i=1:numel(KOModel.rxns) -0861 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') -0862 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); -0863 beginMatches=false; -0864 while 1 -0865 %Get the next line -0866 tline = fgetl(fid); -0867 -0868 %Abort at end of file -0869 if ~ischar(tline) -0870 break; -0871 end -0872 -0873 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) -0874 break; -0875 end -0876 -0877 if beginMatches==false -0878 %This is how the listing of matches begins -0879 if any(strfind(tline,'E-value ')) -0880 %Read one more line that is only padding -0881 tline = fgetl(fid); -0882 beginMatches=true; -0883 end -0884 else -0885 %If matches should be read -0886 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) -0887 elements=regexp(tline,' ','split'); -0888 elements=elements(cellfun(@any,elements)); -0889 -0890 %Check if the match is below the treshhold -0891 score=str2double(elements{1}); -0892 gene=elements{9}; -0893 if score<=cutOff -0894 %If the score is exactly 0, change it to a very -0895 %small value to avoid NaN -0896 if score==0 -0897 score=10^-250; -0898 end -0899 %Check if the gene is added already and, is so, get -0900 %the best score for it -0901 I=hTable.get(gene); -0902 if any(I) -0903 koGeneMat(i,I)=score; -0904 else -0905 geneCounter=geneCounter+1; -0906 %The gene was not present yet so add it -0907 hTable.put(gene,geneCounter); -0908 genes{geneCounter}=gene; -0909 koGeneMat(i,geneCounter)=score; -0910 end -0911 end -0912 else -0913 break; -0914 end -0915 end -0916 end -0917 fclose(fid); -0918 end -0919 end -0920 fprintf('COMPLETE\n'); +0848 fprintf('Parsing the HMM search results... '); +0849 %Retrieve matched genes from the HMMs +0850 koGeneMat=zeros(numel(KOModel.rxns),3000); %Make room for 3000 genes +0851 genes=cell(3000,1); +0852 %Store the best score for a gene in a hash list (since it will be searching +0853 %many times) +0854 hTable = java.util.Hashtable; +0855 +0856 geneCounter=0; +0857 for i=1:numel(KOModel.rxns) +0858 if exist(fullfile(outDir,[KOModel.rxns{i} '.out']), 'file') +0859 fid=fopen(fullfile(outDir,[KOModel.rxns{i} '.out']),'r'); +0860 beginMatches=false; +0861 while 1 +0862 %Get the next line +0863 tline = fgetl(fid); +0864 +0865 %Abort at end of file +0866 if ~ischar(tline) +0867 break; +0868 end +0869 +0870 if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) +0871 break; +0872 end +0873 +0874 if beginMatches==false +0875 %This is how the listing of matches begins +0876 if any(strfind(tline,'E-value ')) +0877 %Read one more line that is only padding +0878 tline = fgetl(fid); +0879 beginMatches=true; +0880 end +0881 else +0882 %If matches should be read +0883 if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) +0884 elements=regexp(tline,' ','split'); +0885 elements=elements(cellfun(@any,elements)); +0886 +0887 %Check if the match is below the treshhold +0888 score=str2double(elements{1}); +0889 gene=elements{9}; +0890 if score<=cutOff +0891 %If the score is exactly 0, change it to a very +0892 %small value to avoid NaN +0893 if score==0 +0894 score=10^-250; +0895 end +0896 %Check if the gene is added already and, is so, get +0897 %the best score for it +0898 I=hTable.get(gene); +0899 if any(I) +0900 koGeneMat(i,I)=score; +0901 else +0902 geneCounter=geneCounter+1; +0903 %The gene was not present yet so add it +0904 hTable.put(gene,geneCounter); +0905 genes{geneCounter}=gene; +0906 koGeneMat(i,geneCounter)=score; +0907 end +0908 end +0909 else +0910 break; +0911 end +0912 end +0913 end +0914 fclose(fid); +0915 end +0916 end +0917 fprintf('COMPLETE\n'); +0918 +0919 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); +0920 koGeneMat=koGeneMat(:,1:geneCounter); 0921 -0922 fprintf('Removing gene, KEGG Orthology associations below minScoreRatioKO, minScoreRatioG... '); -0923 koGeneMat=koGeneMat(:,1:geneCounter); -0924 -0925 %Remove the genes for each KO that are below minScoreRatioKO. -0926 for i=1:size(koGeneMat,1) -0927 J=find(koGeneMat(i,:)); -0928 if any(J) -0929 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; -0930 end -0931 end -0932 -0933 %Remove the KOs for each gene that are below minScoreRatioG -0934 for i=1:size(koGeneMat,2) -0935 J=find(koGeneMat(:,i)); -0936 if any(J) -0937 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; -0938 end -0939 end -0940 fprintf('COMPLETE\n'); -0941 -0942 fprintf('Adding gene annotations to the model... '); -0943 %Create the new model -0944 model.genes=genes(1:geneCounter); -0945 model.grRules=cell(numel(model.rxns),1); -0946 model.grRules(:)={''}; -0947 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); -0948 -0949 %Loop through the reactions and add the corresponding genes -0950 for i=1:numel(model.rxns) -0951 if isstruct(model.rxnMiriams{i}) -0952 %Get all KOs -0953 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); -0954 KOs=model.rxnMiriams{i}.value(I); -0955 %Find the KOs and the corresponding genes -0956 J=ismember(KOModel.rxns,KOs); -0957 [~, K]=find(koGeneMat(J,:)); -0958 -0959 if any(K) -0960 model.rxnGeneMat(i,K)=1; -0961 %Also delete KOs for which no genes were found. If no genes at -0962 %all were matched to the reaction it will be deleted later -0963 L=sum(koGeneMat(J,:),2)==0; -0964 model.rxnMiriams{i}.value(I(L))=[]; -0965 model.rxnMiriams{i}.name(I(L))=[]; -0966 end -0967 end -0968 end -0969 fprintf('COMPLETE\n'); -0970 -0971 %Find and delete all reactions without genes. This also removes genes that -0972 %are not used (which could happen because minScoreRatioG and -0973 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions -0974 %without genes are kept in the model. Spontaneous reactions with original -0975 %gene associations are treated in the same way, like the rest of the -0976 %reactions - if gene associations were removed during HMM search, such -0977 %reactions are deleted from the model -0978 if keepSpontaneous==true -0979 %Not the most comprise way to delete reactions without genes, but this -0980 %makes the code easier to understand. Firstly the non-spontaneous -0981 %reactions without genes are removed. After that, the second deletion -0982 %step removes spontaneous reactions, which had gene associations before -0983 %HMM search, but no longer have after it -0984 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); -0985 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); -0986 model=removeReactions(model,I,true,true); -0987 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); -0988 model=removeReactions(model,I,true,true); -0989 else -0990 %Just simply check for any new reactions without genes and remove -0991 %it -0992 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); -0993 I=~any(model.rxnGeneMat,2); -0994 model=removeReactions(model,I,true,true); -0995 end -0996 fprintf('COMPLETE\n'); -0997 -0998 fprintf('Constructing GPR rules and finalizing the model... '); -0999 %Add the gene associations as 'or' -1000 for i=1:numel(model.rxns) -1001 %Find the involved genes -1002 I=find(model.rxnGeneMat(i,:)); -1003 if any(I) -1004 model.grRules{i}=['(' model.genes{I(1)}]; -1005 for j=2:numel(I) -1006 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; -1007 end -1008 model.grRules{i}=[model.grRules{i} ')']; -1009 end -1010 end -1011 -1012 %Fix grRules and reconstruct rxnGeneMat -1013 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output -1014 model.grRules = grRules; -1015 model.rxnGeneMat = rxnGeneMat; -1016 -1017 %Fix subsystems -1018 emptySubSystems=cellfun(@isempty, model.subSystems); -1019 model.subSystems(emptySubSystems)={{''}}; -1020 -1021 %Add the description to the reactions -1022 for i=1:numel(model.rxns) -1023 if ~isempty(model.rxnNotes{i}) -1024 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); -1025 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); -1026 else -1027 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; -1028 end -1029 end -1030 %Remove the temp fasta file -1031 delete(fastaFile) -1032 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); -1033 end -1034 -1035 function files=listFiles(directory) -1036 %Supporter function to list the files in a directory and return them as a -1037 %cell array -1038 temp=dir(directory); -1039 files=cell(numel(temp),1); -1040 for i=1:numel(temp) -1041 files{i}=temp(i,1).name; -1042 end -1043 files=strrep(files,'.fa',''); -1044 files=strrep(files,'.hmm',''); -1045 files=strrep(files,'.out',''); -1046 files=strrep(files,'.faw',''); -1047 end +0922 %Remove the genes for each KO that are below minScoreRatioKO. +0923 for i=1:size(koGeneMat,1) +0924 J=find(koGeneMat(i,:)); +0925 if any(J) +0926 koGeneMat(i,J(log(koGeneMat(i,J))/log(min(koGeneMat(i,J)))<minScoreRatioKO))=0; +0927 end +0928 end +0929 +0930 %Remove the KOs for each gene that are below minScoreRatioG +0931 for i=1:size(koGeneMat,2) +0932 J=find(koGeneMat(:,i)); +0933 if any(J) +0934 koGeneMat(J(log(koGeneMat(J,i))/log(min(koGeneMat(J,i)))<minScoreRatioG),i)=0; +0935 end +0936 end +0937 fprintf('COMPLETE\n'); +0938 +0939 fprintf('Adding gene annotations to the model... '); +0940 %Create the new model +0941 model.genes=genes(1:geneCounter); +0942 model.grRules=cell(numel(model.rxns),1); +0943 model.grRules(:)={''}; +0944 model.rxnGeneMat=sparse(numel(model.rxns),numel(model.genes)); +0945 +0946 %Loop through the reactions and add the corresponding genes +0947 for i=1:numel(model.rxns) +0948 if isstruct(model.rxnMiriams{i}) +0949 %Get all KOs +0950 I=find(strcmpi(model.rxnMiriams{i}.name,'kegg.orthology')); +0951 KOs=model.rxnMiriams{i}.value(I); +0952 %Find the KOs and the corresponding genes +0953 J=ismember(KOModel.rxns,KOs); +0954 [~, K]=find(koGeneMat(J,:)); +0955 +0956 if any(K) +0957 model.rxnGeneMat(i,K)=1; +0958 %Also delete KOs for which no genes were found. If no genes at +0959 %all were matched to the reaction it will be deleted later +0960 L=sum(koGeneMat(J,:),2)==0; +0961 model.rxnMiriams{i}.value(I(L))=[]; +0962 model.rxnMiriams{i}.name(I(L))=[]; +0963 end +0964 end +0965 end +0966 fprintf('COMPLETE\n'); +0967 +0968 %Find and delete all reactions without genes. This also removes genes that +0969 %are not used (which could happen because minScoreRatioG and +0970 %minScoreRatioKO). If keepSpontaneous==true, the spontaneous reactions +0971 %without genes are kept in the model. Spontaneous reactions with original +0972 %gene associations are treated in the same way, like the rest of the +0973 %reactions - if gene associations were removed during HMM search, such +0974 %reactions are deleted from the model +0975 if keepSpontaneous==true +0976 %Not the most comprise way to delete reactions without genes, but this +0977 %makes the code easier to understand. Firstly the non-spontaneous +0978 %reactions without genes are removed. After that, the second deletion +0979 %step removes spontaneous reactions, which had gene associations before +0980 %HMM search, but no longer have after it +0981 fprintf('Removing non-spontaneous reactions which after HMM search no longer have GPR rules... '); +0982 I=~any(model.rxnGeneMat,2)&~ismember(model.rxns,isSpontaneous); +0983 model=removeReactions(model,I,true,true); +0984 I=~any(model.rxnGeneMat,2)&ismember(model.rxns,spontRxnsWithGenes); +0985 model=removeReactions(model,I,true,true); +0986 else +0987 %Just simply check for any new reactions without genes and remove +0988 %it +0989 fprintf('Removing reactions which after HMM search no longer have GPR rules... '); +0990 I=~any(model.rxnGeneMat,2); +0991 model=removeReactions(model,I,true,true); +0992 end +0993 fprintf('COMPLETE\n'); +0994 +0995 fprintf('Constructing GPR rules and finalizing the model... '); +0996 %Add the gene associations as 'or' +0997 for i=1:numel(model.rxns) +0998 %Find the involved genes +0999 I=find(model.rxnGeneMat(i,:)); +1000 if any(I) +1001 model.grRules{i}=['(' model.genes{I(1)}]; +1002 for j=2:numel(I) +1003 model.grRules{i}=[model.grRules{i} ' or ' model.genes{I(j)}]; +1004 end +1005 model.grRules{i}=[model.grRules{i} ')']; +1006 end +1007 end +1008 +1009 %Fix grRules and reconstruct rxnGeneMat +1010 [grRules,rxnGeneMat] = standardizeGrRules(model,false); %Give detailed output +1011 model.grRules = grRules; +1012 model.rxnGeneMat = rxnGeneMat; +1013 +1014 %Fix subsystems +1015 emptySubSystems=cellfun(@isempty, model.subSystems); +1016 model.subSystems(emptySubSystems)={{''}}; +1017 +1018 %Add the description to the reactions +1019 for i=1:numel(model.rxns) +1020 if ~isempty(model.rxnNotes{i}) +1021 model.rxnNotes(i)=strcat('Included by getKEGGModelForOrganism (using HMMs).',model.rxnNotes(i)); +1022 model.rxnNotes(i)=strrep(model.rxnNotes(i),'.','. '); +1023 else +1024 model.rxnNotes(i)={'Included by getKEGGModelForOrganism (using HMMs)'}; +1025 end +1026 end +1027 %Remove the temp fasta file +1028 delete(fastaFile) +1029 fprintf('COMPLETE\n\n*** Model reconstruction complete ***\n'); +1030 end +1031 +1032 function files=listFiles(directory) +1033 %Supporter function to list the files in a directory and return them as a +1034 %cell array +1035 temp=dir(directory); +1036 files=cell(numel(temp),1); +1037 for i=1:numel(temp) +1038 files{i}=temp(i,1).name; +1039 end +1040 files=strrep(files,'.fa',''); +1041 files=strrep(files,'.hmm',''); +1042 files=strrep(files,'.out',''); +1043 files=strrep(files,'.faw',''); +1044 end
Generated by m2html © 2005
\ No newline at end of file diff --git a/doc/io/importExcelModel.html b/doc/io/importExcelModel.html index 192c63cc..0366309c 100644 --- a/doc/io/importExcelModel.html +++ b/doc/io/importExcelModel.html @@ -967,7 +967,7 @@

SOURCE CODE ^elseif any(strfind(I{j},':')) 0848 index=max(strfind(I{j},':')); 0849 end -0850 if any(index) +0850 if exist('index','var') & any(index) 0851 miriamStruct{i}.name{startIndex+j}=I{j}(1:index-1); 0852 miriamStruct{i}.value{startIndex+j}=I{j}(index+1:end); 0853 else diff --git a/doc/io/importModel.html b/doc/io/importModel.html index 9048c59e..7773dd1a 100644 --- a/doc/io/importModel.html +++ b/doc/io/importModel.html @@ -176,11 +176,11 @@

SOURCE CODE ^end 0071 end 0072 fileName=char(fileName); -0073 if nargin<2 +0073 if nargin<2 || isempty(removeExcMets) 0074 removeExcMets=true; 0075 end 0076 -0077 if nargin<3 +0077 if nargin<3 || isempty(isSBML2COBRA) 0078 isSBML2COBRA=false; 0079 end 0080 diff --git a/external/kegg/getKEGGModelForOrganism.m b/external/kegg/getKEGGModelForOrganism.m index e3abc2d2..24355620 100755 --- a/external/kegg/getKEGGModelForOrganism.m +++ b/external/kegg/getKEGGModelForOrganism.m @@ -38,15 +38,12 @@ % The hidden Markov models as generated in 2b or % downloaded from BioMet Toolbox (see below) % The final directory in dataDir should be styled as -% proXXX_keggYY or eukXXX_keggYY, indicating whether +% prok90_kegg105 or euk90_kegg105, indicating whether % the HMMs were trained on pro- or eukaryotic -% sequences, using a sequence similarity threshold of -% XXX %, fitting the KEGG version YY. E.g. -% euk90_kegg105. (optional, see note about fastaFile. Note -% that in order to rebuild the KEGG model from a -% database dump, as opposed to using the version -% supplied with RAVEN, you would still need to supply -% this) +% sequences; using which sequence similarity treshold +% (first set of digits); using which KEGG version +% (second set of digits). (this parameter should +% ALWAYS be provided) % outDir directory to save the results from the quering of % the hidden Markov models. The output is specific % for the input sequences and the settings used. It diff --git a/io/importExcelModel.m b/io/importExcelModel.m index a703b5bb..0c4c0784 100755 --- a/io/importExcelModel.m +++ b/io/importExcelModel.m @@ -847,7 +847,7 @@ elseif any(strfind(I{j},':')) index=max(strfind(I{j},':')); end - if any(index) + if exist('index','var') & any(index) miriamStruct{i}.name{startIndex+j}=I{j}(1:index-1); miriamStruct{i}.value{startIndex+j}=I{j}(index+1:end); else diff --git a/io/importModel.m b/io/importModel.m index df9b10f2..5b422873 100755 --- a/io/importModel.m +++ b/io/importModel.m @@ -70,11 +70,11 @@ end end fileName=char(fileName); -if nargin<2 +if nargin<2 || isempty(removeExcMets) removeExcMets=true; end -if nargin<3 +if nargin<3 || isempty(isSBML2COBRA) isSBML2COBRA=false; end