Home > RAVEN > getModelFromKEGG.m

getModelFromKEGG

PURPOSE ^

getModelFromKEGG

SYNOPSIS ^

function [model KOModel]=getModelFromKEGG(keggPath,keepUndefinedStoich,keepIncomplete,keepGeneral)

DESCRIPTION ^

 getModelFromKEGG
   Retrieves information stored in KEGG database and generates a model

   keggPath            this function reads data from a local FTP dump of 
                       the KEGG database. keggPath is the pathway to the 
                       root of the database
   keepUndefinedStoich include reactions in the form n A <=> n+1 A. These
                       will be dealt with as two separate metabolites 
                       (opt, default true)
   keepIncomplete      include reactions which have been labelled as
                       "incomplete", "erroneous" or "unclear" (opt,
                       default true)
   keepGeneral         include reactions which have been labelled as
                       "general reaction". These are reactions on the form
                       "an aldehyde <=> an alcohol", and are therefore
                       unsuited for modelling purposes. Note that not all
                       reactions have this type of annotation, and the
                       script will therefore not be able to remove all
                       such reactions (opt, default false)

   model               a model structure generated from the database. 
                       All reactions and the metabolites used in them 
                       will be added
   KOModel             a model structure representing the KEGG Orthology 
                       ids and their associated genes. The KO ids are 
                       saved as reactions
               
   Usage: getModelFromKEGG(keggPath,keepUndefinedStoich,keepIncomplete,keepGeneral)

   Rasmus Agren, 2013-02-06

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function [model KOModel]=getModelFromKEGG(keggPath,keepUndefinedStoich,keepIncomplete,keepGeneral)
0002 % getModelFromKEGG
0003 %   Retrieves information stored in KEGG database and generates a model
0004 %
0005 %   keggPath            this function reads data from a local FTP dump of
0006 %                       the KEGG database. keggPath is the pathway to the
0007 %                       root of the database
0008 %   keepUndefinedStoich include reactions in the form n A <=> n+1 A. These
0009 %                       will be dealt with as two separate metabolites
0010 %                       (opt, default true)
0011 %   keepIncomplete      include reactions which have been labelled as
0012 %                       "incomplete", "erroneous" or "unclear" (opt,
0013 %                       default true)
0014 %   keepGeneral         include reactions which have been labelled as
0015 %                       "general reaction". These are reactions on the form
0016 %                       "an aldehyde <=> an alcohol", and are therefore
0017 %                       unsuited for modelling purposes. Note that not all
0018 %                       reactions have this type of annotation, and the
0019 %                       script will therefore not be able to remove all
0020 %                       such reactions (opt, default false)
0021 %
0022 %   model               a model structure generated from the database.
0023 %                       All reactions and the metabolites used in them
0024 %                       will be added
0025 %   KOModel             a model structure representing the KEGG Orthology
0026 %                       ids and their associated genes. The KO ids are
0027 %                       saved as reactions
0028 %
0029 %   Usage: getModelFromKEGG(keggPath,keepUndefinedStoich,keepIncomplete,keepGeneral)
0030 %
0031 %   Rasmus Agren, 2013-02-06
0032 %
0033 
0034 if nargin<2
0035     keepUndefinedStoich=true;
0036 end
0037 if nargin<3
0038     keepIncomplete=true;
0039 end
0040 if nargin<4
0041     keepGeneral=true;
0042 end
0043 
0044 %First get all reactions
0045 model=getRxnsFromKEGG(keggPath,keepUndefinedStoich,keepIncomplete,keepGeneral);
0046 fprintf('KEGG reactions loaded\n');
0047 
0048 %Get the KO ids that are associated with any of the reactions. They will be
0049 %used later on to create a rxn-gene matrix
0050 KOs=cell(numel(model.rxns)*2,1); %Make room for two KO ids per reaction
0051 
0052 addToIndex=1;
0053 %Add all KO ids that are used
0054 for i=1:numel(model.rxns)
0055    if isstruct(model.rxnMiriams{i})
0056       for j=1:numel(model.rxnMiriams{i}.name)
0057          if strcmpi('urn:miriam:kegg.ko',model.rxnMiriams{i}.name{j});
0058             %Add the KO id
0059             KOs(addToIndex)=model.rxnMiriams{i}.value(j);
0060             addToIndex=addToIndex+1;
0061          end
0062       end
0063    end
0064 end
0065 
0066 KOs=KOs(1:addToIndex-1);
0067 KOs=unique(KOs);
0068 
0069 %Get all genes from any organism in KEGG that is associated with any of
0070 %the KOs
0071 KOModel=getGenesFromKEGG(keggPath,KOs);
0072 fprintf('KEGG genes loaded\n');
0073 
0074 model.genes=KOModel.genes;
0075 
0076 %It can be that there are KOs from the reactions that have no database
0077 %entry. These are (as far as I've seen) lumped versions of other KOs and
0078 %should be removed
0079 KOsToRemove=setdiff(KOs, KOModel.rxns);
0080 
0081 %Loop through all reactions and delete the KOs that weren't found
0082 for i=1:numel(model.rxns)
0083     if isstruct(model.rxnMiriams{i})
0084         for j=1:numel(model.rxnMiriams{i}.name)
0085             toDel=[];
0086             if strcmp(model.rxnMiriams{i}.name{j},'urn:miriam:kegg.ko')
0087                 if ismember(model.rxnMiriams{i}.value{j},KOsToRemove)
0088                    toDel=[toDel;j];
0089                 end
0090             end
0091         end
0092         %Delete the KOs
0093         if any(toDel)
0094            %If all posts are deleted, then delete the whole structure
0095            if numel(toDel)==j
0096                 model.rxnMiriams{i}=[];
0097            else
0098                 model.rxnMiriams{i}.name(toDel)=[];
0099                 model.rxnMiriams{i}.value(toDel)=[];
0100            end
0101         end
0102     end
0103 end
0104 
0105 %Create the rxnGeneMat for the reactions. This is simply done by merging
0106 %the gene associations for all the involved KOs
0107 r=zeros(10000000,1); %Store the positions since it's slow to write to a sparse array in a loop
0108 c=zeros(10000000,1);
0109 counter=1;
0110 for i=1:numel(model.rxns)
0111     if isstruct(model.rxnMiriams{i})
0112         I=strncmp('urn:miriam:kegg.ko',model.rxnMiriams{i}.name,18);
0113         [J K]=ismember(model.rxnMiriams{i}.value(I),KOModel.rxns);
0114         %Find all gene indexes that correspond to any of these KOs
0115         [crap L]=find(KOModel.rxnGeneMat(K(J),:));
0116         if any(L)
0117             %Allocate room for more elements if needed
0118             if counter+numel(L)-1>=numel(r)
0119                 r=[r;zeros(numel(r),1)];
0120                 c=[c;zeros(numel(c),1)];
0121             end
0122             r(counter:counter+numel(L)-1)=ones(numel(L),1)*i;
0123             c(counter:counter+numel(L)-1)=L(:);
0124             counter=counter+numel(L);
0125         end
0126     end
0127 end
0128 
0129 model.rxnGeneMat=sparse(r(1:counter-1),c(1:counter-1),ones(counter-1,1));
0130 if size(model.rxnGeneMat,1)~=numel(model.rxns) || size(model.rxnGeneMat,2)~=numel(KOModel.genes)
0131     model.rxnGeneMat(numel(model.rxns),numel(KOModel.genes))=0;
0132 end
0133 
0134 %Then get all metabolites
0135 metModel=getMetsFromKEGG(keggPath);
0136 fprintf('KEGG metabolites loaded\n');
0137 
0138 %Add information about all metabolites to the model
0139 [a b]=ismember(model.mets,metModel.mets);
0140 a=find(a);
0141 b=b(a);
0142 
0143 if ~isfield(model,'metNames')
0144    model.metNames=cell(numel(model.mets),1);
0145    model.metNames(:)={''};
0146 end
0147 model.metNames(a)=metModel.metNames(b);
0148 
0149 if ~isfield(model,'metFormulas')
0150    model.metFormulas=cell(numel(model.mets),1);
0151    model.metFormulas(:)={''};
0152 end
0153 model.metFormulas(a)=metModel.metFormulas(b);
0154 
0155 if ~isfield(model,'inchis')
0156    model.inchis=cell(numel(model.mets),1);
0157    model.inchis(:)={''};
0158 end
0159 model.inchis(a)=metModel.inchis(b);
0160 
0161 if ~isfield(model,'metMiriams')
0162    model.metMiriams=cell(numel(model.mets),1);
0163 end
0164 model.metMiriams(a)=metModel.metMiriams(b);
0165 
0166 %Put all metabolites in one compartment called 's' (for system). This is
0167 %done just to be more compatible with the rest of the code
0168 model.comps={'s'};
0169 model.compNames={'System'};
0170 model.compOutside={''};
0171 model.metComps=ones(numel(model.mets),1);
0172 
0173 %If reactions with undefined stoichiometry are kept, then the corresponding
0174 %metabolites will have ids such as "(n+1) C000001" and their names will be
0175 %empty. These ids are not valid SBML identifiers and are therefore replaced
0176 %with "undefined1, undefined2...". The former ids are stored as the new
0177 %names
0178 I=find(cellfun(@any,strfind(model.mets,'n')) | cellfun(@any,strfind(model.mets,'m')));
0179 model.metNames(I)=model.mets(I);
0180 repNums=1:numel(I);
0181 repIDs=strcat('undefined_',cellfun(@num2str,num2cell(repNums(:)),'UniformOutput',false));
0182 model.mets(I)=repIDs;
0183 
0184 %It could also be that the metabolite names are empty for some other
0185 %reason. In that case, use the ID instead
0186 I=cellfun(@isempty,model.metNames);
0187 model.metNames(I)=model.mets(I);
0188 end

Generated on Mon 06-Jan-2014 14:58:12 by m2html © 2005