-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathGetFilteredGOData.m
95 lines (85 loc) · 3.92 KB
/
GetFilteredGOData.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
function GOTable = GetFilteredGOData(whatSource,whatFilter,sizeFilter,ourEntrez)
% GetFilteredGOData Returns a GO table of gene annotations
%-------------------------------------------------------------------------------
%-------------------------------------------------------------------------------
% Check Inputs:
if nargin < 1
whatSource = 'mouse-direct'; % Direct annotations from GO
end
if nargin < 2
whatFilter = 'biological_process';
end
if nargin < 3
sizeFilter = [5,200];
end
if nargin < 4
ourEntrez = [];
end
%-------------------------------------------------------------------------------
% Load processed GO annotation data (i.e., direct annotations propagated up the hierarchy):
% cf. propagateHierarchy to map files generated from ReadDirectAnnotationFile (or ReadGEMMAAnnotationFile)
switch whatSource
case 'mouse-direct'
fileNameLoad = sprintf('GOAnnotationDirect-mouse-%s-Prop.mat',whatFilter);
case 'human-direct'
fileNameLoad = sprintf('GOAnnotationDirect-human-%s-Prop.mat',whatFilter);
case 'mouse-GEMMA'
fileNameLoad = 'GOAnnotationGEMMAProp.mat';
otherwise
error('Unknown annotation source: ''%s''',whatSource);
end
load(fileNameLoad,'GOTerms');
fprintf(1,'Loaded annotated GO Terms from %s\n',fileNameLoad);
%-------------------------------------------------------------------------------
% Get full GO ontology details:
GOTermsFile = 'GOTerms_BP.mat';
if strcmp(whatFilter,'biological_process') && exist(GOTermsFile,'file')
load(GOTermsFile,'GOTable');
fprintf(1,'Loaded biological process GO terms from file: %s\n',GOTermsFile);
else
% Retrieve from mySQL database:
GOTable = GetGOTerms(whatFilter);
fprintf(1,'Loaded biological process GO terms from SQL database\n');
end
%-------------------------------------------------------------------------------
% Filter categories
%-------------------------------------------------------------------------------
% Filter by ontology details:
[~,ia,ib] = intersect(GOTable.GOID,GOTerms.GOID);
fprintf(1,'Filtering to %u annotated GO categories related to %s\n',length(ia),whatFilter);
GOTable = GOTable(ia,:);
GOTerms = GOTerms(ib,:);
% Filter by category size:
numGOCategories = height(GOTerms);
if isempty(ourEntrez)
fprintf(1,'Filtering GO categories on their number of gene annotations\n');
sizeGOCategories = GOTerms.size;
fprintf(1,'%u GO categories have no annotations :-/\n',...
sum(sizeGOCategories==0));
else
fprintf(1,'Filtering on size of GO category after matching to our %u genes\n',length(ourEntrez));
% Make sure all annotations match those in our set of entrez_ids
GOTerms.annotations = cellfun(@(x)x(ismember(x,ourEntrez)),GOTerms.annotations,...
'UniformOutput',false);
sizeGOCategories = cellfun(@length,GOTerms.annotations);
GOTerms.size = sizeGOCategories;
fprintf(1,'%u GO categories have no annotations matching our %u genes\n',...
sum(sizeGOCategories==0),length(ourEntrez));
end
GOTable.size = GOTerms.size;
GOTable.annotations = GOTerms.annotations;
fprintf(1,'GO categories have between %u and %u annotations\n',...
min(sizeGOCategories),max(sizeGOCategories));
isGoodSize = (sizeGOCategories >= sizeFilter(1)) & (sizeGOCategories <= sizeFilter(2));
GOTable = GOTable(isGoodSize,:);
fprintf(1,'Filtered to %u GO categories with between %u and %u annotations\n',...
height(GOTable),sizeFilter(1),sizeFilter(2));
%-------------------------------------------------------------------------------
% Check that all annotations in GOTable are column vectors:
isRowVector = find(cellfun(@(x)size(x,2),GOTable.annotations)~=1);
numRowVectorAnnotation = length(isRowVector);
for i = 1:numRowVectorAnnotation
GOTable.annotations{isRowVector(i)} = GOTable.annotations{isRowVector(i)}';
end
fprintf(1,'Transposed %u category annotation vectors from row-vector -> column vector\n',numRowVectorAnnotation);
end