-
Notifications
You must be signed in to change notification settings - Fork 0
/
statsHomoSapiens.R
52 lines (48 loc) · 1.38 KB
/
statsHomoSapiens.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
source("./utilities.R")
source("./getGenomeVersion.R")
source("./isSingleCell.R")
organism <- "Homo sapiens"
results <- getHTSeqResultsByOrganism(organism = organism)
count <- 0
batch <- 150
n <- ceiling(results$count / batch)
outfile <- paste0("output/", tolower(gsub(" ", "_", organism)), ".out")
for(i in 1:n){
start <- (i - 1) * batch + 1
end <- min(i * batch, results$count)
print(sprintf("start: %d, end: %d", start, end))
esummaries <- entrez_summary(db="gds", id=results$ids[start:end])
for(esum in esummaries){
# Ignore GSE containing multipl organisms
if(length(strsplit(esum$taxon, ";")[[1]]) > 1){
next
}
gse <- paste0("GSE",esum$gse)
eList <- tryCatch({
getGEO(gse, getGPL = FALSE, destdir="./tmpData")
}, error = function(e) {
print(e)
NULL
})
if(is.null(eList)){
next
}
pd <- pData(eList[[1]])
if(organism == "Zea mays"){
version <- getGenomeVersion(pd)
if(is.null(version)) {
next
}
}
if(isSingleCell(pd)){
next
}
# has valid data files?
files <- getValidFiles(gse = gse, organism = organism)
if(!is.null(files)){
count <- count + 1
write(gse, file = outfile, append = TRUE)
}
}
}
print(sprintf("Total: %d, Valid: %d", results$count, count))