-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_mageck_inputs.R
142 lines (108 loc) · 5.69 KB
/
make_mageck_inputs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
##--------------------------------------------------
# generates files of counts for each sample group
# files to be used as inputs for MAGeCK
# --------------------------------------------------
library(phylotools)
library(readr)
# function for reading in tables of counts and storing as list of dataframes
read_into_df_list <- function(filename){
df <- read.table(filename, sep = '\t')
names(df) <- c('Seq', 'Count')
return(df)
}
# read in count tables specified in snakemake input
count_tables <- lapply(snakemake@input, read_into_df_list)
names(count_tables) <- snakemake@input
# store other snakemake parameters
sample_sheet_f <- snakemake@params[["sample_sheet"]]
outdir <- paste0(dirname(snakemake@output[[1]]), "/")
groups <- snakemake@params[["groups"]]
vs_plasmid_lib <- snakemake@params[["vs_plasmid_lib"]]
# read in reference file of library sequences, order alphabetically by seq
mutant_table <- read.fasta(snakemake@params[["mutant_table"]])
mutant_table <- mutant_table[order(mutant_table$seq.text),]
# create column of HGSE gene symbols from seq names, necessary formatting for MAGeCK
for (name in mutant_table$seq.name){
symbol <- unlist(strsplit(name, '_'))[1]
mutant_table$symbol[mutant_table$seq.name == name] <- symbol
}
# read in sample sheet
if(length(sample_sheet_f) >= 1) {
if(endsWith(sample_sheet_f, ".csv")) {
sample_sheet <- read_csv(sample_sheet_f)
} else if(endsWith(sample_sheet_f, ".tsv")) {
sample_sheet <- read_tsv(sample_sheet_f)
} else {
sample_sheet <- read_excel(sample_sheet_f)
}
# make flexible for capitalization
colnames(sample_sheet) <- tolower(colnames(sample_sheet))
}
sample_sheet$group <- as.character(sample_sheet$group)
sample_sheet$control <- as.character(sample_sheet$control)
# if samples are to be normalized against abundance of seqs in plasmid library
if (length(vs_plasmid_lib) > 0){
# read in provided file of seq abundances in plasmid lib, sort alphabetically by sequence
plasmid_counts <- read.csv('/home/ec2-user/arrakis-internal-data/reference/libraries/200424_plasmid_norms_RCed.csv', fileEncoding="UTF-8-BOM")
plasmid_counts <- plasmid_counts[order(plasmid_counts$Seq), ]
# for each group of samples to be averaged and/or normalized together
for (group in groups){
# begin building MAGeCK input sheet using columns for seq name, HGSE symbol, and count in plasmid library
input_sheet <- as.data.frame(cbind(mutant_table$seq.name, mutant_table$symbol))
names(input_sheet) <- c('name', 'symbol')
input_sheet$plasmid_counts <- plasmid_counts$Count
# scrape samples in current group from sample sheet
samples <- sample_sheet$name[grepl(group, sample_sheet$group) & grepl('No', sample_sheet$control)]
# for each sample in current group
for (sample in samples){
# isolate sample's count table from list, order alphabetically by sequence
count_table <- count_tables[grepl(sample, names(count_tables))]
count_table <- as.data.frame(count_table)
names(count_table) <- c('Seq', 'Count')
count_table$Seq <- as.character(count_table$Seq)
count_table$Count <- as.numeric(count_table$Count)
count_table <- count_table[order(count_table$Seq),]
# add column to input sheet containing seq counts for current sample
input_sheet[sample] <- count_table$Count
}
# write input sheet containing plasmid counts and counts from all samples in group to file
write_tsv(input_sheet, paste0(outdir, group, '.txt'))
}
# if samples are to be normalized against unsorted samples
} else {
# for each group of samples to be averaged and/or normalized together
for (group in groups){
# determine which samples in group are controls and which are experimental samples
controls <- sample_sheet$name[grepl(group, sample_sheet$group) & grepl('Yes', sample_sheet$control)]
samples <- sample_sheet$name[grepl(group, sample_sheet$group) & grepl('No', sample_sheet$control)]
# begin building MAGeCK input sheet using columns for seq name and HGSE symbol
input_sheet <- as.data.frame(cbind(mutant_table$seq.name, mutant_table$symbol))
names(input_sheet) <- c('name', 'symbol')
# for each control sample in list of control samples
for (control in controls){
# isolate control sample's count table from list, order alphabetically by sequence
count_table <- count_tables[grepl(control, names(count_tables))]
count_table <- as.data.frame(count_table)
names(count_table) <- c('Seq', 'Count')
count_table$Seq <- as.character(count_table$Seq)
count_table$Count <- as.numeric(count_table$Count)
count_table <- count_table[order(count_table$Seq),]
# add column containing seq counts for current control sample to input sheet
input_sheet[control] <- count_table$Count
}
# for each experimental sample in list of experimental samples
for (sample in samples){
# isolate experimental sample's count table from list, order alphabetically by sequence
count_table <- count_tables[grepl(sample, names(count_tables))]
count_table <- as.data.frame(count_table)
names(count_table) <- c('Seq', 'Count')
count_table$Seq <- as.character(count_table$Seq)
count_table$Count <- as.numeric(count_table$Count)
# add column containing seq counts for current experimental sample to input sheet
count_table <- count_table[order(count_table$Seq),]
input_sheet[sample] <- count_table$Count
}
# write input sheet containing all samples in group to file
write_tsv(input_sheet, paste0(outdir, group, '.txt'))
}
}