-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·198 lines (172 loc) · 6.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""main.py: This is the run script of our mutant sequence count analysis.
This script will produce the following folders/files:
- demultiplexing: a folder that contains all demultiplexed fastqs
- lanes: contains all fastqc controls foe samples and merged samples
- merged: contains all merged fastqs produced by NGmerge
- most_common_sequences: contains an estimate of the topmost common sequences
- read_counts: contains the total number of reads observed
- seq_counts contains a list of predefined sequences to be searched for and also
a couple of files per sample:
- sequence counts for all sequences present in the merged fastq files ("all_reads")
- sequence counts for all trimmed sequences present in the merged fastq files,
if a trim function was supplied ("all_reads_trimmed")
- sequence counts for only the predefined sequences ("sequence_count")
- sequence counts for sequences that were not supplied as predefined ("sequence_count_unmatched")
- summary_counts.tsv will contain a summary of all predefined sequence counts over
all samples.
"""
__author__ = "Marco Mernberger"
__copyright__ = "Copyright (c) 2020 Marco Mernberger"
__license__ = "mit"
import pypipegraph2 as ppg
ppg.replace_ppg1()
import pandas as pd
import re
import mbf
import subprocess
import sys
import katha_tools
import mmdemultiplex
import counting_sequences
from typing import List, Dict, Callable
from pypipegraph import Job
from pathlib import Path
from counting_sequences import ngmerge
########################
# instantiate pipegraph
########################
ppg.new()
#######################
# initialize variables
#######################
incoming_dir = "incoming"
result_dir = "results"
info_data_file = f"{incoming_dir}/NovaSeq_incoming_sequences_Exon5678.xlsx"
barcode_sheet = "Übersicht"
barcode_df = pd.read_excel(info_data_file, sheet_name=barcode_sheet, skiprows=2)
barcode_df = barcode_df.fillna(method="ffill", axis=0)
pool_col_name = "Library Name"
sample_col_name = "Probenname"
fw_col_name = "Barcode fw" # the
rv_col_name = "Barcode rev"
trim_start = "trim after start" # we want to trim the constant part at the start
trim_end = "trim before end" # we want to trim the constant part at the end
summary_output_file = results / "summary_counts.tsv"
# The exons to be analyzed. Keys are file prefixes of the fastq files, values
# are the names of the sequencing libraries.
exons = {
"Exon5_DMSO": "Exon 5",
"Exon5_N3a": "Exon 5",
"Exon6_DMSO": "Exon 6",
"Exon6_N3a": "Exon 6",
"Exon8_DMSO": "Exon 8",
"Exon8_N3a": "Exon 8",
"cDNA_Exon5": "Exon 5 cDNA",
"cDNA_Exon6": "Exon 6 cDNA",
"cDNA_Exon8": "Exon 8 cDNA",
}
################################
# declare the jobs for pipegraph
################################
dependencies_for_summary = []
per_sample_count_files = {}
# analyze each exon separately
for exon in exons:
# demultiplex each exon library into replicates
raw_pool = mmdemultiplex.samples.DemultiplexInputSample(
exon,
input_strategy=mbf.align.strategies.FASTQsFromPrefix(
str(incoming / f"fastqs/{exon}_")
),
reverse_reads=False,
pairing="paired",
)
demultiplexer = mmdemultiplex.demultiplex.Demultiplexer(
raw_pool,
get_barcode_df(
exon,
barcode_df,
pool_col_name,
sample_col_name,
fw_col_name,
rv_col_name,
trim_start,
trim_end,
),
output_folder=results / "demultiplexing",
)
demultiplexer.do_demultiplex()
# get a dict of demultiplexed samples/replicates from the demultiplexer
samples = demultiplexer.get_samples()
sequence_counter = counting_sequences.SequenceCounter(
sequence_file_path=Path(info_data_file), # file with predefined sequences
name=f"count_{exon}",
seqs_to_trim_reads=None,
seqs_to_trim_predefined=None,
trimmed_length=None,
result_folder=str(results / f"seq_counts/{exon}"),
sequence_df_filter=None,
sheet_name=exons[exon],
) # Initialize the sequence counter for the exon
# for each sample/replicate
for sample_name in samples:
# merge overlapping R1 and R2 reads into a single read and create a new merged single-end fastq file
print(sample_name)
sample = samples[sample_name]
sample.save_input()
r1, r2 = sample.get_aligner_input_filenames()
merged_path = results / "merged"
merged_fastq = merged_path / f"{sample_name}_merged.fastq.gz"
merge_job = ngmerge(
merged_fastq,
r1=r1,
r2=r2,
dependencies=[sample.prepare_input()],
options={
"-l": str(merged_path / f"{sample_name}_merged.log"),
"-f": str(merged_path / f"{sample_name}_merged.failed"),
"-z": "",
"-d": "",
},
)
merged_sample = mbf.align.raw.Sample(
f"{sample_name}_merged",
input_strategy=mbf.align.strategies.FASTQsFromJob(merge_job),
reverse_reads=False,
pairing="single",
)
# count the number of reads remaining in the merged fastq file
counting_sequences.get_reads_for_lanes_df(
results
/ "read_counts"
/ f"{sample_name}_read_counts_after_merge.tsv",
{f"{sample_name}_merged": merged_sample},
dependencies=[merged_sample.prepare_input()],
)
r1, *_ = merged_sample.get_aligner_input_filenames()
# count the most common sequences
counting_sequences.CutadaptMatch.count_most_common_sequences(
r1=r1,
r2=None,
output_file=results
/ "most_common_sequences"
/ "merged"
/ f"{sample_name}_merged_most_common.tsv",
max=100000,
dependencies=[merged_sample.prepare_input()],
)
# count the number of occurences of each sequence in the merged reads
# by exact matching
job = sequence_counter.write_count_table(merged_sample)
per_sample_count_files[sample_name] = job.outputs[0]
dependencies_for_summary.append(job)
# combine all the count tables into one
sequence_counter.combine(
per_sample_count_files, summary_output_file, dependencies_for_summary
)
###################
# run the pipegraph
###################
ppg.run()