Skip to content

Commit

Permalink
Merge pull request #4 from mdelcorvo/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
mdelcorvo authored Mar 4, 2023
2 parents 2c18d33 + 2378912 commit 14a849e
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 145 deletions.
9 changes: 5 additions & 4 deletions rules/calling.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
rule callable:
input:
ref=expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
bam=outputdir + "recal/{sample}.bam",
ref=get_reference,
bam=outputdir + "dedup/{sample}.bam",
bed=config["filtering"]["restrict-regions"],
dict=expand("resources/reference_genome/{ref}/homo_sapiens.dict",ref=config["ref"]["build"]),
script = "scripts/callable_filt.R"
output:
out_bed=outputdir + "qc/coverage-stats/{sample}.bed",
Expand All @@ -23,8 +24,8 @@ rule callable:

rule mutect2:
input:
ref=expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
bam = outputdir + "recal/{sample}.bam",
ref=get_reference,
bam = outputdir + "dedup/{sample}.bam",
pon= config["database_url"]["GRCh38"]["germline"]["PON"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["germline"]["PON"],
exac= config["database_url"]["GRCh38"]["germline"]["ExAC"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["germline"]["ExAC"],
bed=outputdir + "qc/coverage-stats/{sample}.final.bed"
Expand Down
20 changes: 5 additions & 15 deletions rules/database.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,13 @@

rule download_database:
output:
expand(["resources/database/{ref}/temp_ESP6500SI.vcf.tar.gz",
"resources/database/{ref}/1000G-phase_3.vcf.gz",
"resources/database/{ref}/ClinVar.vcf.gz",
"resources/database/{ref}/COSMIC.vcf.gz"],
ref=config["ref"]["build"])
multiext("{database_dir}", "temp_ESP6500SI.vcf.tar.gz", "1000G-phase_3.vcf.gz", "ClinVar.vcf.gz", "COSMIC.vcf.gz")
params:
build=config["ref"]["build"],
Cosmic = config["database_url"]["GRCh38"]["somatic"]["Cosmic"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["somatic"]["Cosmic"],
Gen1K = config["database_url"]["GRCh38"]["germline"]["1000G"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["germline"]["1000G"],
ESP = config["database_url"]["GRCh38"]["germline"]["ESP"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["germline"]["ESP"],
Clinvar = config["database_url"]["GRCh38"]["germline"]["ClinVar"] if config["ref"]["build"]=='GRCh38' else config["database_url"]["GRCh37"]["germline"]["ClinVar"]
cache: True
shell:
"curl -k -L '{params.Cosmic}' > resources/database/{params.build}/COSMIC.vcf.gz; "
"curl -k -L '{params.Gen1K}' > resources/database/{params.build}/1000G-phase_3.vcf.gz; "
Expand All @@ -28,7 +23,6 @@ rule download_mapping:
build=config["ref"]["build"],
mappability = config["mappability"]["GRCh38"] if config["ref"]["build"]=='GRCh38' else config["mappability"]["GRCh37"],
kmer = config["kmer"]
cache: True
run:
if config["ref"]["build"]=='GRCh38':
shell("curl -L '{params.mappability}_{params.kmer}.bw' > resources/database/{params.build}/raw_mappability_{params.kmer}.bw; ")
Expand All @@ -43,7 +37,6 @@ rule ESP6500SI:
expand("resources/database/{ref}/ESP6500SI.vcf.gz",ref=config["ref"]["build"])
params:
build=config["ref"]["build"]
cache: True
conda:
"../envs/r4.yaml"
shell:
Expand All @@ -60,9 +53,8 @@ rule tabix_Cosmic:
expand("resources/database/{ref}/COSMIC.vcf.gz.tbi",ref=config["ref"]["build"])
params:
"-p vcf"
cache: True
wrapper:
"0.78.0/bio/tabix"
"v1.23.3/bio/tabix/index"

rule tabix_1000G:
input:
Expand All @@ -71,9 +63,8 @@ rule tabix_1000G:
expand("resources/database/{ref}/1000G-phase_3.vcf.gz.tbi",ref=config["ref"]["build"])
params:
"-p vcf"
cache: True
wrapper:
"0.78.0/bio/tabix"
"v1.23.3/bio/tabix/index"

rule tabix_ESP:
input:
Expand All @@ -82,14 +73,13 @@ rule tabix_ESP:
expand("resources/database/{ref}/ESP6500SI.vcf.gz.tbi",ref=config["ref"]["build"])
params:
"-p vcf"
cache: True
wrapper:
"0.78.0/bio/tabix"
"v1.23.3/bio/tabix/index"

rule edit_mappability:
input:
raw_map=expand("resources/database/{ref}/raw_mappability_{len}.bw",ref=config["ref"]["build"],len=config["kmer"]),
ref=expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
ref=get_reference,
script = "scripts/edit_mappability.R"
output:
raw_bed= temp(expand("resources/database/{ref}/raw_mappability_{len}.bed",ref=config["ref"]["build"],len=config["kmer"])),
Expand Down
15 changes: 8 additions & 7 deletions rules/filtering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ rule snpsift_annotate_1000G:
benchmark:
outputdir + "benchmarks/1000G/{sample}.1000G.benchmark.txt"
wrapper:
"0.78.0/bio/snpsift/annotate"
"v1.23.3/bio/snpsift/annotate"

rule snpsift_annotate_ESP:
input:
Expand All @@ -42,7 +42,7 @@ rule snpsift_annotate_ESP:
benchmark:
outputdir + "benchmarks/ESP/{sample}.ESP.benchmark.txt"
wrapper:
"0.78.0/bio/snpsift/annotate"
"v1.23.3/bio/snpsift/annotate"

rule snpsift_annotate_ExAC:
input:
Expand All @@ -55,20 +55,21 @@ rule snpsift_annotate_ExAC:
benchmark:
outputdir + "benchmarks/ExAC/{sample}.ExAC.benchmark.txt"
wrapper:
"0.78.0/bio/snpsift/annotate"
"v1.23.3/bio/snpsift/annotate"

rule snpsift_annotate_dbSNP:
input:
call=outputdir + "variant/{sample}.filt.vcf",
database=expand("resources/database/{ref}/variation.noiupac.vcf.gz",ref=config["ref"]["build"])
database=expand("resources/database/{ref}/variation.vcf.gz",ref=config["ref"]["build"]),
tbi=expand("resources/database/{ref}/variation.vcf.gz.tbi",ref=config["ref"]["build"])
output:
call=outputdir + "annotation/{sample}.dbSNP.vcf"
resources:
mem_mb=1024
benchmark:
outputdir + "benchmarks/dbSNP/{sample}.dbSNP.benchmark.txt"
wrapper:
"0.78.0/bio/snpsift/annotate"
"v1.23.3/bio/snpsift/annotate"

rule Cosmic_annotate:
input:
Expand Down Expand Up @@ -96,12 +97,12 @@ rule snpsift_annotate_ClinVar:
benchmark:
outputdir + "benchmarks/ClinVar/{sample}.ClinVar.benchmark.txt"
wrapper:
"0.78.0/bio/snpsift/annotate"
"v1.23.3/bio/snpsift/annotate"

rule vcf_coverage:
input:
call=outputdir + "variant/{sample}.filt.vcf",
bam=outputdir + "recal/{sample}.bam"
bam=outputdir + "dedup/{sample}.bam"
params:
config["bamstats"]
output:
Expand Down
3 changes: 3 additions & 0 deletions rules/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ def getpath(str):
str += '/'
return str

def get_reference(wildcards):
return os.path.join('resources/reference_genome/',config["ref"]["build"], "homo_sapiens.fasta")

###########################

def get_fastq(wildcards):
Expand Down
72 changes: 27 additions & 45 deletions rules/genome.smk
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
rule get_genome:
output:
expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"])
expand("resources/reference_genome/{ref}/homo_sapiens.fasta",ref=config["ref"]["build"])
params:
species=config["ref"]["species"],
species="homo_sapiens",
datatype="dna",
build=config["ref"]["build"],
release=config["ref"]["release"]
log:
outputdir + "logs/ensembl/get_genome.log"
cache: True
cache: "omit-software"
wrapper:
"0.78.0/bio/reference/ensembl-sequence"
"v1.23.3/bio/reference/ensembl-sequence"

rule get_annotation:
output:
expand("resources/reference_genome/{ref}/{species}_annotation.gtf",ref=config["ref"]["build"],species=config["ref"]["species"])
expand("resources/reference_genome/{ref}/homo_sapiens.gtf",ref=config["ref"]["build"])
params:
species=config["ref"]["species"],
species="homo_sapiens",
release=config["ref"]["release"] if config["ref"]["release"]=='GRCh38' else 87,
build=config["ref"]["build"],
fmt="gtf"
cache: True # save space and time with between workflow caching (see docs)
build=config["ref"]["build"]
cache: "omit-software"
wrapper:
"0.78.0/bio/reference/ensembl-annotation"
"v1.23.3/bio/reference/ensembl-annotation"

rule gtf:
input:
gtf=expand("resources/reference_genome/{ref}/{species}_annotation.gtf",ref=config["ref"]["build"],species=config["ref"]["species"]),
gtf=expand("resources/reference_genome/{ref}/homo_sapiens.gtf",ref=config["ref"]["build"]),
script = "scripts/exon_ranking.R"
params:
build=config["ref"]["build"]
Expand All @@ -39,76 +38,59 @@ rule gtf:

rule genome_dict:
input:
expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"])
get_reference
output:
expand("resources/reference_genome/{ref}/{species}.dict",ref=config["ref"]["build"],species=config["ref"]["species"])
expand("resources/reference_genome/{ref}/homo_sapiens.dict",ref=config["ref"]["build"])
conda:
"../envs/samtools.yaml"
cache: True
shell:
"samtools dict {input} > {output} "

rule bwa_index:
input:
expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"])
get_reference
output:
expand("resources/reference_genome/{ref}/{species}.fasta.{bwa}",ref=config["ref"]["build"],species=config["ref"]["species"],bwa=["amb", "ann" ,"bwt", "pac", "sa"])
idx=expand("resources/reference_genome/{ref}/homo_sapiens.fasta.{bwa}",ref=config["ref"]["build"],bwa=["amb", "ann" ,"bwt", "pac", "sa"])
params:
algorithm="bwtsw"
log:
outputdir + "logs/bwa_index/bwa_index.log"
resources:
mem_mb=369000
cache: True
wrapper:
"0.78.0/bio/bwa/index"
"v1.23.3/bio/bwa/index"

rule genome_faidx:
input:
expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
expand("resources/reference_genome/{ref}/{species}.fasta.{bwa}",ref=config["ref"]["build"],species=config["ref"]["species"],bwa=["amb", "ann" ,"bwt", "pac", "sa"])
get_reference,
expand("resources/reference_genome/{ref}/homo_sapiens.fasta.{bwa}",ref=config["ref"]["build"],bwa=["amb", "ann" ,"bwt", "pac", "sa"])
output:
expand("resources/reference_genome/{ref}/{species}.fasta.fai",ref=config["ref"]["build"],species=config["ref"]["species"])
cache: True
expand("resources/reference_genome/{ref}/homo_sapiens.fasta.fai",ref=config["ref"]["build"])
wrapper:
"0.78.0/bio/samtools/faidx"
"v1.23.3/bio/samtools/faidx"

rule get_known_variation:
input:
# use fai to annotate contig lengths for GATK BQSR
fai=expand("resources/reference_genome/{ref}/{species}.fasta.fai",ref=config["ref"]["build"],species=config["ref"]["species"])
fai=expand("resources/reference_genome/{ref}/homo_sapiens.fasta.fai",ref=config["ref"]["build"])
output:
vcf=temp(expand("resources/database/{ref}/variation.vcf.gz",ref=config["ref"]["build"]))
vcf=expand("resources/database/{ref}/variation.vcf.gz",ref=config["ref"]["build"])
params:
species=config["ref"]["species"],
species="homo_sapiens",
build=config["ref"]["build"],
release=config["ref"]["release"],
type="all"
cache: True
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"0.78.0/bio/reference/ensembl-variation"

rule remove_iupac_codes:
input:
expand("resources/database/{ref}/variation.vcf.gz",ref=config["ref"]["build"])
output:
expand("resources/database/{ref}/variation.noiupac.vcf.gz",ref=config["ref"]["build"])
conda:
"../envs/rbt.yaml"
cache: True
shell:
"rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}"
"v1.23.3/bio/reference/ensembl-variation"

rule tabix_known_variants:
input:
expand("resources/database/{ref}/variation.noiupac.vcf.gz",ref=config["ref"]["build"])
expand("resources/database/{ref}/variation.vcf.gz",ref=config["ref"]["build"])
output:
expand("resources/database/{ref}/variation.noiupac.vcf.gz.tbi",ref=config["ref"]["build"])
expand("resources/database/{ref}/variation.vcf.gz.tbi",ref=config["ref"]["build"])
params:
"-p vcf"
cache: True
wrapper:
"0.78.0/bio/tabix"
"v1.23.3/bio/tabix/index"

rule snpeff_download:
output:
Expand Down
43 changes: 6 additions & 37 deletions rules/mapping.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ rule trim_reads_pe:
benchmark:
outputdir + "benchmarks/trimmomatic/{sample}.trim.benchmark.txt"
wrapper:
"0.78.0/bio/trimmomatic/pe"
"v1.23.3/bio/trimmomatic/pe"

rule map_reads:
input:
Expand All @@ -35,52 +35,21 @@ rule map_reads:
sort_order="coordinate"
threads: config["ncores"]
wrapper:
"0.78.0/bio/bwa/mem"
"v1.23.3/bio/bwa/mem"

rule mark_duplicates:
input:
outputdir + "mapped/{sample}.sorted.bam"
bams=outputdir + "mapped/{sample}.sorted.bam"
output:
bam=temp(outputdir + "dedup/{sample}.bam"),
bam=outputdir + "dedup/{sample}.bam",
metrics=outputdir + "qc/dedup/{sample}.metrics.txt"
log:
outputdir + "logs/picard/dedup/{sample}.log"
benchmark:
outputdir + "benchmarks/picard/{sample}.picard.benchmark.txt"
params:
"REMOVE_DUPLICATES=true OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT"
extra="--REMOVE_DUPLICATES true --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --CREATE_INDEX true --VALIDATION_STRINGENCY LENIENT"
resources:
mem_mb=4096
wrapper:
"0.78.0/bio/picard/markduplicates"

rule gatk_baserecalibrator:
input:
bam=outputdir + "dedup/{sample}.bam",
ref=expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
dict=expand("resources/reference_genome/{ref}/{species}.dict",ref=config["ref"]["build"],species=config["ref"]["species"]),
known=expand("resources/database/{ref}/variation.noiupac.vcf.gz",ref=config["ref"]["build"]),
tbi=expand("resources/database/{ref}/variation.noiupac.vcf.gz.tbi",ref=config["ref"]["build"])
output:
recal_table=outputdir + "recal/{sample}.table"
log:
outputdir + "logs/gatk/bqsr/{sample}.log"
benchmark:
outputdir + "benchmarks/gatk/bqsr/{sample}.gatk.bqsr.benchmark.txt"
wrapper:
"0.78.0/bio/gatk/baserecalibrator"

rule gatk_applybqsr:
input:
bam=outputdir + "dedup/{sample}.bam",
ref=expand("resources/reference_genome/{ref}/{species}.fasta",ref=config["ref"]["build"],species=config["ref"]["species"]),
dict=expand("resources/reference_genome/{ref}/{species}.dict",ref=config["ref"]["build"],species=config["ref"]["species"]),
recal_table=outputdir + "recal/{sample}.table"
output:
bam=outputdir + "recal/{sample}.bam"
log:
outputdir + "logs/gatk/gatk_applybqsr/{sample}.log"
benchmark:
outputdir + "benchmarks/gatk/gatk_applybqsr/{sample}.gatk.applybqsr.benchmark.txt"
wrapper:
"0.78.0/bio/gatk/applybqsr"
"v1.23.3/bio/picard/markduplicates"
Loading

0 comments on commit 14a849e

Please sign in to comment.