Skip to content

Commit

Permalink
updated get indels, added transgene annotations, updated channels
Browse files Browse the repository at this point in the history
  • Loading branch information
nidhidav committed Sep 5, 2024
1 parent 10cde29 commit 83554db
Show file tree
Hide file tree
Showing 10 changed files with 1,240 additions and 265 deletions.
649 changes: 649 additions & 0 deletions bin/extract_variant_reads.py

Large diffs are not rendered by default.

737 changes: 502 additions & 235 deletions bin/getIndelsFromBam.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions bin/make_hotspot_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ def main():
parser.add_argument('--bed', type=checkfile, help="bed hotspot file")
parser.add_argument('--csv', type=checkfile, help="csv hotspot file")
parser.add_argument('--window', type=int, default=200, help="window")
parser.add_argument('--id', help="meta id")
args = parser.parse_args()

window = args.window
# assumes csv_file ranges don't overlap
data_df = pd.DataFrame()
if (args.bed):
bed_file = os.path.realpath(args.bed)
Expand All @@ -69,12 +69,13 @@ def main():
row_df['Chromosome'] = rows['Chromosome']
vcf_df = pd.concat([vcf_df, row_df], ignore_index=True)
result_vcf = dataframe_to_vcf(vcf_df)
with open(f"hotspot.vcf", "w") as f:
id = args.id
with open(f"{id}.hotspot.vcf", "w") as f:
f.write(result_vcf)

if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window]")
print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window] --id <id>")
sys.exit(1)
else:
main()
25 changes: 25 additions & 0 deletions modules/local/annotate_transgene.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
process ANNOTATE_TRANSGENE_VARIANTS {
tag "$meta.id"
label 'process_low'
label 'final_output'
container "ghcr.io/dhslab/docker-vep:release_105"

input:
tuple val(meta), path(files), path("${meta.id}.transgene_out.tsv")

output:
tuple val(meta), path("${meta.id}.transgene.annotated.tsv"), emit: annotated_transgenes
path "versions.yml", emit: versions

script:
"""
awk 'BEGIN {FS=OFS="\\t"} NR > 1 {sub(/^chr/, "", \$1); for (i = 1; i < NF; i++) {printf "%s%s", \$i, (i == NF-1 ? "\\n" : OFS)}}' ${meta.id}.transgene_out.tsv > ${meta.id}.edited.transgene.tsv && \\
/opt/vep/src/ensembl-vep/vep --cache --dir /storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/VEP_cache --symbol --per_gene -i ${meta.id}.edited.transgene.tsv -o ${meta.id}.transgene.annotated.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
vep: \$(/opt/vep/src/ensembl-vep/vep 2>&1 | grep ensembl-vep | cut -d ':' -f 2 | sed 's/\s*//g')
END_VERSIONS
"""

}
8 changes: 4 additions & 4 deletions modules/local/get_indels.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@ process GET_INDELS {
tag "$meta.id"
label 'process_low'
label 'final_output'
container "ghcr.io/dhslab/docker-cleutils"
container "ghcr.io/dhslab/docker-baseimage:latest"
errorStrategy 'ignore'

input:
tuple val(meta), path(files)
path(targetfile)
tuple val(meta), path(files), path(hotspot_file)

output:
tuple val(meta), path("${meta.id}.indels.txt")
path "versions.yml", emit: versions

script:
"""
getIndelsFromBam.py ${targetfile} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
extract_variant_reads.py ${hotspot_file} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 1 addition & 1 deletion modules/local/get_transgene_junctions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ process GET_TRANSGENE_JUNCTIONS {
tuple val(meta), path(files)

output:
tuple val(meta), path("${meta.id}.transgene_out.tsv")
tuple val(meta), path("${meta.id}.transgene_out.tsv"), emit: transgene_file
path "versions.yml" , emit: versions

script:
Expand Down
2 changes: 1 addition & 1 deletion modules/local/make_hotspot_file.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ process MAKE_HOTSPOT_FILE {
def bed = bed_file.name != 'NO_FILE.bed' ? "--bed $bed_file" : ''
def csv = csv_file.name != 'NO_FILE.csv' ? "--csv $csv_file" : ''
"""
make_hotspot_file.py $bed $csv --window $params.window
make_hotspot_file.py $bed $csv --window $params.window --id ${info[0].id}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ params {

// Dragen inputs
dragen_inputs {
dragen_hash = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/hg38_PLVM_CD19_CARv4_cd34"
snv_noisefile = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dragen_v1.0_systematic_noise.nextera_wgs.120920.bed.gz"
sv_noisefile = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/WGS_v2.0.0_hg38_sv_systematic_noise.bedpe.gz"
dragen_hash = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6"
snv_noisefile = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v.2.0.0_systematic_noise.snv.bed.gz"
sv_noisefile = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v3.0.0_systematic_noise.sv.bedpe.gz"
dbsnp = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz"
dbsnp_index = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz.tbi"
pop_af_vcf = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
Expand Down
21 changes: 19 additions & 2 deletions subworkflows/local/somatic_input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,26 @@ workflow SOMATIC_INPUT_CHECK {

ch_input_data = ch_input_data.mix(ch_bam)

ch_mastersheet
.map { meta ->
if (meta.dragen_path != null){
def new_meta = [:]
new_meta['id'] = meta.id
return [new_meta]
}
}
.set{ch_ids}
ch_input_data = ch_input_data.mix(ch_ids)

ch_hotspots = ch_mastersheet
.map{ row -> [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]}
.map{ row ->
if (row.uid != null) {
return [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
}
else {
return [row.id, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
}
}
.unique()

ch_input_data = ch_input_data
Expand All @@ -175,7 +193,6 @@ workflow SOMATIC_INPUT_CHECK {
}
}
.set { ch_dragen_outputs }
ch_dragen_outputs.dump(tag: 'dragen_output')

emit:
dragen_outputs = ch_dragen_outputs
Expand Down
48 changes: 32 additions & 16 deletions workflows/scge.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft
include { MAKE_HOTSPOT_FILE } from '../modules/local/make_hotspot_file.nf'
include { DRAGEN_SCGE } from '../modules/local/dragen_scge.nf'
include { ANNOTATE_VARIANTS } from '../modules/local/annotate_variants.nf'
include { ANNOTATE_TRANSGENE_VARIANTS } from '../modules/local/annotate_transgene.nf'
include { GET_INDELS } from '../modules/local/get_indels.nf'
include { GET_TRANSGENE_JUNCTIONS } from '../modules/local/get_transgene_junctions.nf'
include { REFORMAT_CNV_DATA } from '../modules/local/reformat_cnv_data.nf'
Expand Down Expand Up @@ -103,36 +104,51 @@ workflow SCGE {
SOMATIC_INPUT_CHECK(Channel.fromPath(mastersheet), data_path)

ch_input_data = SOMATIC_INPUT_CHECK.out.input_data
hotspot_input = ch_input_data.combine(hotspot_bed)
MAKE_HOTSPOT_FILE(hotspot_input)
ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
.map{ info, hotspot_vcf ->
def newinfo = []
newinfo = info + [hotspot_vcf]
newinfo
}

ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
ch_hotspots = ch_input_data.map{ meta, file -> return [meta[0]['id'], file] }

if (params.run_dragen == true) {

hotspot_input = ch_input_data.combine(hotspot_bed)
MAKE_HOTSPOT_FILE(hotspot_input)
ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
.map{ info, hotspot_vcf ->
def newinfo = []
newinfo = info + [hotspot_vcf]
newinfo
}

ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))

DRAGEN_SCGE (ch_input_data, ch_dragen_inputs)
ch_versions = ch_versions.mix(DRAGEN_SCGE.out.versions)
ch_dragen_outputs = ch_dragen_outputs.mix(DRAGEN_SCGE.out.dragen_output)
} else {
ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
}

if (params.run_analysis == true) {

ANNOTATE_VARIANTS (ch_dragen_outputs)
ANNOTATE_VARIANTS (ch_dragen_outputs, ch_assay_inputs, params.fasta)
ch_versions = ch_versions.mix(ANNOTATE_VARIANTS.out.versions)

GET_INDELS (ch_dragen_outputs)

get_indels_input = ch_dragen_outputs
.map{ meta -> [meta[0]['id'], meta] }
.combine(ch_hotspots, by: 0)
.map{id, meta, hotspot_file -> [meta[0], meta[1], hotspot_file]}
GET_INDELS (get_indels_input)
ch_versions = ch_versions.mix(GET_INDELS.out.versions)

GET_TRANSGENE_JUNCTIONS (ch_dragen_outputs)
ch_versions = ch_versions.mix(GET_TRANSGENE_JUNCTIONS.out.versions)

annotate_transgene_input = ch_dragen_outputs.join(GET_TRANSGENE_JUNCTIONS.out.transgene_file)
ANNOTATE_TRANSGENE_VARIANTS (annotate_transgene_input)
ch_versions = ch_versions.mix(ANNOTATE_TRANSGENE_VARIANTS.out.versions)

REFORMAT_CNV_DATA (ch_dragen_outputs)
ch_versions = ch_versions.mix(REFORMAT_CNV_DATA.out.versions)
}
Expand Down

0 comments on commit 83554db

Please sign in to comment.