Skip to content

Commit

Permalink
readme update with instructions to create genome file
Browse files Browse the repository at this point in the history
config file and code updated
- field separator for input files
- added gains and deep loss quick filters
- does not select plot data points when ratio file annotation is not
provided
  • Loading branch information
rghu committed Jan 21, 2020
1 parent 7482835 commit 2bbec11
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 16 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ optional arguments:
only if providing VCF)
```
ReconCNV has been tested on MacOS and Linux environments.
## Quickstart with an example
At the minimum we need the ratio file and genome file to generate a plot using reconCNV. First make sure values of keys within the "column_names" field in the JSON configuration file matches those seen on the header of ratio and genome files.
Expand Down Expand Up @@ -245,6 +247,12 @@ Genome file can have any number of columns in addition to those described above.
}
```
Users can create a genome file by parsing the index file generated by `samtools faidx` command and adding the corresponding header (chromosome, length, cumulative_length).
```
samtools faidx <genome>.fa # this generates <genome>.fa.fai - index file
cut -f 1,2 <genome>.fa.fai | awk 'BEGIN{CUMSUM=0}{print $0"\t"CUMSUM; CUMSUM = CUMSUM + $2}'
```
Refer to an example genome file `data/hg19_genome_length.txt`.
**Segmentation file**
Expand Down
13 changes: 10 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"log2FC": "log2",
"weight": "weight"
},
"field_separator": "\t",
"off_target_label": "Antitarget",
"off_target_low_conf_log2": -10,
"weight_scaling_factor": 10
Expand All @@ -19,7 +20,8 @@
"chromosome": "chromosome",
"chr_length": "length",
"chr_cumulative_length": "length_cumsum"
}
},
"field_separator": "\t"
},
"segmentation_file":{
"column_names":{
Expand All @@ -28,7 +30,8 @@
"end": "end",
"log2FC": "log2",
"gene": "gene"
}
},
"field_separator": "\t"
},
"gene_file":{
"column_names":{
Expand All @@ -38,7 +41,10 @@
"gene": "gene",
"log2FC": "log2"
},
"field_separator": "\t",
"loss_threshold": -0.4,
"deep_loss_threshold": -1.0,
"gain_threshold" : 0.5,
"amp_threshold": 1.5
},
"annotation_file":{
Expand All @@ -49,7 +55,8 @@
"tx_id": "name",
"exon_number": "exon_number",
"gene": "name2"
}
},
"field_separator": "\t"
},
"vcf_file":{
"info_fields":{
Expand Down
105 changes: 92 additions & 13 deletions reconCNV.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import vcf
import json
import logging

# module for command line prompt
import argparse
Expand Down Expand Up @@ -70,13 +71,24 @@
dest="bed_blacklist", default=None,
help="File containing variants to NOT plot VAF. (applicable only if providing VCF)")

parser.add_argument("--verbose", "-l", required=False,
dest="verb_log", default=None, action="store_true",
help="Verbose logging output")

options = parser.parse_args()

if options.verb_log:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.DEBUG)
else:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

outdir = options.out_dir

with open(options.config_file, "r") as json_file:
config = json.load(json_file)

logging.info("Successfully read the configuration file.")


def draw_chr_boundary(figure, chr_boundary, genome, vaf):
# first line drawn at the beginning of the plotting panel
Expand Down Expand Up @@ -155,32 +167,64 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):


# read ratio file for plotting log2(FC) points
data = pd.read_csv(options.ratio_file, sep="\t")
data = pd.read_csv(options.ratio_file, sep=config['files']['ratio_file']['field_separator'])
logging.info("Successfully read the ratio file.")

# make sure chromosome names are string data type
data[config['files']['ratio_file']['column_names']['chromosome']] = \
data[config['files']['ratio_file']['column_names']['chromosome']].astype(str)

# index generation for marking chromosomes and drawing chromosome lines
data["ind"] = range(len(data))

# check if gene annotation column is provided accurately
if config['files']['ratio_file']['column_names']['gene'] not in data.columns:
data[config['files']['ratio_file']['column_names']['gene']] = "-"
logging.warning("Column titled \"" + config['files']['ratio_file']['column_names'][
'gene'] + "\" not found in the ratio file. Using \"-\" in place of the annotation. "
"Even off-target bins might be marked as on-target!")

# check if weight column is provided accurately
if config['files']['ratio_file']['column_names']['weight'] not in data.columns:
data[config['files']['ratio_file']['column_names']['weight']] = 0.1
logging.warning("Column titled \"" + config['files']['ratio_file']['column_names'][
'weight'] + "\" not found in the ratio file. Using \"0.1\" in place of the weight.")

# color background off-target bins (aka Antitarget) differently from on-target bins (aka Target)
data['label'] = np.where(
(data[config['files']['ratio_file']['column_names']['gene']] == config['files']['ratio_file']['off_target_label']),
"Antitarget", "Target")

# do not plot antitarget bins that are below log2 -10 (low confidence points)
data[config['files']['ratio_file']['column_names']['log2FC']] = np.where(np.logical_and(
data[config['files']['ratio_file']['column_names']['log2FC']] < config['files']['ratio_file'][
'off_target_low_conf_log2'], data.label == "Antitarget"), np.nan, data[config['files']['ratio_file'][
'column_names']['log2FC']])

chr_cumsum = pd.read_csv(options.genome_file, sep="\t")
chr_cumsum = pd.read_csv(options.genome_file, sep=config['files']['genome_file']['field_separator'])
logging.info("Successfully read the genome file.")

# make sure chromosome names are string data type
chr_cumsum[config['files']['genome_file']['column_names']['chromosome']] = \
chr_cumsum[config['files']['genome_file']['column_names']['chromosome']].astype(str)

# retrieve genome level coordinates from chromosome level coordinates
data = pd.merge(data, chr_cumsum, left_on=config['files']['ratio_file']['column_names']['chromosome'],
right_on=config['files']['genome_file']['column_names']['chromosome'], how='left')
data['genome_cumsum'] = data[config['files']['ratio_file']['column_names']['start']] + data[
config['files']['genome_file']['column_names']['chr_cumulative_length']]

if (options.seg_file):
seg = pd.read_csv(options.seg_file, sep="\t")
seg = pd.read_csv(options.seg_file, sep=config['files']['segmentation_file']['field_separator'])
logging.info("Successfully read the genome file.")

# check if gene annotation column is provided accurately
if config['files']['segmentation_file']['column_names']['gene'] not in seg.columns:
seg[config['files']['segmentation_file']['column_names']['gene']] = "-"
logging.warning("Column titled \"" + config['files']['segmentation_file']['column_names'][
'gene'] + "\" not found in the segmentation file. Using \"-\" in place of the annotation.")

# make sure chromosome names are string data type
seg[config['files']['segmentation_file']['column_names']['chromosome']] = \
seg[config['files']['segmentation_file']['column_names']['chromosome']].astype(str)
seg["prob_start"] = data.merge(seg, how="right",
Expand All @@ -203,7 +247,9 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
seg['index'] = range(len(seg))

if (options.gene_file):
gen = pd.read_csv(options.gene_file, sep="\t")
gen = pd.read_csv(options.gene_file, sep=config['files']['gene_file']['field_separator'])
logging.info("Successfully read the gene CNV file.")

gen[config['files']['gene_file']['column_names']['chromosome']] = \
gen[config['files']['gene_file']['column_names']['chromosome']].astype(str)
gen["prob_start"] = data.merge(gen, how="right",
Expand All @@ -223,6 +269,7 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
gen['genome_cumsum_end'] = gen[config['files']['gene_file']['column_names']['end']] + gen[
config['files']['genome_file']['column_names']['chr_cumulative_length']]

#subsetting columns to define gene boundaries
gene_boundaries = gen[[config['files']['gene_file']['column_names']['chromosome'],
config['files']['gene_file']['column_names']['start'],
config['files']['gene_file']['column_names']['end'],
Expand All @@ -236,7 +283,9 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
config['files']['genome_file']['column_names']['chr_cumulative_length']]

if (options.annot_file):
gene_track = pd.read_csv(options.annot_file, sep="\t")
gene_track = pd.read_csv(options.annot_file, sep=config['files']['annotation_file']['field_separator'])
logging.info("Successfully read the annotation file.")

gene_track[config['files']['annotation_file']['column_names']['chromosome']] = \
gene_track[config['files']['annotation_file']['column_names']['chromosome']].astype(str)
gene_track = pd.merge(gene_track, chr_cumsum,
Expand All @@ -251,13 +300,17 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):

if (options.bed_blacklist and options.vcf_file):
bed_blacklist = pd.read_csv(options.bed_blacklist, sep="\t", header=None)
logging.info("Successfully read the BED SNP blacklist file.")

bed_blacklist = bed_blacklist[bed_blacklist.columns[0:5]]
bed_blacklist.columns = ['chromosome', 'start', 'ref', 'alt', 'count']
bed_blacklist['chromosome'] = bed_blacklist['chromosome'].astype(str)

if (options.vcf_file):
df_vaf = []
logging.info("Reading VCF file ...")
vcf_reader = vcf.Reader(open(options.vcf_file, 'r'))
logging.info("Processing VCF file ...")
for record in vcf_reader:
if record.INFO[config['files']['vcf_file']['info_fields']['depth']] >= \
config['files']['vcf_file']['thresholds']['depth'] and \
Expand All @@ -284,6 +337,7 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
record.INFO[config['files']['vcf_file']['info_fields']['reverse_alt_reads']][0]) /
record.INFO[config['files']['vcf_file']['info_fields']['depth']]})
df_vaf = pd.DataFrame(df_vaf)
logging.info("Successfully filtered VCF file.")
if (not df_vaf.empty):
df_vaf['chromosome'] = df_vaf['chromosome'].astype(str)
df_vaf['ref'] = df_vaf['ref'].astype('str')
Expand Down Expand Up @@ -348,12 +402,13 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
df_vaf_gene = df_vaf_gene.groupby(['chromosome', 'gene_start', 'gene_end', 'cluster', 'genome_cumsum_start',
'genome_cumsum_end']).mean().reset_index()
else:
print("No variants to plot! (VCF did not have any variants passing the set thresholds)")
logging.warning("No variants to plot! (VCF did not have any variants passing the set thresholds)")

# setting up output HTML file
logging.info("Setting up output file.")
output_filename = os.path.basename(options.out_file)
title_name = os.path.basename(os.path.splitext(options.ratio_file)[0])
print("Writing visualization to ", outdir, "/", output_filename, "...", sep="")
logging.info("Writing visualization to " + outdir + "/" + output_filename + "...")
output_file(outdir + "/" + output_filename, title=title_name + ' reconCNV Plot')

source = ColumnDataSource(data=dict(
Expand Down Expand Up @@ -700,8 +755,8 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
width=config['plots']['gene_table']['width'],
height=config['plots']['gene_table']['height'])

selection_options = ["---", "All", "All Amplification and Loss", "Amplification", "Loss", "---",
config['files']['ratio_file']['off_target_label']]
selection_options = ["---", "All", "All Amplification and Loss", "Amplification", "Gain", "Loss", "Deep Loss",
"---", config['files']['ratio_file']['off_target_label']]
selection_options.extend(np.unique(gen[config['files']['gene_file']['column_names']['gene']].tolist()))

select = MultiSelect(title="Gene filter", options=selection_options, max_width=300, size=15)
Expand All @@ -720,6 +775,8 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
filteredSource=filteredSource,
data_table=data_table_gene,
loss_threshold=config['files']['gene_file']['loss_threshold'],
deep_loss_threshold = config['files']['gene_file']['deep_loss_threshold'],
gain_threshold=config['files']['gene_file']['gain_threshold'],
amp_threshold=config['files']['gene_file']['amp_threshold']), code="""
var data_gene = source_gene.data;
var f = cb_obj.value;
Expand All @@ -733,15 +790,24 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
for(i = 0; i < data_gene['chrom'].length;i++){
if(f == "Loss"){
thresh = loss_threshold
if(f == "Deep Loss"){
thresh = deep_loss_threshold
if(data_gene['logFC'][i] <= thresh){
d2['chrom'].push(data_gene['chrom'][i])
d2['start'].push(data_gene['start'][i])
d2['end'].push(data_gene['end'][i])
d2['gene'].push(data_gene['gene'][i])
d2['logFC'].push(data_gene['logFC'][i])
}
}else if(f == "Loss"){
thresh = loss_threshold
if(data_gene['logFC'][i] <= thresh & data_gene['logFC'][i] > deep_loss_threshold){
d2['chrom'].push(data_gene['chrom'][i])
d2['start'].push(data_gene['start'][i])
d2['end'].push(data_gene['end'][i])
d2['gene'].push(data_gene['gene'][i])
d2['logFC'].push(data_gene['logFC'][i])
}
}else if(f == "Amplification"){
thresh = amp_threshold
if(data_gene['logFC'][i] >= thresh){
Expand All @@ -751,6 +817,14 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
d2['gene'].push(data_gene['gene'][i])
d2['logFC'].push(data_gene['logFC'][i])
}
}else if(f == "Gain"){
if(data_gene['logFC'][i] >= gain_threshold & data_gene['logFC'][i] < amp_threshold){
d2['chrom'].push(data_gene['chrom'][i])
d2['start'].push(data_gene['start'][i])
d2['end'].push(data_gene['end'][i])
d2['gene'].push(data_gene['gene'][i])
d2['logFC'].push(data_gene['logFC'][i])
}
}else if (f == "All"){
thresh = 0
d2['chrom'].push(data_gene['chrom'][i])
Expand Down Expand Up @@ -806,7 +880,7 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
d3['label'].push(data['label'][i])
d3['weight'].push(data['weight'][i])
}else if(f == "Loss" | f == "Amplification" | f == "All Amplification and Loss"){
}else if(f == "Loss" | f == "Amplification" | f == "All Amplification and Loss" | f == "Gain" | f == "Deep Loss"){
if(d2['gene'].includes(data['gene'][i])){
d3['chrom'].push(data['chrom'][i])
d3['start'].push(data['start'][i])
Expand All @@ -829,8 +903,13 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):
d3['label'].push(data['label'][i])
d3['weight'].push(data['weight'][i])
}
}
if (!d3['chrom'].length){
d3 = data
}
source.data = d3
filteredSource.change.emit()
// trigger change on datatable
Expand Down Expand Up @@ -902,4 +981,4 @@ def draw_chr_boundary(figure, chr_boundary, genome, vaf):

save(final_fig)
# show(final_fig)
print("Done!")
logging.info("Done!")
Binary file added reconCNV_latest.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 2bbec11

Please sign in to comment.