-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Creating a small python package for plotting Tapestry outputs.
- Loading branch information
1 parent
90468f6
commit be1e1aa
Showing
13 changed files
with
1,124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ __pycache__ | |
notebooks | ||
*.ipynb | ||
*.ipynb_checkpoints | ||
*.egg | ||
*.egg-info | ||
|
||
# Run outputs | ||
results | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: unravel | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
dependencies: | ||
- python=3.7 | ||
- seaborn | ||
- matplotlib | ||
- pandas | ||
- numpy | ||
- scipy | ||
- click | ||
- pip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
[metadata] | ||
name = unravel | ||
description = Unravel the outputs of Tapestry | ||
author = Jason A. Hendry and Robert Verity | ||
license = MIT | ||
license_files = ../LICENSE | ||
platforms = unix, linux, osx, cygwin, win32 | ||
classifiers = | ||
Programming Language :: Python :: 3 | ||
Programming Language :: Python :: 3 :: Only | ||
Programming Language :: Python :: 3.6 | ||
Programming Language :: Python :: 3.7 | ||
Programming Language :: Python :: 3.8 | ||
Programming Language :: Python :: 3.9 | ||
|
||
[options] | ||
packages = | ||
unravel | ||
install_requires = | ||
click | ||
pandas | ||
numpy | ||
scipy | ||
seaborn | ||
matplotlib | ||
python_requires = >=3.6 | ||
#package_dir = | ||
#=unravel | ||
zip_safe = no | ||
|
||
[options.entry_points] | ||
console_scripts = | ||
unravel = unravel.cli:cli | ||
|
||
#[options.extras_requires] | ||
#testing = | ||
# pytest>=6.0 | ||
# pytest-cov>=2.0 | ||
# flake8>=3.9 | ||
# black>=22.0 | ||
|
||
[flake8] | ||
max-line-length=88 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from setuptools import setup | ||
|
||
if __name__ == "__main__": | ||
setup() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import click | ||
# from unravel.coi.commands import coi | ||
from unravel.sample.commands import sample | ||
# from unravel.population.commads import population | ||
|
||
@click.group() | ||
def cli(): | ||
""" | ||
Plot output files from Tapestry | ||
""" | ||
|
||
pass | ||
|
||
# cli.add_command(coi) | ||
cli.add_command(sample) | ||
# cli.add_command(population) | ||
|
||
|
||
if __name__ == "__main__": | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import pandas as pd | ||
from typing import Dict, List | ||
|
||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
class ChromosomeInformation: | ||
|
||
CHROM_GAP = 10**4 | ||
EDGE_GAP = 10**5 | ||
CMAP = "Paired" # colors for chromosomes | ||
|
||
def __init__(self, site_df: pd.DataFrame): | ||
""" | ||
Hold information about individual chromosomes; | ||
including how much to shift sites to produce a genome-wide | ||
x-axis | ||
""" | ||
|
||
# Check inputs | ||
if not "pos" in site_df.columns: | ||
raise ValueError("Column 'pos' must be in dataframe.") | ||
if not "chrom" in site_df.columns: | ||
raise ValueError("Column 'chrom' must be in dataframe.") | ||
|
||
# Store | ||
self.site_df = site_df | ||
|
||
# Compute length information | ||
self.lengths = self._get_chromosome_lengths() | ||
self.genome_length = sum(self.lengths.values()) | ||
|
||
# Chromosome names & colors | ||
self.names = list(self.lengths.keys()) | ||
self.cols = self._get_chromosome_colors() | ||
|
||
# Compute shifts | ||
self._shift_lookup = self._calc_chromosome_shifts(self.lengths) | ||
|
||
|
||
def _get_chromosome_lengths(self) -> Dict[str, int]: | ||
""" | ||
Create a dictionary of chromosome lengths based on the passed sites | ||
""" | ||
|
||
dt = {} | ||
for chrom_name, chrom_df in self.site_df.groupby("chrom"): | ||
dt[chrom_name] = chrom_df["pos"].max() | ||
|
||
return dt | ||
|
||
|
||
def _get_chromosome_colors(self) -> Dict[set, tuple]: | ||
""" | ||
Create a dictionary mapping each chrosome to a color | ||
""" | ||
|
||
# Prepare colors | ||
assert len(self.names) == 14, "Only support 14 chromosomes." | ||
dt = dict(zip( | ||
self.names, | ||
sns.color_palette(self.CMAP, 10) + sns.color_palette(self.CMAP, 4) | ||
)) | ||
|
||
return dt | ||
|
||
|
||
def _calc_chromosome_shifts(self, chrom_lengths) -> Dict[str, int]: | ||
""" | ||
Compute shifted site position to make a genome-wide x-axis plot | ||
from chromosomal positions | ||
""" | ||
|
||
lookup = {} | ||
running_shift = self.EDGE_GAP | ||
for chrom_name, chrom_length in chrom_lengths.items(): | ||
lookup[chrom_name] = running_shift | ||
running_shift += chrom_length + self.CHROM_GAP | ||
|
||
return lookup | ||
|
||
|
||
def get_shifted_positions(self, chroms: List[str], pos: List[int]) -> List[int]: | ||
""" | ||
Get shifted positions given an array of chromosomes and positions | ||
""" | ||
|
||
return [ | ||
p + self._shift_lookup[c] | ||
for (c, p) in zip(chroms, pos) | ||
] | ||
|
||
|
||
def set_genome_axis(self, ax): | ||
""" | ||
Set the limits, ticks, and labels for a genome-wide axis | ||
""" | ||
# Limits | ||
xmin = 0 - self.EDGE_GAP | ||
xmax = self._shift_lookup[self.names[-1]] | ||
xmax += self.lengths[self.names[-1]] | ||
xmax += self.EDGE_GAP | ||
ax.set_xlim(xmin, xmax) | ||
|
||
# Ticks and labels | ||
ax.xaxis.set_major_locator(plt.MultipleLocator(10**6)) | ||
ax.xaxis.set_minor_locator(plt.MultipleLocator(2.5*10**5)) | ||
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda k,v: f"{int(k/10**6):00d}")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import os | ||
from typing import List, Dict | ||
|
||
# TODO | ||
# - Sanity checks and useful warnings | ||
# - Could change to using Path | ||
|
||
class TapestryCOIOutputDir: | ||
""" | ||
Interface for `Tapestry` output directory for | ||
a particular COI | ||
""" | ||
def __init__(self, coi_dir: str) -> None: | ||
self.coi_dir = coi_dir | ||
|
||
# MCMC diagnostics | ||
self.mcmc_diagnostics_csv = f"{coi_dir}/mcmc.diagnostics.csv" | ||
self.mcmc_parameters_csv = f"{coi_dir}/mcmc.parameters.csv" | ||
|
||
# Fits | ||
self.props_csv = f"{coi_dir}/fit.proportions.csv" | ||
self.ibd_pairwise_csv = f"{coi_dir}/fit.ibd.pairwise.csv" | ||
self.ibd_path_csv = f"{coi_dir}/fit.ibd.path.csv" | ||
self.ibd_segs_bed = f"{coi_dir}/fit.ibd.segments.bed" | ||
self.ibd_stats_csv = f"{coi_dir}/fit.ibd.stats.csv" | ||
|
||
|
||
class TapestrySampleOutputDir: | ||
""" | ||
Interface for outputs produced by `Tapestry` for | ||
a single sample | ||
""" | ||
|
||
def __init__(self, tapestry_dir: str) -> None: | ||
""" | ||
Initialise interface for all input files | ||
""" | ||
|
||
self.tapestry_dir = tapestry_dir | ||
|
||
# COI Information | ||
self._coi_dirs, self._cois = self._get_coi_info() | ||
|
||
# COI dictionary -- inteface for interacting with COI specific files | ||
self.coi = self._build_coi_dict() | ||
|
||
# Comparison CSVs | ||
self.compare_heuristic_csv = f"{self.tapestry_dir}/compare.heuristic.csv" | ||
self.compare_evidence_csv = f"{self.tapestry_dir}/compare.evidence.csv" | ||
|
||
def _get_coi_info(self) -> (List[str], List[int]): | ||
""" | ||
Get all COI directories, 'K[0-9]{1}' | ||
""" | ||
|
||
coi_dirs = [ | ||
f"{self.tapestry_dir}/{d}" | ||
for d in os.listdir(self.tapestry_dir) | ||
if d.startswith("K") and os.path.isdir(f"{self.tapestry_dir}/{d}") | ||
] | ||
cois = [int(coi_dir[-1]) for coi_dir in coi_dirs] | ||
|
||
return (coi_dirs, cois) | ||
|
||
def _build_coi_dict(self) -> Dict[int, TapestryCOIOutputDir]: | ||
""" | ||
Build a dictionary of COI outputs | ||
""" | ||
assert self._coi_dirs is not None | ||
assert self._cois is not None | ||
return { | ||
coi: TapestryCOIOutputDir(coi_dir) | ||
for coi, coi_dir in zip(self._cois, self._coi_dirs) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import pandas as pd | ||
from dataclasses import dataclass | ||
from typing import List, Optional | ||
|
||
|
||
@dataclass | ||
class BEDRecord: | ||
chrom: str | ||
start: int | ||
end: int | ||
name: str = "" | ||
|
||
|
||
def get_ibd_segments(chroms: List[str], | ||
pos: List[int], | ||
ibd_states: List[bool], | ||
name: Optional[str] = "" | ||
) -> pd.DataFrame: | ||
""" | ||
Get all IBD segments implied from a list indicating the IBD | ||
state at each position in a genome | ||
""" | ||
|
||
# Init | ||
t = 0 | ||
chrom = chroms[t] | ||
ibd_state = ibd_states[t] | ||
if ibd_state: | ||
start = pos[t] | ||
|
||
# Iterate | ||
bed_records = [] | ||
for t in range(1, len(pos)): | ||
|
||
# Handle end of chromosomes | ||
if chrom != chroms[t]: | ||
if ibd_state: | ||
end = pos[t-1] | ||
bed_records.append(BEDRecord(chrom, start, end, name)) | ||
if ibd_states[t]: | ||
start = pos[t] | ||
chrom = chroms[t] | ||
ibd_state = ibd_states[t] | ||
continue | ||
|
||
# Handle chromosome internal | ||
if ibd_state: | ||
if not ibd_states[t]: # segement end | ||
end = (pos[t-1] + pos[t]) / 2 | ||
bed_records.append(BEDRecord(chrom, start, end, name)) | ||
elif ibd_states[t]: # segment start | ||
start = (pos[t-1] + pos[t]) / 2 | ||
|
||
# Update memory | ||
chrom = chroms[t] | ||
ibd_state = ibd_states[t] | ||
|
||
# Terminate | ||
if ibd_state: | ||
end = pos[t-1] # as with end of chromosome | ||
bed_records.append(BEDRecord(chrom, start, end, name)) | ||
|
||
return pd.DataFrame(bed_records) |
Oops, something went wrong.