Skip to content

Commit

Permalink
Creating a small python package for plotting Tapestry outputs.
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonAHendry committed Aug 3, 2023
1 parent 90468f6 commit be1e1aa
Show file tree
Hide file tree
Showing 13 changed files with 1,124 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ __pycache__
notebooks
*.ipynb
*.ipynb_checkpoints
*.egg
*.egg-info

# Run outputs
results
Expand Down
13 changes: 13 additions & 0 deletions python/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: unravel
channels:
- conda-forge
- bioconda
dependencies:
- python=3.7
- seaborn
- matplotlib
- pandas
- numpy
- scipy
- click
- pip
43 changes: 43 additions & 0 deletions python/setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[metadata]
name = unravel
description = Unravel the outputs of Tapestry
author = Jason A. Hendry and Robert Verity
license = MIT
license_files = ../LICENSE
platforms = unix, linux, osx, cygwin, win32
classifiers =
Programming Language :: Python :: 3
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9

[options]
packages =
unravel
install_requires =
click
pandas
numpy
scipy
seaborn
matplotlib
python_requires = >=3.6
#package_dir =
#=unravel
zip_safe = no

[options.entry_points]
console_scripts =
unravel = unravel.cli:cli

#[options.extras_requires]
#testing =
# pytest>=6.0
# pytest-cov>=2.0
# flake8>=3.9
# black>=22.0

[flake8]
max-line-length=88
4 changes: 4 additions & 0 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from setuptools import setup

if __name__ == "__main__":
setup()
21 changes: 21 additions & 0 deletions python/unravel/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import click
# from unravel.coi.commands import coi
from unravel.sample.commands import sample
# from unravel.population.commads import population

@click.group()
def cli():
"""
Plot output files from Tapestry
"""

pass

# cli.add_command(coi)
cli.add_command(sample)
# cli.add_command(population)


if __name__ == "__main__":
cli()
115 changes: 115 additions & 0 deletions python/unravel/lib/chromosomes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import pandas as pd
from typing import Dict, List

import seaborn as sns
import matplotlib.pyplot as plt


class ChromosomeInformation:

CHROM_GAP = 10**4
EDGE_GAP = 10**5
CMAP = "Paired" # colors for chromosomes

def __init__(self, site_df: pd.DataFrame):
"""
Hold information about individual chromosomes;
including how much to shift sites to produce a genome-wide
x-axis
"""

# Check inputs
if not "pos" in site_df.columns:
raise ValueError("Column 'pos' must be in dataframe.")
if not "chrom" in site_df.columns:
raise ValueError("Column 'chrom' must be in dataframe.")

# Store
self.site_df = site_df

# Compute length information
self.lengths = self._get_chromosome_lengths()
self.genome_length = sum(self.lengths.values())

# Chromosome names & colors
self.names = list(self.lengths.keys())
self.cols = self._get_chromosome_colors()

# Compute shifts
self._shift_lookup = self._calc_chromosome_shifts(self.lengths)


def _get_chromosome_lengths(self) -> Dict[str, int]:
"""
Create a dictionary of chromosome lengths based on the passed sites
"""

dt = {}
for chrom_name, chrom_df in self.site_df.groupby("chrom"):
dt[chrom_name] = chrom_df["pos"].max()

return dt


def _get_chromosome_colors(self) -> Dict[set, tuple]:
"""
Create a dictionary mapping each chrosome to a color
"""

# Prepare colors
assert len(self.names) == 14, "Only support 14 chromosomes."
dt = dict(zip(
self.names,
sns.color_palette(self.CMAP, 10) + sns.color_palette(self.CMAP, 4)
))

return dt


def _calc_chromosome_shifts(self, chrom_lengths) -> Dict[str, int]:
"""
Compute shifted site position to make a genome-wide x-axis plot
from chromosomal positions
"""

lookup = {}
running_shift = self.EDGE_GAP
for chrom_name, chrom_length in chrom_lengths.items():
lookup[chrom_name] = running_shift
running_shift += chrom_length + self.CHROM_GAP

return lookup


def get_shifted_positions(self, chroms: List[str], pos: List[int]) -> List[int]:
"""
Get shifted positions given an array of chromosomes and positions
"""

return [
p + self._shift_lookup[c]
for (c, p) in zip(chroms, pos)
]


def set_genome_axis(self, ax):
"""
Set the limits, ticks, and labels for a genome-wide axis
"""
# Limits
xmin = 0 - self.EDGE_GAP
xmax = self._shift_lookup[self.names[-1]]
xmax += self.lengths[self.names[-1]]
xmax += self.EDGE_GAP
ax.set_xlim(xmin, xmax)

# Ticks and labels
ax.xaxis.set_major_locator(plt.MultipleLocator(10**6))
ax.xaxis.set_minor_locator(plt.MultipleLocator(2.5*10**5))
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda k,v: f"{int(k/10**6):00d}"))
79 changes: 79 additions & 0 deletions python/unravel/lib/dirs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from typing import List, Dict

# TODO
# - Sanity checks and useful warnings
# - Could change to using Path

class TapestryCOIOutputDir:
"""
Interface for `Tapestry` output directory for
a particular COI
"""
def __init__(self, coi_dir: str) -> None:
self.coi_dir = coi_dir

# MCMC diagnostics
self.mcmc_diagnostics_csv = f"{coi_dir}/mcmc.diagnostics.csv"
self.mcmc_parameters_csv = f"{coi_dir}/mcmc.parameters.csv"

# Fits
self.props_csv = f"{coi_dir}/fit.proportions.csv"
self.ibd_pairwise_csv = f"{coi_dir}/fit.ibd.pairwise.csv"
self.ibd_path_csv = f"{coi_dir}/fit.ibd.path.csv"
self.ibd_segs_bed = f"{coi_dir}/fit.ibd.segments.bed"
self.ibd_stats_csv = f"{coi_dir}/fit.ibd.stats.csv"


class TapestrySampleOutputDir:
"""
Interface for outputs produced by `Tapestry` for
a single sample
"""

def __init__(self, tapestry_dir: str) -> None:
"""
Initialise interface for all input files
"""

self.tapestry_dir = tapestry_dir

# COI Information
self._coi_dirs, self._cois = self._get_coi_info()

# COI dictionary -- inteface for interacting with COI specific files
self.coi = self._build_coi_dict()

# Comparison CSVs
self.compare_heuristic_csv = f"{self.tapestry_dir}/compare.heuristic.csv"
self.compare_evidence_csv = f"{self.tapestry_dir}/compare.evidence.csv"

def _get_coi_info(self) -> (List[str], List[int]):
"""
Get all COI directories, 'K[0-9]{1}'
"""

coi_dirs = [
f"{self.tapestry_dir}/{d}"
for d in os.listdir(self.tapestry_dir)
if d.startswith("K") and os.path.isdir(f"{self.tapestry_dir}/{d}")
]
cois = [int(coi_dir[-1]) for coi_dir in coi_dirs]

return (coi_dirs, cois)

def _build_coi_dict(self) -> Dict[int, TapestryCOIOutputDir]:
"""
Build a dictionary of COI outputs
"""
assert self._coi_dirs is not None
assert self._cois is not None
return {
coi: TapestryCOIOutputDir(coi_dir)
for coi, coi_dir in zip(self._cois, self._coi_dirs)
}

64 changes: 64 additions & 0 deletions python/unravel/lib/ibd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional


@dataclass
class BEDRecord:
chrom: str
start: int
end: int
name: str = ""


def get_ibd_segments(chroms: List[str],
pos: List[int],
ibd_states: List[bool],
name: Optional[str] = ""
) -> pd.DataFrame:
"""
Get all IBD segments implied from a list indicating the IBD
state at each position in a genome
"""

# Init
t = 0
chrom = chroms[t]
ibd_state = ibd_states[t]
if ibd_state:
start = pos[t]

# Iterate
bed_records = []
for t in range(1, len(pos)):

# Handle end of chromosomes
if chrom != chroms[t]:
if ibd_state:
end = pos[t-1]
bed_records.append(BEDRecord(chrom, start, end, name))
if ibd_states[t]:
start = pos[t]
chrom = chroms[t]
ibd_state = ibd_states[t]
continue

# Handle chromosome internal
if ibd_state:
if not ibd_states[t]: # segement end
end = (pos[t-1] + pos[t]) / 2
bed_records.append(BEDRecord(chrom, start, end, name))
elif ibd_states[t]: # segment start
start = (pos[t-1] + pos[t]) / 2

# Update memory
chrom = chroms[t]
ibd_state = ibd_states[t]

# Terminate
if ibd_state:
end = pos[t-1] # as with end of chromosome
bed_records.append(BEDRecord(chrom, start, end, name))

return pd.DataFrame(bed_records)
Loading

0 comments on commit be1e1aa

Please sign in to comment.