Creating a small python package for plotting Tapestry outputs.

mrc-ide · Aug 3, 2023 · be1e1aa · be1e1aa
1 parent 90468f6
commit be1e1aa
Show file tree

Hide file tree

Showing 13 changed files with 1,124 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,8 @@ __pycache__
 notebooks
 *.ipynb
 *.ipynb_checkpoints
+*.egg
+*.egg-info
 
 # Run outputs
 results

diff --git a/python/environment.yml b/python/environment.yml
@@ -0,0 +1,13 @@
+name: unravel
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.7
+  - seaborn
+  - matplotlib
+  - pandas
+  - numpy
+  - scipy
+  - click
+  - pip
diff --git a/python/setup.cfg b/python/setup.cfg
@@ -0,0 +1,43 @@
+[metadata]
+name = unravel
+description = Unravel the outputs of Tapestry
+author = Jason A. Hendry and Robert Verity
+license = MIT
+license_files = ../LICENSE
+platforms = unix, linux, osx, cygwin, win32
+classifiers  =
+	Programming Language :: Python :: 3
+	Programming Language :: Python :: 3 :: Only
+	Programming Language :: Python :: 3.6
+	Programming Language :: Python :: 3.7
+	Programming Language :: Python :: 3.8
+	Programming Language :: Python :: 3.9
+
+[options]
+packages =
+	unravel
+install_requires =
+	click
+	pandas
+	numpy
+	scipy
+	seaborn
+	matplotlib
+python_requires = >=3.6
+#package_dir =
+	#=unravel
+zip_safe = no
+
+[options.entry_points]
+console_scripts =
+	unravel = unravel.cli:cli
+
+#[options.extras_requires]
+#testing =
+#   pytest>=6.0
+#	pytest-cov>=2.0
+#	flake8>=3.9
+#	black>=22.0
+
+[flake8]
+max-line-length=88
diff --git a/python/setup.py b/python/setup.py
@@ -0,0 +1,4 @@
+from setuptools import setup
+
+if __name__ == "__main__":
+    setup()
diff --git a/python/unravel/cli.py b/python/unravel/cli.py
@@ -0,0 +1,21 @@
+import click
+# from unravel.coi.commands import coi
+from unravel.sample.commands import sample
+# from unravel.population.commads import population
+
+@click.group()
+def cli():
+    """
+    Plot output files from Tapestry
+    
+    """
+
+    pass
+
+# cli.add_command(coi)
+cli.add_command(sample)
+# cli.add_command(population)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/python/unravel/lib/chromosomes.py b/python/unravel/lib/chromosomes.py
@@ -0,0 +1,115 @@
+import pandas as pd
+from typing import Dict, List
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+class ChromosomeInformation:
+
+    CHROM_GAP = 10**4
+    EDGE_GAP = 10**5
+    CMAP = "Paired"  # colors for chromosomes
+
+    def __init__(self, site_df: pd.DataFrame):
+        """
+        Hold information about individual chromosomes;
+        including how much to shift sites to produce a genome-wide
+        x-axis
+        
+        """
+
+        # Check inputs
+        if not "pos" in site_df.columns:
+            raise ValueError("Column 'pos' must be in dataframe.")        
+        if not "chrom" in site_df.columns:
+            raise ValueError("Column 'chrom' must be in dataframe.")
+
+        # Store
+        self.site_df = site_df
+
+        # Compute length information
+        self.lengths = self._get_chromosome_lengths()
+        self.genome_length = sum(self.lengths.values())
+
+        # Chromosome names & colors
+        self.names = list(self.lengths.keys())
+        self.cols = self._get_chromosome_colors()
+
+        # Compute shifts
+        self._shift_lookup = self._calc_chromosome_shifts(self.lengths)
+
+
+    def _get_chromosome_lengths(self) -> Dict[str, int]:
+        """
+        Create a dictionary of chromosome lengths based on the passed sites
+        
+        """
+
+        dt = {}
+        for chrom_name, chrom_df in self.site_df.groupby("chrom"):
+            dt[chrom_name] = chrom_df["pos"].max()
+
+        return dt
+
+
+    def _get_chromosome_colors(self) -> Dict[set, tuple]:
+        """
+        Create a dictionary mapping each chrosome to a color
+        
+        """
+
+        # Prepare colors
+        assert len(self.names) == 14, "Only support 14 chromosomes."
+        dt = dict(zip(
+            self.names,
+            sns.color_palette(self.CMAP, 10) + sns.color_palette(self.CMAP, 4)
+        ))
+
+        return dt
+
+
+    def _calc_chromosome_shifts(self, chrom_lengths) -> Dict[str, int]:
+        """
+        Compute shifted site position to make a genome-wide x-axis plot
+        from chromosomal positions
+        
+        """
+
+        lookup = {}
+        running_shift = self.EDGE_GAP
+        for chrom_name, chrom_length in chrom_lengths.items():
+            lookup[chrom_name] = running_shift
+            running_shift += chrom_length + self.CHROM_GAP
+
+        return lookup
+
+
+    def get_shifted_positions(self, chroms: List[str], pos: List[int]) -> List[int]:
+        """
+        Get shifted positions given an array of chromosomes and positions
+        
+        """
+
+        return [
+            p + self._shift_lookup[c]
+            for (c, p) in zip(chroms, pos)
+        ]
+
+
+    def set_genome_axis(self, ax):
+        """
+        Set the limits, ticks, and labels for a genome-wide axis
+
+        """
+        # Limits
+        xmin = 0 - self.EDGE_GAP
+        xmax = self._shift_lookup[self.names[-1]] 
+        xmax += self.lengths[self.names[-1]] 
+        xmax += self.EDGE_GAP
+        ax.set_xlim(xmin, xmax)
+
+        # Ticks and labels
+        ax.xaxis.set_major_locator(plt.MultipleLocator(10**6))
+        ax.xaxis.set_minor_locator(plt.MultipleLocator(2.5*10**5))
+        ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda k,v: f"{int(k/10**6):00d}"))
diff --git a/python/unravel/lib/dirs.py b/python/unravel/lib/dirs.py
@@ -0,0 +1,79 @@
+import os
+from typing import List, Dict
+
+# TODO
+# - Sanity checks and useful warnings
+# - Could change to using Path
+
+class TapestryCOIOutputDir:
+	"""
+	Interface for `Tapestry` output directory for
+	a particular COI
+	"""
+	def __init__(self, coi_dir: str) -> None:
+		self.coi_dir = coi_dir
+
+		# MCMC diagnostics
+		self.mcmc_diagnostics_csv = f"{coi_dir}/mcmc.diagnostics.csv"
+		self.mcmc_parameters_csv = f"{coi_dir}/mcmc.parameters.csv"
+
+		# Fits
+		self.props_csv = f"{coi_dir}/fit.proportions.csv"
+		self.ibd_pairwise_csv = f"{coi_dir}/fit.ibd.pairwise.csv"
+		self.ibd_path_csv = f"{coi_dir}/fit.ibd.path.csv"
+		self.ibd_segs_bed = f"{coi_dir}/fit.ibd.segments.bed"
+		self.ibd_stats_csv = f"{coi_dir}/fit.ibd.stats.csv"
+
+
+class TapestrySampleOutputDir:
+	"""
+	Interface for outputs produced by `Tapestry` for
+	a single sample
+
+	"""
+
+	def __init__(self, tapestry_dir: str) -> None:
+		"""
+		Initialise interface for all input files
+
+		"""
+
+		self.tapestry_dir = tapestry_dir
+
+		# COI Information
+		self._coi_dirs, self._cois = self._get_coi_info()
+
+		# COI dictionary -- inteface for interacting with COI specific files
+		self.coi = self._build_coi_dict()
+
+		# Comparison CSVs
+		self.compare_heuristic_csv = f"{self.tapestry_dir}/compare.heuristic.csv"
+		self.compare_evidence_csv = f"{self.tapestry_dir}/compare.evidence.csv"
+
+	def _get_coi_info(self) -> (List[str], List[int]):
+		""" 
+		Get all COI directories, 'K[0-9]{1}'
+		
+		"""
+
+		coi_dirs = [
+			f"{self.tapestry_dir}/{d}"
+			for d in os.listdir(self.tapestry_dir)
+			if d.startswith("K") and os.path.isdir(f"{self.tapestry_dir}/{d}")
+		]
+		cois = [int(coi_dir[-1]) for coi_dir in coi_dirs]
+
+		return (coi_dirs, cois)
+
+	def _build_coi_dict(self) -> Dict[int, TapestryCOIOutputDir]:
+		"""
+		Build a dictionary of COI outputs
+
+		"""
+		assert self._coi_dirs is not None
+		assert self._cois is not None
+		return {
+			coi: TapestryCOIOutputDir(coi_dir)
+			for coi, coi_dir in zip(self._cois, self._coi_dirs)
+		}
+
diff --git a/python/unravel/lib/ibd.py b/python/unravel/lib/ibd.py
@@ -0,0 +1,64 @@
+import pandas as pd
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class BEDRecord:
+    chrom: str
+    start: int
+    end: int
+    name: str = ""
+
+
+def get_ibd_segments(chroms: List[str],
+                     pos: List[int],
+                     ibd_states: List[bool],
+                     name: Optional[str] = ""
+                    ) -> pd.DataFrame:
+    """
+    Get all IBD segments implied from a list indicating the IBD
+    state at each position in a genome
+    
+    """
+
+    # Init
+    t = 0
+    chrom = chroms[t]
+    ibd_state = ibd_states[t]
+    if ibd_state:
+        start = pos[t]
+
+    # Iterate
+    bed_records = []
+    for t in range(1, len(pos)):
+
+        # Handle end of chromosomes
+        if chrom != chroms[t]:
+            if ibd_state:
+                end = pos[t-1]
+                bed_records.append(BEDRecord(chrom, start, end, name))
+            if ibd_states[t]:
+                start = pos[t]
+            chrom = chroms[t]
+            ibd_state = ibd_states[t]
+            continue
+
+        # Handle chromosome internal
+        if ibd_state:
+            if not ibd_states[t]: # segement end
+                end = (pos[t-1] + pos[t]) / 2
+                bed_records.append(BEDRecord(chrom, start, end, name))
+        elif ibd_states[t]: # segment start
+            start = (pos[t-1] + pos[t]) / 2
+
+        # Update memory
+        chrom = chroms[t]
+        ibd_state = ibd_states[t]
+
+    # Terminate
+    if ibd_state:
+        end = pos[t-1]  # as with end of chromosome
+        bed_records.append(BEDRecord(chrom, start, end, name))
+
+    return pd.DataFrame(bed_records)