Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hill diversity profile #535

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ Analyse clonal diversity
tl.clonal_expansion
tl.summarize_clonal_expansion
tl.alpha_diversity
tl.hill_diversity_profile
tl.convert_hill_table
tl.repertoire_overlap
tl.clonotype_modularity
tl.clonotype_imbalance
Expand Down
434 changes: 432 additions & 2 deletions docs/tutorials/tutorial_5k_bcr.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/scirpy/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ._clonotype_modularity import clonotype_modularity
from ._clonotypes import clonotype_network, clonotype_network_igraph, define_clonotype_clusters, define_clonotypes
from ._convergence import clonotype_convergence
from ._diversity import alpha_diversity
from ._diversity import alpha_diversity, convert_hill_table, hill_diversity_profile
from ._group_abundance import group_abundance
from ._ir_query import ir_query, ir_query_annotate, ir_query_annotate_df
from ._repertoire_overlap import repertoire_overlap
Expand Down
132 changes: 127 additions & 5 deletions src/scirpy/tl/_diversity.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections.abc import Callable
from typing import cast
from typing import Literal, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -43,6 +43,125 @@ def _dxx(counts: np.ndarray, *, percentage: int):
return i / len(freqs) * 100


def Hill_diversity(counts: np.ndarray, q):
if q == 0:
return len(counts)
if q == 1:
p_i = counts / sum(counts)
return np.exp(np.sum(-p_i * np.log(p_i)))
else:
p_i = counts / sum(counts)
return np.power(np.sum(np.power(p_i, q)), 1 / (1 - q))


def hill_diversity_profile(
adata: DataHandler.TYPE,
groupby: str,
target_col: str = "clone_id",
airr_mod: str = "airr",
q_min=0,
q_max=2,
q_step=1,
) -> pd.DataFrame:
"""\
Calculates a Hill based diversity profile for a given range of diversity order (`q`)

Parameters
----------
{adata}
groupby
Group by this column from `obs`. E.g, sample, or group.
target_col
Column containing the clonotype annoatation
{airr_mod}
q_min
Specify lowest (start) diversity order
q_max
Specify highest (end) diversity order
q_step
Specify the fineness (steps) of diversity order calculation

Returns
-------
Returns a pd.DataFrame where columns are groups (specified by groupby) and row indices indicate each
all calculated diversity order -> allows seamless plotting with e.g. seaborn
"""
params = DataHandler(adata, airr_mod)
ir_obs = params.get_obs([target_col, groupby])
ir_obs = ir_obs.loc[~_is_na(ir_obs[target_col]), :]
clono_counts = ir_obs.groupby([groupby, target_col], observed=True).size().reset_index(name="count")
diversity = {}

for q in np.arange(q_min, q_max + q_step, q_step):
for k in sorted(ir_obs[groupby].dropna().unique()):
tmp_counts = cast(
np.ndarray,
cast(pd.Series, clono_counts.loc[clono_counts[groupby] == k, "count"]).values,
)
if k in diversity:
diversity[k].append(Hill_diversity(tmp_counts, q))
else:
diversity[k] = [Hill_diversity(tmp_counts, q)]
df = pd.DataFrame.from_dict(diversity, orient="index", columns=list(np.arange(q_min, q_max + q_step, q_step)))
df.index.name = groupby
return df.T


def convert_hill_table(
diversity_profile: pd.DataFrame,
convert_to: Literal["diversity", "evenness_factor", "relative_evenness"] = "diversity",
) -> pd.DataFrame:
"""\
Converts pd.DataFrame generated by scipry.tl.hill_diversity_profile into other relevant alpha indices
for more information regarding alpha diversity indices see https://academic.oup.com/bib/article/19/4/679/2871295


Parameters
----------
diversity_profile
pd.DataFrame generated by scipry.tl.hill_diversity_profile
convert_to
specify which conversion is desired:
* `"diversity"` - infer respective indices from Hill numbers
* `"evenness_factor"` - calculates EF for each diversity order as specified previously
* `"relative_evenness"` - calculates RLE for each diversity order as specified previously

Returns
-------
pd.DataFrame, where rows are indices/diversity orders and columns are groups (i.e. severity)
"""
if convert_to == "diversity":
df = pd.DataFrame(
columns=diversity_profile.columns,
index=["Observed richness", "Shannon entropy", "Inverse Simpson", "Gini-Simpson"],
)

df.loc["Observed richness"] = diversity_profile.loc[0]
df.loc["Shannon entropy"] = [np.log(x) for x in diversity_profile.loc[1]]
df.loc["Inverse Simpson"] = diversity_profile.loc[2]
df.loc["Gini-Simpson"] = [(1 - (1 / x)) for x in diversity_profile.loc[2]]
return df

elif convert_to == "evenness_factor":
df = diversity_profile.copy()
observed_richeness = diversity_profile.loc[0]
for _index, row in df.iterrows():
for i in range(len(row)):
row.iloc[i] = row.iloc[i] / observed_richeness.iloc[i]
return df

elif convert_to == "relative_evenness":
df = diversity_profile.copy()
observed_richeness = diversity_profile.loc[0]
for _index, row in df.iterrows():
for i in range(len(row)):
row.iloc[i] = np.log(row.iloc[i]) / np.log(observed_richeness.iloc[i])
return df

else:
raise Exception("Invalid input. Please check your input to the convert_to argument!")


@DataHandler.inject_param_docs()
def alpha_diversity(
adata: DataHandler.TYPE,
Expand All @@ -69,12 +188,15 @@ def alpha_diversity(
`normalized to group size <https://math.stackexchange.com/a/945172>`__.

D50:
D50 is a measure of the minimum number of distinct clonotypes totalling greater than 50% of total clonotype
counts in a given group, as a percentage out of the total number of clonotypes.
Adapted from `<https://patents.google.com/patent/WO2012097374A1/en>`__.
The diversity index (D50) is a measure of the diversity of an immune repertoire of J individual cells
(the total number of CDR3s) composed of S distinct CDR3s in a ranked dominance configuration where ri
is the abundance of the ith most abundant CDR3, r1 is the abundance of the most abundant CDR3, r2 is the
abundance of the second most abundant CDR3, and so on. C is the minimum number of distinct CDR3s,
amounting to >50% of the total sequencing reads. D50 therefore is given by C/S x 100.
`<https://patents.google.com/patent/WO2012097374A1/en>`__.

DXX:
Similar to D50 where XX indicates the percentage of total clonotype counts threshold.
Similar to D50 where XX indicates the percent of J (the total number of CDR3s).
Requires to pass the `percentage` keyword argument which can be within 0 and
100.

Expand Down
Loading