scverse · MKanetscheider · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 13, 2024
diff --git a/docs/api.rst b/docs/api.rst
@@ -148,6 +148,8 @@ Analyse clonal diversity
    tl.clonal_expansion
    tl.summarize_clonal_expansion
    tl.alpha_diversity
+   tl.hill_diversity_profile
+   tl.convert_hill_table
    tl.repertoire_overlap
    tl.clonotype_modularity
    tl.clonotype_imbalance

diff --git a/docs/tutorials/tutorial_5k_bcr.ipynb b/docs/tutorials/tutorial_5k_bcr.ipynb
diff --git a/src/scirpy/tl/__init__.py b/src/scirpy/tl/__init__.py
@@ -4,7 +4,7 @@
 from ._clonotype_modularity import clonotype_modularity
 from ._clonotypes import clonotype_network, clonotype_network_igraph, define_clonotype_clusters, define_clonotypes
 from ._convergence import clonotype_convergence
-from ._diversity import alpha_diversity
+from ._diversity import alpha_diversity, convert_hill_table, hill_diversity_profile
 from ._group_abundance import group_abundance
 from ._ir_query import ir_query, ir_query_annotate, ir_query_annotate_df
 from ._repertoire_overlap import repertoire_overlap

diff --git a/src/scirpy/tl/_diversity.py b/src/scirpy/tl/_diversity.py
@@ -1,5 +1,5 @@
 from collections.abc import Callable
-from typing import cast
+from typing import Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -43,6 +43,125 @@ def _dxx(counts: np.ndarray, *, percentage: int):
     return i / len(freqs) * 100
 
 
+def Hill_diversity(counts: np.ndarray, q):
+    if q == 0:
+        return len(counts)
+    if q == 1:
+        p_i = counts / sum(counts)
+        return np.exp(np.sum(-p_i * np.log(p_i)))
+    else:
+        p_i = counts / sum(counts)
+        return np.power(np.sum(np.power(p_i, q)), 1 / (1 - q))
+
+
+def hill_diversity_profile(
+    adata: DataHandler.TYPE,
+    groupby: str,
+    target_col: str = "clone_id",
+    airr_mod: str = "airr",
+    q_min=0,
+    q_max=2,
+    q_step=1,
+) -> pd.DataFrame:
+    """\
+    Calculates a Hill based diversity profile for a given range of diversity order (`q`)
+
+    Parameters
+    ----------
+    {adata}
+    groupby
+        Group by this column from `obs`. E.g, sample, or group.
+    target_col
+        Column containing the clonotype annoatation
+    {airr_mod}
+    q_min
+        Specify lowest (start) diversity order
+    q_max
+        Specify highest (end) diversity order
+    q_step
+        Specify the fineness (steps) of diversity order calculation
+
+    Returns
+    -------
+    Returns a pd.DataFrame where columns are groups (specified by groupby) and row indices indicate each
+    all calculated diversity order -> allows seamless plotting with e.g. seaborn
+    """
+    params = DataHandler(adata, airr_mod)
+    ir_obs = params.get_obs([target_col, groupby])
+    ir_obs = ir_obs.loc[~_is_na(ir_obs[target_col]), :]
+    clono_counts = ir_obs.groupby([groupby, target_col], observed=True).size().reset_index(name="count")
+    diversity = {}
+
+    for q in np.arange(q_min, q_max + q_step, q_step):
+        for k in sorted(ir_obs[groupby].dropna().unique()):
+            tmp_counts = cast(
+                np.ndarray,
+                cast(pd.Series, clono_counts.loc[clono_counts[groupby] == k, "count"]).values,
+            )
+            if k in diversity:
+                diversity[k].append(Hill_diversity(tmp_counts, q))
+            else:
+                diversity[k] = [Hill_diversity(tmp_counts, q)]
+    df = pd.DataFrame.from_dict(diversity, orient="index", columns=list(np.arange(q_min, q_max + q_step, q_step)))
+    df.index.name = groupby
+    return df.T
+
+
+def convert_hill_table(
+    diversity_profile: pd.DataFrame,
+    convert_to: Literal["diversity", "evenness_factor", "relative_evenness"] = "diversity",
+) -> pd.DataFrame:
+    """\
+    Converts pd.DataFrame generated by scipry.tl.hill_diversity_profile into other relevant alpha indices
+    for more information regarding alpha diversity indices see https://academic.oup.com/bib/article/19/4/679/2871295
+
+
+    Parameters
+    ----------
+    diversity_profile
+       pd.DataFrame generated by scipry.tl.hill_diversity_profile
+    convert_to
+       specify which conversion is desired:
+           * `"diversity"` - infer respective indices from Hill numbers
+           * `"evenness_factor"` - calculates EF for each diversity order as specified previously
+           * `"relative_evenness"` - calculates RLE for each diversity order as specified previously
+
+    Returns
+    -------
+    pd.DataFrame, where rows are indices/diversity orders and columns are groups (i.e. severity)
+    """
+    if convert_to == "diversity":
+        df = pd.DataFrame(
+            columns=diversity_profile.columns,
+            index=["Observed richness", "Shannon entropy", "Inverse Simpson", "Gini-Simpson"],
+        )
+
+        df.loc["Observed richness"] = diversity_profile.loc[0]
+        df.loc["Shannon entropy"] = [np.log(x) for x in diversity_profile.loc[1]]
+        df.loc["Inverse Simpson"] = diversity_profile.loc[2]
+        df.loc["Gini-Simpson"] = [(1 - (1 / x)) for x in diversity_profile.loc[2]]
+        return df
+
+    elif convert_to == "evenness_factor":
+        df = diversity_profile.copy()
+        observed_richeness = diversity_profile.loc[0]
+        for _index, row in df.iterrows():
+            for i in range(len(row)):
+                row.iloc[i] = row.iloc[i] / observed_richeness.iloc[i]
+        return df
+
+    elif convert_to == "relative_evenness":
+        df = diversity_profile.copy()
+        observed_richeness = diversity_profile.loc[0]
+        for _index, row in df.iterrows():
+            for i in range(len(row)):
+                row.iloc[i] = np.log(row.iloc[i]) / np.log(observed_richeness.iloc[i])
+        return df
+
+    else:
+        raise Exception("Invalid input. Please check your input to the convert_to argument!")
+
+
 @DataHandler.inject_param_docs()
 def alpha_diversity(
     adata: DataHandler.TYPE,
@@ -69,12 +188,15 @@ def alpha_diversity(
         `normalized to group size <https://math.stackexchange.com/a/945172>`__.
 
     D50:
-        D50 is a measure of the minimum number of distinct clonotypes totalling greater than 50% of total clonotype
-        counts in a given group, as a percentage out of the total number of clonotypes.
-        Adapted from `<https://patents.google.com/patent/WO2012097374A1/en>`__.
+        The diversity index (D50) is a measure of the diversity of an immune repertoire of J individual cells
+        (the total number of CDR3s) composed of S distinct CDR3s in a ranked dominance configuration where ri
+        is the abundance of the ith most abundant CDR3, r1 is the abundance of the most abundant CDR3, r2 is the
+        abundance of the second most abundant CDR3, and so on. C is the minimum number of distinct CDR3s,
+        amounting to >50% of the total sequencing reads. D50 therefore is given by C/S x 100.
+        `<https://patents.google.com/patent/WO2012097374A1/en>`__.
 
     DXX:
-        Similar to D50 where XX indicates the percentage of total clonotype counts threshold.
+        Similar to D50 where XX indicates the percent of J (the total number of CDR3s).
         Requires to pass the `percentage` keyword argument which can be within 0 and
         100.