update documentations

TuftsBCB · Mar 27, 2024 · 15336e3 · 15336e3
1 parent c0833d0
commit 15336e3
Show file tree

Hide file tree

Showing 4 changed files with 225 additions and 77 deletions.
diff --git a/regdiffusion/data/beeline.py b/regdiffusion/data/beeline.py
@@ -34,31 +34,35 @@ def cell_type_separator(sc_data, cell_type_element_indices=[0], sep='_'):
 
 def load_beeline(data_dir='data', benchmark_data='hESC', 
                  benchmark_setting='500_STRING'):
-    ''' Load BEELINE
-    
-    Load BEELINE data into memory (download if necessary).
+    """
+    Load BEELINE data and its ground truth (download if necessary).
+
+    Paper: Benchmarking algorithms for gene regulatory network inference from 
+        single-cell transcriptomic data
+    Paper Link: https://www.nature.com/articles/s41592-019-0690-6
+
+    BEELINE consists of 7 single-cell datasets (`hESC`, `hHep`, `mDC`, `mESC`, 
+    `mHSC`, `mHSC-GM`, and `mHSC-L) and 3 sets of ground truth networks 
+    (`STRING`, `Non-ChIP`, `ChIP-seq`). 
     
-    Parameters
-    ----------
-    data_dir: str
-        Root folder where the BEELINE data is/will be located. 
-    benchmark_data: str
-        Benchmark datasets. Choose among `hESC`, `hHep`, `mDC`, 
-        `mESC`, `mHSC`, `mHSC-GM`, and `mHSC-L`.
-    benchmark_setting: str
-        Benchmark settings. Choose among `500_STRING`, 
-        `1000_STRING`, `500_Non-ChIP`, `1000_Non-ChIP`, 
-        `500_ChIP-seq`, `1000_ChIP-seq`, `500_lofgof`,
-        and `1000_lofgof`. If either of the `lofgof` settings
-        is choosed, only `mESC` data is available.  
-        
-    Returns
-    -------
-    tuple
-        First element is a scanpy data with cells on rows and 
-        genes on columns. Second element is the corresponding 
-        BEELINE ground truth data 
-    '''
+    Args:
+        data_dir (str): Parent directory to save and load the data. If the path
+            does not exist, it will be created. Data will be saved in a
+            subdirectory under the provided path. 
+        benchmark_data (str): Benchmark datasets. Choose among `hESC`, `hHep`, 
+            `mDC`, `mESC`, `mHSC`, `mHSC-GM`, and `mHSC-L`.
+        benchmark_setting (str): Benchmark settings. Choose among `500_STRING`, 
+            `1000_STRING`, `500_Non-ChIP`, `1000_Non-ChIP`, `500_ChIP-seq`, 
+            `1000_ChIP-seq`, `500_lofgof`, and `1000_lofgof`. If either of the 
+            `lofgof` settings is choosed, only `mESC` data is available.  
+
+    Returns:
+        tuple: A tuple containing two objects for a single BEELINE benchmark. 
+            - The first element is a scanpy AnnData with cells on rows and 
+                genes on columns. 
+            - Second element is an numpy array for the adjacency list of the 
+                ground truth network.  
+    """
     if not os.path.exists(data_dir):
         os.mkdir(data_dir)
     if not os.path.exists(f'{data_dir}/BEELINE/'):

diff --git a/regdiffusion/data/microglia.py b/regdiffusion/data/microglia.py
@@ -7,14 +7,27 @@
 from .utils import download_file
 
 
-def load_atlas_microglia(data_dir='data'):
-    ''' Load single cell for microglia from SCP795
+def load_atlas_microglia(data_dir='data') -> sc.AnnData:
+    """
+    Load single cell for microglia from Broad Institute SCP795
 
     Data Source: https://singlecell.broadinstitute.org/single_cell/study/SCP795/a-transcriptomic-atlas-of-the-mouse-cerebellum#study-summary
 
-    Data is just count data and has been log 
-    transformed at the end of the loading step
-    '''
+    Paper: A transcriptomic atlas of mouse cerebellar cortex comprehensively 
+        defines cell types
+    Paper Link: https://www.nature.com/articles/s41586-021-03220-z
+
+    Raw data is count data. We select all genes that have non-zero expression. 
+    We also removed all gene models, Mitochondrial genes, and ribosome genes. 
+    We used log-plus-one to transform the count data. 
+
+    The output is an AnnData object where rows are cells and columns are genes.
+
+    Args:
+        data_dir (str): Parent directory to save and load the data. If the path
+            does not exist, it will be created. Data will be saved in a
+            subdirectory under the provided path. 
+    """
     if not os.path.exists(data_dir):
         os.mkdir(data_dir)
     file_dir = f'{data_dir}/scp795_microglia/'
@@ -34,10 +47,35 @@ def load_atlas_microglia(data_dir='data'):
     return ann_dt
 
 def load_hammond_microglia(data_dir='data'):
-    ''' Load single cell for hammond microglia
+    """
+    Load single cell for microglia from Hammond Microglia dataset
+
+    Data Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE121654
+
+    Paper: Single-Cell RNA Sequencing of Microglia throughout the Mouse Lifespan
+        and in the Injured Brain Reveals Complex Cell-State Changes
+    Paper Link: https://www.cell.com/immunity/fulltext/S1074-7613(18)30485-0
+
+    IMPORTANT! This is not the complete data from the study. We only selected
+    data from the 4 adult male mouses at P100. Here are their accession IDs. 
+
+    GSM3442026	P100 male no 1
+    GSM3442027	P100 male no 2
+    GSM3442030	P100 male no 3
+    GSM3442031	P100 male no 4
+    
+    Raw data has already been log transformed. We select all genes that have 
+    non-zero expression. We also removed all gene models, Mitochondrial genes, 
+    and ribosome genes. 
+
+    The output is an AnnData object where rows are cells and columns are genes.
 
-    We selected the 4 P100 male mice data. Data has been log transformed. 
-    '''
+    Args:
+        data_dir (str): Parent directory to save and load the data. If the path
+            does not exist, it will be created. Data will be saved in a
+            subdirectory under the provided path. 
+    
+    """
     if not os.path.exists(data_dir):
         os.mkdir(data_dir)
     file_dir = f'{data_dir}/hammond_microglia/'

diff --git a/regdiffusion/models/regdiffusion.py b/regdiffusion/models/regdiffusion.py
@@ -56,8 +56,7 @@ def forward(self, x):
         return batch_gene_emb
 
 class RegDiffusion(nn.Module):
-    ''' 
-    
+    """
     A RegDiffusion model. For architecture details, please refer to our paper.
 
     > From noise to knowledge: probabilistic diffusion-based neural inference
@@ -76,7 +75,7 @@ class RegDiffusion(nn.Module):
             during training. 
         init_coef (int): Coefficient to multiply with gene regulation norm 
             (1/(n_gene - 1)) to initialize the adjacency matrix. 
-    '''
+    """
     def __init__(
         self, n_gene, time_dim, 
         n_celltype=None, celltype_dim=4,