Skip to content

Commit

Permalink
update documentations
Browse files Browse the repository at this point in the history
  • Loading branch information
haozhu233 committed Mar 27, 2024
1 parent c0833d0 commit 15336e3
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 77 deletions.
52 changes: 28 additions & 24 deletions regdiffusion/data/beeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,35 @@ def cell_type_separator(sc_data, cell_type_element_indices=[0], sep='_'):

def load_beeline(data_dir='data', benchmark_data='hESC',
benchmark_setting='500_STRING'):
''' Load BEELINE
Load BEELINE data into memory (download if necessary).
"""
Load BEELINE data and its ground truth (download if necessary).
Paper: Benchmarking algorithms for gene regulatory network inference from
single-cell transcriptomic data
Paper Link: https://www.nature.com/articles/s41592-019-0690-6
BEELINE consists of 7 single-cell datasets (`hESC`, `hHep`, `mDC`, `mESC`,
`mHSC`, `mHSC-GM`, and `mHSC-L) and 3 sets of ground truth networks
(`STRING`, `Non-ChIP`, `ChIP-seq`).
Parameters
----------
data_dir: str
Root folder where the BEELINE data is/will be located.
benchmark_data: str
Benchmark datasets. Choose among `hESC`, `hHep`, `mDC`,
`mESC`, `mHSC`, `mHSC-GM`, and `mHSC-L`.
benchmark_setting: str
Benchmark settings. Choose among `500_STRING`,
`1000_STRING`, `500_Non-ChIP`, `1000_Non-ChIP`,
`500_ChIP-seq`, `1000_ChIP-seq`, `500_lofgof`,
and `1000_lofgof`. If either of the `lofgof` settings
is choosed, only `mESC` data is available.
Returns
-------
tuple
First element is a scanpy data with cells on rows and
genes on columns. Second element is the corresponding
BEELINE ground truth data
'''
Args:
data_dir (str): Parent directory to save and load the data. If the path
does not exist, it will be created. Data will be saved in a
subdirectory under the provided path.
benchmark_data (str): Benchmark datasets. Choose among `hESC`, `hHep`,
`mDC`, `mESC`, `mHSC`, `mHSC-GM`, and `mHSC-L`.
benchmark_setting (str): Benchmark settings. Choose among `500_STRING`,
`1000_STRING`, `500_Non-ChIP`, `1000_Non-ChIP`, `500_ChIP-seq`,
`1000_ChIP-seq`, `500_lofgof`, and `1000_lofgof`. If either of the
`lofgof` settings is choosed, only `mESC` data is available.
Returns:
tuple: A tuple containing two objects for a single BEELINE benchmark.
- The first element is a scanpy AnnData with cells on rows and
genes on columns.
- Second element is an numpy array for the adjacency list of the
ground truth network.
"""
if not os.path.exists(data_dir):
os.mkdir(data_dir)
if not os.path.exists(f'{data_dir}/BEELINE/'):
Expand Down
54 changes: 46 additions & 8 deletions regdiffusion/data/microglia.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,27 @@
from .utils import download_file


def load_atlas_microglia(data_dir='data'):
''' Load single cell for microglia from SCP795
def load_atlas_microglia(data_dir='data') -> sc.AnnData:
"""
Load single cell for microglia from Broad Institute SCP795
Data Source: https://singlecell.broadinstitute.org/single_cell/study/SCP795/a-transcriptomic-atlas-of-the-mouse-cerebellum#study-summary
Data is just count data and has been log
transformed at the end of the loading step
'''
Paper: A transcriptomic atlas of mouse cerebellar cortex comprehensively
defines cell types
Paper Link: https://www.nature.com/articles/s41586-021-03220-z
Raw data is count data. We select all genes that have non-zero expression.
We also removed all gene models, Mitochondrial genes, and ribosome genes.
We used log-plus-one to transform the count data.
The output is an AnnData object where rows are cells and columns are genes.
Args:
data_dir (str): Parent directory to save and load the data. If the path
does not exist, it will be created. Data will be saved in a
subdirectory under the provided path.
"""
if not os.path.exists(data_dir):
os.mkdir(data_dir)
file_dir = f'{data_dir}/scp795_microglia/'
Expand All @@ -34,10 +47,35 @@ def load_atlas_microglia(data_dir='data'):
return ann_dt

def load_hammond_microglia(data_dir='data'):
''' Load single cell for hammond microglia
"""
Load single cell for microglia from Hammond Microglia dataset
Data Source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE121654
Paper: Single-Cell RNA Sequencing of Microglia throughout the Mouse Lifespan
and in the Injured Brain Reveals Complex Cell-State Changes
Paper Link: https://www.cell.com/immunity/fulltext/S1074-7613(18)30485-0
IMPORTANT! This is not the complete data from the study. We only selected
data from the 4 adult male mouses at P100. Here are their accession IDs.
GSM3442026 P100 male no 1
GSM3442027 P100 male no 2
GSM3442030 P100 male no 3
GSM3442031 P100 male no 4
Raw data has already been log transformed. We select all genes that have
non-zero expression. We also removed all gene models, Mitochondrial genes,
and ribosome genes.
The output is an AnnData object where rows are cells and columns are genes.
We selected the 4 P100 male mice data. Data has been log transformed.
'''
Args:
data_dir (str): Parent directory to save and load the data. If the path
does not exist, it will be created. Data will be saved in a
subdirectory under the provided path.
"""
if not os.path.exists(data_dir):
os.mkdir(data_dir)
file_dir = f'{data_dir}/hammond_microglia/'
Expand Down
5 changes: 2 additions & 3 deletions regdiffusion/models/regdiffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ def forward(self, x):
return batch_gene_emb

class RegDiffusion(nn.Module):
'''
"""
A RegDiffusion model. For architecture details, please refer to our paper.
> From noise to knowledge: probabilistic diffusion-based neural inference
Expand All @@ -76,7 +75,7 @@ class RegDiffusion(nn.Module):
during training.
init_coef (int): Coefficient to multiply with gene regulation norm
(1/(n_gene - 1)) to initialize the adjacency matrix.
'''
"""
def __init__(
self, n_gene, time_dim,
n_celltype=None, celltype_dim=4,
Expand Down
Loading

0 comments on commit 15336e3

Please sign in to comment.