From c5528c23684fd558c6d97a18c810054f1401a331 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 16 Jun 2024 12:58:34 +0200
Subject: [PATCH 1/2] Start singleR implementation

---
 conf/test_full.config                         |  3 +-
 .../local/celltypes/singler/environment.yml   | 10 ++++
 modules/local/celltypes/singler/main.nf       | 25 +++++++++
 .../celltypes/singler/templates/singleR.py    | 55 +++++++++++++++++++
 nextflow.config                               |  1 +
 nextflow_schema.json                          |  7 +++
 subworkflows/local/celltype_assignment.nf     |  9 +++
 7 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 modules/local/celltypes/singler/environment.yml
 create mode 100644 modules/local/celltypes/singler/main.nf
 create mode 100644 modules/local/celltypes/singler/templates/singleR.py

diff --git a/conf/test_full.config b/conf/test_full.config
index c25988a..6b246e1 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -21,7 +21,8 @@ params {
 
     // Input data for full size test
     input               = params.pipelines_testdata_base_path + 'scdownstream/samplesheet.csv'
-    integration_methods = 'scvi,harmony,bbknn,combat,seurat'
+    integration_methods = 'scvi,harmony,bbknn,combat'
     doublet_detection   = 'scrublet,doubletdetection,scds'
     celltypist_model    = 'Adult_Human_Skin'
+    celldex_reference   = 'hpca'
 }
diff --git a/modules/local/celltypes/singler/environment.yml b/modules/local/celltypes/singler/environment.yml
new file mode 100644
index 0000000..b1ae95b
--- /dev/null
+++ b/modules/local/celltypes/singler/environment.yml
@@ -0,0 +1,10 @@
+name: celltypes_singler
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::anndata2ri=1.3.1
+  - bioconda::bioconductor-celldex=1.12.0
+  - bioconda::bioconductor-singlecellexperiment=1.24.0
+  - bioconda::bioconductor-singler=2.4.0
+  - conda-forge::anndata=0.10.7
diff --git a/modules/local/celltypes/singler/main.nf b/modules/local/celltypes/singler/main.nf
new file mode 100644
index 0000000..0f6262b
--- /dev/null
+++ b/modules/local/celltypes/singler/main.nf
@@ -0,0 +1,25 @@
+process CELLTYPES_SINGLER {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/anndata2ri_bioconductor-celldex_bioconductor-singlecellexperiment_bioconductor-singler_anndata:d0dfcaede2417581':
+        'community.wave.seqera.io/library/anndata2ri_bioconductor-celldex_bioconductor-singlecellexperiment_bioconductor-singler_anndata:d6a21ee363999d21' }"
+
+    input:
+    tuple val(meta), path(h5ad)
+    val(reference)
+
+    output:
+    tuple val(meta), path("*.h5ad"), emit: h5ad
+    path("*.pkl")                  , emit: obs
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    template 'singleR.py'
+}
diff --git a/modules/local/celltypes/singler/templates/singleR.py b/modules/local/celltypes/singler/templates/singleR.py
new file mode 100644
index 0000000..308bd25
--- /dev/null
+++ b/modules/local/celltypes/singler/templates/singleR.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import anndata as ad
+import anndata2ri
+import rpy2
+import rpy2.robjects as ro
+import platform
+celldex = ro.packages.importr('celldex')
+singler = ro.packages.importr('SingleR')
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+adata = ad.read_h5ad("${h5ad}")
+sce = anndata2ri.py2rpy(adata)
+
+get_counts = ro.r("function(sce) { assay(sce, 'X') }")
+reference = celldex.fetchReference("${reference}")
+predictions = singler.singleR(get_counts(sce), reference)
+
+# TODO: Save the predictions
+
+adata.write_h5ad("${prefix}.h5ad")
+
+# Versions
+
+versions = {
+    "${task.process}": {
+        "python": platform.python_version(),
+        "anndata": ad.__version__,
+        "anndata2ri": anndata2ri.__version__,
+        "rpy2": rpy2.__version__,
+        "celldex": celldex.__version__,
+        "singler": singler.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/nextflow.config b/nextflow.config
index 35d5c41..75d07a7 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -20,6 +20,7 @@ params {
     integration_methods        = 'scvi'
     clustering_resolutions     = '0.5,1.0'
     celltypist_model           = ''
+    celldex_reference          = ''
 
     // MultiQC options
     multiqc_config             = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index b582262..2ee407a 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -86,6 +86,13 @@
                     "description": "Specify the models to use for the celltypist cell type annotation",
                     "help_text": "If you want to use multiple models, separate them with a comma. Available models can be found [here](https://www.celltypist.org/models).",
                     "pattern": "^([a-zA-Z0-9_]*(,[a-zA-Z0-9_]*)*)?$"
+                },
+                "celldex_reference": {
+                    "type": "string",
+                    "default": "human",
+                    "description": "Specify the reference to use for the singleR cell type annotation",
+                    "help_text": "Existing references can be found using the surveyReferences function in the celldex package.",
+                    "pattern": "^([a-zA-Z0-9_]*(,[a-zA-Z0-9_]*)*)?$"
                 }
             }
         },
diff --git a/subworkflows/local/celltype_assignment.nf b/subworkflows/local/celltype_assignment.nf
index b54df32..c3a6852 100644
--- a/subworkflows/local/celltype_assignment.nf
+++ b/subworkflows/local/celltype_assignment.nf
@@ -1,4 +1,5 @@
 include { CELLTYPES_CELLTYPIST } from '../../modules/local/celltypes/celltypist'
+include { CELLTYPES_SINGLER    } from '../../modules/local/celltypes/singler'
 
 workflow CELLTYPE_ASSIGNMENT {
     take:
@@ -16,6 +17,14 @@ workflow CELLTYPE_ASSIGNMENT {
         ch_versions = ch_versions.mix(CELLTYPES_CELLTYPIST.out.versions)
     }
 
+    if (params.celldex_reference) {
+        celldex_references = Channel.from(params.celldex_reference.split(','))
+
+        CELLTYPES_SINGLER(ch_h5ad, celldex_references)
+        ch_obs = ch_obs.mix(CELLTYPES_SINGLER.out.obs)
+        ch_versions = ch_versions.mix(CELLTYPES_SINGLER.out.versions)
+    }
+
     emit:
     obs = ch_obs
 

From 1419fa75051304de41682cfb5b1ff260f31d0422 Mon Sep 17 00:00:00 2001
From: Nico Trummer <nictru32@gmail.com>
Date: Sun, 16 Jun 2024 13:04:31 +0200
Subject: [PATCH 2/2] Remove wrong default celldex_reference

---
 nextflow_schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 2ee407a..ceb6fa5 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -89,7 +89,7 @@
                 },
                 "celldex_reference": {
                     "type": "string",
-                    "default": "human",
+                    "default": "",
                     "description": "Specify the reference to use for the singleR cell type annotation",
                     "help_text": "Existing references can be found using the surveyReferences function in the celldex package.",
                     "pattern": "^([a-zA-Z0-9_]*(,[a-zA-Z0-9_]*)*)?$"