Merge pull request #770 from AlexsLemonade/dev

Production Deploy
AlexsLemonade · Jun 20, 2024 · d8c4b9a · d8c4b9a
2 parents 18438e8 + 7d4d2c3
commit d8c4b9a
Show file tree

Hide file tree

Showing 51 changed files with 2,231 additions and 1,158 deletions.
diff --git a/api/scpca_portal/common.py b/api/scpca_portal/common.py
@@ -19,3 +19,72 @@
 TEMPLATE_PATH = CODE_PATH / "scpca_portal" / "templates"
 
 TAB = "\t"
+
+IGNORED_INPUT_VALUES = {"", "N/A", "TBD"}
+STRIPPED_INPUT_VALUES = "< >"
+
+# Global sort order for Metadata TSVs
+# Columns
+METADATA_COLUMN_SORT_ORDER = [
+    "scpca_project_id",
+    # Sample metadata
+    "scpca_sample_id",
+    "scpca_library_id",
+    "diagnosis",
+    "subdiagnosis",
+    "disease_timing",
+    "age_at_diagnosis",
+    "sex",
+    "tissue_location",
+    "participant_id",
+    "submitter",
+    "submitter_id",
+    "organism",
+    "development_stage_ontology_term_id",
+    "sex_ontology_term_id",
+    "organism_ontology_id",
+    "self_reported_ethnicity_ontology_term_id",
+    "disease_ontology_term_id",
+    "tissue_ontology_term_id",
+    "*",  # Addtional metadata
+    # Library metadata
+    "seq_unit",
+    "technology",
+    "demux_samples",
+    "total_reads",
+    "mapped_reads",
+    "sample_cell_count_estimate",
+    "sample_cell_estimates",  # ONLY FOR MULTIPLEXED
+    "unfiltered_cells",
+    "filtered_cell_count",
+    "processed_cells",
+    "has_cellhash",
+    "includes_anndata",
+    "is_cell_line",
+    "is_multiplexed",
+    "is_xenograft",
+    # Project metadata
+    "pi_name",
+    "project_title",
+    # Processing metadata
+    "genome_assembly",
+    "mapping_index",
+    "spaceranger_version",  # FOR SPATIAL ONLY
+    "alevin_fry_version",  # REMOVED FOR SPATIAL
+    "salmon_version",  # REMOVED FOR SPATIAL
+    "transcript_type",  # REMOVED FOR SPATIAL
+    "droplet_filtering_method",  # REMOVED FOR SPATIAL
+    "cell_filtering_method",  # REMOVED FOR SPATIAL
+    "prob_compromised_cutoff",  # REMOVED FOR SPATIAL
+    "min_gene_cutoff",  # REMOVED FOR SPATIAL
+    "normalization_method",  # REMOVED FOR SPATIAL
+    "demux_method",  # ONLY FOR MULTIPLEXED
+    "date_processed",
+    "workflow",
+    "workflow_version",
+    "workflow_commit",
+]
+# Rows
+PROJECT_ID_KEY = "scpca_project_id"
+SAMPLE_ID_KEY = "scpca_sample_id"
+LIBRARY_ID_KEY = "scpca_library_id"
diff --git a/api/scpca_portal/management/commands/load_data.py b/api/scpca_portal/management/commands/load_data.py
@@ -1,4 +1,3 @@
-import csv
 import logging
 import shutil
 import subprocess
@@ -12,8 +11,8 @@
 import boto3
 from botocore.client import Config
 
-from scpca_portal import common, utils
-from scpca_portal.models import Project
+from scpca_portal import common, metadata_file
+from scpca_portal.models import Contact, ExternalAccession, Project, Publication
 
 ALLOWED_SUBMITTERS = {
     "christensen",
@@ -139,52 +138,13 @@ def add_arguments(self, parser):
             "--update-s3", action=BooleanOptionalAction, default=settings.UPDATE_S3_DATA
         )
 
-    def clean_up_input_data(self):
-        shutil.rmtree(common.INPUT_DATA_PATH / self.project.scpca_id, ignore_errors=True)
+    def clean_up_input_data(self, project):
+        shutil.rmtree(common.INPUT_DATA_PATH / project.scpca_id, ignore_errors=True)
 
     def handle(self, *args, **kwargs):
         self.configure_aws_cli(**kwargs)
         self.load_data(**kwargs)
 
-    def process_project_data(self, data, sample_id, **kwargs):
-        self.project.abstract = data["abstract"]
-        self.project.additional_restrictions = data["additional_restrictions"]
-        self.project.has_bulk_rna_seq = utils.boolean_from_string(data.get("has_bulk", False))
-        self.project.has_cite_seq_data = utils.boolean_from_string(data.get("has_CITE", False))
-        self.project.has_multiplexed_data = utils.boolean_from_string(
-            data.get("has_multiplex", False)
-        )
-        self.project.has_spatial_data = utils.boolean_from_string(data.get("has_spatial", False))
-        self.project.human_readable_pi_name = data["PI"]
-        self.project.includes_anndata = utils.boolean_from_string(
-            data.get("includes_anndata", False)
-        )
-        self.project.includes_cell_lines = utils.boolean_from_string(
-            data.get("includes_cell_lines", False)
-        )
-        self.project.includes_merged_anndata = utils.boolean_from_string(
-            data.get("includes_merged_anndata", False)
-        )
-        self.project.includes_merged_sce = utils.boolean_from_string(
-            data.get("includes_merged_sce", False)
-        )
-        self.project.includes_xenografts = utils.boolean_from_string(
-            data.get("includes_xenografts", False)
-        )
-        self.project.pi_name = data["submitter"]
-        self.project.title = data["project_title"]
-        self.project.save()
-
-        self.project.add_contacts(data["contact_email"], data["contact_name"])
-        self.project.add_external_accessions(
-            data["external_accession"],
-            data["external_accession_url"],
-            data["external_accession_raw"],
-        )
-        self.project.add_publications(data["citation"], data["citation_doi"])
-
-        self.project.load_data(sample_id=sample_id, **kwargs)
-
     def load_data(
         self,
         allowed_submitters: set[str] = None,
@@ -215,9 +175,9 @@ def load_data(
             if project_path.is_dir()
         }
 
-        with open(Project.get_input_project_metadata_file_path()) as project_csv:
-            project_list = list(csv.DictReader(project_csv))
-
+        project_list = metadata_file.load_projects_metadata(
+            Project.get_input_project_metadata_file_path()
+        )
         for project_data in project_list:
             scpca_project_id = project_data["scpca_project_id"]
             if project_id and project_id != scpca_project_id:
@@ -229,34 +189,37 @@ def load_data(
                 )
                 return
 
-            if project_data["submitter"] not in allowed_submitters:
-                logger.warning("Project submitter  is not the white list.")
-                continue
-
-            # Purge existing projects so they can be re-added.
-            if (project := Project.objects.filter(scpca_id=scpca_project_id).first()) and (
-                kwargs["reload_all"] or kwargs["reload_existing"]
-            ):
-                logger.info(f"Purging '{project}")
-                project.purge(delete_from_s3=kwargs["update_s3"])
-
-            # Only import new projects. If old ones are desired they should be purged and re-added.
-            project, created = Project.objects.get_or_create(scpca_id=scpca_project_id)
-            if not created:
-                logger.info(f"'{project}' already exists. Use --reload-existing to re-import.")
+            if project_data["pi_name"] not in allowed_submitters:
+                logger.warning("Project submitter is not the white list.")
                 continue
 
-            self.project = Project.objects.filter(scpca_id=scpca_project_id).first()
-            logger.info(f"Importing '{self.project}' data")
-            self.process_project_data(project_data, sample_id, **kwargs)
-            if samples_count := self.project.samples.count():
+            if project := Project.objects.filter(scpca_id=scpca_project_id).first():
+                # Purge existing projects so they can be re-added.
+                if kwargs["reload_all"] or kwargs["reload_existing"]:
+                    logger.info(f"Purging '{project}")
+                    project.purge(delete_from_s3=kwargs["update_s3"])
+                # Only import new projects.
+                # If old ones are desired they should be purged and re-added.
+                else:
+                    logger.info(f"'{project}' already exists. Use --reload-existing to re-import.")
+                    continue
+
+            logger.info(f"Importing '{project}' data")
+            project = Project.get_from_dict(project_data)
+            project.save()
+            Contact.bulk_create_from_project_data(project_data, project)
+            ExternalAccession.bulk_create_from_project_data(project_data, project)
+            Publication.bulk_create_from_project_data(project_data, project)
+
+            project.load_data(sample_id=sample_id, **kwargs)
+            if samples_count := project.samples.count():
                 logger.info(
-                    f"Created {samples_count} sample{pluralize(samples_count)} for '{self.project}'"
+                    f"Created {samples_count} sample{pluralize(samples_count)} for '{project}'"
                 )
 
             if kwargs["clean_up_input_data"]:
                 logger.info(f"Cleaning up '{project}' input data")
-                self.clean_up_input_data()
+                self.clean_up_input_data(project)
 
             if kwargs["clean_up_output_data"]:
                 logger.info("Cleaning up output directory")

diff --git a/api/scpca_portal/metadata_file.py b/api/scpca_portal/metadata_file.py
@@ -0,0 +1,112 @@
+import csv
+import json
+from collections import namedtuple
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+from scpca_portal import common, utils
+
+PROJECT_METADATA_KEYS = [
+    # Fields used in Project model object creation
+    ("has_bulk", "has_bulk_rna_seq", False),
+    ("has_CITE", "has_cite_seq_data", False),
+    ("has_multiplex", "has_multiplexed_data", False),
+    ("has_spatial", "has_spatial_data", False),
+    ("PI", "human_readable_pi_name", None),
+    ("submitter", "pi_name", None),
+    ("project_title", "title", None),
+    # Fields used in Contact model object creation
+    ("contact_email", "email", None),
+    ("contact_name", "name", None),
+    # Fields used in ExternalAccession model object creation
+    ("external_accession", "accession", None),
+    ("external_accession_raw", "has_raw", False),
+    ("external_accession_url", "accession_url", None),
+    # Field used in Publication model object creation
+    ("citation_doi", "doi", None),
+]
+
+SAMPLE_METADATA_KEYS = [
+    ("age", "age_at_diagnosis", None),
+]
+
+LIBRARY_METADATA_KEYS = [
+    ("library_id", "scpca_library_id", None),
+    ("sample_id", "scpca_sample_id", None),
+    # Field only included in Single cell (and Multiplexed) libraries
+    ("filtered_cells", "filtered_cell_count", None),
+]
+KeyTransform = namedtuple("KeyTransform", ["old_key", "new_key", "default_value"])
+
+
+def load_projects_metadata(metadata_file_path: Path):
+    """
+    Opens, loads and parses list of project metadata located at inputted metadata_file_path.
+    Transforms keys in data dicts to match associated model attributes.
+    """
+    with open(metadata_file_path) as raw_file:
+        data_dicts = list(csv.DictReader(raw_file))
+
+    for data_dict in data_dicts:
+        transform_keys(data_dict, PROJECT_METADATA_KEYS)
+
+    return data_dicts
+
+
+def load_samples_metadata(metadata_file_path: Path):
+    """
+    Opens, loads and parses list of sample metadata located at inputted metadata_file_path.
+    Transforms keys in data dicts to match associated model attributes.
+    """
+    with open(metadata_file_path) as raw_file:
+        data_dicts = list(csv.DictReader(raw_file))
+
+    for data_dict in data_dicts:
+        transform_keys(data_dict, SAMPLE_METADATA_KEYS)
+
+    return data_dicts
+
+
+def load_library_metadata(metadata_file_path: Path):
+    """
+    Opens, loads and parses single library's metadata located at inputted metadata_file_path.
+    Transforms keys in data dicts to match associated model attributes.
+    """
+    with open(metadata_file_path) as raw_file:
+        return transform_keys(json.load(raw_file), LIBRARY_METADATA_KEYS)
+
+
+def transform_keys(data_dict: Dict, key_transforms: List[Tuple]):
+    """
+    Transforms keys in inputted data dict according to inputted key transforms tuple list.
+    """
+    for element in [KeyTransform._make(element) for element in key_transforms]:
+        if element.old_key in data_dict:
+            data_dict[element.new_key] = data_dict.pop(element.old_key, element.default_value)
+
+    return data_dict
+
+
+def write_metadata_dicts(list_of_dicts: List[Dict], output_file_path: str, **kwargs) -> None:
+    """
+    Writes a list of dictionaries to a csv-like file.
+    Optional modifiers to the csv.DictWriter can be passed to function as kwargs.
+    """
+    kwargs["fieldnames"] = kwargs.get(
+        "fieldnames", utils.get_sorted_field_names(utils.get_keys_from_dicts(list_of_dicts))
+    )
+    kwargs["delimiter"] = kwargs.get("delimiter", common.TAB)
+
+    sorted_list_of_dicts = sorted(
+        list_of_dicts,
+        key=lambda k: (
+            k[common.PROJECT_ID_KEY],
+            k[common.SAMPLE_ID_KEY],
+            k[common.LIBRARY_ID_KEY],
+        ),
+    )
+
+    with open(output_file_path, "w", newline="") as raw_file:
+        csv_writer = csv.DictWriter(raw_file, **kwargs)
+        csv_writer.writeheader()
+        csv_writer.writerows(sorted_list_of_dicts)
diff --git a/api/scpca_portal/migrations/0043_auto_20240524_1459.py b/api/scpca_portal/migrations/0043_auto_20240524_1459.py
@@ -0,0 +1,60 @@
+# Generated by Django 3.2.25 on 2024-05-24 14:59
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scpca_portal", "0042_auto_20240423_2045"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="Library",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
+                    ),
+                ),
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                (
+                    "formats",
+                    django.contrib.postgres.fields.ArrayField(
+                        base_field=models.TextField(
+                            choices=[
+                                ("ANN_DATA", "AnnData"),
+                                ("SINGLE_CELL_EXPERIMENT", "Single cell experiment"),
+                            ]
+                        ),
+                        default=list,
+                        size=None,
+                    ),
+                ),
+                ("is_multiplexed", models.BooleanField(default=False)),
+                ("metadata", models.JSONField(default=dict)),
+                (
+                    "modality",
+                    models.TextField(
+                        choices=[("SINGLE_CELL", "Single Cell"), ("SPATIAL", "Spatial")]
+                    ),
+                ),
+                ("scpca_id", models.TextField(unique=True)),
+                ("workflow_version", models.TextField()),
+            ],
+            options={
+                "db_table": "libraries",
+                "ordering": ["updated_at"],
+                "get_latest_by": "updated_at",
+            },
+        ),
+        migrations.AddField(
+            model_name="sample",
+            name="libraries",
+            field=models.ManyToManyField(to="scpca_portal.Library"),
+        ),
+    ]