Merge pull request #682 from AlexsLemonade/dev

Production Deploy
AlexsLemonade · Apr 26, 2024 · 1458dfa · 1458dfa
2 parents 6384685 + 4dc3d08
commit 1458dfa
Show file tree

Hide file tree

Showing 46 changed files with 509 additions and 329 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,6 +16,7 @@ What types of changes does your code introduce?
 <!-- Remove any which your PR isn't -->
 
 - Bugfix (non-breaking change which fixes an issue)
+- Refactor (addresses code organization and design mentioned in corresponding issue)
 - New feature (non-breaking change which adds functionality)
 - Breaking change (fix or feature that would cause existing functionality to not work as expected)
 

diff --git a/api/scpca_portal/migrations/0041_remove_computedfile_type.py b/api/scpca_portal/migrations/0041_remove_computedfile_type.py
@@ -0,0 +1,17 @@
+# Generated by Django 3.2.18 on 2024-04-23 20:32
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scpca_portal", "0040_auto_20240412_1531"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="computedfile",
+            name="type",
+        ),
+    ]
diff --git a/api/scpca_portal/migrations/0042_auto_20240423_2045.py b/api/scpca_portal/migrations/0042_auto_20240423_2045.py
@@ -0,0 +1,25 @@
+# Generated by Django 3.2.18 on 2024-04-23 20:45
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scpca_portal", "0041_remove_computedfile_type"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="computedfile",
+            name="has_multiplexed_data",
+            field=models.BooleanField(default=False),
+        ),
+        migrations.AlterField(
+            model_name="computedfile",
+            name="modality",
+            field=models.TextField(
+                choices=[("SINGLE_CELL", "Single Cell"), ("SPATIAL", "Spatial")]
+            ),
+        ),
+    ]
diff --git a/api/scpca_portal/models/base.py b/api/scpca_portal/models/base.py
@@ -9,6 +9,7 @@ class Meta:
 
     has_bulk_rna_seq = models.BooleanField(default=False)
     has_cite_seq_data = models.BooleanField(default=False)
+    has_multiplexed_data = models.BooleanField(default=False)
 
 
 class TimestampedModel(models.Model):

diff --git a/api/scpca_portal/models/computed_file.py b/api/scpca_portal/models/computed_file.py
@@ -27,36 +27,15 @@ class MetadataFilenames:
         SINGLE_CELL_METADATA_FILE_NAME = "single_cell_metadata.tsv"
         SPATIAL_METADATA_FILE_NAME = "spatial_metadata.tsv"
 
-    # TODO(ark): these values are redundant and need to be refactored in order not to violate DRY.
     class OutputFileModalities:
-        MULTIPLEXED = "MULTIPLEXED"
         SINGLE_CELL = "SINGLE_CELL"
         SPATIAL = "SPATIAL"
 
         CHOICES = (
-            (MULTIPLEXED, "Multiplexed"),
             (SINGLE_CELL, "Single Cell"),
             (SPATIAL, "Spatial"),
         )
 
-    class OutputFileTypes:
-        PROJECT_MULTIPLEXED_ZIP = "PROJECT_MULTIPLEXED_ZIP"
-        PROJECT_SPATIAL_ZIP = "PROJECT_SPATIAL_ZIP"
-        PROJECT_ZIP = "PROJECT_ZIP"
-
-        SAMPLE_MULTIPLEXED_ZIP = "SAMPLE_MULTIPLEXED_ZIP"
-        SAMPLE_SPATIAL_ZIP = "SAMPLE_SPATIAL_ZIP"
-        SAMPLE_ZIP = "SAMPLE_ZIP"
-
-        CHOICES = (
-            (PROJECT_MULTIPLEXED_ZIP, "Project Multiplexed ZIP"),
-            (PROJECT_SPATIAL_ZIP, "Project Spatial ZIP"),
-            (PROJECT_ZIP, "Project ZIP"),
-            (SAMPLE_MULTIPLEXED_ZIP, "Sample Multiplexed ZIP"),
-            (SAMPLE_SPATIAL_ZIP, "Sample Spatial ZIP"),
-            (SAMPLE_ZIP, "Sample ZIP"),
-        )
-
     class OutputFileFormats:
         ANN_DATA = "ANN_DATA"
         SINGLE_CELL_EXPERIMENT = "SINGLE_CELL_EXPERIMENT"
@@ -102,7 +81,6 @@ class OutputFileFormats:
     s3_bucket = models.TextField()
     s3_key = models.TextField()
     size_in_bytes = models.BigIntegerField()
-    type = models.TextField(choices=OutputFileTypes.CHOICES)
     workflow_version = models.TextField()
     includes_celltype_report = models.BooleanField(default=False)
 
@@ -139,13 +117,13 @@ def get_project_merged_file(
             computed_file_name = project.output_merged_anndata_computed_file_name
             readme_file_path = ComputedFile.README_ANNDATA_MERGED_FILE_PATH
             project_file_mapping[
-                f"{project.input_merged_data_path}/{project.scpca_id}_merged_rna.hdf5"
-            ] = f"{project.scpca_id}_merged_rna.hdf5"
+                f"{project.input_merged_data_path}/{project.scpca_id}_merged_rna.h5ad"
+            ] = f"{project.scpca_id}_merged_rna.h5ad"
 
             if project.has_cite_seq_data:
                 project_file_mapping[
-                    f"{project.input_merged_data_path}/{project.scpca_id}_merged_adt.hdf5"
-                ] = f"{project.scpca_id}_merged_adt.hdf5"
+                    f"{project.input_merged_data_path}/{project.scpca_id}_merged_adt.h5ad"
+                ] = f"{project.scpca_id}_merged_adt.h5ad"
         else:
             if not project.includes_merged_sce:
                 return None
@@ -163,7 +141,6 @@ def get_project_merged_file(
             project=project,
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=computed_file_name,
-            type=cls.OutputFileTypes.PROJECT_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -206,11 +183,10 @@ def get_project_multiplexed_file(
 
         computed_file = cls(
             format=file_format,
-            modality=cls.OutputFileModalities.MULTIPLEXED,
+            modality=cls.OutputFileModalities.SINGLE_CELL,
             project=project,
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=project.output_multiplexed_computed_file_name,
-            type=cls.OutputFileTypes.PROJECT_MULTIPLEXED_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -234,6 +210,7 @@ def get_project_multiplexed_file(
 
         computed_file.has_bulk_rna_seq = project.has_bulk_rna_seq
         computed_file.has_cite_seq_data = project.has_cite_seq_data
+        computed_file.has_multiplexed_data = project.has_multiplexed_data
         computed_file.size_in_bytes = computed_file.zip_file_path.stat().st_size
         computed_file.includes_celltype_report = project.samples.filter(is_cell_line=False).exists()
 
@@ -258,7 +235,6 @@ def get_project_single_cell_file(
             project=project,
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=computed_file_name,
-            type=cls.OutputFileTypes.PROJECT_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -295,7 +271,6 @@ def get_project_spatial_file(
             project=project,
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=project.output_spatial_computed_file_name,
-            type=cls.OutputFileTypes.PROJECT_SPATIAL_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -327,11 +302,10 @@ def get_sample_multiplexed_file(
         """
         computed_file = cls(
             format=file_format,
-            modality=cls.OutputFileModalities.MULTIPLEXED,
+            modality=cls.OutputFileModalities.SINGLE_CELL,
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=sample.output_multiplexed_computed_file_name,
             sample=sample,
-            type=cls.OutputFileTypes.SAMPLE_MULTIPLEXED_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -387,6 +361,7 @@ def get_sample_multiplexed_file(
 
         computed_file.has_bulk_rna_seq = False  # Sample downloads can't contain bulk data.
         computed_file.has_cite_seq_data = sample.has_cite_seq_data
+        computed_file.has_multiplexed_data = sample.has_multiplexed_data
         computed_file.size_in_bytes = computed_file.zip_file_path.stat().st_size
         computed_file.includes_celltype_report = includes_celltype_report
 
@@ -406,10 +381,10 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions, file_
             file_name = sample.output_single_cell_anndata_computed_file_name
             readme_file_path = ComputedFile.README_ANNDATA_FILE_PATH
             common_file_suffixes = [
-                "filtered_rna.hdf5",
-                "processed_rna.hdf5",
+                "filtered_rna.h5ad",
+                "processed_rna.h5ad",
                 "qc.html",
-                "unfiltered_rna.hdf5",
+                "unfiltered_rna.h5ad",
             ]
         else:
             file_name = sample.output_single_cell_computed_file_name
@@ -425,9 +400,9 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions, file_
             common_file_suffixes.append("celltype-report.html")
 
         cite_seq_anndata_file_suffixes = [
-            "filtered_adt.hdf5",
-            "processed_adt.hdf5",
-            "unfiltered_adt.hdf5",
+            "filtered_adt.h5ad",
+            "processed_adt.h5ad",
+            "unfiltered_adt.h5ad",
         ]
 
         computed_file = cls(
@@ -436,7 +411,6 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions, file_
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=file_name,
             sample=sample,
-            type=cls.OutputFileTypes.SAMPLE_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -484,7 +458,6 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions, file_form
             s3_bucket=settings.AWS_S3_BUCKET_NAME,
             s3_key=sample.output_spatial_computed_file_name,
             sample=sample,
-            type=cls.OutputFileTypes.SAMPLE_SPATIAL_ZIP,
             workflow_version=utils.join_workflow_versions(workflow_versions),
         )
 
@@ -521,19 +494,25 @@ def download_url(self):
 
     @property
     def is_project_multiplexed_zip(self):
-        return self.type == ComputedFile.OutputFileTypes.PROJECT_MULTIPLEXED_ZIP
+        return (
+            self.modality == ComputedFile.OutputFileModalities.SINGLE_CELL
+            and self.has_multiplexed_data
+        )
 
     @property
-    def is_project_zip(self):
-        return self.type == ComputedFile.OutputFileTypes.PROJECT_ZIP
+    def is_project_single_cell_zip(self):
+        return (
+            self.modality == ComputedFile.OutputFileModalities.SINGLE_CELL
+            and not self.has_multiplexed_data
+        )
 
     @property
     def is_project_spatial_zip(self):
-        return self.type == ComputedFile.OutputFileTypes.PROJECT_SPATIAL_ZIP
+        return self.modality == ComputedFile.OutputFileModalities.SPATIAL
 
     @property
     def metadata_file_name(self):
-        if self.is_project_multiplexed_zip or self.is_project_zip:
+        if self.is_project_multiplexed_zip or self.is_project_single_cell_zip:
             return ComputedFile.MetadataFilenames.SINGLE_CELL_METADATA_FILE_NAME
         if self.is_project_spatial_zip:
             return ComputedFile.MetadataFilenames.SPATIAL_METADATA_FILE_NAME

diff --git a/api/scpca_portal/models/project.py b/api/scpca_portal/models/project.py
@@ -40,7 +40,6 @@ class Meta:
     diagnoses_counts = models.TextField(blank=True, null=True)
     disease_timings = models.TextField()
     downloadable_sample_count = models.IntegerField(default=0)
-    has_multiplexed_data = models.BooleanField(default=False)
     has_single_cell_data = models.BooleanField(default=False)
     has_spatial_data = models.BooleanField(default=False)
     human_readable_pi_name = models.TextField()
@@ -103,7 +102,9 @@ def input_samples_metadata_file_path(self):
     def multiplexed_computed_file(self):
         try:
             return self.project_computed_files.get(
-                type=ComputedFile.OutputFileTypes.PROJECT_MULTIPLEXED_ZIP
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
+                format=ComputedFile.OutputFileFormats.SINGLE_CELL_EXPERIMENT,
+                has_multiplexed_data=True,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -149,8 +150,8 @@ def single_cell_computed_file(self):
         try:
             return self.project_computed_files.get(
                 format=ComputedFile.OutputFileFormats.SINGLE_CELL_EXPERIMENT,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
                 includes_merged=False,
-                type=ComputedFile.OutputFileTypes.PROJECT_ZIP,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -160,8 +161,8 @@ def single_cell_merged_computed_file(self):
         try:
             return self.project_computed_files.get(
                 format=ComputedFile.OutputFileFormats.SINGLE_CELL_EXPERIMENT,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
                 includes_merged=True,
-                type=ComputedFile.OutputFileTypes.PROJECT_ZIP,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -171,8 +172,8 @@ def single_cell_anndata_computed_file(self):
         try:
             return self.project_computed_files.get(
                 format=ComputedFile.OutputFileFormats.ANN_DATA,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
                 includes_merged=False,
-                type=ComputedFile.OutputFileTypes.PROJECT_ZIP,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -182,8 +183,8 @@ def single_cell_anndata_merged_computed_file(self):
         try:
             return self.project_computed_files.get(
                 format=ComputedFile.OutputFileFormats.ANN_DATA,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
                 includes_merged=True,
-                type=ComputedFile.OutputFileTypes.PROJECT_ZIP,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -192,7 +193,7 @@ def single_cell_anndata_merged_computed_file(self):
     def spatial_computed_file(self):
         try:
             return self.project_computed_files.get(
-                type=ComputedFile.OutputFileTypes.PROJECT_SPATIAL_ZIP
+                modality=ComputedFile.OutputFileModalities.SPATIAL
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -1010,7 +1011,7 @@ def load_data(self, sample_id=None, **kwargs) -> None:
             sample_metadata["has_cite_seq_data"] = has_cite_seq_data
             sample_metadata["has_single_cell_data"] = has_single_cell_data
             sample_metadata["has_spatial_data"] = has_spatial_data
-            sample_metadata["includes_anndata"] = len(list(Path(sample_dir).glob("*.hdf5"))) > 0
+            sample_metadata["includes_anndata"] = len(list(Path(sample_dir).glob("*.h5ad"))) > 0
             sample_metadata["sample_cell_count_estimate"] = sample_cell_count_estimate
             sample_metadata["seq_units"] = ", ".join(sorted(sample_seq_units, key=str.lower))
             sample_metadata["technologies"] = ", ".join(sorted(sample_technologies, key=str.lower))

diff --git a/api/scpca_portal/models/sample.py b/api/scpca_portal/models/sample.py
@@ -165,7 +165,9 @@ def output_spatial_metadata_file_path(self):
     def multiplexed_computed_file(self):
         try:
             return self.sample_computed_files.get(
-                type=ComputedFile.OutputFileTypes.SAMPLE_MULTIPLEXED_ZIP
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
+                format=ComputedFile.OutputFileFormats.SINGLE_CELL_EXPERIMENT,
+                has_multiplexed_data=True,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -175,7 +177,8 @@ def single_cell_computed_file(self):
         try:
             return self.sample_computed_files.get(
                 format=ComputedFile.OutputFileFormats.SINGLE_CELL_EXPERIMENT,
-                type=ComputedFile.OutputFileTypes.SAMPLE_ZIP,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
+                has_multiplexed_data=False,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -185,7 +188,8 @@ def single_cell_anndata_computed_file(self):
         try:
             return self.sample_computed_files.get(
                 format=ComputedFile.OutputFileFormats.ANN_DATA,
-                type=ComputedFile.OutputFileTypes.SAMPLE_ZIP,
+                modality=ComputedFile.OutputFileModalities.SINGLE_CELL,
+                has_multiplexed_data=False,
             )
         except ComputedFile.DoesNotExist:
             pass
@@ -194,7 +198,7 @@ def single_cell_anndata_computed_file(self):
     def spatial_computed_file(self):
         try:
             return self.sample_computed_files.get(
-                type=ComputedFile.OutputFileTypes.SAMPLE_SPATIAL_ZIP
+                modality=ComputedFile.OutputFileModalities.SPATIAL
             )
         except ComputedFile.DoesNotExist:
             pass

diff --git a/api/scpca_portal/serializers.py b/api/scpca_portal/serializers.py
@@ -30,6 +30,7 @@ class Meta:
             "format",
             "has_bulk_rna_seq",
             "has_cite_seq_data",
+            "has_multiplexed_data",
             "id",
             "includes_merged",
             "modality",
@@ -38,7 +39,6 @@ class Meta:
             "s3_key",
             "sample",
             "size_in_bytes",
-            "type",
             "updated_at",
             "workflow_version",
         )