Merge pull request #474 from AlexsLemonade/dev

Production Deploy
AlexsLemonade · Nov 10, 2023 · 6ed7164 · 6ed7164
2 parents cc05572 + e5b2e36
commit 6ed7164
Show file tree

Hide file tree

Showing 28 changed files with 631 additions and 217 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,5 @@
+# For more information on CODEOWNERS, see:
+# https://help.github.com/en/articles/about-code-owners
+
+* @arkid15r @davidsmejia
+/client/ @davidsmejia @nozomione
diff --git a/api/README.md b/api/README.md
@@ -145,6 +145,12 @@ If you would like to purge a project and remove its files from the S3 bucket, yo
 sportal manage-api purge_project --scpca-id SCPCP000001 --delete-from-s3
 ```
 
+The `--cleanup-input-data` flag can help you control the projects input data size. If flag is set the
+input data cleanup process will be run for each project right after its processing is over.
+```
+sportal load-data --cleanup-input-data --reload-all --update-s3
+```
+
 The `--cleanup-output-data` flag can help you control the projects output data size. If flag is set the
 output (no longer needed) data cleanup process will be run for each project right after its processing is over.
 ```
@@ -160,6 +166,13 @@ This is to help prevent the S3 bucket data from accidentally becoming out of syn
 To run a command in production, there is a run_command.sh script that is created on the API instance.
 It passes any arguments through to the `manage.py`, so `./run_command.sh load_data --reload-all` will work nicely.
 
+The following code can be used to process projects one by one with a minimum disk space footprint:
+```
+for i in $(seq -f "%02g" 1 20); do
+    ./run_command.sh load_data --cleanup-input-data --cleanup-output-data --reload-existing --scpca-project-id SCPCP0000$i
+done
+```
+
 The `purge_project` command can be run in a similar fashion: `./run_command.sh purge_project --scpca-id SCPCP000001`
 
 ## Cloud Deployments

diff --git a/api/scpca_portal/management/commands/load_data.py b/api/scpca_portal/management/commands/load_data.py
@@ -63,6 +63,9 @@ class Command(BaseCommand):
     to a stack-specific S3 bucket."""
 
     def add_arguments(self, parser):
+        parser.add_argument(
+            "--cleanup-input-data", action=BooleanOptionalAction, default=settings.PRODUCTION
+        )
         parser.add_argument(
             "--cleanup-output-data", action=BooleanOptionalAction, default=settings.PRODUCTION
         )
@@ -75,6 +78,7 @@ def add_arguments(self, parser):
 
     def handle(self, *args, **options):
         load_data_from_s3(
+            cleanup_input_data=options["cleanup_input_data"],
             cleanup_output_data=options["cleanup_output_data"],
             reload_all=options["reload_all"],
             reload_existing=options["reload_existing"],
@@ -99,6 +103,7 @@ def cleanup_output_data_dir():
 
 def load_data_from_s3(
     allowed_submitters: set = ALLOWED_SUBMITTERS,
+    cleanup_input_data: bool = False,
     cleanup_output_data: bool = False,
     input_bucket_name: str = "scpca-portal-inputs",
     reload_all: bool = False,
@@ -195,6 +200,11 @@ def load_data_from_s3(
         project.save()
 
         project.add_contacts(project_data["contact_email"], project_data["contact_name"])
+        project.add_external_accessions(
+            project_data["external_accession"],
+            project_data["external_accession_url"],
+            project_data["external_accession_raw"],
+        )
         project.add_publications(project_data["citation"], project_data["citation_doi"])
 
         if project.scpca_id not in os.listdir(common.INPUT_DATA_DIR):
@@ -216,6 +226,10 @@ def load_data_from_s3(
                     computed_file.s3_key,
                 )
 
+        if cleanup_input_data:
+            logger.info(f"Cleaning up '{project}' input data")
+            shutil.rmtree(os.path.join(common.INPUT_DATA_DIR, project.scpca_id), ignore_errors=True)
+
         if cleanup_output_data:
             logger.info(f"Cleaning up '{project}' output data")
             for computed_file in computed_files:

diff --git a/api/scpca_portal/migrations/0030_auto_20231030_2259.py b/api/scpca_portal/migrations/0030_auto_20231030_2259.py
@@ -0,0 +1,31 @@
+# Generated by Django 3.2.22 on 2023-10-30 22:59
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scpca_portal", "0029_auto_20221217_0256"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="ExternalAccession",
+            fields=[
+                ("created_at", models.DateTimeField(auto_now_add=True)),
+                ("updated_at", models.DateTimeField(auto_now=True)),
+                ("accession", models.TextField(primary_key=True, serialize=False)),
+                ("has_raw", models.BooleanField(default=False)),
+                ("url", models.TextField()),
+            ],
+            options={
+                "db_table": "external_accessions",
+            },
+        ),
+        migrations.AddField(
+            model_name="project",
+            name="external_accessions",
+            field=models.ManyToManyField(to="scpca_portal.ExternalAccession"),
+        ),
+    ]
diff --git a/api/scpca_portal/models/__init__.py b/api/scpca_portal/models/__init__.py
@@ -1,6 +1,7 @@
 from scpca_portal.models.api_token import APIToken
 from scpca_portal.models.computed_file import ComputedFile
 from scpca_portal.models.contact import Contact
+from scpca_portal.models.external_accession import ExternalAccession
 from scpca_portal.models.project import Project
 from scpca_portal.models.project_summary import ProjectSummary
 from scpca_portal.models.publication import Publication

diff --git a/api/scpca_portal/models/computed_file.py b/api/scpca_portal/models/computed_file.py
@@ -92,7 +92,8 @@ def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
             zip_file.write(
-                ComputedFile.README_MULTIPLEXED_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
+                ComputedFile.README_MULTIPLEXED_FILE_PATH,
+                ComputedFile.OUTPUT_README_FILE_NAME,
             )
             zip_file.write(
                 project.output_multiplexed_metadata_file_path, computed_file.metadata_file_name
@@ -125,14 +126,17 @@ def get_project_single_cell_file(cls, project, sample_to_file_mapping, workflow_
         )
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.write(ComputedFile.README_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME)
+            zip_file.write(
+                ComputedFile.README_FILE_PATH,
+                ComputedFile.OUTPUT_README_FILE_NAME,
+            )
             zip_file.write(
                 project.output_single_cell_metadata_file_path, computed_file.metadata_file_name
             )
 
             for sample_id, file_paths in sample_to_file_mapping.items():
                 for file_path in file_paths:
-                    # Nest these under thier sample id.
+                    # Nest these under their sample id.
                     archive_path = os.path.join(sample_id, os.path.basename(file_path))
                     zip_file.write(file_path, archive_path)
 
@@ -158,7 +162,8 @@ def get_project_spatial_file(cls, project, sample_to_file_mapping, workflow_vers
 
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
             zip_file.write(
-                ComputedFile.README_SPATIAL_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
+                ComputedFile.README_SPATIAL_FILE_PATH,
+                ComputedFile.OUTPUT_README_FILE_NAME,
             )
             zip_file.write(
                 project.output_spatial_metadata_file_path, computed_file.metadata_file_name
@@ -201,7 +206,8 @@ def get_sample_multiplexed_file(
         if not os.path.exists(computed_file.zip_file_path):
             with ZipFile(computed_file.zip_file_path, "w") as zip_file:
                 zip_file.write(
-                    ComputedFile.README_MULTIPLEXED_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
+                    ComputedFile.README_MULTIPLEXED_FILE_PATH,
+                    ComputedFile.OUTPUT_README_FILE_NAME,
                 )
                 zip_file.write(
                     sample.output_multiplexed_metadata_file_path,
@@ -230,7 +236,10 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions):
 
         file_paths = []
         with ZipFile(computed_file.zip_file_path, "w") as zip_file:
-            zip_file.write(ComputedFile.README_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME)
+            zip_file.write(
+                ComputedFile.README_FILE_PATH,
+                ComputedFile.OUTPUT_README_FILE_NAME,
+            )
             zip_file.write(
                 sample.output_single_cell_metadata_file_path,
                 ComputedFile.MetadataFilenames.SINGLE_CELL_METADATA_FILE_NAME,
@@ -269,11 +278,12 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):
         )
 
         file_paths = []
-        with ZipFile(computed_file.zip_file_path, "w") as zip_object:
-            zip_object.write(
-                ComputedFile.README_SPATIAL_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
+        with ZipFile(computed_file.zip_file_path, "w") as zip_file:
+            zip_file.write(
+                ComputedFile.README_SPATIAL_FILE_PATH,
+                ComputedFile.OUTPUT_README_FILE_NAME,
             )
-            zip_object.write(
+            zip_file.write(
                 sample.output_spatial_metadata_file_path,
                 ComputedFile.MetadataFilenames.SPATIAL_METADATA_FILE_NAME,
             )
@@ -286,7 +296,7 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):
                     )
                 )
                 for item in library_path.rglob("*"):  # Add the entire directory contents.
-                    zip_object.write(item, item.relative_to(library_path.parent))
+                    zip_file.write(item, item.relative_to(library_path.parent))
                     file_paths.append(f"{Path(library_path, item.relative_to(library_path))}")
 
         computed_file.size_in_bytes = os.path.getsize(computed_file.zip_file_path)
@@ -324,10 +334,18 @@ def zip_file_path(self):
     def create_download_url(self):
         """Creates a temporary URL from which the file can be downloaded."""
         if self.s3_bucket and self.s3_key:
+            # Append the download date to the filename on download.
+            date = utils.get_today_string()
+            filename, ext = os.path.splitext(self.s3_key)
+
             return s3.generate_presigned_url(
                 ClientMethod="get_object",
-                Params={"Bucket": self.s3_bucket, "Key": self.s3_key},
-                ExpiresIn=(60 * 60 * 24 * 7),  # 7 days in seconds.
+                Params={
+                    "Bucket": self.s3_bucket,
+                    "Key": self.s3_key,
+                    "ResponseContentDisposition": f"attachment; filename = {filename}_{date}.{ext}",
+                },
+                ExpiresIn=60 * 60 * 24 * 7,  # 7 days in seconds.
             )
 
     def delete_s3_file(self, force=False):

diff --git a/api/scpca_portal/models/external_accession.py b/api/scpca_portal/models/external_accession.py
@@ -0,0 +1,17 @@
+from django.db import models
+
+from scpca_portal.models.base import TimestampedModel
+
+
+class ExternalAccession(TimestampedModel):
+    """External accession."""
+
+    class Meta:
+        db_table = "external_accessions"
+
+    accession = models.TextField(primary_key=True)
+    has_raw = models.BooleanField(default=False)
+    url = models.TextField()
+
+    def __str__(self) -> str:
+        return self.accession
diff --git a/api/scpca_portal/models/project.py b/api/scpca_portal/models/project.py
@@ -8,10 +8,11 @@
 
 from django.db import models
 
-from scpca_portal import common
+from scpca_portal import common, utils
 from scpca_portal.models.base import TimestampedModel
 from scpca_portal.models.computed_file import ComputedFile
 from scpca_portal.models.contact import Contact
+from scpca_portal.models.external_accession import ExternalAccession
 from scpca_portal.models.project_summary import ProjectSummary
 from scpca_portal.models.publication import Publication
 from scpca_portal.models.sample import Sample
@@ -48,6 +49,7 @@ class Meta:
     unavailable_samples_count = models.PositiveIntegerField(default=0)
 
     contacts = models.ManyToManyField(Contact)
+    external_accessions = models.ManyToManyField(ExternalAccession)
     publications = models.ManyToManyField(Publication)
 
     def __str__(self):
@@ -257,10 +259,10 @@ def combine_multiplexed_metadata(
 
         return combined_metadata, multiplexed_sample_mapping
 
-    def add_contacts(self, contact_emails, contact_names):
+    def add_contacts(self, contact_email, contact_name):
         """Creates and adds project contacts."""
-        emails = contact_emails.split(common.CSV_MULTI_VALUE_DELIMITER)
-        names = contact_names.split(common.CSV_MULTI_VALUE_DELIMITER)
+        emails = contact_email.split(common.CSV_MULTI_VALUE_DELIMITER)
+        names = contact_name.split(common.CSV_MULTI_VALUE_DELIMITER)
 
         if len(emails) != len(names):
             logger.error("Unable to add ambiguous contacts.")
@@ -277,10 +279,32 @@ def add_contacts(self, contact_emails, contact_names):
 
             self.contacts.add(contact)
 
-    def add_publications(self, citations, citation_dois):
+    def add_external_accessions(
+        self, external_accession, external_accession_url, external_accession_raw
+    ):
+        """Creates and adds project external accessions."""
+        accessions = external_accession.split(common.CSV_MULTI_VALUE_DELIMITER)
+        urls = external_accession_url.split(common.CSV_MULTI_VALUE_DELIMITER)
+        accessions_raw = external_accession_raw.split(common.CSV_MULTI_VALUE_DELIMITER)
+
+        if len(set((len(accessions), len(urls), len(accessions_raw)))) != 1:
+            logger.error("Unable to add ambiguous external accessions.")
+            return
+
+        for idx, accession in enumerate(accessions):
+            external_accession, _ = ExternalAccession.objects.get_or_create(
+                accession=accession.strip()
+            )
+            external_accession.url = urls[idx].strip()
+            external_accession.has_raw = utils.boolean_from_string(accessions_raw[idx].strip())
+            external_accession.save()
+
+            self.external_accessions.add(external_accession)
+
+    def add_publications(self, citation, citation_doi):
         """Creates and adds project publications."""
-        citations = citations.split(common.CSV_MULTI_VALUE_DELIMITER)
-        dois = citation_dois.split(common.CSV_MULTI_VALUE_DELIMITER)
+        citations = citation.split(common.CSV_MULTI_VALUE_DELIMITER)
+        dois = citation_doi.split(common.CSV_MULTI_VALUE_DELIMITER)
 
         if len(citations) != len(dois):
             logger.error("Unable to add ambiguous publications.")
@@ -451,7 +475,11 @@ def create_single_cell_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def create_multiplexed_readme_file(self):
@@ -460,7 +488,11 @@ def create_multiplexed_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_MULTIPLEXED_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def create_spatial_readme_file(self):
@@ -469,7 +501,11 @@ def create_spatial_readme_file(self):
             readme_template = readme_template_file.read()
         with open(ComputedFile.README_SPATIAL_FILE_PATH, "w") as readme_file:
             readme_file.write(
-                readme_template.format(project_accession=self.scpca_id, project_url=self.url)
+                readme_template.format(
+                    project_accession=self.scpca_id,
+                    project_url=self.url,
+                    date=utils.get_today_string(),
+                )
             )
 
     def get_bulk_rna_seq_sample_ids(self):
@@ -497,20 +533,20 @@ def get_computed_files(
         """Prepares ready for saving project computed files based on generated file mappings."""
         computed_files = list()
 
-        # The multiplexed and single cell cases are if/else as we produce
-        # a single computed file for a multiplexed samples project.
         if multiplexed_file_mapping:
             computed_files.append(
                 ComputedFile.get_project_multiplexed_file(
                     self, multiplexed_file_mapping, multiplexed_workflow_versions
                 )
             )
-        elif single_cell_file_mapping:
+
+        if single_cell_file_mapping:
             computed_files.append(
                 ComputedFile.get_project_single_cell_file(
                     self, single_cell_file_mapping, single_cell_workflow_versions
                 )
             )
+
         if spatial_file_mapping:
             computed_files.append(
                 ComputedFile.get_project_spatial_file(