Skip to content

Commit

Permalink
Merge pull request #474 from AlexsLemonade/dev
Browse files Browse the repository at this point in the history
Production Deploy
  • Loading branch information
arkid15r authored Nov 10, 2023
2 parents cc05572 + e5b2e36 commit 6ed7164
Show file tree
Hide file tree
Showing 28 changed files with 631 additions and 217 deletions.
5 changes: 5 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# For more information on CODEOWNERS, see:
# https://help.github.com/en/articles/about-code-owners

* @arkid15r @davidsmejia
/client/ @davidsmejia @nozomione
13 changes: 13 additions & 0 deletions api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ If you would like to purge a project and remove its files from the S3 bucket, yo
sportal manage-api purge_project --scpca-id SCPCP000001 --delete-from-s3
```

The `--cleanup-input-data` flag can help you control the projects input data size. If flag is set the
input data cleanup process will be run for each project right after its processing is over.
```
sportal load-data --cleanup-input-data --reload-all --update-s3
```

The `--cleanup-output-data` flag can help you control the projects output data size. If flag is set the
output (no longer needed) data cleanup process will be run for each project right after its processing is over.
```
Expand All @@ -160,6 +166,13 @@ This is to help prevent the S3 bucket data from accidentally becoming out of syn
To run a command in production, there is a run_command.sh script that is created on the API instance.
It passes any arguments through to the `manage.py`, so `./run_command.sh load_data --reload-all` will work nicely.

The following code can be used to process projects one by one with a minimum disk space footprint:
```
for i in $(seq -f "%02g" 1 20); do
./run_command.sh load_data --cleanup-input-data --cleanup-output-data --reload-existing --scpca-project-id SCPCP0000$i
done
```

The `purge_project` command can be run in a similar fashion: `./run_command.sh purge_project --scpca-id SCPCP000001`

## Cloud Deployments
Expand Down
14 changes: 14 additions & 0 deletions api/scpca_portal/management/commands/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class Command(BaseCommand):
to a stack-specific S3 bucket."""

def add_arguments(self, parser):
parser.add_argument(
"--cleanup-input-data", action=BooleanOptionalAction, default=settings.PRODUCTION
)
parser.add_argument(
"--cleanup-output-data", action=BooleanOptionalAction, default=settings.PRODUCTION
)
Expand All @@ -75,6 +78,7 @@ def add_arguments(self, parser):

def handle(self, *args, **options):
load_data_from_s3(
cleanup_input_data=options["cleanup_input_data"],
cleanup_output_data=options["cleanup_output_data"],
reload_all=options["reload_all"],
reload_existing=options["reload_existing"],
Expand All @@ -99,6 +103,7 @@ def cleanup_output_data_dir():

def load_data_from_s3(
allowed_submitters: set = ALLOWED_SUBMITTERS,
cleanup_input_data: bool = False,
cleanup_output_data: bool = False,
input_bucket_name: str = "scpca-portal-inputs",
reload_all: bool = False,
Expand Down Expand Up @@ -195,6 +200,11 @@ def load_data_from_s3(
project.save()

project.add_contacts(project_data["contact_email"], project_data["contact_name"])
project.add_external_accessions(
project_data["external_accession"],
project_data["external_accession_url"],
project_data["external_accession_raw"],
)
project.add_publications(project_data["citation"], project_data["citation_doi"])

if project.scpca_id not in os.listdir(common.INPUT_DATA_DIR):
Expand All @@ -216,6 +226,10 @@ def load_data_from_s3(
computed_file.s3_key,
)

if cleanup_input_data:
logger.info(f"Cleaning up '{project}' input data")
shutil.rmtree(os.path.join(common.INPUT_DATA_DIR, project.scpca_id), ignore_errors=True)

if cleanup_output_data:
logger.info(f"Cleaning up '{project}' output data")
for computed_file in computed_files:
Expand Down
31 changes: 31 additions & 0 deletions api/scpca_portal/migrations/0030_auto_20231030_2259.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Generated by Django 3.2.22 on 2023-10-30 22:59

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("scpca_portal", "0029_auto_20221217_0256"),
]

operations = [
migrations.CreateModel(
name="ExternalAccession",
fields=[
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
("accession", models.TextField(primary_key=True, serialize=False)),
("has_raw", models.BooleanField(default=False)),
("url", models.TextField()),
],
options={
"db_table": "external_accessions",
},
),
migrations.AddField(
model_name="project",
name="external_accessions",
field=models.ManyToManyField(to="scpca_portal.ExternalAccession"),
),
]
1 change: 1 addition & 0 deletions api/scpca_portal/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from scpca_portal.models.api_token import APIToken
from scpca_portal.models.computed_file import ComputedFile
from scpca_portal.models.contact import Contact
from scpca_portal.models.external_accession import ExternalAccession
from scpca_portal.models.project import Project
from scpca_portal.models.project_summary import ProjectSummary
from scpca_portal.models.publication import Publication
Expand Down
44 changes: 31 additions & 13 deletions api/scpca_portal/models/computed_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ def get_project_multiplexed_file(cls, project, sample_to_file_mapping, workflow_

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(
ComputedFile.README_MULTIPLEXED_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
ComputedFile.README_MULTIPLEXED_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_file.write(
project.output_multiplexed_metadata_file_path, computed_file.metadata_file_name
Expand Down Expand Up @@ -125,14 +126,17 @@ def get_project_single_cell_file(cls, project, sample_to_file_mapping, workflow_
)

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(ComputedFile.README_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME)
zip_file.write(
ComputedFile.README_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_file.write(
project.output_single_cell_metadata_file_path, computed_file.metadata_file_name
)

for sample_id, file_paths in sample_to_file_mapping.items():
for file_path in file_paths:
# Nest these under thier sample id.
# Nest these under their sample id.
archive_path = os.path.join(sample_id, os.path.basename(file_path))
zip_file.write(file_path, archive_path)

Expand All @@ -158,7 +162,8 @@ def get_project_spatial_file(cls, project, sample_to_file_mapping, workflow_vers

with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(
ComputedFile.README_SPATIAL_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
ComputedFile.README_SPATIAL_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_file.write(
project.output_spatial_metadata_file_path, computed_file.metadata_file_name
Expand Down Expand Up @@ -201,7 +206,8 @@ def get_sample_multiplexed_file(
if not os.path.exists(computed_file.zip_file_path):
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(
ComputedFile.README_MULTIPLEXED_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
ComputedFile.README_MULTIPLEXED_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_file.write(
sample.output_multiplexed_metadata_file_path,
Expand Down Expand Up @@ -230,7 +236,10 @@ def get_sample_single_cell_file(cls, sample, libraries, workflow_versions):

file_paths = []
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(ComputedFile.README_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME)
zip_file.write(
ComputedFile.README_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_file.write(
sample.output_single_cell_metadata_file_path,
ComputedFile.MetadataFilenames.SINGLE_CELL_METADATA_FILE_NAME,
Expand Down Expand Up @@ -269,11 +278,12 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):
)

file_paths = []
with ZipFile(computed_file.zip_file_path, "w") as zip_object:
zip_object.write(
ComputedFile.README_SPATIAL_FILE_PATH, ComputedFile.OUTPUT_README_FILE_NAME
with ZipFile(computed_file.zip_file_path, "w") as zip_file:
zip_file.write(
ComputedFile.README_SPATIAL_FILE_PATH,
ComputedFile.OUTPUT_README_FILE_NAME,
)
zip_object.write(
zip_file.write(
sample.output_spatial_metadata_file_path,
ComputedFile.MetadataFilenames.SPATIAL_METADATA_FILE_NAME,
)
Expand All @@ -286,7 +296,7 @@ def get_sample_spatial_file(cls, sample, libraries, workflow_versions):
)
)
for item in library_path.rglob("*"): # Add the entire directory contents.
zip_object.write(item, item.relative_to(library_path.parent))
zip_file.write(item, item.relative_to(library_path.parent))
file_paths.append(f"{Path(library_path, item.relative_to(library_path))}")

computed_file.size_in_bytes = os.path.getsize(computed_file.zip_file_path)
Expand Down Expand Up @@ -324,10 +334,18 @@ def zip_file_path(self):
def create_download_url(self):
"""Creates a temporary URL from which the file can be downloaded."""
if self.s3_bucket and self.s3_key:
# Append the download date to the filename on download.
date = utils.get_today_string()
filename, ext = os.path.splitext(self.s3_key)

return s3.generate_presigned_url(
ClientMethod="get_object",
Params={"Bucket": self.s3_bucket, "Key": self.s3_key},
ExpiresIn=(60 * 60 * 24 * 7), # 7 days in seconds.
Params={
"Bucket": self.s3_bucket,
"Key": self.s3_key,
"ResponseContentDisposition": f"attachment; filename = {filename}_{date}.{ext}",
},
ExpiresIn=60 * 60 * 24 * 7, # 7 days in seconds.
)

def delete_s3_file(self, force=False):
Expand Down
17 changes: 17 additions & 0 deletions api/scpca_portal/models/external_accession.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.db import models

from scpca_portal.models.base import TimestampedModel


class ExternalAccession(TimestampedModel):
"""External accession."""

class Meta:
db_table = "external_accessions"

accession = models.TextField(primary_key=True)
has_raw = models.BooleanField(default=False)
url = models.TextField()

def __str__(self) -> str:
return self.accession
62 changes: 49 additions & 13 deletions api/scpca_portal/models/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

from django.db import models

from scpca_portal import common
from scpca_portal import common, utils
from scpca_portal.models.base import TimestampedModel
from scpca_portal.models.computed_file import ComputedFile
from scpca_portal.models.contact import Contact
from scpca_portal.models.external_accession import ExternalAccession
from scpca_portal.models.project_summary import ProjectSummary
from scpca_portal.models.publication import Publication
from scpca_portal.models.sample import Sample
Expand Down Expand Up @@ -48,6 +49,7 @@ class Meta:
unavailable_samples_count = models.PositiveIntegerField(default=0)

contacts = models.ManyToManyField(Contact)
external_accessions = models.ManyToManyField(ExternalAccession)
publications = models.ManyToManyField(Publication)

def __str__(self):
Expand Down Expand Up @@ -257,10 +259,10 @@ def combine_multiplexed_metadata(

return combined_metadata, multiplexed_sample_mapping

def add_contacts(self, contact_emails, contact_names):
def add_contacts(self, contact_email, contact_name):
"""Creates and adds project contacts."""
emails = contact_emails.split(common.CSV_MULTI_VALUE_DELIMITER)
names = contact_names.split(common.CSV_MULTI_VALUE_DELIMITER)
emails = contact_email.split(common.CSV_MULTI_VALUE_DELIMITER)
names = contact_name.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(emails) != len(names):
logger.error("Unable to add ambiguous contacts.")
Expand All @@ -277,10 +279,32 @@ def add_contacts(self, contact_emails, contact_names):

self.contacts.add(contact)

def add_publications(self, citations, citation_dois):
def add_external_accessions(
self, external_accession, external_accession_url, external_accession_raw
):
"""Creates and adds project external accessions."""
accessions = external_accession.split(common.CSV_MULTI_VALUE_DELIMITER)
urls = external_accession_url.split(common.CSV_MULTI_VALUE_DELIMITER)
accessions_raw = external_accession_raw.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(set((len(accessions), len(urls), len(accessions_raw)))) != 1:
logger.error("Unable to add ambiguous external accessions.")
return

for idx, accession in enumerate(accessions):
external_accession, _ = ExternalAccession.objects.get_or_create(
accession=accession.strip()
)
external_accession.url = urls[idx].strip()
external_accession.has_raw = utils.boolean_from_string(accessions_raw[idx].strip())
external_accession.save()

self.external_accessions.add(external_accession)

def add_publications(self, citation, citation_doi):
"""Creates and adds project publications."""
citations = citations.split(common.CSV_MULTI_VALUE_DELIMITER)
dois = citation_dois.split(common.CSV_MULTI_VALUE_DELIMITER)
citations = citation.split(common.CSV_MULTI_VALUE_DELIMITER)
dois = citation_doi.split(common.CSV_MULTI_VALUE_DELIMITER)

if len(citations) != len(dois):
logger.error("Unable to add ambiguous publications.")
Expand Down Expand Up @@ -451,7 +475,11 @@ def create_single_cell_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def create_multiplexed_readme_file(self):
Expand All @@ -460,7 +488,11 @@ def create_multiplexed_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_MULTIPLEXED_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def create_spatial_readme_file(self):
Expand All @@ -469,7 +501,11 @@ def create_spatial_readme_file(self):
readme_template = readme_template_file.read()
with open(ComputedFile.README_SPATIAL_FILE_PATH, "w") as readme_file:
readme_file.write(
readme_template.format(project_accession=self.scpca_id, project_url=self.url)
readme_template.format(
project_accession=self.scpca_id,
project_url=self.url,
date=utils.get_today_string(),
)
)

def get_bulk_rna_seq_sample_ids(self):
Expand Down Expand Up @@ -497,20 +533,20 @@ def get_computed_files(
"""Prepares ready for saving project computed files based on generated file mappings."""
computed_files = list()

# The multiplexed and single cell cases are if/else as we produce
# a single computed file for a multiplexed samples project.
if multiplexed_file_mapping:
computed_files.append(
ComputedFile.get_project_multiplexed_file(
self, multiplexed_file_mapping, multiplexed_workflow_versions
)
)
elif single_cell_file_mapping:

if single_cell_file_mapping:
computed_files.append(
ComputedFile.get_project_single_cell_file(
self, single_cell_file_mapping, single_cell_workflow_versions
)
)

if spatial_file_mapping:
computed_files.append(
ComputedFile.get_project_spatial_file(
Expand Down
Loading

0 comments on commit 6ed7164

Please sign in to comment.