Skip to content

Commit

Permalink
Merge pull request #770 from AlexsLemonade/dev
Browse files Browse the repository at this point in the history
Production Deploy
  • Loading branch information
davidsmejia authored Jun 20, 2024
2 parents 18438e8 + 7d4d2c3 commit d8c4b9a
Show file tree
Hide file tree
Showing 51 changed files with 2,231 additions and 1,158 deletions.
69 changes: 69 additions & 0 deletions api/scpca_portal/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,72 @@
TEMPLATE_PATH = CODE_PATH / "scpca_portal" / "templates"

TAB = "\t"

IGNORED_INPUT_VALUES = {"", "N/A", "TBD"}
STRIPPED_INPUT_VALUES = "< >"

# Global sort order for Metadata TSVs
# Columns
METADATA_COLUMN_SORT_ORDER = [
"scpca_project_id",
# Sample metadata
"scpca_sample_id",
"scpca_library_id",
"diagnosis",
"subdiagnosis",
"disease_timing",
"age_at_diagnosis",
"sex",
"tissue_location",
"participant_id",
"submitter",
"submitter_id",
"organism",
"development_stage_ontology_term_id",
"sex_ontology_term_id",
"organism_ontology_id",
"self_reported_ethnicity_ontology_term_id",
"disease_ontology_term_id",
"tissue_ontology_term_id",
"*", # Addtional metadata
# Library metadata
"seq_unit",
"technology",
"demux_samples",
"total_reads",
"mapped_reads",
"sample_cell_count_estimate",
"sample_cell_estimates", # ONLY FOR MULTIPLEXED
"unfiltered_cells",
"filtered_cell_count",
"processed_cells",
"has_cellhash",
"includes_anndata",
"is_cell_line",
"is_multiplexed",
"is_xenograft",
# Project metadata
"pi_name",
"project_title",
# Processing metadata
"genome_assembly",
"mapping_index",
"spaceranger_version", # FOR SPATIAL ONLY
"alevin_fry_version", # REMOVED FOR SPATIAL
"salmon_version", # REMOVED FOR SPATIAL
"transcript_type", # REMOVED FOR SPATIAL
"droplet_filtering_method", # REMOVED FOR SPATIAL
"cell_filtering_method", # REMOVED FOR SPATIAL
"prob_compromised_cutoff", # REMOVED FOR SPATIAL
"min_gene_cutoff", # REMOVED FOR SPATIAL
"normalization_method", # REMOVED FOR SPATIAL
"demux_method", # ONLY FOR MULTIPLEXED
"date_processed",
"workflow",
"workflow_version",
"workflow_commit",
]
# Rows
PROJECT_ID_KEY = "scpca_project_id"
SAMPLE_ID_KEY = "scpca_sample_id"
LIBRARY_ID_KEY = "scpca_library_id"
99 changes: 31 additions & 68 deletions api/scpca_portal/management/commands/load_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import csv
import logging
import shutil
import subprocess
Expand All @@ -12,8 +11,8 @@
import boto3
from botocore.client import Config

from scpca_portal import common, utils
from scpca_portal.models import Project
from scpca_portal import common, metadata_file
from scpca_portal.models import Contact, ExternalAccession, Project, Publication

ALLOWED_SUBMITTERS = {
"christensen",
Expand Down Expand Up @@ -139,52 +138,13 @@ def add_arguments(self, parser):
"--update-s3", action=BooleanOptionalAction, default=settings.UPDATE_S3_DATA
)

def clean_up_input_data(self):
shutil.rmtree(common.INPUT_DATA_PATH / self.project.scpca_id, ignore_errors=True)
def clean_up_input_data(self, project):
shutil.rmtree(common.INPUT_DATA_PATH / project.scpca_id, ignore_errors=True)

def handle(self, *args, **kwargs):
self.configure_aws_cli(**kwargs)
self.load_data(**kwargs)

def process_project_data(self, data, sample_id, **kwargs):
self.project.abstract = data["abstract"]
self.project.additional_restrictions = data["additional_restrictions"]
self.project.has_bulk_rna_seq = utils.boolean_from_string(data.get("has_bulk", False))
self.project.has_cite_seq_data = utils.boolean_from_string(data.get("has_CITE", False))
self.project.has_multiplexed_data = utils.boolean_from_string(
data.get("has_multiplex", False)
)
self.project.has_spatial_data = utils.boolean_from_string(data.get("has_spatial", False))
self.project.human_readable_pi_name = data["PI"]
self.project.includes_anndata = utils.boolean_from_string(
data.get("includes_anndata", False)
)
self.project.includes_cell_lines = utils.boolean_from_string(
data.get("includes_cell_lines", False)
)
self.project.includes_merged_anndata = utils.boolean_from_string(
data.get("includes_merged_anndata", False)
)
self.project.includes_merged_sce = utils.boolean_from_string(
data.get("includes_merged_sce", False)
)
self.project.includes_xenografts = utils.boolean_from_string(
data.get("includes_xenografts", False)
)
self.project.pi_name = data["submitter"]
self.project.title = data["project_title"]
self.project.save()

self.project.add_contacts(data["contact_email"], data["contact_name"])
self.project.add_external_accessions(
data["external_accession"],
data["external_accession_url"],
data["external_accession_raw"],
)
self.project.add_publications(data["citation"], data["citation_doi"])

self.project.load_data(sample_id=sample_id, **kwargs)

def load_data(
self,
allowed_submitters: set[str] = None,
Expand Down Expand Up @@ -215,9 +175,9 @@ def load_data(
if project_path.is_dir()
}

with open(Project.get_input_project_metadata_file_path()) as project_csv:
project_list = list(csv.DictReader(project_csv))

project_list = metadata_file.load_projects_metadata(
Project.get_input_project_metadata_file_path()
)
for project_data in project_list:
scpca_project_id = project_data["scpca_project_id"]
if project_id and project_id != scpca_project_id:
Expand All @@ -229,34 +189,37 @@ def load_data(
)
return

if project_data["submitter"] not in allowed_submitters:
logger.warning("Project submitter is not the white list.")
continue

# Purge existing projects so they can be re-added.
if (project := Project.objects.filter(scpca_id=scpca_project_id).first()) and (
kwargs["reload_all"] or kwargs["reload_existing"]
):
logger.info(f"Purging '{project}")
project.purge(delete_from_s3=kwargs["update_s3"])

# Only import new projects. If old ones are desired they should be purged and re-added.
project, created = Project.objects.get_or_create(scpca_id=scpca_project_id)
if not created:
logger.info(f"'{project}' already exists. Use --reload-existing to re-import.")
if project_data["pi_name"] not in allowed_submitters:
logger.warning("Project submitter is not the white list.")
continue

self.project = Project.objects.filter(scpca_id=scpca_project_id).first()
logger.info(f"Importing '{self.project}' data")
self.process_project_data(project_data, sample_id, **kwargs)
if samples_count := self.project.samples.count():
if project := Project.objects.filter(scpca_id=scpca_project_id).first():
# Purge existing projects so they can be re-added.
if kwargs["reload_all"] or kwargs["reload_existing"]:
logger.info(f"Purging '{project}")
project.purge(delete_from_s3=kwargs["update_s3"])
# Only import new projects.
# If old ones are desired they should be purged and re-added.
else:
logger.info(f"'{project}' already exists. Use --reload-existing to re-import.")
continue

logger.info(f"Importing '{project}' data")
project = Project.get_from_dict(project_data)
project.save()
Contact.bulk_create_from_project_data(project_data, project)
ExternalAccession.bulk_create_from_project_data(project_data, project)
Publication.bulk_create_from_project_data(project_data, project)

project.load_data(sample_id=sample_id, **kwargs)
if samples_count := project.samples.count():
logger.info(
f"Created {samples_count} sample{pluralize(samples_count)} for '{self.project}'"
f"Created {samples_count} sample{pluralize(samples_count)} for '{project}'"
)

if kwargs["clean_up_input_data"]:
logger.info(f"Cleaning up '{project}' input data")
self.clean_up_input_data()
self.clean_up_input_data(project)

if kwargs["clean_up_output_data"]:
logger.info("Cleaning up output directory")
Expand Down
112 changes: 112 additions & 0 deletions api/scpca_portal/metadata_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import csv
import json
from collections import namedtuple
from pathlib import Path
from typing import Dict, List, Tuple

from scpca_portal import common, utils

PROJECT_METADATA_KEYS = [
# Fields used in Project model object creation
("has_bulk", "has_bulk_rna_seq", False),
("has_CITE", "has_cite_seq_data", False),
("has_multiplex", "has_multiplexed_data", False),
("has_spatial", "has_spatial_data", False),
("PI", "human_readable_pi_name", None),
("submitter", "pi_name", None),
("project_title", "title", None),
# Fields used in Contact model object creation
("contact_email", "email", None),
("contact_name", "name", None),
# Fields used in ExternalAccession model object creation
("external_accession", "accession", None),
("external_accession_raw", "has_raw", False),
("external_accession_url", "accession_url", None),
# Field used in Publication model object creation
("citation_doi", "doi", None),
]

SAMPLE_METADATA_KEYS = [
("age", "age_at_diagnosis", None),
]

LIBRARY_METADATA_KEYS = [
("library_id", "scpca_library_id", None),
("sample_id", "scpca_sample_id", None),
# Field only included in Single cell (and Multiplexed) libraries
("filtered_cells", "filtered_cell_count", None),
]
KeyTransform = namedtuple("KeyTransform", ["old_key", "new_key", "default_value"])


def load_projects_metadata(metadata_file_path: Path):
"""
Opens, loads and parses list of project metadata located at inputted metadata_file_path.
Transforms keys in data dicts to match associated model attributes.
"""
with open(metadata_file_path) as raw_file:
data_dicts = list(csv.DictReader(raw_file))

for data_dict in data_dicts:
transform_keys(data_dict, PROJECT_METADATA_KEYS)

return data_dicts


def load_samples_metadata(metadata_file_path: Path):
"""
Opens, loads and parses list of sample metadata located at inputted metadata_file_path.
Transforms keys in data dicts to match associated model attributes.
"""
with open(metadata_file_path) as raw_file:
data_dicts = list(csv.DictReader(raw_file))

for data_dict in data_dicts:
transform_keys(data_dict, SAMPLE_METADATA_KEYS)

return data_dicts


def load_library_metadata(metadata_file_path: Path):
"""
Opens, loads and parses single library's metadata located at inputted metadata_file_path.
Transforms keys in data dicts to match associated model attributes.
"""
with open(metadata_file_path) as raw_file:
return transform_keys(json.load(raw_file), LIBRARY_METADATA_KEYS)


def transform_keys(data_dict: Dict, key_transforms: List[Tuple]):
"""
Transforms keys in inputted data dict according to inputted key transforms tuple list.
"""
for element in [KeyTransform._make(element) for element in key_transforms]:
if element.old_key in data_dict:
data_dict[element.new_key] = data_dict.pop(element.old_key, element.default_value)

return data_dict


def write_metadata_dicts(list_of_dicts: List[Dict], output_file_path: str, **kwargs) -> None:
"""
Writes a list of dictionaries to a csv-like file.
Optional modifiers to the csv.DictWriter can be passed to function as kwargs.
"""
kwargs["fieldnames"] = kwargs.get(
"fieldnames", utils.get_sorted_field_names(utils.get_keys_from_dicts(list_of_dicts))
)
kwargs["delimiter"] = kwargs.get("delimiter", common.TAB)

sorted_list_of_dicts = sorted(
list_of_dicts,
key=lambda k: (
k[common.PROJECT_ID_KEY],
k[common.SAMPLE_ID_KEY],
k[common.LIBRARY_ID_KEY],
),
)

with open(output_file_path, "w", newline="") as raw_file:
csv_writer = csv.DictWriter(raw_file, **kwargs)
csv_writer.writeheader()
csv_writer.writerows(sorted_list_of_dicts)
60 changes: 60 additions & 0 deletions api/scpca_portal/migrations/0043_auto_20240524_1459.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Generated by Django 3.2.25 on 2024-05-24 14:59

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("scpca_portal", "0042_auto_20240423_2045"),
]

operations = [
migrations.CreateModel(
name="Library",
fields=[
(
"id",
models.AutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name="ID"
),
),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
(
"formats",
django.contrib.postgres.fields.ArrayField(
base_field=models.TextField(
choices=[
("ANN_DATA", "AnnData"),
("SINGLE_CELL_EXPERIMENT", "Single cell experiment"),
]
),
default=list,
size=None,
),
),
("is_multiplexed", models.BooleanField(default=False)),
("metadata", models.JSONField(default=dict)),
(
"modality",
models.TextField(
choices=[("SINGLE_CELL", "Single Cell"), ("SPATIAL", "Spatial")]
),
),
("scpca_id", models.TextField(unique=True)),
("workflow_version", models.TextField()),
],
options={
"db_table": "libraries",
"ordering": ["updated_at"],
"get_latest_by": "updated_at",
},
),
migrations.AddField(
model_name="sample",
name="libraries",
field=models.ManyToManyField(to="scpca_portal.Library"),
),
]
Loading

0 comments on commit d8c4b9a

Please sign in to comment.