-
Notifications
You must be signed in to change notification settings - Fork 34
Serratus SQL Database Management
serratus-aurora
is the main instance. It is an Aurora Serverless instance and can be scaled up to meet ingestion demand during batch uploads. This instance "sleeps" at 0 capacity units while inactive.
Periodically, a publicly accessible, always-on, Aurora Provisioned instance is created from a snapshot of serratus-aurora
. See Creating a Public Instance.
- Serratus summary info. Created by serratus-summary-uploader.
- Nucleotide
- nsra
- nfamily
- nsequence
- Protein
- psra
- pfamily
- protein
- psequence
- RdRP
- rsra
- rphylum
- rfamily
- rsequence
- Nucleotide
- BioSample/SRA metadata. Created by biosample-sql and sraruninfo-sql.
- biosample: all BioSamples (as of 2021/02/23) with geospatial data extracted if available
- biosample_geocode: internal table for mapping geospatial text values to x/y coordinate values
- biosample_geo_coordinates: all from biosample with x/y coordinates combined with biosample_geocode
- srarun: metadata for all SRA run accessions in the Serratus search space
- srarun_geo_coordinates: all from srarun with x/y coordinates from biosample_geo_coordinates
- rdrp_pos: uploaded from Artem's rdrp_pos.csv
-
analysis_index_uploaded_cols: uploaded from Artem's index for
assembly
,micro
serratus
is the main user. Password stored in Secrets Manager. This user is used for table curation and creating the other users below.
-- revoke default access for all users
REVOKE ALL ON SCHEMA public FROM public;
-- read-only group
CREATE ROLE viewer NOSUPERUSER NOINHERIT NOCREATEDB NOCREATEROLE NOREPLICATION;
GRANT CONNECT ON DATABASE summary TO viewer;
GRANT USAGE ON SCHEMA public TO viewer;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO viewer; -- re-run any time a table/view is replaced
-- users
CREATE USER public_reader WITH PASSWORD 'serratus' IN ROLE viewer;
CREATE USER tantalus WITH PASSWORD 'serratus' IN ROLE viewer;
CREATE USER web_api WITH PASSWORD 'serratus' IN ROLE viewer;
CREATE MATERIALIZED VIEW nfamily_counts AS
SELECT family_name, score, percent_identity, COUNT(*)
FROM nfamily
GROUP BY family_name, score, percent_identity;
CREATE MATERIALIZED VIEW nsequence_counts AS
SELECT sequence_accession, score, percent_identity, COUNT(*)
FROM nsequence
GROUP BY sequence_accession, score, percent_identity;
CREATE MATERIALIZED VIEW nfamily_list AS
SELECT family_name
FROM nfamily
GROUP BY family_name
ORDER BY family_name;
CREATE MATERIALIZED VIEW nsequence_list AS
SELECT sequence_accession, virus_name
FROM nsequence
GROUP BY sequence_accession, virus_name
ORDER BY sequence_accession;
CREATE MATERIALIZED VIEW analysis_index AS
SELECT
run_id,
s.run_id IS NOT NULL AS srarun,
nsra.run_id IS NOT NULL AS nsra,
psra.run_id IS NOT NULL AS psra,
sgc.run_id IS NOT NULL AS geo,
upl.micro IS TRUE AS micro,
upl.assembly IS TRUE AS assembly
FROM (SELECT run AS run_id FROM srarun) AS s
FULL JOIN nsra USING (run_id)
FULL JOIN rsra USING (run_id)
FULL JOIN psra USING (run_id)
FULL JOIN srarun_geo_coordinates sgc USING (run_id)
FULL JOIN analysis_index_uploaded_cols upl USING (run_id);
CREATE MATERIALIZED VIEW rphylum_counts AS
SELECT phylum_name, score, percent_identity, COUNT(*)
FROM rphylum
GROUP BY phylum_name, score, percent_identity;
CREATE MATERIALIZED VIEW rfamily_counts AS
SELECT family_name, score, percent_identity, COUNT(*)
FROM rfamily
GROUP BY family_name, score, percent_identity;
CREATE MATERIALIZED VIEW rsequence_counts AS
SELECT sequence_accession, score, percent_identity, COUNT(*)
FROM rsequence
GROUP BY sequence_accession, score, percent_identity;
CREATE MATERIALIZED VIEW rphylum_list AS
SELECT phylum_name
FROM rphylum
GROUP BY phylum_name
ORDER BY phylum_name;
CREATE MATERIALIZED VIEW rfamily_list AS
SELECT family_name
FROM rfamily
GROUP BY family_name
ORDER BY family_name;
CREATE MATERIALIZED VIEW rsequence_list AS
SELECT sequence_accession, virus_name
FROM rsequence
GROUP BY sequence_accession, virus_name
ORDER BY sequence_accession;
CREATE MATERIALIZED VIEW biosample_geo_coordinates AS
SELECT biosample_id,
coordinate_x,
coordinate_y,
b.geo_text_extracted AS from_text
FROM biosample b
INNER JOIN biosample_geocode bgeo
ON (b.geo_text_extracted = bgeo.geo_text_extracted)
WHERE coordinate_x IS NOT NULL
UNION ALL
SELECT biosample_id,
geo_coordinate_x AS coordinate_x,
geo_coordinate_y AS coordinate_y,
NULL AS FROM_text
FROM biosample
WHERE geo_coordinate_x IS NOT NULL;
CREATE MATERIALIZED VIEW srarun_geo_coordinates AS
SELECT run AS run_id,
bio_sample AS biosample_id,
release_date,
coordinate_x, coordinate_y, from_text
FROM srarun
INNER JOIN biosample_geo_coordinates bgeo
ON (srarun.bio_sample = bgeo.biosample_id);
When referenced tables are replaced, these views should be dropped and recreated. REFRESH MATERIALIZED VIEW
doesn't seem to work in this case.
Sometimes, biosample_geocode
may be uploaded with text data type for numeric columns. If that happens, run this before creating views:
ALTER TABLE biosample_geocode
ALTER COLUMN coordinate_x TYPE DOUBLE PRECISION,
ALTER COLUMN coordinate_y TYPE DOUBLE PRECISION;
Indexes are used for optimizing queries.
CREATE INDEX nfamily_run_id_index ON nfamily (run_id);
CREATE INDEX nfamily_family_name_index ON nfamily (family_name);
CREATE INDEX nfamily_score_index ON nfamily (score);
CREATE INDEX nfamily_percent_identity_index ON nfamily (percent_identity);
CREATE INDEX nsequence_run_id_index ON nsequence (run_id);
CREATE INDEX nsequence_sequence_accession_index ON nsequence (sequence_accession);
CREATE INDEX nsequence_score_index ON nsequence (score);
CREATE INDEX nsequence_percent_identity_index ON nsequence (percent_identity);
CREATE INDEX nsequence_sequence_accession_score_index ON nsequence (sequence_accession, score);
-- views
CREATE INDEX nfamily_counts_family_name_index ON nfamily_counts (family_name);
CREATE INDEX nsequence_counts_sequence_accession_index ON nsequence_counts (sequence_accession);
CREATE INDEX rphylum_run_id_index ON rphylum (run_id);
CREATE INDEX rphylum_phylum_name_index ON rphylum (phylum_name);
CREATE INDEX rphylum_score_index ON rphylum (score);
CREATE INDEX rphylum_percent_identity_index ON rphylum (percent_identity);
CREATE INDEX rfamily_run_id_index ON rfamily (run_id);
CREATE INDEX rfamily_family_name_index ON rfamily (family_name);
CREATE INDEX rfamily_score_index ON rfamily (score);
CREATE INDEX rfamily_percent_identity_index ON rfamily (percent_identity);
CREATE INDEX rsequence_run_id_index ON rsequence (run_id);
CREATE INDEX rsequence_sequence_accession_index ON rsequence (sequence_accession);
CREATE INDEX rsequence_score_index ON rsequence (score);
CREATE INDEX rsequence_percent_identity_index ON rsequence (percent_identity);
-- views
CREATE INDEX rphylum_counts_phylum_name_index ON rphylum_counts (phylum_name);
CREATE INDEX rfamily_counts_family_name_index ON rfamily_counts (family_name);
CREATE INDEX rsequence_counts_sequence_accession_index ON rsequence_counts (sequence_accession);
CREATE INDEX srarun_run_index ON srarun (run);
CREATE INDEX srarun_bio_sample_index ON srarun (bio_sample);
CREATE INDEX biosample_geo_coordinates_biosample_id_index ON biosample_geo_coordinates (biosample_id);
CREATE INDEX srarun_geo_coordinates_run_id_index ON srarun_geo_coordinates (run_id);
CREATE INDEX rdrp_pos_run_id_index ON rdrp_pos (run_id);
CREATE INDEX analysis_index_run_id_index ON analysis_index (run_id);
export ORIGINAL_CLUSTER_ID="serratus-aurora"
export SNAPSHOT_ID="serratus-aurora-20210315"
export RESTORE_CLUSTER_ID="serratus-aurora-20210315"
export RESTORE_INSTANCE_ID="serratus-aurora-20210315-main"
# create snapshot
aws rds create-db-cluster-snapshot \
--db-cluster-snapshot-identifier $SNAPSHOT_ID \
--db-cluster-identifier $ORIGINAL_CLUSTER_ID
# restore from snapshot
aws rds restore-db-cluster-from-snapshot \
--db-cluster-identifier $RESTORE_CLUSTER_ID \
--snapshot-identifier $SNAPSHOT_ID \
--db-subnet-group-name default-vpc-025ef5ccc841b5b86 \
--engine-mode provisioned \
--engine aurora-postgresql \
--vpc-security-group-ids sg-07e2ad8dccb1d4ba6 \
--engine-version 10.14
aws rds create-db-instance \
--db-instance-identifier $RESTORE_INSTANCE_ID \
--db-instance-class db.t3.medium \
--engine aurora-postgresql \
--db-cluster-identifier $RESTORE_CLUSTER_ID \
--publicly-accessible