Skip to content

Commit

Permalink
Merge pull request #36 from Ensembl/hotfix/alter_unreleased_logic
Browse files Browse the repository at this point in the history
Fetch unreleased data only if required internally (API changes)
  • Loading branch information
marcoooo authored Sep 21, 2023
2 parents c31c87d + 461a95f commit 41a4791
Show file tree
Hide file tree
Showing 13 changed files with 228 additions and 137 deletions.
192 changes: 109 additions & 83 deletions src/ensembl/production/metadata/api/genome.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
6 7e8ed3a8-d724-4cba-92e1-e968719b7a18 \N GCA_000146045.2 chromosome R64-1-1 \N R64-1-1 \N 2023-05-12 13:32:46 R64-1-1 \N 0 \N
7 f7de35c9-e0e8-4e81-b186-2962098d6361 \N GCA_000002985.3 chromosome WBcel235 \N WBcel235 \N 2023-05-12 13:32:52 WBcel235 \N 0 \N
8 eeaaa233-151c-4848-8b85-a05a9993101e \N GCA_000001499.28 chromosome GRCh38 t2t \N GRCh38 t2t \N 2023-09-07 14:30:58 GRCh38_t2t \N 1 \N
9 34372aad-5bb1-4304-8a13-28cb4afc601e \N GCA_000001735.1 chromosome TAIR10 \N TAIR10 \N 2023-08-18 12:22:34 TAIR10 \N 1 \N
Original file line number Diff line number Diff line change
Expand Up @@ -2021,4 +2021,5 @@
2021 IV 7 BX284604.4 1 17493829 \N SO:0000738 \N \N
2022 V 7 BX284605.5 1 20924180 \N SO:0000738 \N \N
2023 X 7 BX284606.5 1 17718942 \N SO:0000738 \N \N
2024 MtDNA 7 X54252.1 1 13794 \N SO:0000737 \N \N
2024 MtDNA 7 X54252.1 1 13794 \N SO:0000737 \N \N
2025 Mt 6 Mt 1 366924 \N SO:0000737 \N \N
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@
69 genebuild.last_geneset_update last_geneset_update last_geneset_update string
70 genebuild.version version version string
71 sample.gene_param sample.gene_param sample.gene_param string
72 sample.location_param sample.location_param sample.location_param string
72 sample.location_param sample.location_param sample.location_param string
73 assembly.date assembly.date assembly.date string
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@
42 ea044d8e-33f1-4c9f-9b9f-8c0bd1dcf642 6 homologies \N 2023-06-02 13:32:52 11 Manual Add Submitted
44 feaa37ea-4217-4d9d-afca-599bdae11b36 2 genebuild \N 2023-09-12 13:32:52.00 7 2023-05 Submitted
45 feaa37ea-4217-4d9d-afca-600bdae11b36 1 asssembly \N 2023-09-12 13:32:52.00 7 2023-05 Submitted
46 385f1ec2-bd06-40ce-873a-98e199f10505 1 asssembly \N 2023-08-18 12:22:34 13 GCA_000001735.1 Submitted

Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,4 @@
339 57.7 56 38
399 17461 55 42
400 87.4 56 42
401 2008-04 73 46
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
10 variation gathered_data_1
11 compara gathered_data_2
12 regulation gathered_data_3
13 core arabidopsis_thaliana_core_57_110_11
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
4 evidence Variation Evidence Variation Annotation \N \N
5 regulation_build Regulations Regulatory Annotation \N \N
6 homologies Comparative homologies Comparative Annotation \N \N
7 regulatory_features Regulations Regulatory Annotation \N \N
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
6 a733574a-93e7-11ec-a39d-005056b38ce3 6 5 2023-05-12 13:32:46 0
7 a733550b-93e7-11ec-a39d-005056b38ce3 7 6 2023-05-12 13:32:52 0
8 a7335667-93e7-11ec-a39d-00aasab38ce3 8 1 2023-09-07 16:30:58 0

9 90720316-006c-470b-a7dd-82d28f952264 9 8 2023-08-18 12:22:34 0
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@
56 42 7 1 1
57 44 7 \N 0
58 45 7 \N 0
59 46 9 \N 0
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
5 0dc46f87-0b61-403a-8cd3-86b7e0cce8f0 559292 4932 Saccharomyces cerevisiae S288C Saccharomyces cerevisiae S288c Saccharomyces_cerevisiae saccharomyces_cerevisiae \N
6 0f4aad7b-db15-4a72-af1e-82bbae54226 6239 6239 Caenorhabditis elegans (PRJNA13758) N2 Caenorhabditis elegans Caenorhabditis_elegans caenorhabditis_elegans \N
7 dbbsaf09-2db8-429b-a407-c15a4ca2876d 9606 9606 Human T2T \N Homo sapiens Homo_sapiens homo_sapiens_t2t \N
8 02b934c5-83af-4b3c-9fc1-5a0f01823396 3702 3702 thale-cress \N Arabidopsis thaliana arabidopsis_thaliana arabidopsis_thaliana \N
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
9 0 3 7 3
10 0 4 7 4
11 0 5 7 5
12 0 6 7 6
12 0 6 7 6
13 0 8 4 1
155 changes: 105 additions & 50 deletions src/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,44 +29,64 @@ def test_load_database(self, multi_dbs):
db_test = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url)
assert db_test, "DB should not be empty"

def fetch_all_genomes(self, multi_dbs):
conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url)
test = conn.fetch_genomes()
assert len(test) == 7
@pytest.mark.parametrize(
"allow_unreleased, unreleased_only, current_only, output_count",
[
# fetches everything (7 released + 2 unreleased)
(True, False, True, 9),
# fetches all released genomes (with current_only=0)
(False, False, False, 7),
# fetches released genomes with current_only=1 (default)
(False, False, True, 6),
# fetches all unreleased genomes
(False, True, True, 2),
]
)
def test_fetch_all_genomes(self, multi_dbs, allow_unreleased, unreleased_only, current_only, output_count):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes(
allow_unreleased=allow_unreleased,
unreleased_only=unreleased_only,
current_only=current_only
)
assert len(test) == output_count

def fetch_with_all_args_no_conflict(self, multi_dbs):
conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url)
def test_fetch_with_all_args_no_conflict(self, multi_dbs):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes(
genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3",
assembly_accession="GCA_000002985.3",
assembly_name="WBcel235",
ensembl_name="caenorhabditis_elegans",
taxonomy_id="6239",
group="EnsemblMetazoa",
unreleased_only=False,
allow_unreleased=False,
site_name="Ensembl",
release_type="integrated",
release_version="108.0",
current_only=True
)
assert len(test) == 0

def fetch_with_all_args_conflict(self, multi_dbs):
conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url)
def test_fetch_with_all_args_conflict(self, multi_dbs):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes(
genome_uuid="a733550b-93e7-11ec-a39d-005056b38ce3",
assembly_accession="GCA_000002985.3",
assembly_name="WBcel235",
ensembl_name="caenorhabditis_elegans",
taxonomy_id="9606", # Conflicting taxonomy_id
group="EnsemblBacteria", # Conflicting group
unreleased_only=False,
allow_unreleased=False,
site_name="Ensembl",
release_type="integrated",
release_version="108.0",
current_only=True
)
assert test[0].Organism.scientific_name == 'Caenorhabditis elegans'
assert len(test) == 0

def test_fetch_releases(self, multi_dbs):
conn = ReleaseAdaptor(multi_dbs['ensembl_metadata'].dbc.url)
Expand Down Expand Up @@ -106,7 +126,6 @@ def test_fetch_genomes(self, multi_dbs):
test = conn.fetch_genomes(genome_uuid='a7335667-93e7-11ec-a39d-005056b38ce3')
assert test[0].Organism.scientific_name == 'Homo sapiens'


# def test_fetch_genomes_by_group_division(self, multi_dbs):
# conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
# taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
Expand All @@ -116,7 +135,6 @@ def test_fetch_genomes(self, multi_dbs):
# Other PR will likely change this drastically, so the effort is not really necessary. Their are 7 groups.
# assert division_filter in division_results


def test_fetch_genomes_by_genome_uuid(self, multi_dbs):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
Expand Down Expand Up @@ -206,44 +224,37 @@ def test_fetch_sequences_chromosomal_only(self, multi_dbs):
)
assert test[-1].AssemblySequence.chromosomal == 1

def test_fetch_genome_dataset_default_topic_assembly(self, multi_dbs):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genome_datasets(genome_uuid='a73357ab-93e7-11ec-a39d-005056b38ce3')
assert test[0].DatasetType.topic == 'Core Annotation'

def test_fetch_genome_dataset_uuid(self, multi_dbs):
uuid = '0dc05c6e-2910-4dbd-879a-719ba97d5824'
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genome_datasets(dataset_uuid=uuid, dataset_name='genebuild')
assert test[0].Dataset.dataset_uuid == uuid

def test_fetch_genome_dataset_genome_uuid(self, multi_dbs):
uuid = 'a73357ab-93e7-11ec-a39d-005056b38ce3'
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genome_datasets(genome_uuid=uuid)
assert test[0].Genome.genome_uuid == uuid

def test_fetch_genome_datasets(self, multi_dbs):
@pytest.mark.parametrize(
"genome_uuid, dataset_uuid, allow_unreleased, unreleased_only, expected_dataset_uuid, expected_count",
[
# nothing specified + allow_unreleased -> fetches everything
(None, None, True, False, "559d7660-d92d-47e1-924e-e741151c2cef", 33),
# specifying genome_uuid
("a73357ab-93e7-11ec-a39d-005056b38ce3", None, False, False, "b4ff55e3-d06a-4772-bb13-81c3207669e3", 5),
# specifying dataset_uuid
(None, "0dc05c6e-2910-4dbd-879a-719ba97d5824", False, False, "0dc05c6e-2910-4dbd-879a-719ba97d5824", 1),
# fetch unreleased datasets only
(None, None, False, True, "feaa37ea-4217-4d9d-afca-600bdae11b36", 3),
]
)
def test_fetch_genome_dataset_all(
self, multi_dbs, genome_uuid,
dataset_uuid, allow_unreleased,
unreleased_only, expected_dataset_uuid,
expected_count
):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genome_datasets()
assert test[0].Dataset.dataset_uuid == '559d7660-d92d-47e1-924e-e741151c2cef'
assert test[0].DatasetType.name == 'assembly'

# TODO: fix it, there are no unreleased datasets (add one?)
# def test_fetch_genome_datasets_unreleased(self, multi_dbs):
# conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
# taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
# test = conn.fetch_genome_datasets(
# dataset_name="all",
# unreleased_datasets=True
# )
# print(f"test ===> {test}")
# assert test[0].GenomeDataset.release_id is None
# assert test[0].GenomeDataset.is_current == 0
test = conn.fetch_genome_datasets(
genome_uuid=genome_uuid,
dataset_uuid=dataset_uuid,
unreleased_only=unreleased_only,
allow_unreleased=allow_unreleased,
# fetch all datasets (default: dataset_name="assembly")
dataset_name="all"
)
assert test[0].Dataset.dataset_uuid == expected_dataset_uuid
assert len(test) == expected_count

@pytest.mark.parametrize(
"ensembl_name, assembly_name, use_default_assembly, expected_output",
Expand All @@ -258,7 +269,28 @@ def test_fetch_genome_uuid(self, multi_dbs, ensembl_name, assembly_name, use_def
test = conn.fetch_genomes(
ensembl_name=ensembl_name,
assembly_name=assembly_name,
use_default_assembly=use_default_assembly
use_default_assembly=use_default_assembly,
allow_unreleased=False,
current_only=False
)
assert len(test) == 1
assert test[0].Genome.genome_uuid == expected_output

@pytest.mark.parametrize(
"ensembl_name, assembly_name, use_default_assembly, expected_output",
[
("homo_sapiens", "GRCh38.p13", False, "a7335667-93e7-11ec-a39d-005056b38ce3"),
("homo_sapiens", "GRCh38", True, "a7335667-93e7-11ec-a39d-005056b38ce3"),
]
)
def test_fetch_genome_uuid_is_current(self, multi_dbs, ensembl_name, assembly_name, use_default_assembly, expected_output):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes(
ensembl_name=ensembl_name,
assembly_name=assembly_name,
use_default_assembly=use_default_assembly,
allow_unreleased=False
)
assert len(test) == 1
assert test[0].Genome.genome_uuid == expected_output
Expand Down Expand Up @@ -291,3 +323,26 @@ def test_popular_species(self, multi_dbs):
for data in test[1:]:
# All others have only one genome in test DB
assert data[5] == 1

@pytest.mark.parametrize(
"allow_unreleased, output_count, expected_genome_uuid",
[
# fetches everything
(True, 9, "90720316-006c-470b-a7dd-82d28f952264"),
# fetches released datasets and genomes with current_only=1 (default)
(False, 6, "a733550b-93e7-11ec-a39d-005056b38ce3"),
]
)
def test_fetch_genomes_info(self, multi_dbs, allow_unreleased, output_count, expected_genome_uuid):
conn = GenomeAdaptor(metadata_uri=multi_dbs['ensembl_metadata'].dbc.url,
taxonomy_uri=multi_dbs['ncbi_taxonomy'].dbc.url)
test = conn.fetch_genomes_info(
allow_unreleased_genomes=allow_unreleased,
allow_unreleased_datasets=allow_unreleased,
group_type=['division', 'internal']
)
output_to_list = list(test)
assert len(output_to_list) == output_count
assert output_to_list[0][0]['genome'].Genome.genome_uuid == expected_genome_uuid


0 comments on commit 41a4791

Please sign in to comment.