diff --git a/.coveragerc b/.coveragerc index 21704d6..2b755b1 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,7 +1,8 @@ [run] omit = - *tests* - *migrations* + */tests/* + */migrations/* *settings* *asgi.py - *wsgi.py \ No newline at end of file + *wsgi.py + *generate_dev_data.py \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1af8a97..ec007a5 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ __pycache__ .coverage .pytest_cache test-results +screenshot_website* # Elastic Beanstalk Files .elasticbeanstalk/* diff --git a/docs/img/website/mag-annotations.png b/docs/img/website/mag-annotations.png new file mode 100644 index 0000000..62310ef Binary files /dev/null and b/docs/img/website/mag-annotations.png differ diff --git a/docs/img/website/mag-catalogue.png b/docs/img/website/mag-catalogue.png index 7b7a980..510eab2 100644 Binary files a/docs/img/website/mag-catalogue.png and b/docs/img/website/mag-catalogue.png differ diff --git a/docs/img/website/mag-containment.png b/docs/img/website/mag-containment.png new file mode 100644 index 0000000..fdb738a Binary files /dev/null and b/docs/img/website/mag-containment.png differ diff --git a/docs/website.qmd b/docs/website.qmd index 88b7d96..84005d3 100644 --- a/docs/website.qmd +++ b/docs/website.qmd @@ -125,6 +125,37 @@ MAGs can be found by searching on accession or taxonomy, or for the accession of The MAGs in a catalogue can be downloaded as a TSV file, using the "Download all as TSV" button. +#### MAG Annotations +MGnify Genomes catalogues use a standardised pipeline ([Gurbich et al. 2023](https://europepmc.org/article/MED/36806692)) to annotate +the assembled genomes with various tools. +These annotations are performed on the species-level cluster representative genomes. +These annotations can all be accessed via the data portal’s links to MGnify. + +Given the HoloFood project's aims, [CAZy](http://www.cazy.org) (Carbohydrate-Active enZymes) annotations are particularly relevant to +the HoloFood MAG catalogues. + +A summary of the CAZy annotations, in the form of counts per CAZy category, is therefore shown on the detail view of each MAG. +(Note that, like all MAG annotations, these CAZy annotations refer to the MAG's cluster representative genome – not necesarily the HoloFood-data-derived MAG itself.) + +![Screenshot of a MAG’s detail page, including CAZy annotations](/img/website/mag-annotations.png) + +These annotations are also available via each genome's [API](api.ipynb) endpoint. + +#### MAG containment within samples +To facilitate the linking of MAGs to other samples within the HoloFood dataset, the data portal also includes a list of "containments" for each MAG within all of the project’s metagenomic samples. + +For each MAG, a sample list was found using Mastiff (a tool based on [sourmash](https://sourmash.bio), [Irber et al. 2022](https://www.biorxiv.org/content/10.1101/2022.11.02.514947v1)). +Each sample in this list contains the MAG at some level, equivalent to the fraction of the MAG’s kmers that are present in the sample’s sequencing reads. +The list can be filtered to find only the samples that contain the MAG above some minimum containment threshold. + +![Screenshot of a MAG’s sample containment list](/img/website/mag-containment.png) + +These sample containment lists are also available via each genome's [API](api.ipynb) endpoint, as well as via the TSV export option above the table. + +Together with the [MAG’s CAZy annotations](website.qmd#mag-annotations), this feature means the prevalence of +carbohydrate-active enzymes can be compared at the genome level between samples originating from animals +under different experimental conditions. + ### Viral Catalogues ![Screenshot of a viral catalogue](/img/website/viral-catalogue.png) Viral catalogues are lists of the unique (at species-level) viruses found in HoloFood samples. diff --git a/holofood/api.py b/holofood/api.py index 9299f53..38b99f2 100644 --- a/holofood/api.py +++ b/holofood/api.py @@ -192,7 +192,13 @@ def resolve_representative_url(obj: Genome): class Config: model = Genome - model_fields = ["accession", "cluster_representative", "taxonomy", "metadata"] + model_fields = [ + "accession", + "cluster_representative", + "taxonomy", + "metadata", + "annotations", + ] class GenomeSampleContainmentSchema(ModelSchema): diff --git a/holofood/filters.py b/holofood/filters.py index 4ed07d2..2cabdf4 100644 --- a/holofood/filters.py +++ b/holofood/filters.py @@ -16,6 +16,7 @@ from django.forms import NumberInput from django.utils.safestring import mark_safe +from holofood.forms import CazyAnnotationsFilterForm from holofood.models import ( Sample, Genome, @@ -123,6 +124,7 @@ class Meta: class GenomeFilter(django_filters.FilterSet): class Meta: model = Genome + form = CazyAnnotationsFilterForm fields = { "accession": ["icontains"], @@ -130,6 +132,19 @@ class Meta: "taxonomy": ["icontains"], } + def filter_queryset(self, queryset): + qs = queryset + for name, value in self.form.cleaned_data.items(): + if name in self.filters: + qs = self.filters[name].filter(qs, value) + + filters = Q() + if self.data: + cazy_annotations = self.data.getlist("cazy_annotations") + for key in cazy_annotations: + filters &= Q(**{f"annotations__cazy__{key}__gt": 0}) + return qs.filter(filters) + class GenomeSampleContainmentFilter(django_filters.FilterSet): minimum_containment = django_filters.NumberFilter( diff --git a/holofood/forms.py b/holofood/forms.py new file mode 100644 index 0000000..b2294a6 --- /dev/null +++ b/holofood/forms.py @@ -0,0 +1,75 @@ +from django import forms +from django.forms.widgets import SelectMultiple +from django.utils.html import format_html +from django.utils.encoding import force_str + + +class CazyCheckboxesWidget(SelectMultiple): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.choices = ( + [ + ("GH", "Glycoside Hydrolase"), + ("CB", "Carbohydrate-Binding"), + ("PL", "Polysaccharide Lyases"), + ("CE", "Carbohydrate Esterases"), + ("AA", "Auxiliary Activities"), + ("GT", "GlycosylTransferases"), + ], + ) + + def render(self, name, value, attrs=None, renderer=None): + output = [] + if not isinstance(value, list): + value = [value] + for option_value, option_label in self.choices: + final_attrs = self.build_attrs(self.attrs, attrs) + final_attrs["type"] = "checkbox" + final_attrs["name"] = name + final_attrs["value"] = option_value + final_attrs["id"] = "id_%s_%s" % (name, option_value) + + if value and force_str(option_value) in value: + final_attrs["checked"] = "checked" + else: + final_attrs.pop("checked", None) + + output.append( + format_html( + '
' + ' ' + "
", + format_html( + "".join( + f' {key}="{value}"' for key, value in final_attrs.items() + ) + ), + final_attrs["id"], + option_label, + ) + ) + + return format_html("".join(output)) + + +class CazyAnnotationsFilterForm(forms.Form): + field_order = [ + "accession__icontains", + "cluster_representative__icontains", + "taxonomy__icontains", + "cazy_annotations", + ] + cazy_annotations = forms.MultipleChoiceField( + choices=[ + ("GH", "Glycoside Hydrolase"), + ("CB", "Carbohydrate-Binding"), + ("PL", "Polysaccharide Lyases"), + ("CE", "Carbohydrate Esterases"), + ("AA", "Auxiliary Activities"), + ("GT", "GlycosylTransferases"), + ], + widget=CazyCheckboxesWidget, + required=False, + label="CAZy Annotations present", + help_text="Annotated on species rep.", + ) diff --git a/holofood/management/commands/import_mag_catalogue.py b/holofood/management/commands/import_mag_catalogue.py index ea28dd9..309c3fa 100644 --- a/holofood/management/commands/import_mag_catalogue.py +++ b/holofood/management/commands/import_mag_catalogue.py @@ -42,6 +42,11 @@ def add_arguments(self, parser): type=str, help="System (chicken/salmon) of the catalogue (or None to copy from related MAG catalogue)", ) + parser.add_argument( + "--representatives_cazy_annotations_file", + type=argparse.FileType("r"), + help="Optional path to a TSV file listing cazy annotations for the cluster representative MAGs.", + ) @staticmethod def _parse_taxonomic_lineage(lineage_string: str) -> str: @@ -79,6 +84,17 @@ def handle(self, *args, **options): system=options["system"], ) logging.info(f"Created MAG {catalogue=}") + + cazy_file = options["representatives_cazy_annotations_file"] + cazy_annotations = {} + if cazy_file: + cazy_reader = DictReader(cazy_file, delimiter="\t") + for row in cazy_reader: + cazy_annotations.setdefault(row["Genome"], {})[ + row["CAZy_category"] + ] = int(row["Counts"]) + cazy_file.close() + for mag in reader: mag_data = { field_name: mag[col_name] @@ -90,6 +106,10 @@ def handle(self, *args, **options): mag_data["taxonomy"] ) + mag_data["annotations"] = { + "cazy": cazy_annotations.get(mag_data["cluster_representative"], {}) + } + metadata = { col_name: col_val for col_name, col_val in mag.items() diff --git a/holofood/migrations/0038_genome_annotations.py b/holofood/migrations/0038_genome_annotations.py new file mode 100644 index 0000000..ab22663 --- /dev/null +++ b/holofood/migrations/0038_genome_annotations.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2 on 2024-07-02 16:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("holofood", "0037_genomesamplecontainment"), + ] + + operations = [ + migrations.AddField( + model_name="genome", + name="annotations", + field=models.JSONField(blank=True, default=dict), + ), + ] diff --git a/holofood/models.py b/holofood/models.py index 699a45f..e7d1dfe 100644 --- a/holofood/models.py +++ b/holofood/models.py @@ -456,6 +456,7 @@ class Genome(models.Model): ) taxonomy = models.CharField(max_length=200) metadata = models.JSONField(default=dict, blank=True) + annotations = models.JSONField(default=dict, blank=True) class Meta: ordering = ("accession",) diff --git a/holofood/tests/conftest.py b/holofood/tests/conftest.py index b723ee2..c266245 100644 --- a/holofood/tests/conftest.py +++ b/holofood/tests/conftest.py @@ -966,6 +966,9 @@ def create_genome_objects(sample: Sample) -> GenomeCatalogue: "catalogue": catalogue, "taxonomy": "Root > Foods > Donuts > Sugar Monster", "metadata": {}, + "annotations": { + "cazy": {"GH": 6, "PL": 5, "CE": 4, "AA": 3, "CB": 2, "GT": 1, "CL": 0} + }, }, ) diff --git a/holofood/tests/static_fixtures/mag-catalogue-cazy.tsv b/holofood/tests/static_fixtures/mag-catalogue-cazy.tsv new file mode 100644 index 0000000..9f2d4ee --- /dev/null +++ b/holofood/tests/static_fixtures/mag-catalogue-cazy.tsv @@ -0,0 +1,8 @@ + Genome CAZy_category Counts +0 MGYG000290000 GH 6 +1 MGYG000290000 PL 5 +2 MGYG000290000 CE 4 +3 MGYG000290000 AA 3 +4 MGYG000290000 CB 2 +5 MGYG000290000 GT 1 +6 MGYG000290000 CL 0 \ No newline at end of file diff --git a/holofood/tests/test_api.py b/holofood/tests/test_api.py index 6e33020..1b93921 100644 --- a/holofood/tests/test_api.py +++ b/holofood/tests/test_api.py @@ -388,6 +388,7 @@ def test_mag_catalogues(client, chicken_mag_catalogue): "sample": "SAMEA00000006", "containment": 0.7, } + assert data.get("annotations", {}).get("cazy", {}).get("GH") == 6 @pytest.mark.django_db diff --git a/holofood/tests/test_import_comands.py b/holofood/tests/test_import_comands.py index a07f575..c3c2c1f 100644 --- a/holofood/tests/test_import_comands.py +++ b/holofood/tests/test_import_comands.py @@ -161,6 +161,7 @@ def test_import_mag_catalogue(): "public-donut-v1-0", "Donut Surface", "chicken", + f"--representatives_cazy_annotations_file={tests_path}/static_fixtures/mag-catalogue-cazy.tsv", ) logging.info(out) diff --git a/holofood/tests/test_website.py b/holofood/tests/test_website.py index b15e640..f0a203e 100644 --- a/holofood/tests/test_website.py +++ b/holofood/tests/test_website.py @@ -60,7 +60,7 @@ def test_web(self, m): wait = WebDriverWait(self.selenium, 10) - # # ---- Home page ---- # + # ---- Home page ---- # self.selenium.get(self.live_server_url) self.selenium.add_cookie( { @@ -381,8 +381,27 @@ def test_web(self, m): ) self.assertIn("export", export_link.get_attribute("href")) + # chart should be showing cazys. test accessible/aria version of chart rather than svg. + cazy_accessible_table = self.selenium.find_element( + by=By.XPATH, value="//*[@id='cazy_chart']//table/tbody" + ) + self.assertEqual( + len(cazy_accessible_table.find_elements(by=By.TAG_NAME, value="tr")), 6 + ) + + # element is hidden so use selenium script to get text + first_cazy_label = self.selenium.execute_script( + "return document.evaluate(\"//*[@id='cazy_chart']//table/tbody/tr[1]/td[1]\", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.textContent;" + ) + self.assertEqual(first_cazy_label.strip(), "Glycoside Hydrolase") + + first_cazy_count = self.selenium.execute_script( + "return document.evaluate(\"//*[@id='cazy_chart']//table/tbody/tr[1]/td[2]\", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.textContent;" + ) + self.assertEqual(first_cazy_count.strip(), "6") + # should be one sample containing this MAG - table = self.selenium.find_element(by=By.TAG_NAME, value="tbody") + table = self.selenium.find_element(by=By.CLASS_NAME, value="vf-table__body") self.assertEqual(len(table.find_elements(by=By.TAG_NAME, value="tr")), 1) # change containment to very high, so MAG is contained sufficiently in NO samples @@ -397,7 +416,7 @@ def test_web(self, m): "minimum_containment=0.9", self.selenium.current_url, ) - table = self.selenium.find_element(by=By.TAG_NAME, value="tbody") + table = self.selenium.find_element(by=By.CLASS_NAME, value="vf-table__body") self.assertEqual(table.size["height"], 0) # ---- Viral catalogues ---- # diff --git a/holofood/views.py b/holofood/views.py index f0c05c1..1bae246 100644 --- a/holofood/views.py +++ b/holofood/views.py @@ -326,6 +326,23 @@ def get_context_data(self, **kwargs): context["catalogue"] = get_object_or_404( GenomeCatalogue, id=self.kwargs.get("catalogue_pk") ) + cazy = self.object.annotations.get("cazy") + if cazy: + cazy_categories = { + "GH": "Glycoside Hydrolase", + "CB": "Carbohydrate-Binding", + "PL": "Polysaccharide Lyases", + "CE": "Carbohydrate Esterases", + "AA": "Auxiliary Activities", + "GT": "GlycosylTransferases", + } + context["cazy_annotations"] = { + cazy_categories[cat]: count + for cat, count in cazy.items() + if cat in cazy_categories + } + else: + context["cazy_annotations"] = {} return context diff --git a/requirements-docs.txt b/requirements-docs.txt index 3ae1577..2e14016 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,3 +1,3 @@ -jupyterlab==3.4.8 -pandas==1.5.1 -matplotlib==3.6.1 \ No newline at end of file +jupyterlab==4.2.3 +pandas==2.2.2 +matplotlib==3.9.1 \ No newline at end of file diff --git a/static/scss/site.scss b/static/scss/site.scss index dd62286..4ef6d0a 100644 --- a/static/scss/site.scss +++ b/static/scss/site.scss @@ -217,4 +217,56 @@ img.hf-hero-logo { input[type="range"] { width: 100%; } +} + +.hf-checkbox { + display: flex; + align-items: center; + margin-bottom: 10px; + + input[type="checkbox"] { + display: none; + } + + label { + position: relative; + padding-left: 25px; + cursor: pointer; + + &::before { + content: ""; + position: absolute; + left: 0; + top: 0; + width: 18px; + height: 18px; + border: 2px solid var(--vf-color--blue); + border-radius: 4px; + background-color: #fff; + } + + &::after { + content: ""; + position: absolute; + left: 8px; + top: 3px; + width: 4px; + height: 10px; + border: solid #fff; + border-width: 0 2px 2px 0; + transform: rotate(45deg); + opacity: 0; + } + } + + input[type="checkbox"]:checked + label { + &::before { + background-color: var(--vf-color--blue); + border-color: var(--vf-color--blue); + } + + &::after { + opacity: 1; + } + } } \ No newline at end of file diff --git a/templates/holofood/pages/genome_detail.html b/templates/holofood/pages/genome_detail.html index 73bf6d9..e002f15 100644 --- a/templates/holofood/pages/genome_detail.html +++ b/templates/holofood/pages/genome_detail.html @@ -62,6 +62,17 @@

{{ genome.accession }}


 Show other genomes in cluster + +
+ Annotations +
+
+

+ {{ genome.accession }}’s cluster representative genome {{ genome.cluster_representative }} has been annotated by MGnify’s Genomes pipeline. + {# djlint:off #}CAZy (Carbohydrate-Active enZymes){# djlint:on #} annotations are particularly relevant to HoloFood’s experimental goals, + and this graphic shows the number of CAZy annotations on {{ genome.cluster_representative }} from each CAZy category. +

+ {% include "holofood/scripts/cazy_chart.html" with cazy_annotations=cazy_annotations only %}
@@ -95,7 +106,7 @@

{{ genome.accession }}

{{ genome.accession }}’s cluster representative ({{ genome.cluster_representative }}) - has been searched for in all HoloFood samples, using a sourmash-based tool. + has been searched for in all HoloFood samples, using a sourmash-based tool. These samples contain some or all of the kmers in {{ genome.cluster_representative }}’s sequence. Because {{ genome.accession }} has been clustered with {{ genome.cluster_representative }} at 95% sequence similarity, this indicates that these samples are likely to contain {{ genome.accession }}. diff --git a/templates/holofood/scripts/cazy_chart.html b/templates/holofood/scripts/cazy_chart.html new file mode 100644 index 0000000..56a025d --- /dev/null +++ b/templates/holofood/scripts/cazy_chart.html @@ -0,0 +1,33 @@ + +

+{{ cazy_annotations | json_script:"cazy_annotations_data" }} + \ No newline at end of file