From 57d0957b549387c2fbc109dd3e4eecf2ed5b99bc Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 15 May 2024 17:56:16 +0100
Subject: [PATCH 01/39] Initial set-up for port of Northern Ireland to dagster

---
 python/popgetter/__init__.py           |   9 +-
 python/popgetter/assets/__init__.py    |   2 +-
 python/popgetter/assets/ni/__init__.py | 399 +++++++++++++++++++++++++
 python/popgetter/utils.py              |  22 ++
 4 files changed, 430 insertions(+), 2 deletions(-)
 create mode 100644 python/popgetter/assets/ni/__init__.py

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 5eeab89..3ee36c8 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -44,6 +44,7 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
+    *load_assets_from_package_module(assets.ni, group_name="ni"),
     *load_assets_from_modules([cloud_outputs], group_name="cloud_assets"),
 ]
 
@@ -66,6 +67,12 @@
     description="Downloads UK data.",
 )
 
+job_ni: UnresolvedAssetJobDefinition = define_asset_job(
+    name="job_ni",
+    selection=AssetSelection.groups("ni"),
+    partitions_def=assets.ni.dataset_node_partition,
+)
+
 resources_by_env = {
     "prod": {
         "general_io_manager": AzureGeneralIOManager(".bin"),
@@ -92,5 +99,5 @@
     schedules=[],
     sensors=[cloud_outputs.country_outputs_sensor],
     resources=resources,
-    jobs=[job_be, job_us, job_uk],
+    jobs=[job_be, job_us, job_uk, job_ni],
 )
diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py
index e050bf8..c16f0c1 100644
--- a/python/popgetter/assets/__init__.py
+++ b/python/popgetter/assets/__init__.py
@@ -1,3 +1,3 @@
 from __future__ import annotations
 
-from . import be, uk, us  # noqa: F401
+from . import be, ni, uk, us  # noqa: F401
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
new file mode 100644
index 0000000..7fdbe3f
--- /dev/null
+++ b/python/popgetter/assets/ni/__init__.py
@@ -0,0 +1,399 @@
+from __future__ import annotations
+
+import io
+from abc import ABC
+from dataclasses import dataclass
+from datetime import date
+
+import geopandas as gpd
+import matplotlib.pyplot as plt
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from dagster import (
+    AssetExecutionContext,
+    DynamicPartitionsDefinition,
+    MetadataValue,
+    asset,
+)
+
+from popgetter.metadata import (
+    CountryMetadata,
+    DataPublisher,
+    GeometryMetadata,
+    MetricMetadata,
+    SourceDataRelease,
+    metadata_to_dataframe,
+)
+from popgetter.utils import add_metadata, markdown_from_plot
+
+PARTITION_NAME = "uk-ni_dataset_nodes"
+REQUIRED_TABLES = [
+    "MS-A09",
+]
+REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
+REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"]
+GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
+
+# TODO: get correct dates
+CENSUS_REFERENCE_DATE = date(2011, 3, 27)
+CENSUS_COLLECTION_DATE = date(2011, 3, 27)
+CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
+CENSUS_REFERENCE_DATE = date(2021, 3, 1)
+CENSUS_PUBLICATION_DATE = date(2021, 3, 1)
+
+
+@dataclass
+class NIGeometryLevel:
+    level: str
+    hxl_tag: str
+    geo_id_column: str
+    name_columns: dict[str, str]  # keys = language codes, values = column names
+    url: str
+
+
+NI_GEO_LEVELS = {
+    "DZ21": NIGeometryLevel(
+        level="DZ21",
+        hxl_tag="TBD",
+        geo_id_column="DZ2021_cd",
+        name_columns={"eng": "DZ2021_nm"},
+        url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip",
+    )
+}
+
+# Full list of geographies, see metadata:
+# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE
+GEO_LEVELS = [
+    "LGD14",  # Local Government District 2014
+    # "URBAN_STATUS", # Urban Status
+    # "HEALTH_TRUST", # Health and Social Care Trust
+    # "PARLCON24", # Parliamentary Constituency 2024
+    # "DEA14", # District Electoral Area 2014
+    "SDZ21",  # Census 2021 Super Data Zone
+    "DZ21",  # Census 2021 Data Zone
+]
+
+
+class Country(ABC):
+    def catalog(self, context) -> pd.DataFrame:
+        ...
+
+    def source_table(self, context):
+        ...
+
+    def census_table(self, context):
+        ...
+
+    def derived_table(self, context):
+        ...
+
+
+# async fn population(&self) -> anyhow::Result<DataFrame> {
+#     let url =
+#         "https://build.nisra.gov.uk/en/custom/table.csv?d=PEOPLE&v=DZ21&v=UR_SEX&v=AGE_SYOA_85";
+#     let data: Vec<u8> = reqwest::get(url).await?.text().await?.bytes().collect();
+#     Ok(CsvReader::new(Cursor::new(data))
+#         .has_header(true)
+#         .finish()?)
+# }
+# async fn geojson(&self) -> anyhow::Result<FeatureCollection> {
+#     let url = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-geojson.zip";
+#     let mut tmpfile = tempfile::tempfile()?;
+#     tmpfile.write_all(&reqwest::get(url).await?.bytes().await?)?;
+#     let mut zip = zip::ZipArchive::new(tmpfile)?;
+#     let mut file = zip.by_name("DZ2021.geojson")?;
+#     let mut buffer = String::from("");
+#     file.read_to_string(&mut buffer)?;
+#     Ok(buffer.parse()?)
+# }
+
+
+def get_nodes_and_links() -> dict[str, dict[str, str]]:
+    SCHEME_AND_HOST = "https://build.nisra.gov.uk"
+    urls = [
+        "".join([SCHEME_AND_HOST, url.get("href")])
+        for url in BeautifulSoup(
+            requests.get(SCHEME_AND_HOST + "/en/standard").content, features="lxml"
+        ).find_all("a")
+        if str(url.get("href")).startswith("/en/standard")
+    ]
+    nodes: dict[str, dict[str, str]] = {}
+    for url in urls:
+        soup = BeautifulSoup(requests.get(url).content, features="lxml")
+        nodes[url] = {
+            "table_url": list(
+                set(
+                    [
+                        "".join([SCHEME_AND_HOST, link.get("href")])
+                        for link in soup.find_all("a")
+                        if "table.csv?" in link.get("href")
+                    ]
+                )
+            )[0],
+            "metadata_url": list(
+                set(
+                    [
+                        "".join([SCHEME_AND_HOST, link.get("href")])
+                        for link in soup.find_all("a")
+                        if "table.csv-metadata" in link.get("href")
+                    ]
+                )
+            )[0],
+        }
+    return nodes
+
+
+class NorthernIreland(Country):
+    partition_name: str = PARTITION_NAME
+    geo_levels: list[str] = GEO_LEVELS
+    required_tables: list[str] = REQUIRED_TABLES
+
+    def catalog(self, context: AssetExecutionContext) -> pd.DataFrame:
+        """
+        A catalog for NI can be generated in two ways:
+        1. With flexible table builder:
+                https://build.nisra.gov.uk/en/
+            with metadata chosen from:
+                https://build.nisra.gov.uk/en/metadata
+        2. Or through enumerating the ready-made tables:
+            https://build.nisra.gov.uk/en/standard
+            However, some level of
+        """
+        catalog_summary = {
+            "node": [],
+            "partition_key": [],
+            "table_id": [],
+            "geo_level": [],
+            "human_readable_name": [],
+            "description": [],
+            "metric_parquet_file_url": [],
+            "parquet_column_name": [],
+            "parquet_margin_of_error_column": [],
+            "parquet_margin_of_error_file": [],
+            "potential_denominator_ids": [],
+            "parent_metric_id": [],
+            "source_data_release_id": [],
+            "source_download_url": [],
+            "source_format": [],
+            "source_archive_file_path": [],
+            "source_documentation_url": [],
+            "table_schema": [],
+        }
+        nodes = get_nodes_and_links()
+
+        def add_resolution(s: str, geo_level: str) -> str:
+            s_split = s.split("?")
+            return "?".join([s_split[0], f"v={geo_level}&" + s_split[1]])
+
+        for node_url, node_items in nodes.items():
+            for geo_level in self.geo_levels:
+                metadata = requests.get(node_items["metadata_url"]).json()
+                table_id = metadata["dc:title"].split(":")[0]
+                # Skip if not required
+                if table_id not in self.required_tables:
+                    continue
+
+                catalog_summary["node"].append(node_url)
+                catalog_summary["table_id"].append(table_id)
+                catalog_summary["geo_level"].append(geo_level)
+                catalog_summary["partition_key"].append(f"{geo_level}/{table_id}")
+                catalog_summary["human_readable_name"].append(metadata["dc:title"])
+                catalog_summary["description"].append(metadata["dc:description"])
+                catalog_summary["metric_parquet_file_url"].append(None)
+                catalog_summary["parquet_column_name"].append(None)
+                catalog_summary["parquet_margin_of_error_column"].append(None)
+                catalog_summary["parquet_margin_of_error_file"].append(None)
+                catalog_summary["potential_denominator_ids"].append(None)
+                catalog_summary["parent_metric_id"].append(None)
+                catalog_summary["source_data_release_id"].append(None)
+                catalog_summary["source_download_url"].append(
+                    add_resolution(metadata["url"], geo_level)
+                )
+                catalog_summary["source_format"].append(None)
+                catalog_summary["source_archive_file_path"].append(None)
+                catalog_summary["source_documentation_url"].append(node_url)
+                catalog_summary["table_schema"].append(metadata["tableSchema"])
+
+        catalog_df = pd.DataFrame.from_records(catalog_summary)
+        context.instance.add_dynamic_partitions(
+            partitions_def_name=self.partition_name,
+            partition_keys=catalog_df["partition_key"].to_list(),
+        )
+
+        add_metadata(context, catalog_df, "Catalog")
+        return catalog_df
+
+    def census_tables(
+        self, context: AssetExecutionContext, catalog: pd.DataFrame, partition
+    ) -> pd.DataFrame:
+        url = catalog.loc[
+            catalog["partition_key"].eq(partition), "source_download_url"
+        ].iloc[0]
+        return pd.read_csv(io.BytesIO(requests.get(url).content), encoding="utf8")
+
+    def source_table(self) -> pd.DataFrame:
+        return pd.DataFrame()
+
+
+country: CountryMetadata = CountryMetadata(
+    name_short_en="Northern Ireland",
+    name_official="Northern Ireland",
+    iso3="GBR",
+    iso2="GB",
+    iso3166_2="GB-NIR",
+)
+
+publisher: DataPublisher = DataPublisher(
+    name="NISRA",
+    url="https://www.nisra.gov.uk/",
+    description="The Northern Ireland Statistics and Research Agency (NISRA), which incorporates the General Register Office (GRO), is an executive agency within the Department of Finance (NI) and was established on 1 April 1996.",
+    countries_of_interest=[country.id],
+)
+
+
+@asset
+def source_data_release(
+    context: AssetExecutionContext,
+    geographies: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame],
+) -> list[SourceDataRelease]:
+    source_data_releases = []
+    for geo_level in geographies[0]:
+        source_data_release: SourceDataRelease = SourceDataRelease(
+            name="Census 2021",
+            date_published=date(2014, 2, 27),
+            reference_period_start=CENSUS_REFERENCE_DATE,
+            reference_period_end=CENSUS_REFERENCE_DATE,
+            collection_period_start=CENSUS_COLLECTION_DATE,
+            collection_period_end=CENSUS_COLLECTION_DATE,
+            expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+            url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
+            data_publisher_id=publisher.id,
+            description="TBC",
+            # geography_file="TBC",
+            # geography_level="TBC",
+            # countries_of_interest=[country.id],
+            geometry_metadata_id="tbd",
+        )
+        source_data_releases.append(source_data_release)
+    return source_data_releases
+
+
+key_prefix = "uk-ni"
+
+ni = NorthernIreland()
+
+dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
+
+
+@asset(key_prefix=key_prefix)
+def catalog(context) -> pd.DataFrame:
+    return ni.catalog(context)
+
+
+@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix)
+def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame:
+    census_table = ni.census_tables(
+        context, catalog, context.asset_partition_key_for_output()
+    )
+    add_metadata(context, census_table, title=context.asset_partition_key_for_output())
+    return census_table
+
+
+@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix)
+def source_tables(
+    context: AssetExecutionContext, census_tables: pd.DataFrame
+) -> pd.DataFrame:
+    return census_tables
+
+
+def source_metadata_from_catalog(catalog) -> MetricMetadata:
+    ...
+
+
+geometry_metadata: GeometryMetadata = GeometryMetadata(
+    validity_period_start=date(2023, 1, 1),
+    validity_period_end=date(2023, 12, 31),
+    level="municipality",
+    # country -> province -> region -> arrondisement -> municipality
+    hxl_tag="adm4",
+)
+
+
+@asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix)
+def geographies(
+    context: AssetExecutionContext,
+) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]:
+    level_details = NI_GEO_LEVELS["DZ21"]
+
+    # TODO: get correct values
+    geometry_metadata = GeometryMetadata(
+        validity_period_start=date(2023, 1, 1),
+        validity_period_end=date(2023, 12, 31),
+        level=level_details.level,
+        hxl_tag=level_details.hxl_tag,
+    )
+    region_geometries_raw = (
+        gpd.read_file(level_details.url)
+        .dissolve(by=level_details.geo_id_column)
+        .reset_index()
+    )
+    region_geometries = region_geometries_raw.rename(
+        columns={level_details.geo_id_column: "GEO_ID"}
+    ).loc[:, ["geometry", "GEO_ID"]]
+    region_names = (
+        region_geometries_raw.rename(
+            columns={
+                level_details.geo_id_column: "GEO_ID",
+                level_details.name_columns["eng"]: "eng",
+            }
+        )
+        .loc[:, ["GEO_ID", "eng"]]
+        .drop_duplicates()
+    )
+
+    # Generate a plot and convert the image to Markdown to preview it within
+    # Dagster
+    joined_gdf = region_geometries.merge(region_names, on="GEO_ID")
+    ax = joined_gdf.plot(column="eng", legend=False)
+    ax.set_title(f"Northern Ireland 2023 {level_details.level}")
+    md_plot = markdown_from_plot(plt)
+
+    geometry_metadata_df = metadata_to_dataframe([geometry_metadata])
+
+    context.add_output_metadata(
+        metadata={
+            "num_records": len(region_geometries),
+            "geometry_plot": MetadataValue.md(md_plot),
+            "names_preview": MetadataValue.md(region_names.head().to_markdown()),
+            "metadata_preview": MetadataValue.md(
+                geometry_metadata_df.head().to_markdown()
+            ),
+        },
+    )
+
+    context.add_output_metadata(
+        metadata={
+            "num_records": len(region_geometries),
+            "geometry_plot": MetadataValue.md(md_plot),
+            "names_preview": MetadataValue.md(region_names.head().to_markdown()),
+            "metadata_preview": MetadataValue.md(
+                geometry_metadata_df.head().to_markdown()
+            ),
+        },
+    )
+    return geometry_metadata_df, region_geometries, region_names
+
+
+# @asset(partitions_def=dataset_node_partition, key_prefix=asset_prefix)
+# def source_mmd(context: AssetExecutionContext, catalog: pd.DataFrame) -> list[MetricMetadata]:
+#     # return census_tables
+#     source_metadata_from_catalog(catalog)
+
+# @asset
+# def source_tables() -> pd.DataFrame:
+#     return ni.catalog()
+
+# @asset
+# def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]:
+#     # return ni.catalog()
diff --git a/python/popgetter/utils.py b/python/popgetter/utils.py
index 724377f..48070cc 100644
--- a/python/popgetter/utils.py
+++ b/python/popgetter/utils.py
@@ -11,10 +11,13 @@
 from tempfile import TemporaryDirectory
 
 import fsspec
+import geopandas as gpd
+import pandas as pd
 import requests
 from dagster import (
     ConfigurableResource,
     EnvVar,
+    MetadataValue,
     get_dagster_logger,
 )
 
@@ -297,6 +300,25 @@ def download_zipped_files(zipfile_url: str, output_dir: str) -> None:
     z.extractall(output_dpath)
 
 
+def add_metadata(
+    context,
+    df: pd.DataFrame | gpd.GeoDataFrame,
+    title: str | list[str],
+    output_name: str | None = None,
+):
+    context.add_output_metadata(
+        metadata={
+            "title": title,
+            "num_records": len(df),
+            "columns": MetadataValue.md(
+                "\n".join([f"- '`{col}`'" for col in df.columns.to_list()])
+            ),
+            "preview": MetadataValue.md(df.head().to_markdown()),
+        },
+        output_name=output_name,
+    )
+
+
 if __name__ == "__main__":
     pass
     # This is for testing only

From 649ed1e8f05b3c7adb99eb5db93e0fba6857adce Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 16 May 2024 17:20:58 +0100
Subject: [PATCH 02/39] Update NI port

---
 python/popgetter/__init__.py               |   2 +-
 python/popgetter/assets/ni/__init__.py     | 292 ++++++++++-----------
 python/popgetter/cloud_outputs/__init__.py |   2 +
 python/popgetter/io_managers/__init__.py   |  17 +-
 4 files changed, 150 insertions(+), 163 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 2312895..f12c1a5 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -45,7 +45,7 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
-    *load_assets_from_package_module(assets.ni, group_name="ni"),
+    *load_assets_from_package_module(assets.ni, group_name="ni", key_prefix="uk-ni"),
     *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"),
 ]
 
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 7fdbe3f..3975c8a 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import io
-from abc import ABC
 from dataclasses import dataclass
 from datetime import date
 
@@ -16,14 +15,15 @@
     MetadataValue,
     asset,
 )
+from icecream import ic
 
+from popgetter.assets.common import Country
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
     GeometryMetadata,
     MetricMetadata,
     SourceDataRelease,
-    metadata_to_dataframe,
 )
 from popgetter.utils import add_metadata, markdown_from_plot
 
@@ -32,15 +32,19 @@
     "MS-A09",
 ]
 REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
-REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"]
-GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
 
-# TODO: get correct dates
-CENSUS_REFERENCE_DATE = date(2011, 3, 27)
-CENSUS_COLLECTION_DATE = date(2011, 3, 27)
-CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1)
-CENSUS_REFERENCE_DATE = date(2021, 3, 1)
-CENSUS_PUBLICATION_DATE = date(2021, 3, 1)
+# TODO
+REQUIRED_RELEASES = [""]
+# GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
+
+# TODO: get these are correct dates
+CENSUS_REFERENCE_DATE = date(2021, 3, 21)
+CENSUS_COLLECTION_DATE = date(2021, 3, 21)
+CENSUS_EXPECT_NEXT_UPDATE = date(2031, 1, 1)
+CENSUS_REFERENCE_DATE = date(2021, 3, 21)
+# https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus:
+# 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas
+CENSUS_PUBLICATION_DATE = date(2023, 2, 21)
 
 
 @dataclass
@@ -57,7 +61,7 @@ class NIGeometryLevel:
         level="DZ21",
         hxl_tag="TBD",
         geo_id_column="DZ2021_cd",
-        name_columns={"eng": "DZ2021_nm"},
+        name_columns={"en": "DZ2021_nm"},
         url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip",
     )
 }
@@ -75,40 +79,6 @@ class NIGeometryLevel:
 ]
 
 
-class Country(ABC):
-    def catalog(self, context) -> pd.DataFrame:
-        ...
-
-    def source_table(self, context):
-        ...
-
-    def census_table(self, context):
-        ...
-
-    def derived_table(self, context):
-        ...
-
-
-# async fn population(&self) -> anyhow::Result<DataFrame> {
-#     let url =
-#         "https://build.nisra.gov.uk/en/custom/table.csv?d=PEOPLE&v=DZ21&v=UR_SEX&v=AGE_SYOA_85";
-#     let data: Vec<u8> = reqwest::get(url).await?.text().await?.bytes().collect();
-#     Ok(CsvReader::new(Cursor::new(data))
-#         .has_header(true)
-#         .finish()?)
-# }
-# async fn geojson(&self) -> anyhow::Result<FeatureCollection> {
-#     let url = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-geojson.zip";
-#     let mut tmpfile = tempfile::tempfile()?;
-#     tmpfile.write_all(&reqwest::get(url).await?.bytes().await?)?;
-#     let mut zip = zip::ZipArchive::new(tmpfile)?;
-#     let mut file = zip.by_name("DZ2021.geojson")?;
-#     let mut buffer = String::from("");
-#     file.read_to_string(&mut buffer)?;
-#     Ok(buffer.parse()?)
-# }
-
-
 def get_nodes_and_links() -> dict[str, dict[str, str]]:
     SCHEME_AND_HOST = "https://build.nisra.gov.uk"
     urls = [
@@ -149,7 +119,8 @@ class NorthernIreland(Country):
     geo_levels: list[str] = GEO_LEVELS
     required_tables: list[str] = REQUIRED_TABLES
 
-    def catalog(self, context: AssetExecutionContext) -> pd.DataFrame:
+    # def catalog(self, context: AssetExecutionContext) -> pd.DataFrame:
+    def catalog(self, context) -> pd.DataFrame:
         """
         A catalog for NI can be generated in two ways:
         1. With flexible table builder:
@@ -184,7 +155,14 @@ def catalog(self, context: AssetExecutionContext) -> pd.DataFrame:
 
         def add_resolution(s: str, geo_level: str) -> str:
             s_split = s.split("?")
-            return "?".join([s_split[0], f"v={geo_level}&" + s_split[1]])
+            query_params = s_split[1].split("&")
+            if query_params[0].startswith("d="):
+                query_params = "&".join(
+                    [query_params[0], f"v={geo_level}", *query_params[1:]]
+                )
+            else:
+                query_params = "&".join([f"v={geo_level}", *query_params[:]])
+            return "?".join([s_split[0], query_params])
 
         for node_url, node_items in nodes.items():
             for geo_level in self.geo_levels:
@@ -224,9 +202,7 @@ def add_resolution(s: str, geo_level: str) -> str:
         add_metadata(context, catalog_df, "Catalog")
         return catalog_df
 
-    def census_tables(
-        self, context: AssetExecutionContext, catalog: pd.DataFrame, partition
-    ) -> pd.DataFrame:
+    def census_tables(self, context, catalog: pd.DataFrame, partition) -> pd.DataFrame:
         url = catalog.loc[
             catalog["partition_key"].eq(partition), "source_download_url"
         ].iloc[0]
@@ -252,33 +228,6 @@ def source_table(self) -> pd.DataFrame:
 )
 
 
-@asset
-def source_data_release(
-    context: AssetExecutionContext,
-    geographies: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame],
-) -> list[SourceDataRelease]:
-    source_data_releases = []
-    for geo_level in geographies[0]:
-        source_data_release: SourceDataRelease = SourceDataRelease(
-            name="Census 2021",
-            date_published=date(2014, 2, 27),
-            reference_period_start=CENSUS_REFERENCE_DATE,
-            reference_period_end=CENSUS_REFERENCE_DATE,
-            collection_period_start=CENSUS_COLLECTION_DATE,
-            collection_period_end=CENSUS_COLLECTION_DATE,
-            expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
-            url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a",
-            data_publisher_id=publisher.id,
-            description="TBC",
-            # geography_file="TBC",
-            # geography_level="TBC",
-            # countries_of_interest=[country.id],
-            geometry_metadata_id="tbd",
-        )
-        source_data_releases.append(source_data_release)
-    return source_data_releases
-
-
 key_prefix = "uk-ni"
 
 ni = NorthernIreland()
@@ -286,13 +235,14 @@ def source_data_release(
 dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
 
 
-@asset(key_prefix=key_prefix)
+@asset
 def catalog(context) -> pd.DataFrame:
     return ni.catalog(context)
 
 
-@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix)
-def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame:
+@asset(partitions_def=dataset_node_partition)
+# def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame:
+def census_tables(context, catalog) -> pd.DataFrame:
     census_table = ni.census_tables(
         context, catalog, context.asset_partition_key_for_output()
     )
@@ -300,100 +250,126 @@ def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame:
     return census_table
 
 
-@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix)
-def source_tables(
-    context: AssetExecutionContext, census_tables: pd.DataFrame
-) -> pd.DataFrame:
+@asset(partitions_def=dataset_node_partition)
+def source_tables(context, census_tables: pd.DataFrame) -> pd.DataFrame:
     return census_tables
 
 
-def source_metadata_from_catalog(catalog) -> MetricMetadata:
-    ...
-
-
-geometry_metadata: GeometryMetadata = GeometryMetadata(
-    validity_period_start=date(2023, 1, 1),
-    validity_period_end=date(2023, 12, 31),
-    level="municipality",
-    # country -> province -> region -> arrondisement -> municipality
-    hxl_tag="adm4",
-)
+def source_metadata_from_catalog(
+    catalog: pd.DataFrame, parition_key: str, source_data_release: SourceDataRelease
+) -> MetricMetadata:
+    catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :]
+    return MetricMetadata(
+        human_readable_name=catalog_row["human_readable_name"],
+        source_download_url=catalog_row["source_download_url"],
+        source_archive_file_path=catalog_row["source_archive_file_path"],
+        source_documentation_url=catalog_row["source_documentation_url"],
+        source_data_release_id=source_data_release.id,
+        # TODO - this is a placeholder
+        parent_metric_id="unknown_at_this_stage",
+        potential_denominator_ids=None,
+        parquet_margin_of_error_file=None,
+        parquet_margin_of_error_column=None,
+        parquet_column_name=catalog_row["source_column"],
+        # TODO - this is a placeholder
+        metric_parquet_file_url="unknown_at_this_stage",
+        hxl_tag=catalog_row["hxltag"],
+        description=catalog_row["description"],
+        source_metric_id=catalog_row["hxltag"],
+    )
 
 
-@asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix)
-def geographies(
-    context: AssetExecutionContext,
-) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]:
+@asset
+# @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix)
+def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+    # TODO: This is almost identical to Belgium so can probably be refactored to common
+    # function with config of releases and languages
     level_details = NI_GEO_LEVELS["DZ21"]
 
-    # TODO: get correct values
-    geometry_metadata = GeometryMetadata(
-        validity_period_start=date(2023, 1, 1),
-        validity_period_end=date(2023, 12, 31),
-        level=level_details.level,
-        hxl_tag=level_details.hxl_tag,
-    )
-    region_geometries_raw = (
-        gpd.read_file(level_details.url)
-        .dissolve(by=level_details.geo_id_column)
-        .reset_index()
-    )
-    region_geometries = region_geometries_raw.rename(
-        columns={level_details.geo_id_column: "GEO_ID"}
-    ).loc[:, ["geometry", "GEO_ID"]]
-    region_names = (
-        region_geometries_raw.rename(
-            columns={
-                level_details.geo_id_column: "GEO_ID",
-                level_details.name_columns["eng"]: "eng",
-            }
+    geometries_to_return = []
+    for level_details in NI_GEO_LEVELS.values():
+        # TODO: get correct values
+        geometry_metadata = GeometryMetadata(
+            validity_period_start=CENSUS_COLLECTION_DATE,
+            validity_period_end=CENSUS_COLLECTION_DATE,
+            level=level_details.level,
+            hxl_tag=level_details.hxl_tag,
+        )
+        region_geometries_raw = (
+            gpd.read_file(level_details.url)
+            .dissolve(by=level_details.geo_id_column)
+            .reset_index()
+        )
+        context.log.debug(ic(region_geometries_raw.head()))
+        region_geometries = region_geometries_raw.rename(
+            columns={level_details.geo_id_column: "GEO_ID"}
+        ).loc[:, ["geometry", "GEO_ID"]]
+        region_names = (
+            region_geometries_raw.rename(
+                columns={
+                    level_details.geo_id_column: "GEO_ID",
+                    level_details.name_columns["en"]: "en",
+                }
+            )
+            .loc[:, ["GEO_ID", "en"]]
+            .drop_duplicates()
+        )
+        geometries_to_return.append(
+            (geometry_metadata, region_geometries, region_names)
         )
-        .loc[:, ["GEO_ID", "eng"]]
-        .drop_duplicates()
-    )
 
-    # Generate a plot and convert the image to Markdown to preview it within
-    # Dagster
-    joined_gdf = region_geometries.merge(region_names, on="GEO_ID")
-    ax = joined_gdf.plot(column="eng", legend=False)
-    ax.set_title(f"Northern Ireland 2023 {level_details.level}")
+    # Add output metadata
+    first_metadata, first_gdf, first_names = geometries_to_return[0]
+    first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
+    ax = first_joined_gdf.plot(column="en", legend=False)
+    ax.set_title(f"NI 2023 {first_metadata.level}")
     md_plot = markdown_from_plot(plt)
-
-    geometry_metadata_df = metadata_to_dataframe([geometry_metadata])
-
     context.add_output_metadata(
         metadata={
-            "num_records": len(region_geometries),
-            "geometry_plot": MetadataValue.md(md_plot),
-            "names_preview": MetadataValue.md(region_names.head().to_markdown()),
-            "metadata_preview": MetadataValue.md(
-                geometry_metadata_df.head().to_markdown()
+            "all_geom_levels": MetadataValue.md(
+                ",".join([metadata.level for metadata, _, _ in geometries_to_return])
             ),
-        },
+            "first_geometry_plot": MetadataValue.md(md_plot),
+            "first_names_preview": MetadataValue.md(first_names.head().to_markdown()),
+        }
     )
 
-    context.add_output_metadata(
-        metadata={
-            "num_records": len(region_geometries),
-            "geometry_plot": MetadataValue.md(md_plot),
-            "names_preview": MetadataValue.md(region_names.head().to_markdown()),
-            "metadata_preview": MetadataValue.md(
-                geometry_metadata_df.head().to_markdown()
-            ),
-        },
-    )
-    return geometry_metadata_df, region_geometries, region_names
+    return geometries_to_return
 
 
-# @asset(partitions_def=dataset_node_partition, key_prefix=asset_prefix)
-# def source_mmd(context: AssetExecutionContext, catalog: pd.DataFrame) -> list[MetricMetadata]:
-#     # return census_tables
-#     source_metadata_from_catalog(catalog)
+@asset()
+def source_data_release(
+    context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]
+) -> list[SourceDataRelease]:
+    source_data_releases = []
+    for geo_metadata, _, _ in geometry:
+        # TODO: update with dates from config
+        source_data_release: SourceDataRelease = SourceDataRelease(
+            name="Census 2021",
+            date_published=date(2014, 2, 27),
+            reference_period_start=CENSUS_REFERENCE_DATE,
+            reference_period_end=CENSUS_REFERENCE_DATE,
+            collection_period_start=CENSUS_COLLECTION_DATE,
+            collection_period_end=CENSUS_COLLECTION_DATE,
+            expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+            url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus",
+            data_publisher_id=publisher.id,
+            description="TBC",
+            geometry_metadata_id=geo_metadata.id,
+        )
+        source_data_releases.append(source_data_release)
+    return source_data_releases
 
-# @asset
-# def source_tables() -> pd.DataFrame:
-#     return ni.catalog()
 
-# @asset
-# def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]:
-#     # return ni.catalog()
+@asset(partitions_def=dataset_node_partition)
+def source_mmd(
+    context: AssetExecutionContext,
+    catalog: pd.DataFrame,
+    source_data_release: list[SourceDataRelease],
+) -> list[MetricMetadata]:
+    source_metadata_from_catalog(catalog)
+
+
+@asset
+def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]:
+    return ni.catalog()
diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py
index 96ba6f8..cf7b589 100644
--- a/python/popgetter/cloud_outputs/__init__.py
+++ b/python/popgetter/cloud_outputs/__init__.py
@@ -7,6 +7,7 @@
         "be/country_metadata",
         "be/data_publisher",
         "be/source_data_release_munip",
+        "uk-ni/source_data_release",
     ],
     io_manager_key="metadata_io_manager",
     prefix="metadata",
@@ -19,6 +20,7 @@
 geometry_factory = CloudAssetSensor(
     asset_names_to_monitor=[
         "be/geometry",
+        "uk-ni/geometry",
     ],
     io_manager_key="geometry_io_manager",
     prefix="geometry",
diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py
index 5bf6dd1..97b008e 100644
--- a/python/popgetter/io_managers/__init__.py
+++ b/python/popgetter/io_managers/__init__.py
@@ -33,13 +33,17 @@ def load_input(self, _context: InputContext) -> pd.DataFrame:
 
 class MetadataIOManager(PopgetterIOManager):
     def get_output_filename(
-        self, obj: CountryMetadata | DataPublisher | SourceDataRelease
+        self,
+        obj: CountryMetadata
+        | DataPublisher
+        | SourceDataRelease
+        | list[SourceDataRelease],
     ) -> str:
         if isinstance(obj, CountryMetadata):
             return "country_metadata.parquet"
         if isinstance(obj, DataPublisher):
             return "data_publishers.parquet"
-        if isinstance(obj, SourceDataRelease):
+        if isinstance(obj, SourceDataRelease) or isinstance(obj, list):
             return "source_data_releases.parquet"
 
         err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease"
@@ -57,11 +61,16 @@ def get_full_path(
     def handle_output(
         self,
         context: OutputContext,
-        obj: CountryMetadata | DataPublisher | SourceDataRelease,
+        obj: CountryMetadata
+        | DataPublisher
+        | SourceDataRelease
+        | list[SourceDataRelease],
     ):
+        if not isinstance(obj, list):
+            obj = [obj]
         full_path = self.get_full_path(context, obj)
         context.add_output_metadata(metadata={"parquet_path": str(full_path)})
-        self.handle_df(context, metadata_to_dataframe([obj]), full_path)
+        self.handle_df(context, metadata_to_dataframe(obj), full_path)
 
 
 class GeoIOManager(PopgetterIOManager):

From bf3ea60dd04fbd7e055ee3270d18934a4670c80c Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 16 May 2024 18:12:05 +0100
Subject: [PATCH 03/39] Adding transformations to match updated Belgium

---
 python/popgetter/assets/ni/__init__.py | 292 +++++++++++++++++++++----
 1 file changed, 247 insertions(+), 45 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 3975c8a..651fe7e 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import io
+from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import date
+from functools import reduce
 
 import geopandas as gpd
 import matplotlib.pyplot as plt
@@ -10,9 +12,12 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
-    AssetExecutionContext,
+    AssetIn,
     DynamicPartitionsDefinition,
+    IdentityPartitionMapping,
     MetadataValue,
+    SpecificPartitionsPartitionMapping,
+    StaticPartitionsDefinition,
     asset,
 )
 from icecream import ic
@@ -24,6 +29,7 @@
     GeometryMetadata,
     MetricMetadata,
     SourceDataRelease,
+    metadata_to_dataframe,
 )
 from popgetter.utils import add_metadata, markdown_from_plot
 
@@ -158,11 +164,13 @@ def add_resolution(s: str, geo_level: str) -> str:
             query_params = s_split[1].split("&")
             if query_params[0].startswith("d="):
                 query_params = "&".join(
-                    [query_params[0], f"v={geo_level}", *query_params[1:]]
+                    [query_params[0], f"v={geo_level}", *query_params[2:]]
                 )
             else:
-                query_params = "&".join([f"v={geo_level}", *query_params[:]])
-            return "?".join([s_split[0], query_params])
+                query_params = "&".join([f"v={geo_level}", *query_params[1:]])
+            out_url = "?".join([s_split[0], query_params])
+            ic(out_url)
+            return out_url
 
         for node_url, node_items in nodes.items():
             for geo_level in self.geo_levels:
@@ -202,7 +210,11 @@ def add_resolution(s: str, geo_level: str) -> str:
         add_metadata(context, catalog_df, "Catalog")
         return catalog_df
 
-    def census_tables(self, context, catalog: pd.DataFrame, partition) -> pd.DataFrame:
+    def census_tables(
+        self, context, catalog: pd.DataFrame, partition: str
+    ) -> pd.DataFrame:
+        ic(partition)
+        ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"])
         url = catalog.loc[
             catalog["partition_key"].eq(partition), "source_download_url"
         ].iloc[0]
@@ -250,35 +262,6 @@ def census_tables(context, catalog) -> pd.DataFrame:
     return census_table
 
 
-@asset(partitions_def=dataset_node_partition)
-def source_tables(context, census_tables: pd.DataFrame) -> pd.DataFrame:
-    return census_tables
-
-
-def source_metadata_from_catalog(
-    catalog: pd.DataFrame, parition_key: str, source_data_release: SourceDataRelease
-) -> MetricMetadata:
-    catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :]
-    return MetricMetadata(
-        human_readable_name=catalog_row["human_readable_name"],
-        source_download_url=catalog_row["source_download_url"],
-        source_archive_file_path=catalog_row["source_archive_file_path"],
-        source_documentation_url=catalog_row["source_documentation_url"],
-        source_data_release_id=source_data_release.id,
-        # TODO - this is a placeholder
-        parent_metric_id="unknown_at_this_stage",
-        potential_denominator_ids=None,
-        parquet_margin_of_error_file=None,
-        parquet_margin_of_error_column=None,
-        parquet_column_name=catalog_row["source_column"],
-        # TODO - this is a placeholder
-        metric_parquet_file_url="unknown_at_this_stage",
-        hxl_tag=catalog_row["hxltag"],
-        description=catalog_row["description"],
-        source_metric_id=catalog_row["hxltag"],
-    )
-
-
 @asset
 # @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix)
 def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
@@ -340,7 +323,7 @@ def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataF
 @asset()
 def source_data_release(
     context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]
-) -> list[SourceDataRelease]:
+) -> SourceDataRelease:
     source_data_releases = []
     for geo_metadata, _, _ in geometry:
         # TODO: update with dates from config
@@ -358,18 +341,237 @@ def source_data_release(
             geometry_metadata_id=geo_metadata.id,
         )
         source_data_releases.append(source_data_release)
-    return source_data_releases
+    # TODO: update for multiple source data releases
+    return source_data_releases[0]
 
 
-@asset(partitions_def=dataset_node_partition)
-def source_mmd(
-    context: AssetExecutionContext,
+# @asset(partitions_def=dataset_node_partition)
+# def source_mmd(
+#     context,
+#     catalog: pd.DataFrame,
+#     source_data_release: SourceDataRelease,
+# ) -> list[MetricMetadata]:
+#     source_metadata_from_catalog(catalog)
+
+
+# TODO: check if this is a simpler approach?
+# @asset(partitions_def=dataset_node_partition)
+# def source_tables(
+#     context: AssetExecutionContext, census_tables: pd.DataFrame
+# ) -> pd.DataFrame:
+#     if context.partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys():
+#         raise ValueError(f"Specified partition '{context.partition_key}' not handled")
+#     return census_tables
+
+
+@dataclass
+class DerivedColumn:
+    hxltag: str
+    filter_func: Callable[[pd.DataFrame], pd.DataFrame]
+    output_column_name: str
+    human_readable_name: str
+
+
+# The keys of this dict are the nodes (i.e. partition keys). The values are a
+# list of all columns of data derived from this node.
+age_code = "Age Code"
+sex_code = "Sex Code"
+DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, list[DerivedColumn])] = {  # type: ignore
+    "DZ21/MS-A09": (
+        "Census 2021 Data Zone Code",
+        [
+            DerivedColumn(
+                hxltag="#population+children+age5_17",
+                filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"),
+                output_column_name="children_5_17",
+                human_readable_name="Children aged 5 to 17",
+            ),
+            DerivedColumn(
+                hxltag="#population+infants+age0_4",
+                filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"),
+                output_column_name="infants_0_4",
+                human_readable_name="Infants aged 0 to 4",
+            ),
+            DerivedColumn(
+                hxltag="#population+children+age0_17",
+                filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"),
+                output_column_name="children_0_17",
+                human_readable_name="Children aged 0 to 17",
+            ),
+            DerivedColumn(
+                hxltag="#population+adults+f",
+                filter_func=lambda df: df.query(
+                    f"{age_code} >= 18 and {sex_code} == 'F'"
+                ),
+                output_column_name="adults_f",
+                human_readable_name="Female adults",
+            ),
+            DerivedColumn(
+                hxltag="#population+adults+m",
+                filter_func=lambda df: df.query(
+                    f"{age_code} >= 18 and {sex_code} == 'M'"
+                ),
+                output_column_name="adults_m",
+                human_readable_name="Male adults",
+            ),
+            DerivedColumn(
+                hxltag="#population+adults",
+                filter_func=lambda df: df.query(f"{age_code} >= 18"),
+                output_column_name="adults",
+                human_readable_name="Adults",
+            ),
+            DerivedColumn(
+                hxltag="#population+ind",
+                filter_func=lambda df: df,
+                output_column_name="individuals",
+                human_readable_name="Total individuals",
+            ),
+        ],
+    )
+}
+_needed_dataset_nodes = list(set([key for key in DERIVED_COLUMN_SPECIFICATIONS.keys()]))
+needed_dataset_mapping = SpecificPartitionsPartitionMapping(_needed_dataset_nodes)
+needed_dataset_partition = StaticPartitionsDefinition(_needed_dataset_nodes)
+
+
+def census_table_metadata(
+    catalog_row: dict[str, str], source_data_release: SourceDataRelease
+) -> MetricMetadata:
+    return MetricMetadata(
+        human_readable_name=catalog_row["human_readable_name"],
+        source_download_url=catalog_row["source_download_url"],
+        source_archive_file_path=catalog_row["source_archive_file_path"],
+        source_documentation_url=catalog_row["source_documentation_url"],
+        source_data_release_id=source_data_release.id,
+        # TODO - this is a placeholder
+        parent_metric_id="unknown_at_this_stage",
+        potential_denominator_ids=None,
+        parquet_margin_of_error_file=None,
+        parquet_margin_of_error_column=None,
+        parquet_column_name=catalog_row["source_column"],
+        # TODO - this is a placeholder
+        metric_parquet_path="unknown_at_this_stage",
+        hxl_tag=catalog_row["hxltag"],
+        description=catalog_row["description"],
+        source_metric_id=catalog_row["hxltag"],
+    )
+
+
+@asset(
+    ins={
+        "census_tables": AssetIn(partition_mapping=needed_dataset_mapping),
+        "catalog": AssetIn(),
+        "source_data_release": AssetIn(),
+    },
+    partitions_def=dataset_node_partition,
+)
+def source_metrics_by_partition(
+    context,
+    census_tables: dict[str, pd.DataFrame],
     catalog: pd.DataFrame,
-    source_data_release: list[SourceDataRelease],
-) -> list[MetricMetadata]:
-    source_metadata_from_catalog(catalog)
+    # TODO: generalise to list or dict of SourceDataReleases as there may be
+    # tables in here that are not at the same release level
+    source_data_release: SourceDataRelease,
+    # TODO: return an intermediate type instead of MetricMetadata
+) -> tuple[MetricMetadata, pd.DataFrame]:
+    input_partition_keys = context.asset_partition_keys_for_input(
+        input_name="census_tables"
+    )
+    output_partition_key = context.partition_key
+
+    if output_partition_key not in input_partition_keys:
+        skip_reason = f"Skipping as requested partition {output_partition_key} is not part of the 'needed' partitions {input_partition_keys}"
+        context.log.warning(skip_reason)
+        raise RuntimeError(skip_reason)
+
+    try:
+        result_df = census_tables[output_partition_key]
+    except KeyError:
+        err_msg = (
+            f"Partition key {output_partition_key} not found in census_tables\n"
+            f"Available keys are {census_tables.keys()}"
+        )
+        raise ValueError(err_msg) from None
 
+    catalog_row = catalog[catalog["node"] == output_partition_key].to_dict(
+        orient="records"
+    )[0]
 
-@asset
-def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]:
-    return ni.catalog()
+    # catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :]
+    result_mmd = census_table_metadata(catalog_row, source_data_release)
+
+    return result_mmd, result_df
+
+
+@asset(
+    partitions_def=dataset_node_partition,
+    ins={
+        "source_metrics_by_partition": AssetIn(
+            partition_mapping=IdentityPartitionMapping()
+        ),
+    },
+)
+def derived_metrics_by_partition(
+    context,
+    source_metrics_by_partition: tuple[MetricMetadata, pd.DataFrame],
+) -> tuple[list[MetricMetadata], pd.DataFrame]:
+    node = context.partition_key
+
+    source_mmd, source_table = source_metrics_by_partition
+    source_column = source_mmd.parquet_column_name
+    assert source_column in source_table.columns
+    assert len(source_table) > 0
+
+    try:
+        geo_id_col_name, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[node]
+    except KeyError:
+        skip_reason = (
+            f"Skipping as no derived columns are to be created for node {node}"
+        )
+        context.log.warning(skip_reason)
+        raise RuntimeError(skip_reason)
+
+    # Rename the geoID column to GEO_ID
+    source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"})
+
+    derived_metrics: list[pd.DataFrame] = []
+    derived_mmd: list[MetricMetadata] = []
+
+    parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet"
+
+    for metric_spec in metric_specs:
+        new_table = (
+            source_table.pipe(metric_spec.filter_func)
+            .groupby(by="GEO_ID", as_index=True)
+            .sum()
+            .rename(columns={source_column: metric_spec.output_column_name})
+            .filter(items=["GEO_ID", metric_spec.output_column_name])
+        )
+        derived_metrics.append(new_table)
+
+        new_mmd = source_mmd.copy()
+        new_mmd.parent_metric_id = source_mmd.source_metric_id
+        new_mmd.metric_parquet_path = parquet_file_name
+        new_mmd.hxl_tag = metric_spec.hxltag
+        new_mmd.parquet_column_name = metric_spec.output_column_name
+        new_mmd.human_readable_name = metric_spec.human_readable_name
+        derived_mmd.append(new_mmd)
+
+    joined_metrics = reduce(
+        lambda left, right: left.merge(
+            right, on="GEO_ID", how="inner", validate="one_to_one"
+        ),
+        derived_metrics,
+    )
+
+    context.add_output_metadata(
+        metadata={
+            "metadata_preview": MetadataValue.md(
+                metadata_to_dataframe(derived_mmd).head().to_markdown()
+            ),
+            "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
+            "metrics_preview": MetadataValue.md(joined_metrics.head().to_markdown()),
+        },
+    )
+
+    return derived_mmd, joined_metrics

From eb883babdf0caccbd449ad1e2624e3b9aa0f4cd1 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 20 May 2024 11:18:29 +0100
Subject: [PATCH 04/39] Complete and revise DAG for NI, metadata for source
 table

---
 python/popgetter/assets/ni/__init__.py | 185 ++++++++++---------------
 1 file changed, 75 insertions(+), 110 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 651fe7e..512442f 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -12,12 +12,8 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
-    AssetIn,
     DynamicPartitionsDefinition,
-    IdentityPartitionMapping,
     MetadataValue,
-    SpecificPartitionsPartitionMapping,
-    StaticPartitionsDefinition,
     asset,
 )
 from icecream import ic
@@ -98,24 +94,24 @@ def get_nodes_and_links() -> dict[str, dict[str, str]]:
     for url in urls:
         soup = BeautifulSoup(requests.get(url).content, features="lxml")
         nodes[url] = {
-            "table_url": list(
-                set(
+            "table_url": next(
+                iter(
                     [
                         "".join([SCHEME_AND_HOST, link.get("href")])
                         for link in soup.find_all("a")
                         if "table.csv?" in link.get("href")
                     ]
                 )
-            )[0],
-            "metadata_url": list(
-                set(
+            ),
+            "metadata_url": next(
+                iter(
                     [
                         "".join([SCHEME_AND_HOST, link.get("href")])
                         for link in soup.find_all("a")
                         if "table.csv-metadata" in link.get("href")
                     ]
                 )
-            )[0],
+            ),
         }
     return nodes
 
@@ -211,7 +207,7 @@ def add_resolution(s: str, geo_level: str) -> str:
         return catalog_df
 
     def census_tables(
-        self, context, catalog: pd.DataFrame, partition: str
+        self, _context, catalog: pd.DataFrame, partition: str
     ) -> pd.DataFrame:
         ic(partition)
         ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"])
@@ -321,10 +317,10 @@ def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataF
 
 
 @asset()
-def source_data_release(
-    context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]
-) -> SourceDataRelease:
-    source_data_releases = []
+def source_data_releases(
+    _context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]
+) -> dict[str, SourceDataRelease]:
+    source_data_releases = {}
     for geo_metadata, _, _ in geometry:
         # TODO: update with dates from config
         source_data_release: SourceDataRelease = SourceDataRelease(
@@ -340,28 +336,8 @@ def source_data_release(
             description="TBC",
             geometry_metadata_id=geo_metadata.id,
         )
-        source_data_releases.append(source_data_release)
-    # TODO: update for multiple source data releases
-    return source_data_releases[0]
-
-
-# @asset(partitions_def=dataset_node_partition)
-# def source_mmd(
-#     context,
-#     catalog: pd.DataFrame,
-#     source_data_release: SourceDataRelease,
-# ) -> list[MetricMetadata]:
-#     source_metadata_from_catalog(catalog)
-
-
-# TODO: check if this is a simpler approach?
-# @asset(partitions_def=dataset_node_partition)
-# def source_tables(
-#     context: AssetExecutionContext, census_tables: pd.DataFrame
-# ) -> pd.DataFrame:
-#     if context.partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys():
-#         raise ValueError(f"Specified partition '{context.partition_key}' not handled")
-#     return census_tables
+        source_data_releases[geo_metadata.level] = source_data_release
+    return source_data_releases
 
 
 @dataclass
@@ -372,13 +348,28 @@ class DerivedColumn:
     human_readable_name: str
 
 
+@dataclass
+class SourceTable:
+    hxltag: str
+    geo_level: str
+    geo_column: str
+    source_column: str
+
+
 # The keys of this dict are the nodes (i.e. partition keys). The values are a
 # list of all columns of data derived from this node.
-age_code = "Age Code"
-sex_code = "Sex Code"
-DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, list[DerivedColumn])] = {  # type: ignore
+age_code = "`Age Code`"
+sex_label = "`Sex Label`"
+
+# Config for each partition to be derived
+DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = {
     "DZ21/MS-A09": (
-        "Census 2021 Data Zone Code",
+        SourceTable(
+            hxltag="#population+dz21+2021",
+            geo_level="DZ21",
+            geo_column="Census 2021 Data Zone Code",
+            source_column="Count",
+        ),
         [
             DerivedColumn(
                 hxltag="#population+children+age5_17",
@@ -401,7 +392,7 @@ class DerivedColumn:
             DerivedColumn(
                 hxltag="#population+adults+f",
                 filter_func=lambda df: df.query(
-                    f"{age_code} >= 18 and {sex_code} == 'F'"
+                    f"{age_code} >= 18 and {sex_label} == 'Female'"
                 ),
                 output_column_name="adults_f",
                 human_readable_name="Female adults",
@@ -409,7 +400,7 @@ class DerivedColumn:
             DerivedColumn(
                 hxltag="#population+adults+m",
                 filter_func=lambda df: df.query(
-                    f"{age_code} >= 18 and {sex_code} == 'M'"
+                    f"{age_code} >= 18 and {sex_label} == 'Male'"
                 ),
                 output_column_name="adults_m",
                 human_readable_name="Male adults",
@@ -429,116 +420,90 @@ class DerivedColumn:
         ],
     )
 }
-_needed_dataset_nodes = list(set([key for key in DERIVED_COLUMN_SPECIFICATIONS.keys()]))
-needed_dataset_mapping = SpecificPartitionsPartitionMapping(_needed_dataset_nodes)
-needed_dataset_partition = StaticPartitionsDefinition(_needed_dataset_nodes)
 
 
 def census_table_metadata(
-    catalog_row: dict[str, str], source_data_release: SourceDataRelease
+    catalog_row: dict[str, str],
+    source_table: SourceTable,
+    source_data_releases: dict[str, SourceDataRelease],
 ) -> MetricMetadata:
     return MetricMetadata(
         human_readable_name=catalog_row["human_readable_name"],
         source_download_url=catalog_row["source_download_url"],
         source_archive_file_path=catalog_row["source_archive_file_path"],
         source_documentation_url=catalog_row["source_documentation_url"],
-        source_data_release_id=source_data_release.id,
+        source_data_release_id=source_data_releases[source_table.geo_level].id,
         # TODO - this is a placeholder
         parent_metric_id="unknown_at_this_stage",
         potential_denominator_ids=None,
         parquet_margin_of_error_file=None,
         parquet_margin_of_error_column=None,
-        parquet_column_name=catalog_row["source_column"],
+        # parquet_column_name=catalog_row["source_column"],
+        parquet_column_name=source_table.source_column,
         # TODO - this is a placeholder
         metric_parquet_path="unknown_at_this_stage",
-        hxl_tag=catalog_row["hxltag"],
+        hxl_tag=source_table.hxltag,
         description=catalog_row["description"],
-        source_metric_id=catalog_row["hxltag"],
+        source_metric_id=source_table.hxltag,
     )
 
 
-@asset(
-    ins={
-        "census_tables": AssetIn(partition_mapping=needed_dataset_mapping),
-        "catalog": AssetIn(),
-        "source_data_release": AssetIn(),
-    },
-    partitions_def=dataset_node_partition,
-)
-def source_metrics_by_partition(
+@asset(partitions_def=dataset_node_partition)
+def source_metric_metadata(
     context,
-    census_tables: dict[str, pd.DataFrame],
     catalog: pd.DataFrame,
-    # TODO: generalise to list or dict of SourceDataReleases as there may be
-    # tables in here that are not at the same release level
-    source_data_release: SourceDataRelease,
-    # TODO: return an intermediate type instead of MetricMetadata
-) -> tuple[MetricMetadata, pd.DataFrame]:
-    input_partition_keys = context.asset_partition_keys_for_input(
-        input_name="census_tables"
-    )
-    output_partition_key = context.partition_key
-
-    if output_partition_key not in input_partition_keys:
-        skip_reason = f"Skipping as requested partition {output_partition_key} is not part of the 'needed' partitions {input_partition_keys}"
+    source_data_releases: dict[str, SourceDataRelease],
+) -> MetricMetadata:
+    partition_key = context.partition_key
+    if partition_key not in DERIVED_COLUMN_SPECIFICATIONS:
+        skip_reason = (
+            f"Skipping as requested partition {partition_key} is configured "
+            f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}"
+        )
         context.log.warning(skip_reason)
         raise RuntimeError(skip_reason)
 
-    try:
-        result_df = census_tables[output_partition_key]
-    except KeyError:
-        err_msg = (
-            f"Partition key {output_partition_key} not found in census_tables\n"
-            f"Available keys are {census_tables.keys()}"
-        )
-        raise ValueError(err_msg) from None
-
-    catalog_row = catalog[catalog["node"] == output_partition_key].to_dict(
+    catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict(
         orient="records"
     )[0]
 
-    # catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :]
-    result_mmd = census_table_metadata(catalog_row, source_data_release)
-
-    return result_mmd, result_df
+    return census_table_metadata(
+        catalog_row,
+        DERIVED_COLUMN_SPECIFICATIONS[partition_key][0],
+        source_data_releases,
+    )
 
 
-@asset(
-    partitions_def=dataset_node_partition,
-    ins={
-        "source_metrics_by_partition": AssetIn(
-            partition_mapping=IdentityPartitionMapping()
-        ),
-    },
-)
-def derived_metrics_by_partition(
+@asset(partitions_def=dataset_node_partition)
+def derived_metrics(
     context,
-    source_metrics_by_partition: tuple[MetricMetadata, pd.DataFrame],
+    census_tables: pd.DataFrame,
+    source_metric_metadata: MetricMetadata,
 ) -> tuple[list[MetricMetadata], pd.DataFrame]:
-    node = context.partition_key
-
-    source_mmd, source_table = source_metrics_by_partition
+    partition_key = context.partition_key
+    source_table = census_tables
+    source_mmd = source_metric_metadata
     source_column = source_mmd.parquet_column_name
     assert source_column in source_table.columns
     assert len(source_table) > 0
 
     try:
-        geo_id_col_name, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[node]
-    except KeyError:
+        source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[
+            partition_key
+        ]
+    except KeyError as err:
         skip_reason = (
-            f"Skipping as no derived columns are to be created for node {node}"
+            f"Skipping as no derived columns are to be created for node {partition_key}"
         )
         context.log.warning(skip_reason)
-        raise RuntimeError(skip_reason)
-
-    # Rename the geoID column to GEO_ID
-    source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"})
+        raise RuntimeError(skip_reason) from err
 
+    source_table = source_table.rename(
+        columns={source_table_metadata.geo_column: "GEO_ID"}
+    )
     derived_metrics: list[pd.DataFrame] = []
     derived_mmd: list[MetricMetadata] = []
-
-    parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet"
-
+    parquet_file_name = "".join(c for c in partition_key if c.isalnum()) + ".parquet"
     for metric_spec in metric_specs:
         new_table = (
             source_table.pipe(metric_spec.filter_func)

From 6374fb4ac22da9f5093a4c564ec910cad421b7d7 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Mon, 20 May 2024 11:33:47 +0100
Subject: [PATCH 05/39] Add remaining required assets, update cloud outputs

---
 python/popgetter/assets/ni/__init__.py     | 44 ++++++++++++++++++----
 python/popgetter/cloud_outputs/__init__.py |  4 ++
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 512442f..31e6474 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -220,6 +220,12 @@ def source_table(self) -> pd.DataFrame:
         return pd.DataFrame()
 
 
+key_prefix = "uk-ni"
+
+ni = NorthernIreland()
+
+dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
+
 country: CountryMetadata = CountryMetadata(
     name_short_en="Northern Ireland",
     name_official="Northern Ireland",
@@ -236,11 +242,14 @@ def source_table(self) -> pd.DataFrame:
 )
 
 
-key_prefix = "uk-ni"
+@asset
+def country_metadata() -> CountryMetadata:
+    return country
 
-ni = NorthernIreland()
 
-dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
+@asset
+def data_publisher() -> DataPublisher:
+    return publisher
 
 
 @asset
@@ -356,12 +365,9 @@ class SourceTable:
     source_column: str
 
 
-# The keys of this dict are the nodes (i.e. partition keys). The values are a
-# list of all columns of data derived from this node.
+# Config for each partition to be derived
 age_code = "`Age Code`"
 sex_label = "`Sex Label`"
-
-# Config for each partition to be derived
 DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = {
     "DZ21/MS-A09": (
         SourceTable(
@@ -540,3 +546,27 @@ def derived_metrics(
     )
 
     return derived_mmd, joined_metrics
+
+
+@asset(partitions_def=dataset_node_partition)
+def metrics(
+    context, derived_metrics: tuple[list[MetricMetadata], pd.DataFrame]
+) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
+    """
+    This asset exists solely to aggregate all the derived tables into one
+    single unpartitioned asset, which the downstream publishing tasks can use.
+
+    Right now it is a bit boring because it only relies on one partition, but
+    it could be extended when we have more data products.
+    """
+    mmds, table = derived_metrics
+    filepath = mmds[0].metric_parquet_path
+
+    context.add_output_metadata(
+        metadata={
+            "num_metrics": len(mmds),
+            "num_parquets": 1,
+        },
+    )
+
+    return [(filepath, mmds, table)]
diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py
index a480713..ef2ad42 100644
--- a/python/popgetter/cloud_outputs/__init__.py
+++ b/python/popgetter/cloud_outputs/__init__.py
@@ -7,6 +7,9 @@
         "be/country_metadata",
         "be/data_publisher",
         "be/source_data_releases",
+        "uk-ni/country_metadata",
+        "uk-ni/data_publisher",
+        "uk-ni/source_data_releases",
     ],
     io_manager_key="metadata_io_manager",
     prefix="metadata",
@@ -32,6 +35,7 @@
 metrics_factory = CloudAssetSensor(
     asset_names_to_monitor=[
         "be/metrics",
+        "uk-ni/metrics",
     ],
     io_manager_key="metrics_io_manager",
     prefix="metrics",

From 5ad8d470af0275accfb0438723cc2a140013e497 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 May 2024 11:45:40 +0100
Subject: [PATCH 06/39] Fix metrics asset

---
 python/popgetter/assets/ni/__init__.py | 50 +++++++++++++++++++-------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 31e6474..78f2103 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -12,8 +12,10 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
+    AssetIn,
     DynamicPartitionsDefinition,
     MetadataValue,
+    SpecificPartitionsPartitionMapping,
     asset,
 )
 from icecream import ic
@@ -237,7 +239,11 @@ def source_table(self) -> pd.DataFrame:
 publisher: DataPublisher = DataPublisher(
     name="NISRA",
     url="https://www.nisra.gov.uk/",
-    description="The Northern Ireland Statistics and Research Agency (NISRA), which incorporates the General Register Office (GRO), is an executive agency within the Department of Finance (NI) and was established on 1 April 1996.",
+    description=(
+        "The Northern Ireland Statistics and Research Agency (NISRA), which "
+        "incorporates the General Register Office (GRO), is an executive agency "
+        "within the Department of Finance (NI) and was established on 1 April 1996."
+    ),
     countries_of_interest=[country.id],
 )
 
@@ -258,7 +264,6 @@ def catalog(context) -> pd.DataFrame:
 
 
 @asset(partitions_def=dataset_node_partition)
-# def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame:
 def census_tables(context, catalog) -> pd.DataFrame:
     census_table = ni.census_tables(
         context, catalog, context.asset_partition_key_for_output()
@@ -268,7 +273,6 @@ def census_tables(context, catalog) -> pd.DataFrame:
 
 
 @asset
-# @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix)
 def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
     # TODO: This is almost identical to Belgium so can probably be refactored to common
     # function with config of releases and languages
@@ -548,9 +552,20 @@ def derived_metrics(
     return derived_mmd, joined_metrics
 
 
-@asset(partitions_def=dataset_node_partition)
+@asset(
+    ins={
+        "derived_metrics": AssetIn(
+            partition_mapping=SpecificPartitionsPartitionMapping(
+                list(DERIVED_COLUMN_SPECIFICATIONS.keys())
+            ),
+        ),
+    },
+)
 def metrics(
-    context, derived_metrics: tuple[list[MetricMetadata], pd.DataFrame]
+    # Note dagster does not seem to allow a union type for `derived_metrics` for
+    # the cases of one or many partitions
+    context,
+    derived_metrics,
 ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
     """
     This asset exists solely to aggregate all the derived tables into one
@@ -559,14 +574,25 @@ def metrics(
     Right now it is a bit boring because it only relies on one partition, but
     it could be extended when we have more data products.
     """
-    mmds, table = derived_metrics
-    filepath = mmds[0].metric_parquet_path
-
+    if len(DERIVED_COLUMN_SPECIFICATIONS) == 1:
+        # Make into same type for the case of multiple partitions
+        derived_metrics_dict: dict[str, tuple[list[MetricMetadata], pd.DataFrame]] = {
+            next(iter(DERIVED_COLUMN_SPECIFICATIONS.keys())): derived_metrics
+        }
+    else:
+        derived_metrics_dict: dict[
+            str, tuple[list[MetricMetadata], pd.DataFrame]
+        ] = derived_metrics
+
+    # Combine outputs across partitions
+    outputs = [
+        (mmds[0].metric_parquet_path, mmds, table)
+        for (mmds, table) in derived_metrics_dict.values()
+    ]
     context.add_output_metadata(
         metadata={
-            "num_metrics": len(mmds),
-            "num_parquets": 1,
+            "num_metrics": sum(len(output[1]) for output in outputs),
+            "num_parquets": len(outputs),
         },
     )
-
-    return [(filepath, mmds, table)]
+    return outputs

From 145166dc2f17639b76d838c2305fb9dbb353530b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 May 2024 12:10:35 +0100
Subject: [PATCH 07/39] Add super data zones

---
 python/popgetter/assets/ni/__init__.py | 121 ++++++++++++++-----------
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 78f2103..507f6d5 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -35,7 +35,6 @@
 REQUIRED_TABLES = [
     "MS-A09",
 ]
-REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES)
 
 # TODO
 REQUIRED_RELEASES = [""]
@@ -67,7 +66,14 @@ class NIGeometryLevel:
         geo_id_column="DZ2021_cd",
         name_columns={"en": "DZ2021_nm"},
         url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip",
-    )
+    ),
+    "SDZ21": NIGeometryLevel(
+        level="SDZ21",
+        hxl_tag="TBD",
+        geo_id_column="SDZ2021_cd",
+        name_columns={"en": "SDZ2021_nm"},
+        url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-sdz2021-esri-shapefile.zip",
+    ),
 }
 
 # Full list of geographies, see metadata:
@@ -222,8 +228,6 @@ def source_table(self) -> pd.DataFrame:
         return pd.DataFrame()
 
 
-key_prefix = "uk-ni"
-
 ni = NorthernIreland()
 
 dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
@@ -276,8 +280,6 @@ def census_tables(context, catalog) -> pd.DataFrame:
 def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
     # TODO: This is almost identical to Belgium so can probably be refactored to common
     # function with config of releases and languages
-    level_details = NI_GEO_LEVELS["DZ21"]
-
     geometries_to_return = []
     for level_details in NI_GEO_LEVELS.values():
         # TODO: get correct values
@@ -372,6 +374,53 @@ class SourceTable:
 # Config for each partition to be derived
 age_code = "`Age Code`"
 sex_label = "`Sex Label`"
+DERIVED_COLUMNS = [
+    DerivedColumn(
+        hxltag="#population+children+age5_17",
+        filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"),
+        output_column_name="children_5_17",
+        human_readable_name="Children aged 5 to 17",
+    ),
+    DerivedColumn(
+        hxltag="#population+infants+age0_4",
+        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"),
+        output_column_name="infants_0_4",
+        human_readable_name="Infants aged 0 to 4",
+    ),
+    DerivedColumn(
+        hxltag="#population+children+age0_17",
+        filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"),
+        output_column_name="children_0_17",
+        human_readable_name="Children aged 0 to 17",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults+f",
+        filter_func=lambda df: df.query(
+            f"{age_code} >= 18 and {sex_label} == 'Female'"
+        ),
+        output_column_name="adults_f",
+        human_readable_name="Female adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults+m",
+        filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"),
+        output_column_name="adults_m",
+        human_readable_name="Male adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+adults",
+        filter_func=lambda df: df.query(f"{age_code} >= 18"),
+        output_column_name="adults",
+        human_readable_name="Adults",
+    ),
+    DerivedColumn(
+        hxltag="#population+ind",
+        filter_func=lambda df: df,
+        output_column_name="individuals",
+        human_readable_name="Total individuals",
+    ),
+]
+
 DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = {
     "DZ21/MS-A09": (
         SourceTable(
@@ -380,55 +429,17 @@ class SourceTable:
             geo_column="Census 2021 Data Zone Code",
             source_column="Count",
         ),
-        [
-            DerivedColumn(
-                hxltag="#population+children+age5_17",
-                filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"),
-                output_column_name="children_5_17",
-                human_readable_name="Children aged 5 to 17",
-            ),
-            DerivedColumn(
-                hxltag="#population+infants+age0_4",
-                filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"),
-                output_column_name="infants_0_4",
-                human_readable_name="Infants aged 0 to 4",
-            ),
-            DerivedColumn(
-                hxltag="#population+children+age0_17",
-                filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"),
-                output_column_name="children_0_17",
-                human_readable_name="Children aged 0 to 17",
-            ),
-            DerivedColumn(
-                hxltag="#population+adults+f",
-                filter_func=lambda df: df.query(
-                    f"{age_code} >= 18 and {sex_label} == 'Female'"
-                ),
-                output_column_name="adults_f",
-                human_readable_name="Female adults",
-            ),
-            DerivedColumn(
-                hxltag="#population+adults+m",
-                filter_func=lambda df: df.query(
-                    f"{age_code} >= 18 and {sex_label} == 'Male'"
-                ),
-                output_column_name="adults_m",
-                human_readable_name="Male adults",
-            ),
-            DerivedColumn(
-                hxltag="#population+adults",
-                filter_func=lambda df: df.query(f"{age_code} >= 18"),
-                output_column_name="adults",
-                human_readable_name="Adults",
-            ),
-            DerivedColumn(
-                hxltag="#population+ind",
-                filter_func=lambda df: df,
-                output_column_name="individuals",
-                human_readable_name="Total individuals",
-            ),
-        ],
-    )
+        DERIVED_COLUMNS,
+    ),
+    "SDZ21/MS-A09": (
+        SourceTable(
+            hxltag="#population+sdz21+2021",
+            geo_level="SDZ21",
+            geo_column="Census 2021 Super Data Zone Code",
+            source_column="Count",
+        ),
+        DERIVED_COLUMNS,
+    ),
 }
 
 

From 7927b16316e83d1e9d240c72b0bb5902e988f3c8 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 May 2024 12:33:48 +0100
Subject: [PATCH 08/39] Refactor, fix source data release

---
 python/popgetter/assets/ni/__init__.py | 40 +++++++++++---------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 507f6d5..a9ece60 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -31,24 +31,6 @@
 )
 from popgetter.utils import add_metadata, markdown_from_plot
 
-PARTITION_NAME = "uk-ni_dataset_nodes"
-REQUIRED_TABLES = [
-    "MS-A09",
-]
-
-# TODO
-REQUIRED_RELEASES = [""]
-# GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf"
-
-# TODO: get these are correct dates
-CENSUS_REFERENCE_DATE = date(2021, 3, 21)
-CENSUS_COLLECTION_DATE = date(2021, 3, 21)
-CENSUS_EXPECT_NEXT_UPDATE = date(2031, 1, 1)
-CENSUS_REFERENCE_DATE = date(2021, 3, 21)
-# https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus:
-# 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas
-CENSUS_PUBLICATION_DATE = date(2023, 2, 21)
-
 
 @dataclass
 class NIGeometryLevel:
@@ -59,6 +41,10 @@ class NIGeometryLevel:
     url: str
 
 
+# Name for census tables partition
+PARTITION_NAME = "uk-ni_dataset_nodes"
+
+# Geometry levels to include
 NI_GEO_LEVELS = {
     "DZ21": NIGeometryLevel(
         level="DZ21",
@@ -76,6 +62,9 @@ class NIGeometryLevel:
     ),
 }
 
+# Required tables
+REQUIRED_TABLES = ["MS-A09"]
+
 # Full list of geographies, see metadata:
 # https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE
 GEO_LEVELS = [
@@ -89,6 +78,10 @@ class NIGeometryLevel:
 ]
 
 
+# 2021 census collection date
+CENSUS_COLLECTION_DATE = date(2021, 3, 21)
+
+
 def get_nodes_and_links() -> dict[str, dict[str, str]]:
     SCHEME_AND_HOST = "https://build.nisra.gov.uk"
     urls = [
@@ -337,15 +330,16 @@ def source_data_releases(
 ) -> dict[str, SourceDataRelease]:
     source_data_releases = {}
     for geo_metadata, _, _ in geometry:
-        # TODO: update with dates from config
         source_data_release: SourceDataRelease = SourceDataRelease(
             name="Census 2021",
-            date_published=date(2014, 2, 27),
-            reference_period_start=CENSUS_REFERENCE_DATE,
-            reference_period_end=CENSUS_REFERENCE_DATE,
+            # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus:
+            # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas
+            date_published=date(2023, 2, 21),
+            reference_period_start=date(2021, 3, 21),
+            reference_period_end=date(2021, 3, 21),
             collection_period_start=CENSUS_COLLECTION_DATE,
             collection_period_end=CENSUS_COLLECTION_DATE,
-            expect_next_update=CENSUS_EXPECT_NEXT_UPDATE,
+            expect_next_update=date(2031, 1, 1),
             url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus",
             data_publisher_id=publisher.id,
             description="TBC",

From 017ea207232cfb85a2cb78931da12b6acc53032b Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 May 2024 12:35:11 +0100
Subject: [PATCH 09/39] Fix asset job definition

---
 python/popgetter/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 24b9be0..9a356ed 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -84,7 +84,7 @@
 
 job_ni: UnresolvedAssetJobDefinition = define_asset_job(
     name="job_ni",
-    selection=AssetSelection.groups("uk-ni"),
+    selection=AssetSelection.groups("ni"),
     description="Downloads UK data.",
 )
 

From e400dce4d40a6dedc294c758652e091492ce4d7e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 22 May 2024 18:09:35 +0100
Subject: [PATCH 10/39] Initial implementation using a country base class

---
 python/popgetter/assets/common.py      | 134 ++++++
 python/popgetter/assets/ni/__init__.py | 617 ++++++++++++-------------
 2 files changed, 438 insertions(+), 313 deletions(-)
 create mode 100644 python/popgetter/assets/common.py

diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py
new file mode 100644
index 0000000..0420013
--- /dev/null
+++ b/python/popgetter/assets/common.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+import geopandas as gpd
+import pandas as pd
+from dagster import DynamicPartitionsDefinition, asset
+
+from popgetter.metadata import (
+    CountryMetadata,
+    DataPublisher,
+    GeometryMetadata,
+    MetricMetadata,
+    SourceDataRelease,
+)
+
+
+class Country(ABC):
+    dataset_node_partition: DynamicPartitionsDefinition
+
+    def create_catalog(self):
+        @asset()
+        def catalog(context):
+            return self._catalog(context)
+
+        return catalog
+
+    @abstractmethod
+    def _catalog(self, context) -> pd.DataFrame:
+        ...
+
+    def create_country_metadata(self):
+        @asset()
+        def country_metadata(context):
+            return self._country_metadata(context)
+
+        return country_metadata
+
+    @abstractmethod
+    def _country_metadata(self, context) -> CountryMetadata:
+        ...
+
+    def create_data_publisher(self):
+        @asset
+        def data_publisher(context, country_metadata: CountryMetadata):
+            return self._data_publisher(context, country_metadata)
+
+        return data_publisher
+
+    @abstractmethod
+    def _data_publisher(
+        self, context, country_metdata: CountryMetadata
+    ) -> DataPublisher:
+        ...
+
+    def create_geometry(self):
+        @asset()
+        def geometry(context):
+            return self._geometry(context)
+
+        return geometry
+
+    @abstractmethod
+    def _geometry(
+        self, context
+    ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+        ...
+
+    def create_source_data_releases(self):
+        @asset()
+        def source_data_releases(
+            context,
+            geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
+            data_publisher: DataPublisher,
+        ):
+            return self._source_data_releases(context, geometry, data_publisher)
+
+        return source_data_releases
+
+    @abstractmethod
+    def _source_data_releases(
+        self,
+        context,
+        geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
+        data_publisher: DataPublisher,
+    ) -> dict[str, SourceDataRelease]:
+        ...
+
+    def create_census_tables(self):
+        @asset(partitions_def=self.dataset_node_partition)
+        def census_tables(context, catalog):
+            return self._census_tables(context, catalog)
+
+        return census_tables
+
+    @abstractmethod
+    def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
+        ...
+
+    def create_source_metric_metadata(self):
+        @asset(partitions_def=self.dataset_node_partition)
+        def source_metric_metadata(context, catalog, source_data_releases):
+            return self._source_metric_metadata(context, catalog, source_data_releases)
+
+        return source_metric_metadata
+
+    @abstractmethod
+    def _source_metric_metadata(
+        self,
+        context,
+        catalog: pd.DataFrame,
+        source_data_releases: dict[str, SourceDataRelease],
+    ) -> MetricMetadata:
+        ...
+
+    def create_derived_metrics(self):
+        @asset(partitions_def=self.dataset_node_partition)
+        def derived_metrics(
+            context,
+            census_tables: pd.DataFrame,
+            source_metric_metadata: MetricMetadata,
+        ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+            return self._derived_metrics(context, census_tables, source_metric_metadata)
+
+        return derived_metrics
+
+    @abstractmethod
+    def _derived_metrics(
+        self,
+        context,
+        census_tables: pd.DataFrame,
+        source_metric_metadata: MetricMetadata,
+    ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+        ...
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index a9ece60..60a9039 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -117,238 +117,6 @@ def get_nodes_and_links() -> dict[str, dict[str, str]]:
     return nodes
 
 
-class NorthernIreland(Country):
-    partition_name: str = PARTITION_NAME
-    geo_levels: list[str] = GEO_LEVELS
-    required_tables: list[str] = REQUIRED_TABLES
-
-    # def catalog(self, context: AssetExecutionContext) -> pd.DataFrame:
-    def catalog(self, context) -> pd.DataFrame:
-        """
-        A catalog for NI can be generated in two ways:
-        1. With flexible table builder:
-                https://build.nisra.gov.uk/en/
-            with metadata chosen from:
-                https://build.nisra.gov.uk/en/metadata
-        2. Or through enumerating the ready-made tables:
-            https://build.nisra.gov.uk/en/standard
-            However, some level of
-        """
-        catalog_summary = {
-            "node": [],
-            "partition_key": [],
-            "table_id": [],
-            "geo_level": [],
-            "human_readable_name": [],
-            "description": [],
-            "metric_parquet_file_url": [],
-            "parquet_column_name": [],
-            "parquet_margin_of_error_column": [],
-            "parquet_margin_of_error_file": [],
-            "potential_denominator_ids": [],
-            "parent_metric_id": [],
-            "source_data_release_id": [],
-            "source_download_url": [],
-            "source_format": [],
-            "source_archive_file_path": [],
-            "source_documentation_url": [],
-            "table_schema": [],
-        }
-        nodes = get_nodes_and_links()
-
-        def add_resolution(s: str, geo_level: str) -> str:
-            s_split = s.split("?")
-            query_params = s_split[1].split("&")
-            if query_params[0].startswith("d="):
-                query_params = "&".join(
-                    [query_params[0], f"v={geo_level}", *query_params[2:]]
-                )
-            else:
-                query_params = "&".join([f"v={geo_level}", *query_params[1:]])
-            out_url = "?".join([s_split[0], query_params])
-            ic(out_url)
-            return out_url
-
-        for node_url, node_items in nodes.items():
-            for geo_level in self.geo_levels:
-                metadata = requests.get(node_items["metadata_url"]).json()
-                table_id = metadata["dc:title"].split(":")[0]
-                # Skip if not required
-                if table_id not in self.required_tables:
-                    continue
-
-                catalog_summary["node"].append(node_url)
-                catalog_summary["table_id"].append(table_id)
-                catalog_summary["geo_level"].append(geo_level)
-                catalog_summary["partition_key"].append(f"{geo_level}/{table_id}")
-                catalog_summary["human_readable_name"].append(metadata["dc:title"])
-                catalog_summary["description"].append(metadata["dc:description"])
-                catalog_summary["metric_parquet_file_url"].append(None)
-                catalog_summary["parquet_column_name"].append(None)
-                catalog_summary["parquet_margin_of_error_column"].append(None)
-                catalog_summary["parquet_margin_of_error_file"].append(None)
-                catalog_summary["potential_denominator_ids"].append(None)
-                catalog_summary["parent_metric_id"].append(None)
-                catalog_summary["source_data_release_id"].append(None)
-                catalog_summary["source_download_url"].append(
-                    add_resolution(metadata["url"], geo_level)
-                )
-                catalog_summary["source_format"].append(None)
-                catalog_summary["source_archive_file_path"].append(None)
-                catalog_summary["source_documentation_url"].append(node_url)
-                catalog_summary["table_schema"].append(metadata["tableSchema"])
-
-        catalog_df = pd.DataFrame.from_records(catalog_summary)
-        context.instance.add_dynamic_partitions(
-            partitions_def_name=self.partition_name,
-            partition_keys=catalog_df["partition_key"].to_list(),
-        )
-
-        add_metadata(context, catalog_df, "Catalog")
-        return catalog_df
-
-    def census_tables(
-        self, _context, catalog: pd.DataFrame, partition: str
-    ) -> pd.DataFrame:
-        ic(partition)
-        ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"])
-        url = catalog.loc[
-            catalog["partition_key"].eq(partition), "source_download_url"
-        ].iloc[0]
-        return pd.read_csv(io.BytesIO(requests.get(url).content), encoding="utf8")
-
-    def source_table(self) -> pd.DataFrame:
-        return pd.DataFrame()
-
-
-ni = NorthernIreland()
-
-dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
-
-country: CountryMetadata = CountryMetadata(
-    name_short_en="Northern Ireland",
-    name_official="Northern Ireland",
-    iso3="GBR",
-    iso2="GB",
-    iso3166_2="GB-NIR",
-)
-
-publisher: DataPublisher = DataPublisher(
-    name="NISRA",
-    url="https://www.nisra.gov.uk/",
-    description=(
-        "The Northern Ireland Statistics and Research Agency (NISRA), which "
-        "incorporates the General Register Office (GRO), is an executive agency "
-        "within the Department of Finance (NI) and was established on 1 April 1996."
-    ),
-    countries_of_interest=[country.id],
-)
-
-
-@asset
-def country_metadata() -> CountryMetadata:
-    return country
-
-
-@asset
-def data_publisher() -> DataPublisher:
-    return publisher
-
-
-@asset
-def catalog(context) -> pd.DataFrame:
-    return ni.catalog(context)
-
-
-@asset(partitions_def=dataset_node_partition)
-def census_tables(context, catalog) -> pd.DataFrame:
-    census_table = ni.census_tables(
-        context, catalog, context.asset_partition_key_for_output()
-    )
-    add_metadata(context, census_table, title=context.asset_partition_key_for_output())
-    return census_table
-
-
-@asset
-def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
-    # TODO: This is almost identical to Belgium so can probably be refactored to common
-    # function with config of releases and languages
-    geometries_to_return = []
-    for level_details in NI_GEO_LEVELS.values():
-        # TODO: get correct values
-        geometry_metadata = GeometryMetadata(
-            validity_period_start=CENSUS_COLLECTION_DATE,
-            validity_period_end=CENSUS_COLLECTION_DATE,
-            level=level_details.level,
-            hxl_tag=level_details.hxl_tag,
-        )
-        region_geometries_raw = (
-            gpd.read_file(level_details.url)
-            .dissolve(by=level_details.geo_id_column)
-            .reset_index()
-        )
-        context.log.debug(ic(region_geometries_raw.head()))
-        region_geometries = region_geometries_raw.rename(
-            columns={level_details.geo_id_column: "GEO_ID"}
-        ).loc[:, ["geometry", "GEO_ID"]]
-        region_names = (
-            region_geometries_raw.rename(
-                columns={
-                    level_details.geo_id_column: "GEO_ID",
-                    level_details.name_columns["en"]: "en",
-                }
-            )
-            .loc[:, ["GEO_ID", "en"]]
-            .drop_duplicates()
-        )
-        geometries_to_return.append(
-            (geometry_metadata, region_geometries, region_names)
-        )
-
-    # Add output metadata
-    first_metadata, first_gdf, first_names = geometries_to_return[0]
-    first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
-    ax = first_joined_gdf.plot(column="en", legend=False)
-    ax.set_title(f"NI 2023 {first_metadata.level}")
-    md_plot = markdown_from_plot(plt)
-    context.add_output_metadata(
-        metadata={
-            "all_geom_levels": MetadataValue.md(
-                ",".join([metadata.level for metadata, _, _ in geometries_to_return])
-            ),
-            "first_geometry_plot": MetadataValue.md(md_plot),
-            "first_names_preview": MetadataValue.md(first_names.head().to_markdown()),
-        }
-    )
-
-    return geometries_to_return
-
-
-@asset()
-def source_data_releases(
-    _context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]
-) -> dict[str, SourceDataRelease]:
-    source_data_releases = {}
-    for geo_metadata, _, _ in geometry:
-        source_data_release: SourceDataRelease = SourceDataRelease(
-            name="Census 2021",
-            # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus:
-            # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas
-            date_published=date(2023, 2, 21),
-            reference_period_start=date(2021, 3, 21),
-            reference_period_end=date(2021, 3, 21),
-            collection_period_start=CENSUS_COLLECTION_DATE,
-            collection_period_end=CENSUS_COLLECTION_DATE,
-            expect_next_update=date(2031, 1, 1),
-            url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus",
-            data_publisher_id=publisher.id,
-            description="TBC",
-            geometry_metadata_id=geo_metadata.id,
-        )
-        source_data_releases[geo_metadata.level] = source_data_release
-    return source_data_releases
-
-
 @dataclass
 class DerivedColumn:
     hxltag: str
@@ -463,98 +231,321 @@ def census_table_metadata(
     )
 
 
-@asset(partitions_def=dataset_node_partition)
-def source_metric_metadata(
-    context,
-    catalog: pd.DataFrame,
-    source_data_releases: dict[str, SourceDataRelease],
-) -> MetricMetadata:
-    partition_key = context.partition_key
-    if partition_key not in DERIVED_COLUMN_SPECIFICATIONS:
-        skip_reason = (
-            f"Skipping as requested partition {partition_key} is configured "
-            f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}"
+class NorthernIreland(Country):
+    partition_name: str = PARTITION_NAME
+    geo_levels: list[str] = GEO_LEVELS
+    required_tables: list[str] = REQUIRED_TABLES
+    dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
+
+    def _country_metadata(self, _context) -> CountryMetadata:
+        return CountryMetadata(
+            name_short_en="Northern Ireland",
+            name_official="Northern Ireland",
+            iso3="GBR",
+            iso2="GB",
+            iso3166_2="GB-NIR",
         )
-        context.log.warning(skip_reason)
-        raise RuntimeError(skip_reason)
 
-    catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict(
-        orient="records"
-    )[0]
+    def _data_publisher(
+        self, _context, country_metadata: CountryMetadata
+    ) -> DataPublisher:
+        return DataPublisher(
+            name="NISRA",
+            url="https://www.nisra.gov.uk/",
+            description=(
+                "The Northern Ireland Statistics and Research Agency (NISRA), which "
+                "incorporates the General Register Office (GRO), is an executive agency "
+                "within the Department of Finance (NI) and was established on 1 April 1996."
+            ),
+            countries_of_interest=[country_metadata.id],
+        )
 
-    return census_table_metadata(
-        catalog_row,
-        DERIVED_COLUMN_SPECIFICATIONS[partition_key][0],
-        source_data_releases,
-    )
+    def _catalog(self, context) -> pd.DataFrame:
+        """
+        A catalog for NI can be generated in two ways:
+        1. With flexible table builder:
+                https://build.nisra.gov.uk/en/
+            with metadata chosen from:
+                https://build.nisra.gov.uk/en/metadata
+        2. Or through enumerating the ready-made tables:
+            https://build.nisra.gov.uk/en/standard
+            However, some level of
+        """
+        catalog_summary = {
+            "node": [],
+            "partition_key": [],
+            "table_id": [],
+            "geo_level": [],
+            "human_readable_name": [],
+            "description": [],
+            "metric_parquet_file_url": [],
+            "parquet_column_name": [],
+            "parquet_margin_of_error_column": [],
+            "parquet_margin_of_error_file": [],
+            "potential_denominator_ids": [],
+            "parent_metric_id": [],
+            "source_data_release_id": [],
+            "source_download_url": [],
+            "source_format": [],
+            "source_archive_file_path": [],
+            "source_documentation_url": [],
+            "table_schema": [],
+        }
+        nodes = get_nodes_and_links()
 
+        def add_resolution(s: str, geo_level: str) -> str:
+            s_split = s.split("?")
+            query_params = s_split[1].split("&")
+            if query_params[0].startswith("d="):
+                query_params = "&".join(
+                    [query_params[0], f"v={geo_level}", *query_params[2:]]
+                )
+            else:
+                query_params = "&".join([f"v={geo_level}", *query_params[1:]])
+            out_url = "?".join([s_split[0], query_params])
+            ic(out_url)
+            return out_url
 
-@asset(partitions_def=dataset_node_partition)
-def derived_metrics(
-    context,
-    census_tables: pd.DataFrame,
-    source_metric_metadata: MetricMetadata,
-) -> tuple[list[MetricMetadata], pd.DataFrame]:
-    partition_key = context.partition_key
-    source_table = census_tables
-    source_mmd = source_metric_metadata
-    source_column = source_mmd.parquet_column_name
-    assert source_column in source_table.columns
-    assert len(source_table) > 0
-
-    try:
-        source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[
-            partition_key
-        ]
-    except KeyError as err:
-        skip_reason = (
-            f"Skipping as no derived columns are to be created for node {partition_key}"
+        for node_url, node_items in nodes.items():
+            for geo_level in self.geo_levels:
+                metadata = requests.get(node_items["metadata_url"]).json()
+                table_id = metadata["dc:title"].split(":")[0]
+                # Skip if not required
+                if table_id not in self.required_tables:
+                    continue
+
+                catalog_summary["node"].append(node_url)
+                catalog_summary["table_id"].append(table_id)
+                catalog_summary["geo_level"].append(geo_level)
+                catalog_summary["partition_key"].append(f"{geo_level}/{table_id}")
+                catalog_summary["human_readable_name"].append(metadata["dc:title"])
+                catalog_summary["description"].append(metadata["dc:description"])
+                catalog_summary["metric_parquet_file_url"].append(None)
+                catalog_summary["parquet_column_name"].append(None)
+                catalog_summary["parquet_margin_of_error_column"].append(None)
+                catalog_summary["parquet_margin_of_error_file"].append(None)
+                catalog_summary["potential_denominator_ids"].append(None)
+                catalog_summary["parent_metric_id"].append(None)
+                catalog_summary["source_data_release_id"].append(None)
+                catalog_summary["source_download_url"].append(
+                    add_resolution(metadata["url"], geo_level)
+                )
+                catalog_summary["source_format"].append(None)
+                catalog_summary["source_archive_file_path"].append(None)
+                catalog_summary["source_documentation_url"].append(node_url)
+                catalog_summary["table_schema"].append(metadata["tableSchema"])
+
+        catalog_df = pd.DataFrame.from_records(catalog_summary)
+        context.instance.add_dynamic_partitions(
+            partitions_def_name=self.partition_name,
+            partition_keys=catalog_df["partition_key"].to_list(),
         )
-        context.log.warning(skip_reason)
-        raise RuntimeError(skip_reason) from err
 
-    source_table = source_table.rename(
-        columns={source_table_metadata.geo_column: "GEO_ID"}
-    )
-    derived_metrics: list[pd.DataFrame] = []
-    derived_mmd: list[MetricMetadata] = []
-    parquet_file_name = "".join(c for c in partition_key if c.isalnum()) + ".parquet"
-    for metric_spec in metric_specs:
-        new_table = (
-            source_table.pipe(metric_spec.filter_func)
-            .groupby(by="GEO_ID", as_index=True)
-            .sum()
-            .rename(columns={source_column: metric_spec.output_column_name})
-            .filter(items=["GEO_ID", metric_spec.output_column_name])
+        add_metadata(context, catalog_df, "Catalog")
+        return catalog_df
+
+    def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
+        partition = context.asset_partition_key_for_output()
+        ic(partition)
+        ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"])
+        url = catalog.loc[
+            catalog["partition_key"].eq(partition), "source_download_url"
+        ].iloc[0]
+        census_table = pd.read_csv(
+            io.BytesIO(requests.get(url).content), encoding="utf8"
         )
-        derived_metrics.append(new_table)
-
-        new_mmd = source_mmd.copy()
-        new_mmd.parent_metric_id = source_mmd.source_metric_id
-        new_mmd.metric_parquet_path = parquet_file_name
-        new_mmd.hxl_tag = metric_spec.hxltag
-        new_mmd.parquet_column_name = metric_spec.output_column_name
-        new_mmd.human_readable_name = metric_spec.human_readable_name
-        derived_mmd.append(new_mmd)
-
-    joined_metrics = reduce(
-        lambda left, right: left.merge(
-            right, on="GEO_ID", how="inner", validate="one_to_one"
-        ),
-        derived_metrics,
-    )
+        add_metadata(
+            context, census_table, title=context.asset_partition_key_for_output()
+        )
+        return census_table
+
+    def _geometry(
+        self, context
+    ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]:
+        # TODO: This is almost identical to Belgium so can probably be refactored to common
+        # function with config of releases and languages
+        geometries_to_return = []
+        for level_details in NI_GEO_LEVELS.values():
+            # TODO: get correct values
+            geometry_metadata = GeometryMetadata(
+                validity_period_start=CENSUS_COLLECTION_DATE,
+                validity_period_end=CENSUS_COLLECTION_DATE,
+                level=level_details.level,
+                hxl_tag=level_details.hxl_tag,
+            )
+            region_geometries_raw = (
+                gpd.read_file(level_details.url)
+                .dissolve(by=level_details.geo_id_column)
+                .reset_index()
+            )
+            context.log.debug(ic(region_geometries_raw.head()))
+            region_geometries = region_geometries_raw.rename(
+                columns={level_details.geo_id_column: "GEO_ID"}
+            ).loc[:, ["geometry", "GEO_ID"]]
+            region_names = (
+                region_geometries_raw.rename(
+                    columns={
+                        level_details.geo_id_column: "GEO_ID",
+                        level_details.name_columns["en"]: "en",
+                    }
+                )
+                .loc[:, ["GEO_ID", "en"]]
+                .drop_duplicates()
+            )
+            geometries_to_return.append(
+                (geometry_metadata, region_geometries, region_names)
+            )
 
-    context.add_output_metadata(
-        metadata={
-            "metadata_preview": MetadataValue.md(
-                metadata_to_dataframe(derived_mmd).head().to_markdown()
+        # Add output metadata
+        first_metadata, first_gdf, first_names = geometries_to_return[0]
+        first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
+        ax = first_joined_gdf.plot(column="en", legend=False)
+        ax.set_title(f"NI 2023 {first_metadata.level}")
+        md_plot = markdown_from_plot(plt)
+        context.add_output_metadata(
+            metadata={
+                "all_geom_levels": MetadataValue.md(
+                    ",".join(
+                        [metadata.level for metadata, _, _ in geometries_to_return]
+                    )
+                ),
+                "first_geometry_plot": MetadataValue.md(md_plot),
+                "first_names_preview": MetadataValue.md(
+                    first_names.head().to_markdown()
+                ),
+            }
+        )
+
+        return geometries_to_return
+
+    def _source_data_releases(
+        self, _context, geometry, data_publisher
+    ) -> dict[str, SourceDataRelease]:
+        source_data_releases = {}
+        for geo_metadata, _, _ in geometry:
+            source_data_release: SourceDataRelease = SourceDataRelease(
+                name="Census 2021",
+                # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus:
+                # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas
+                date_published=date(2023, 2, 21),
+                reference_period_start=date(2021, 3, 21),
+                reference_period_end=date(2021, 3, 21),
+                collection_period_start=CENSUS_COLLECTION_DATE,
+                collection_period_end=CENSUS_COLLECTION_DATE,
+                expect_next_update=date(2031, 1, 1),
+                url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus",
+                data_publisher_id=data_publisher.id,
+                description="TBC",
+                geometry_metadata_id=geo_metadata.id,
+            )
+            source_data_releases[geo_metadata.level] = source_data_release
+        return source_data_releases
+
+    def _source_metric_metadata(
+        self,
+        context,
+        catalog: pd.DataFrame,
+        source_data_releases: dict[str, SourceDataRelease],
+    ) -> MetricMetadata:
+        partition_key = context.partition_key
+        if partition_key not in DERIVED_COLUMN_SPECIFICATIONS:
+            skip_reason = (
+                f"Skipping as requested partition {partition_key} is configured "
+                f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}"
+            )
+            context.log.warning(skip_reason)
+            raise RuntimeError(skip_reason)
+
+        catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict(
+            orient="records"
+        )[0]
+
+        return census_table_metadata(
+            catalog_row,
+            DERIVED_COLUMN_SPECIFICATIONS[partition_key][0],
+            source_data_releases,
+        )
+
+    def _derived_metrics(
+        self,
+        context,
+        census_tables: pd.DataFrame,
+        source_metric_metadata: MetricMetadata,
+    ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+        partition_key = context.partition_key
+        source_table = census_tables
+        source_mmd = source_metric_metadata
+        source_column = source_mmd.parquet_column_name
+        assert source_column in source_table.columns
+        assert len(source_table) > 0
+
+        try:
+            source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[
+                partition_key
+            ]
+        except KeyError as err:
+            skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}"
+            context.log.warning(skip_reason)
+            raise RuntimeError(skip_reason) from err
+
+        source_table = source_table.rename(
+            columns={source_table_metadata.geo_column: "GEO_ID"}
+        )
+        derived_metrics: list[pd.DataFrame] = []
+        derived_mmd: list[MetricMetadata] = []
+        parquet_file_name = (
+            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+        )
+        for metric_spec in metric_specs:
+            new_table = (
+                source_table.pipe(metric_spec.filter_func)
+                .groupby(by="GEO_ID", as_index=True)
+                .sum()
+                .rename(columns={source_column: metric_spec.output_column_name})
+                .filter(items=["GEO_ID", metric_spec.output_column_name])
+            )
+            derived_metrics.append(new_table)
+
+            new_mmd = source_mmd.copy()
+            new_mmd.parent_metric_id = source_mmd.source_metric_id
+            new_mmd.metric_parquet_path = parquet_file_name
+            new_mmd.hxl_tag = metric_spec.hxltag
+            new_mmd.parquet_column_name = metric_spec.output_column_name
+            new_mmd.human_readable_name = metric_spec.human_readable_name
+            derived_mmd.append(new_mmd)
+
+        joined_metrics = reduce(
+            lambda left, right: left.merge(
+                right, on="GEO_ID", how="inner", validate="one_to_one"
             ),
-            "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
-            "metrics_preview": MetadataValue.md(joined_metrics.head().to_markdown()),
-        },
-    )
+            derived_metrics,
+        )
+
+        context.add_output_metadata(
+            metadata={
+                "metadata_preview": MetadataValue.md(
+                    metadata_to_dataframe(derived_mmd).head().to_markdown()
+                ),
+                "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
+                "metrics_preview": MetadataValue.md(
+                    joined_metrics.head().to_markdown()
+                ),
+            },
+        )
 
-    return derived_mmd, joined_metrics
+        return derived_mmd, joined_metrics
+
+
+# Assets
+ni = NorthernIreland()
+country_metadata = ni.create_country_metadata()
+data_publisher = ni.create_data_publisher()
+geometry = ni.create_geometry()
+source_data_releases = ni.create_source_data_releases()
+catalog = ni.create_catalog()
+census_tables = ni.create_census_tables()
+source_metric_metadata = ni.create_source_metric_metadata()
+derived_metrics = ni.create_derived_metrics()
 
 
 @asset(

From 83668b90cb8a1c1164e656118407dd885324acf5 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 23 May 2024 11:32:25 +0100
Subject: [PATCH 11/39] Add README for NI

---
 python/popgetter/assets/ni/README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 python/popgetter/assets/ni/README.md

diff --git a/python/popgetter/assets/ni/README.md b/python/popgetter/assets/ni/README.md
new file mode 100644
index 0000000..2faa303
--- /dev/null
+++ b/python/popgetter/assets/ni/README.md
@@ -0,0 +1,17 @@
+# Northern Ireland
+
+## Summary
+
+Census 2021 is available from
+[https://build.nisra.gov.uk](https://build.nisra.gov.uk/en/).
+
+The processing pipeline involves the following steps:
+
+- Gets the corresponding geography files and outputs as standard geometry
+  formats
+- Generates metadata associated with Northern Ireland and data releases
+- Generates a catalog by identifying all tables
+  [available](https://build.nisra.gov.uk/en/standard).
+- Read table metadata and census tables, across different geography levels
+  (currently only Data Zone 2021 and Super Data Zone 2021)
+- Construct a set of pre-defined derived metrics

From 69f74d99f034466c98b581f1c92918612a3cc6b1 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 23 May 2024 12:03:57 +0100
Subject: [PATCH 12/39] Add country outputs class

---
 python/popgetter/__init__.py               |  4 ++-
 python/popgetter/assets/common.py          | 16 ++++++++++++
 python/popgetter/assets/ni/__init__.py     | 24 ++++++++++++-----
 python/popgetter/cloud_outputs/__init__.py | 30 ++++++++++------------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 9a356ed..2da6302 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -54,7 +54,9 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
-    *load_assets_from_package_module(assets.ni, group_name="ni", key_prefix="uk-ni"),
+    *load_assets_from_package_module(
+        assets.ni, group_name="ni", key_prefix=assets.ni.ni.key_prefix
+    ),
     *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"),
     *(
         load_assets_from_modules([azure_test], group_name="azure_test")
diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py
index 0420013..daa8ae0 100644
--- a/python/popgetter/assets/common.py
+++ b/python/popgetter/assets/common.py
@@ -15,6 +15,20 @@
 )
 
 
+class CountryAssetOuputs(ABC):
+    @abstractmethod
+    def get_metadata_asset_keys(self) -> list[str]:
+        ...
+
+    @abstractmethod
+    def get_geo_asset_keys(self) -> list[str]:
+        ...
+
+    @abstractmethod
+    def get_metric_asset_keys(self) -> list[str]:
+        ...
+
+
 class Country(ABC):
     dataset_node_partition: DynamicPartitionsDefinition
 
@@ -83,6 +97,8 @@ def _source_data_releases(
         context,
         geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
         data_publisher: DataPublisher,
+        # TODO: consider version without inputs so only output type specified
+        # **kwargs,
     ) -> dict[str, SourceDataRelease]:
         ...
 
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 60a9039..3440c9e 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -20,7 +20,7 @@
 )
 from icecream import ic
 
-from popgetter.assets.common import Country
+from popgetter.assets.common import Country, CountryAssetOuputs
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -41,9 +41,6 @@ class NIGeometryLevel:
     url: str
 
 
-# Name for census tables partition
-PARTITION_NAME = "uk-ni_dataset_nodes"
-
 # Geometry levels to include
 NI_GEO_LEVELS = {
     "DZ21": NIGeometryLevel(
@@ -231,11 +228,24 @@ def census_table_metadata(
     )
 
 
-class NorthernIreland(Country):
-    partition_name: str = PARTITION_NAME
+class NorthernIreland(Country, CountryAssetOuputs):
+    key_prefix: str = "uk-ni"
+    partition_name: str = "uk-ni_dataset_nodes"
     geo_levels: list[str] = GEO_LEVELS
     required_tables: list[str] = REQUIRED_TABLES
-    dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME)
+    dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes")
+
+    def get_metadata_asset_keys(self) -> list[str]:
+        return [
+            f"{self.key_prefix}/{el}"
+            for el in ["country_metadata", "data_publisher", "source_data_releases"]
+        ]
+
+    def get_geo_asset_keys(self) -> list[str]:
+        return [f"{self.key_prefix}/{el}" for el in ["geometry"]]
+
+    def get_metric_asset_keys(self) -> list[str]:
+        return [f"{self.key_prefix}/{el}" for el in ["metrics"]]
 
     def _country_metadata(self, _context) -> CountryMetadata:
         return CountryMetadata(
diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py
index ef2ad42..d4d7cbc 100644
--- a/python/popgetter/cloud_outputs/__init__.py
+++ b/python/popgetter/cloud_outputs/__init__.py
@@ -1,16 +1,20 @@
 from __future__ import annotations
 
+import popgetter.assets as assets
+
 from .sensor_class import CloudAssetSensor
 
+METADATA_ASSETS = [
+    "be/country_metadata",
+    "be/data_publisher",
+    "be/source_data_releases",
+    *assets.ni.ni.get_metadata_asset_keys(),
+]
+GEOMETRY_ASSETS = ["be/geometry", *assets.ni.ni.get_geo_asset_keys()]
+METRIC_ASSETS = ["be/metrics", *assets.ni.ni.get_metric_asset_keys()]
+
 metadata_factory = CloudAssetSensor(
-    asset_names_to_monitor=[
-        "be/country_metadata",
-        "be/data_publisher",
-        "be/source_data_releases",
-        "uk-ni/country_metadata",
-        "uk-ni/data_publisher",
-        "uk-ni/source_data_releases",
-    ],
+    asset_names_to_monitor=METADATA_ASSETS,
     io_manager_key="metadata_io_manager",
     prefix="metadata",
     interval=20,
@@ -20,10 +24,7 @@
 metadata_asset = metadata_factory.create_publishing_asset()
 
 geometry_factory = CloudAssetSensor(
-    asset_names_to_monitor=[
-        "be/geometry",
-        "uk-ni/geometry",
-    ],
+    asset_names_to_monitor=GEOMETRY_ASSETS,
     io_manager_key="geometry_io_manager",
     prefix="geometry",
     interval=60,
@@ -33,10 +34,7 @@
 geometry_asset = geometry_factory.create_publishing_asset()
 
 metrics_factory = CloudAssetSensor(
-    asset_names_to_monitor=[
-        "be/metrics",
-        "uk-ni/metrics",
-    ],
+    asset_names_to_monitor=METRIC_ASSETS,
     io_manager_key="metrics_io_manager",
     prefix="metrics",
     interval=60,

From 852c8f82c29610779aee2137ca689a1caa65d861 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 23 May 2024 12:15:44 +0100
Subject: [PATCH 13/39] Add dep

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 00743ca..441b5d7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ dependencies = [
   "icecream >=2.1.3", # General debugging tool
   "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names
   "jcs >=0.2.1", # For generating IDs from class attributes
+  "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages
 ]
 
 

From b489dfb0618d6f4148fa1ef5ea39ce6d1e75fd82 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 23 May 2024 17:49:48 +0100
Subject: [PATCH 14/39] Begin update to cover all census tables

---
 python/popgetter/assets/ni/__init__.py | 76 ++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 11 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 3440c9e..4b2d162 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import io
+import os
 from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import date
@@ -37,8 +38,13 @@ class NIGeometryLevel:
     level: str
     hxl_tag: str
     geo_id_column: str
+    census_table_column: str
     name_columns: dict[str, str]  # keys = language codes, values = column names
     url: str
+    lookup_url: str | None
+    lookup_sheet: str | None
+    left_on: str | None
+    right_on: str | None
 
 
 # Geometry levels to include
@@ -47,20 +53,42 @@ class NIGeometryLevel:
         level="DZ21",
         hxl_tag="TBD",
         geo_id_column="DZ2021_cd",
+        census_table_column="Census 2021 Data Zone Code",
         name_columns={"en": "DZ2021_nm"},
         url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip",
+        lookup_url=None,
+        lookup_sheet=None,
+        left_on=None,
+        right_on=None,
     ),
     "SDZ21": NIGeometryLevel(
         level="SDZ21",
         hxl_tag="TBD",
         geo_id_column="SDZ2021_cd",
+        census_table_column="Census 2021 Super Data Zone Code",
         name_columns={"en": "SDZ2021_nm"},
         url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-sdz2021-esri-shapefile.zip",
+        lookup_url=None,
+        lookup_sheet=None,
+        left_on=None,
+        right_on=None,
+    ),
+    "LGD14": NIGeometryLevel(
+        level="LGD14",
+        hxl_tag="TBD",
+        geo_id_column="LGD2014_cd",
+        census_table_column="Local Government District 2014 Code",
+        name_columns={"en": "LGD2014_name"},
+        url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip",
+        lookup_url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-data-zone-and-super-data-zone-lookups.xlsx",
+        lookup_sheet="DZ2021_lookup",
+        left_on="DZ2021_cd",
+        right_on="DZ2021_code",
     ),
 }
 
 # Required tables
-REQUIRED_TABLES = ["MS-A09"]
+REQUIRED_TABLES = ["MS-A09"] if os.getenv("ENV") == "dev" else None
 
 # Full list of geographies, see metadata:
 # https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE
@@ -232,7 +260,7 @@ class NorthernIreland(Country, CountryAssetOuputs):
     key_prefix: str = "uk-ni"
     partition_name: str = "uk-ni_dataset_nodes"
     geo_levels: list[str] = GEO_LEVELS
-    required_tables: list[str] = REQUIRED_TABLES
+    required_tables: list[str] | None = REQUIRED_TABLES
     dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes")
 
     def get_metadata_asset_keys(self) -> list[str]:
@@ -321,7 +349,10 @@ def add_resolution(s: str, geo_level: str) -> str:
                 metadata = requests.get(node_items["metadata_url"]).json()
                 table_id = metadata["dc:title"].split(":")[0]
                 # Skip if not required
-                if table_id not in self.required_tables:
+                if (
+                    self.required_tables is not None
+                    and table_id not in self.required_tables
+                ):
                     continue
 
                 catalog_summary["node"].append(node_url)
@@ -383,11 +414,22 @@ def _geometry(
                 level=level_details.level,
                 hxl_tag=level_details.hxl_tag,
             )
-            region_geometries_raw = (
-                gpd.read_file(level_details.url)
-                .dissolve(by=level_details.geo_id_column)
-                .reset_index()
-            )
+            region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(level_details.url)
+            if level_details.lookup_url is not None:
+                lookup = pd.read_excel(
+                    level_details.lookup_url, sheet_name=level_details.lookup_sheet
+                )
+                region_geometries_raw = region_geometries_raw.merge(
+                    lookup,
+                    left_on=level_details.left_on,
+                    right_on=level_details.right_on,
+                    how="outer",
+                )
+
+            region_geometries_raw = region_geometries_raw.dissolve(
+                by=level_details.geo_id_column
+            ).reset_index()
+
             context.log.debug(ic(region_geometries_raw.head()))
             region_geometries = region_geometries_raw.rename(
                 columns={level_details.geo_id_column: "GEO_ID"}
@@ -410,7 +452,7 @@ def _geometry(
         first_metadata, first_gdf, first_names = geometries_to_return[0]
         first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID")
         ax = first_joined_gdf.plot(column="en", legend=False)
-        ax.set_title(f"NI 2023 {first_metadata.level}")
+        ax.set_title(f"NI 2021 {first_metadata.level}")
         md_plot = markdown_from_plot(plt)
         context.add_output_metadata(
             metadata={
@@ -458,7 +500,10 @@ def _source_metric_metadata(
         source_data_releases: dict[str, SourceDataRelease],
     ) -> MetricMetadata:
         partition_key = context.partition_key
-        if partition_key not in DERIVED_COLUMN_SPECIFICATIONS:
+        if (
+            self.required_tables is not None
+            and partition_key not in self.required_tables
+        ):
             skip_reason = (
                 f"Skipping as requested partition {partition_key} is configured "
                 f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}"
@@ -470,9 +515,18 @@ def _source_metric_metadata(
             orient="records"
         )[0]
 
+        geo_level = partition_key.split("/")[0]
+        source_table = SourceTable(
+            # TODO: how programmatically do this
+            hxltag="TBD",
+            geo_level=geo_level,
+            geo_column=NI_GEO_LEVELS["geo_level"].geo_id_column,
+            source_column="Count",
+        )
+
         return census_table_metadata(
             catalog_row,
-            DERIVED_COLUMN_SPECIFICATIONS[partition_key][0],
+            source_table,
             source_data_releases,
         )
 

From 0d42d27057b8074d71ee88938296886afd42decf Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Fri, 24 May 2024 09:27:18 +0100
Subject: [PATCH 15/39] Add processing for pivoting arbitrary census tables

---
 python/popgetter/assets/ni/__init__.py | 122 +++++++++++++++++++------
 1 file changed, 94 insertions(+), 28 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 4b2d162..75ddb27 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -520,7 +520,7 @@ def _source_metric_metadata(
             # TODO: how programmatically do this
             hxltag="TBD",
             geo_level=geo_level,
-            geo_column=NI_GEO_LEVELS["geo_level"].geo_id_column,
+            geo_column=NI_GEO_LEVELS[geo_level].geo_id_column,
             source_column="Count",
         )
 
@@ -536,47 +536,113 @@ def _derived_metrics(
         census_tables: pd.DataFrame,
         source_metric_metadata: MetricMetadata,
     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+        SEP = "_"
         partition_key = context.partition_key
+        geo_level = partition_key.split("/")[0]
         source_table = census_tables
         source_mmd = source_metric_metadata
         source_column = source_mmd.parquet_column_name
         assert source_column in source_table.columns
         assert len(source_table) > 0
 
+        geo_id = NI_GEO_LEVELS[geo_level].census_table_column
+        source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop(
+            columns=geo_id.replace("Code", "Label")
+        )
+
+        parquet_file_name = (
+            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+        )
+        derived_metrics: list[pd.DataFrame] = []
+        derived_mmd: list[MetricMetadata] = []
+
         try:
+            # TODO: check whether to drop unused source_table_metadata
             source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[
                 partition_key
             ]
-        except KeyError as err:
+            for metric_spec in metric_specs:
+                new_table = (
+                    source_table.pipe(metric_spec.filter_func)
+                    .groupby(by="GEO_ID", as_index=True)
+                    .sum()
+                    .rename(columns={source_column: metric_spec.output_column_name})
+                    .filter(items=["GEO_ID", metric_spec.output_column_name])
+                )
+                derived_metrics.append(new_table)
+                new_mmd = source_mmd.copy()
+                new_mmd.parent_metric_id = source_mmd.source_metric_id
+                new_mmd.metric_parquet_path = parquet_file_name
+                new_mmd.hxl_tag = metric_spec.hxltag
+                new_mmd.parquet_column_name = metric_spec.output_column_name
+                new_mmd.human_readable_name = metric_spec.human_readable_name
+                derived_mmd.append(new_mmd)
+        except KeyError:
             skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}"
             context.log.warning(skip_reason)
-            raise RuntimeError(skip_reason) from err
-
-        source_table = source_table.rename(
-            columns={source_table_metadata.geo_column: "GEO_ID"}
-        )
-        derived_metrics: list[pd.DataFrame] = []
-        derived_mmd: list[MetricMetadata] = []
-        parquet_file_name = (
-            "".join(c for c in partition_key if c.isalnum()) + ".parquet"
-        )
-        for metric_spec in metric_specs:
-            new_table = (
-                source_table.pipe(metric_spec.filter_func)
-                .groupby(by="GEO_ID", as_index=True)
-                .sum()
-                .rename(columns={source_column: metric_spec.output_column_name})
-                .filter(items=["GEO_ID", metric_spec.output_column_name])
+            # raise RuntimeError(skip_reason) from err
+
+        # Get all other metrics from table as is pivoted
+        def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
+            # Variables are either code or label, only keep the case for given 'end'
+            cols = (
+                [col for col in df.columns if col.endswith(end)]
+                + ["GEO_ID"]
+                + ["Count"]
+            )
+            pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]]
+            ic(cols)
+            ic(pivot_cols)
+            ic(df.columns)
+            ic(df.head())
+            pivot = df[cols].pivot_table(
+                index="GEO_ID",
+                columns=pivot_cols,
+                values="Count",
             )
-            derived_metrics.append(new_table)
-
-            new_mmd = source_mmd.copy()
-            new_mmd.parent_metric_id = source_mmd.source_metric_id
-            new_mmd.metric_parquet_path = parquet_file_name
-            new_mmd.hxl_tag = metric_spec.hxltag
-            new_mmd.parquet_column_name = metric_spec.output_column_name
-            new_mmd.human_readable_name = metric_spec.human_readable_name
-            derived_mmd.append(new_mmd)
+
+            # FLattent multi-index
+            if isinstance(pivot.columns, pd.MultiIndex):
+                pivot.columns = [
+                    SEP.join(list(map(str, col))).strip()
+                    for col in pivot.columns.to_numpy()
+                ]
+            # Ensure columns are string
+            else:
+                pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
+            out_cols = [col.replace(var_type, "").strip() for col in pivot_cols]
+            return out_cols, pivot
+
+        # Pivot for codes and labels
+        for var_type in ["Code", "Label"]:
+            out_cols, new_table = pivot_df(source_table, var_type)
+            ic(new_table)
+            for metric_col in new_table.columns:
+                metric_df = new_table.loc[:, metric_col].to_frame()
+                ic(metric_df)
+                derived_metrics.append(metric_df)
+                new_mmd = source_mmd.copy()
+                new_mmd.parent_metric_id = source_mmd.source_metric_id
+                new_mmd.metric_parquet_path = parquet_file_name
+                key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
+
+                def gen_hxltag(kv: dict[str, str]) -> str:
+                    out = ["#population"]
+                    for key, value in kv.items():
+                        out += ["".join(c for c in key if c.isalnum())]
+                        out += ["_"]
+                        out += ["".join(c for c in value if c.isalnum())]
+                    return "+".join(out)
+
+                new_mmd.hxl_tag = gen_hxltag(key_val)
+                new_mmd.parquet_column_name = metric_col
+                new_mmd.human_readable_name = "; ".join(
+                    [
+                        f"Variable: '{key}'; Value: '{value}'"
+                        for key, value in key_val.items()
+                    ]
+                )
+                derived_mmd.append(new_mmd)
 
         joined_metrics = reduce(
             lambda left, right: left.merge(

From ad6440622de240cfaca871d63fa113f2ce4ea015 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 28 May 2024 09:41:28 +0100
Subject: [PATCH 16/39] Add new metrics asset across all partitions

---
 python/popgetter/assets/common.py      |  20 ++++
 python/popgetter/assets/ni/__init__.py | 154 ++++++++++++++++++++-----
 2 files changed, 146 insertions(+), 28 deletions(-)

diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py
index daa8ae0..cadb5f5 100644
--- a/python/popgetter/assets/common.py
+++ b/python/popgetter/assets/common.py
@@ -148,3 +148,23 @@ def _derived_metrics(
         source_metric_metadata: MetricMetadata,
     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
         ...
+
+    # def create_reshaped_metrics(self):
+    #     @asset(partitions_def=self.dataset_node_partition)
+    #     def reshaped_metrics(
+    #         context,
+    #         census_tables: pd.DataFrame,
+    #         source_metric_metadata: MetricMetadata,
+    #     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+    #         return self._reshaped_metrics(context, census_tables, source_metric_metadata)
+
+    #     return reshaped_metrics
+
+    # @abstractmethod
+    # def _reshaped_metrics(
+    #     self,
+    #     context,
+    #     census_tables: pd.DataFrame,
+    #     source_metric_metadata: MetricMetadata,
+    # ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+    #     ...
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 75ddb27..c45ad35 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -13,14 +13,14 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
-    AssetIn,
+    AssetDep,
     DynamicPartitionsDefinition,
     MetadataValue,
-    SpecificPartitionsPartitionMapping,
     asset,
 )
 from icecream import ic
 
+import popgetter
 from popgetter.assets.common import Country, CountryAssetOuputs
 from popgetter.metadata import (
     CountryMetadata,
@@ -580,7 +580,6 @@ def _derived_metrics(
         except KeyError:
             skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}"
             context.log.warning(skip_reason)
-            # raise RuntimeError(skip_reason) from err
 
         # Get all other metrics from table as is pivoted
         def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
@@ -662,9 +661,117 @@ def gen_hxltag(kv: dict[str, str]) -> str:
                 ),
             },
         )
-
         return derived_mmd, joined_metrics
 
+    # TODO: consider splitting the reshaping of census table data into a separate asset
+    # def _reshaped_metrics(
+    #     self,
+    #     context,
+    #     census_tables: pd.DataFrame,
+    #     source_metric_metadata: MetricMetadata,
+    # ) -> tuple[list[MetricMetadata], pd.DataFrame]:
+    #     SEP = "_"
+    #     partition_key = context.partition_key
+    #     geo_level = partition_key.split("/")[0]
+    #     source_table = census_tables
+    #     source_mmd = source_metric_metadata
+    #     source_column = source_mmd.parquet_column_name
+    #     assert source_column in source_table.columns
+    #     assert len(source_table) > 0
+
+    #     geo_id = NI_GEO_LEVELS[geo_level].census_table_column
+    #     source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop(
+    #         columns=geo_id.replace("Code", "Label")
+    #     )
+
+    #     parquet_file_name = (
+    #         "".join(c for c in partition_key if c.isalnum()) + ".parquet"
+    #     )
+    #     derived_metrics: list[pd.DataFrame] = []
+    #     derived_mmd: list[MetricMetadata] = []
+
+    #     # Get all other metrics from table as is pivoted
+    #     def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
+    #         # Variables are either code or label, only keep the case for given 'end'
+    #         cols = (
+    #             [col for col in df.columns if col.endswith(end)]
+    #             + ["GEO_ID"]
+    #             + ["Count"]
+    #         )
+    #         pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]]
+    #         ic(cols)
+    #         ic(pivot_cols)
+    #         ic(df.columns)
+    #         ic(df.head())
+    #         pivot = df[cols].pivot_table(
+    #             index="GEO_ID",
+    #             columns=pivot_cols,
+    #             values="Count",
+    #         )
+
+    #         # FLattent multi-index
+    #         if isinstance(pivot.columns, pd.MultiIndex):
+    #             pivot.columns = [
+    #                 SEP.join(list(map(str, col))).strip()
+    #                 for col in pivot.columns.to_numpy()
+    #             ]
+    #         # Ensure columns are string
+    #         else:
+    #             pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
+    #         out_cols = [col.replace(var_type, "").strip() for col in pivot_cols]
+    #         return out_cols, pivot
+
+    #     # Pivot for codes and labels
+    #     for var_type in ["Code", "Label"]:
+    #         out_cols, new_table = pivot_df(source_table, var_type)
+    #         ic(new_table)
+    #         for metric_col in new_table.columns:
+    #             metric_df = new_table.loc[:, metric_col].to_frame()
+    #             ic(metric_df)
+    #             derived_metrics.append(metric_df)
+    #             new_mmd = source_mmd.copy()
+    #             new_mmd.parent_metric_id = source_mmd.source_metric_id
+    #             new_mmd.metric_parquet_path = parquet_file_name
+    #             key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
+
+    #             def gen_hxltag(kv: dict[str, str]) -> str:
+    #                 out = ["#population"]
+    #                 for key, value in kv.items():
+    #                     out += ["".join(c for c in key if c.isalnum())]
+    #                     out += ["_"]
+    #                     out += ["".join(c for c in value if c.isalnum())]
+    #                 return "+".join(out)
+
+    #             new_mmd.hxl_tag = gen_hxltag(key_val)
+    #             new_mmd.parquet_column_name = metric_col
+    #             new_mmd.human_readable_name = "; ".join(
+    #                 [
+    #                     f"Variable: '{key}'; Value: '{value}'"
+    #                     for key, value in key_val.items()
+    #                 ]
+    #             )
+    #             derived_mmd.append(new_mmd)
+
+    #     joined_metrics = reduce(
+    #         lambda left, right: left.merge(
+    #             right, on="GEO_ID", how="inner", validate="one_to_one"
+    #         ),
+    #         derived_metrics,
+    #     )
+
+    #     context.add_output_metadata(
+    #         metadata={
+    #             "metadata_preview": MetadataValue.md(
+    #                 metadata_to_dataframe(derived_mmd).head().to_markdown()
+    #             ),
+    #             "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
+    #             "metrics_preview": MetadataValue.md(
+    #                 joined_metrics.head().to_markdown()
+    #             ),
+    #         },
+    #     )
+    #     return derived_mmd, joined_metrics
+
 
 # Assets
 ni = NorthernIreland()
@@ -675,40 +782,31 @@ def gen_hxltag(kv: dict[str, str]) -> str:
 catalog = ni.create_catalog()
 census_tables = ni.create_census_tables()
 source_metric_metadata = ni.create_source_metric_metadata()
+# reshaped_metrics = ni.create_reshaped_metrics()
 derived_metrics = ni.create_derived_metrics()
 
 
-@asset(
-    ins={
-        "derived_metrics": AssetIn(
-            partition_mapping=SpecificPartitionsPartitionMapping(
-                list(DERIVED_COLUMN_SPECIFICATIONS.keys())
-            ),
-        ),
-    },
-)
+# Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition:
+# See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi
+@asset(deps=[AssetDep("derived_metrics")])
 def metrics(
-    # Note dagster does not seem to allow a union type for `derived_metrics` for
-    # the cases of one or many partitions
     context,
-    derived_metrics,
+    catalog: pd.DataFrame,
 ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
     """
     This asset exists solely to aggregate all the derived tables into one
     single unpartitioned asset, which the downstream publishing tasks can use.
-
-    Right now it is a bit boring because it only relies on one partition, but
-    it could be extended when we have more data products.
     """
-    if len(DERIVED_COLUMN_SPECIFICATIONS) == 1:
-        # Make into same type for the case of multiple partitions
-        derived_metrics_dict: dict[str, tuple[list[MetricMetadata], pd.DataFrame]] = {
-            next(iter(DERIVED_COLUMN_SPECIFICATIONS.keys())): derived_metrics
-        }
-    else:
-        derived_metrics_dict: dict[
-            str, tuple[list[MetricMetadata], pd.DataFrame]
-        ] = derived_metrics
+    # Get derived_metrics asset for partitions that were successful
+    derived_metrics_dict = {}
+    for partition_key in catalog["partition_key"].to_list():
+        try:
+            derived_metrics_partition = popgetter.defs.load_asset_value(
+                ["uk-ni", "derived_metrics"], partition_key=partition_key
+            )
+            derived_metrics_dict[partition_key] = derived_metrics_partition
+        except FileNotFoundError as err:
+            context.log.debug(ic(f"Failed partition key {partition_key}: {err}"))
 
     # Combine outputs across partitions
     outputs = [

From 1ef97ea096f7e40b3d43a0a7a55854ac5306a81e Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 28 May 2024 09:58:51 +0100
Subject: [PATCH 17/39] Change aggfunc to sum to prevent cast as float

---
 python/popgetter/assets/ni/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index c45ad35..e3e9029 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -595,9 +595,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
             ic(df.columns)
             ic(df.head())
             pivot = df[cols].pivot_table(
-                index="GEO_ID",
-                columns=pivot_cols,
-                values="Count",
+                index="GEO_ID", columns=pivot_cols, values="Count", aggfunc="sum"
             )
 
             # FLattent multi-index

From 3661f68fc7478d090c3537a4abd5abc332071413 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 28 May 2024 10:00:08 +0100
Subject: [PATCH 18/39] Fix hxltag construction

---
 python/popgetter/assets/ni/__init__.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index e3e9029..713d7e1 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -626,9 +626,11 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
                 def gen_hxltag(kv: dict[str, str]) -> str:
                     out = ["#population"]
                     for key, value in kv.items():
-                        out += ["".join(c for c in key if c.isalnum())]
-                        out += ["_"]
-                        out += ["".join(c for c in value if c.isalnum())]
+                        out += [
+                            "".join(c for c in key if c.isalnum())
+                            + "_"
+                            + "".join(c for c in value if c.isalnum())
+                        ]
                     return "+".join(out)
 
                 new_mmd.hxl_tag = gen_hxltag(key_val)

From f069cea118e927b9411c8ed31aad51d496f18004 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Tue, 28 May 2024 17:18:59 +0100
Subject: [PATCH 19/39] Move metrics into abc

---
 python/popgetter/__init__.py           |  4 +-
 python/popgetter/assets/common.py      | 46 ++++++++++++----
 python/popgetter/assets/ni/__init__.py | 72 ++++++++++++--------------
 3 files changed, 69 insertions(+), 53 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 2da6302..2300abd 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -54,9 +54,7 @@
     *load_assets_from_package_module(assets.us, group_name="us"),
     *load_assets_from_package_module(assets.be, group_name="be"),
     *load_assets_from_package_module(assets.uk, group_name="uk"),
-    *load_assets_from_package_module(
-        assets.ni, group_name="ni", key_prefix=assets.ni.ni.key_prefix
-    ),
+    *load_assets_from_package_module(assets.ni, group_name="ni"),
     *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"),
     *(
         load_assets_from_modules([azure_test], group_name="azure_test")
diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py
index 5233825..a99f8e5 100644
--- a/python/popgetter/assets/common.py
+++ b/python/popgetter/assets/common.py
@@ -4,9 +4,13 @@
 
 import geopandas as gpd
 import pandas as pd
-from dagster import DynamicPartitionsDefinition, asset
+from dagster import AssetDep, DynamicPartitionsDefinition, asset
 
-from popgetter.cloud_outputs import send_to_geometry_sensor, send_to_metadata_sensor
+from popgetter.cloud_outputs import (
+    send_to_geometry_sensor,
+    send_to_metadata_sensor,
+    send_to_metrics_sensor,
+)
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -17,10 +21,11 @@
 
 
 class Country(ABC):
+    key_prefix: str
     dataset_node_partition: DynamicPartitionsDefinition
 
     def create_catalog(self):
-        @asset()
+        @asset(key_prefix=self.key_prefix)
         def catalog(context):
             return self._catalog(context)
 
@@ -32,7 +37,7 @@ def _catalog(self, context) -> pd.DataFrame:
 
     def create_country_metadata(self):
         @send_to_metadata_sensor
-        @asset()
+        @asset(key_prefix=self.key_prefix)
         def country_metadata(context):
             return self._country_metadata(context)
 
@@ -44,7 +49,7 @@ def _country_metadata(self, context) -> CountryMetadata:
 
     def create_data_publisher(self):
         @send_to_metadata_sensor
-        @asset
+        @asset(key_prefix=self.key_prefix)
         def data_publisher(context, country_metadata: CountryMetadata):
             return self._data_publisher(context, country_metadata)
 
@@ -58,7 +63,7 @@ def _data_publisher(
 
     def create_geometry(self):
         @send_to_geometry_sensor
-        @asset()
+        @asset(key_prefix=self.key_prefix)
         def geometry(context):
             return self._geometry(context)
 
@@ -72,7 +77,7 @@ def _geometry(
 
     def create_source_data_releases(self):
         @send_to_metadata_sensor
-        @asset()
+        @asset(key_prefix=self.key_prefix)
         def source_data_releases(
             context,
             geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
@@ -94,7 +99,7 @@ def _source_data_releases(
         ...
 
     def create_census_tables(self):
-        @asset(partitions_def=self.dataset_node_partition)
+        @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
         def census_tables(context, catalog):
             return self._census_tables(context, catalog)
 
@@ -105,7 +110,7 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
         ...
 
     def create_source_metric_metadata(self):
-        @asset(partitions_def=self.dataset_node_partition)
+        @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
         def source_metric_metadata(context, catalog, source_data_releases):
             return self._source_metric_metadata(context, catalog, source_data_releases)
 
@@ -121,7 +126,7 @@ def _source_metric_metadata(
         ...
 
     def create_derived_metrics(self):
-        @asset(partitions_def=self.dataset_node_partition)
+        @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
         def derived_metrics(
             context,
             census_tables: pd.DataFrame,
@@ -159,3 +164,24 @@ def _derived_metrics(
     #     source_metric_metadata: MetricMetadata,
     # ) -> tuple[list[MetricMetadata], pd.DataFrame]:
     #     ...
+
+    def create_metrics(self):
+        @send_to_metrics_sensor
+        # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition:
+        # See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi
+        @asset(deps=[AssetDep("derived_metrics")], key_prefix=self.key_prefix)
+        def metrics(
+            context,
+            catalog: pd.DataFrame,
+        ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
+            return self._metrics(context, catalog)
+
+        return metrics
+
+    @abstractmethod
+    def _metrics(
+        self,
+        context,
+        catalog: pd.DataFrame,
+    ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
+        ...
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index f8e4cb2..4b80d4f 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -13,16 +13,13 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
-    AssetDep,
     DynamicPartitionsDefinition,
     MetadataValue,
-    asset,
 )
 from icecream import ic
 
 import popgetter
 from popgetter.assets.common import Country
-from popgetter.cloud_outputs import send_to_metrics_sensor
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -761,6 +758,37 @@ def gen_hxltag(kv: dict[str, str]) -> str:
     #     )
     #     return derived_mmd, joined_metrics
 
+    def _metrics(
+        self, context, catalog: pd.DataFrame
+    ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
+        """
+        This asset exists solely to aggregate all the derived tables into one
+        single unpartitioned asset, which the downstream publishing tasks can use.
+        """
+        # Get derived_metrics asset for partitions that were successful
+        derived_metrics_dict = {}
+        for partition_key in catalog["partition_key"].to_list():
+            try:
+                derived_metrics_partition = popgetter.defs.load_asset_value(
+                    ["uk-ni", "derived_metrics"], partition_key=partition_key
+                )
+                derived_metrics_dict[partition_key] = derived_metrics_partition
+            except FileNotFoundError as err:
+                context.log.debug(ic(f"Failed partition key {partition_key}: {err}"))
+
+        # Combine outputs across partitions
+        outputs = [
+            (mmds[0].metric_parquet_path, mmds, table)
+            for (mmds, table) in derived_metrics_dict.values()
+        ]
+        context.add_output_metadata(
+            metadata={
+                "num_metrics": sum(len(output[1]) for output in outputs),
+                "num_parquets": len(outputs),
+            },
+        )
+        return outputs
+
 
 # Assets
 ni = NorthernIreland()
@@ -773,40 +801,4 @@ def gen_hxltag(kv: dict[str, str]) -> str:
 source_metric_metadata = ni.create_source_metric_metadata()
 # reshaped_metrics = ni.create_reshaped_metrics()
 derived_metrics = ni.create_derived_metrics()
-
-
-@send_to_metrics_sensor
-# Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition:
-# See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi
-@asset(deps=[AssetDep("derived_metrics")])
-def metrics(
-    context,
-    catalog: pd.DataFrame,
-) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
-    """
-    This asset exists solely to aggregate all the derived tables into one
-    single unpartitioned asset, which the downstream publishing tasks can use.
-    """
-    # Get derived_metrics asset for partitions that were successful
-    derived_metrics_dict = {}
-    for partition_key in catalog["partition_key"].to_list():
-        try:
-            derived_metrics_partition = popgetter.defs.load_asset_value(
-                ["uk-ni", "derived_metrics"], partition_key=partition_key
-            )
-            derived_metrics_dict[partition_key] = derived_metrics_partition
-        except FileNotFoundError as err:
-            context.log.debug(ic(f"Failed partition key {partition_key}: {err}"))
-
-    # Combine outputs across partitions
-    outputs = [
-        (mmds[0].metric_parquet_path, mmds, table)
-        for (mmds, table) in derived_metrics_dict.values()
-    ]
-    context.add_output_metadata(
-        metadata={
-            "num_metrics": sum(len(output[1]) for output in outputs),
-            "num_parquets": len(outputs),
-        },
-    )
-    return outputs
+metrics = ni.create_metrics()

From 01836c045a42bc3ad3534f7340c7dd5d1f7838ac Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 29 May 2024 10:07:40 +0100
Subject: [PATCH 20/39] Refactor to remove obsolete geo levels dict

---
 notebooks/explore.ipynb                |  2 +-
 python/popgetter/assets/ni/__init__.py | 27 ++++++++++++--------------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/notebooks/explore.ipynb b/notebooks/explore.ipynb
index 44c18f0..8d9a91c 100644
--- a/notebooks/explore.ipynb
+++ b/notebooks/explore.ipynb
@@ -66,7 +66,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 4b80d4f..297a398 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 from datetime import date
 from functools import reduce
+from typing import ClassVar
 
 import geopandas as gpd
 import matplotlib.pyplot as plt
@@ -46,6 +47,15 @@ class NIGeometryLevel:
 
 
 # Geometry levels to include
+# Full list of geographies, see metadata:
+# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE
+# - "LGD14",  # Local Government District 2014
+# - "URBAN_STATUS", # Urban Status
+# - "HEALTH_TRUST", # Health and Social Care Trust
+# - "PARLCON24", # Parliamentary Constituency 2024
+# - "DEA14", # District Electoral Area 2014
+# - "SDZ21",  # Census 2021 Super Data Zone
+# - "DZ21",  # Census 2021 Data Zone
 NI_GEO_LEVELS = {
     "DZ21": NIGeometryLevel(
         level="DZ21",
@@ -88,24 +98,12 @@ class NIGeometryLevel:
 # Required tables
 REQUIRED_TABLES = ["MS-A09"] if os.getenv("ENV") == "dev" else None
 
-# Full list of geographies, see metadata:
-# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE
-GEO_LEVELS = [
-    "LGD14",  # Local Government District 2014
-    # "URBAN_STATUS", # Urban Status
-    # "HEALTH_TRUST", # Health and Social Care Trust
-    # "PARLCON24", # Parliamentary Constituency 2024
-    # "DEA14", # District Electoral Area 2014
-    "SDZ21",  # Census 2021 Super Data Zone
-    "DZ21",  # Census 2021 Data Zone
-]
-
-
 # 2021 census collection date
 CENSUS_COLLECTION_DATE = date(2021, 3, 21)
 
 
 def get_nodes_and_links() -> dict[str, dict[str, str]]:
+    """Extracts the URLs for census tables and metadata for ready-made tables."""
     SCHEME_AND_HOST = "https://build.nisra.gov.uk"
     urls = [
         "".join([SCHEME_AND_HOST, url.get("href")])
@@ -244,7 +242,6 @@ def census_table_metadata(
         potential_denominator_ids=None,
         parquet_margin_of_error_file=None,
         parquet_margin_of_error_column=None,
-        # parquet_column_name=catalog_row["source_column"],
         parquet_column_name=source_table.source_column,
         # TODO - this is a placeholder
         metric_parquet_path="unknown_at_this_stage",
@@ -257,7 +254,7 @@ def census_table_metadata(
 class NorthernIreland(Country):
     key_prefix: str = "uk-ni"
     partition_name: str = "uk-ni_dataset_nodes"
-    geo_levels: list[str] = GEO_LEVELS
+    geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
     required_tables: list[str] | None = REQUIRED_TABLES
     dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes")
 

From 6c33d266d29920729f77348e9638736571708897 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 29 May 2024 10:20:56 +0100
Subject: [PATCH 21/39] Remove unused reshape metrics method

---
 python/popgetter/assets/common.py      |  20 -----
 python/popgetter/assets/ni/__init__.py | 110 -------------------------
 2 files changed, 130 deletions(-)

diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py
index a99f8e5..b5983a2 100644
--- a/python/popgetter/assets/common.py
+++ b/python/popgetter/assets/common.py
@@ -145,26 +145,6 @@ def _derived_metrics(
     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
         ...
 
-    # def create_reshaped_metrics(self):
-    #     @asset(partitions_def=self.dataset_node_partition)
-    #     def reshaped_metrics(
-    #         context,
-    #         census_tables: pd.DataFrame,
-    #         source_metric_metadata: MetricMetadata,
-    #     ) -> tuple[list[MetricMetadata], pd.DataFrame]:
-    #         return self._reshaped_metrics(context, census_tables, source_metric_metadata)
-
-    #     return reshaped_metrics
-
-    # @abstractmethod
-    # def _reshaped_metrics(
-    #     self,
-    #     context,
-    #     census_tables: pd.DataFrame,
-    #     source_metric_metadata: MetricMetadata,
-    # ) -> tuple[list[MetricMetadata], pd.DataFrame]:
-    #     ...
-
     def create_metrics(self):
         @send_to_metrics_sensor
         # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition:
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 297a398..73a263a 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -646,115 +646,6 @@ def gen_hxltag(kv: dict[str, str]) -> str:
         )
         return derived_mmd, joined_metrics
 
-    # TODO: consider splitting the reshaping of census table data into a separate asset
-    # def _reshaped_metrics(
-    #     self,
-    #     context,
-    #     census_tables: pd.DataFrame,
-    #     source_metric_metadata: MetricMetadata,
-    # ) -> tuple[list[MetricMetadata], pd.DataFrame]:
-    #     SEP = "_"
-    #     partition_key = context.partition_key
-    #     geo_level = partition_key.split("/")[0]
-    #     source_table = census_tables
-    #     source_mmd = source_metric_metadata
-    #     source_column = source_mmd.parquet_column_name
-    #     assert source_column in source_table.columns
-    #     assert len(source_table) > 0
-
-    #     geo_id = NI_GEO_LEVELS[geo_level].census_table_column
-    #     source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop(
-    #         columns=geo_id.replace("Code", "Label")
-    #     )
-
-    #     parquet_file_name = (
-    #         "".join(c for c in partition_key if c.isalnum()) + ".parquet"
-    #     )
-    #     derived_metrics: list[pd.DataFrame] = []
-    #     derived_mmd: list[MetricMetadata] = []
-
-    #     # Get all other metrics from table as is pivoted
-    #     def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]:
-    #         # Variables are either code or label, only keep the case for given 'end'
-    #         cols = (
-    #             [col for col in df.columns if col.endswith(end)]
-    #             + ["GEO_ID"]
-    #             + ["Count"]
-    #         )
-    #         pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]]
-    #         ic(cols)
-    #         ic(pivot_cols)
-    #         ic(df.columns)
-    #         ic(df.head())
-    #         pivot = df[cols].pivot_table(
-    #             index="GEO_ID",
-    #             columns=pivot_cols,
-    #             values="Count",
-    #         )
-
-    #         # FLattent multi-index
-    #         if isinstance(pivot.columns, pd.MultiIndex):
-    #             pivot.columns = [
-    #                 SEP.join(list(map(str, col))).strip()
-    #                 for col in pivot.columns.to_numpy()
-    #             ]
-    #         # Ensure columns are string
-    #         else:
-    #             pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()]
-    #         out_cols = [col.replace(var_type, "").strip() for col in pivot_cols]
-    #         return out_cols, pivot
-
-    #     # Pivot for codes and labels
-    #     for var_type in ["Code", "Label"]:
-    #         out_cols, new_table = pivot_df(source_table, var_type)
-    #         ic(new_table)
-    #         for metric_col in new_table.columns:
-    #             metric_df = new_table.loc[:, metric_col].to_frame()
-    #             ic(metric_df)
-    #             derived_metrics.append(metric_df)
-    #             new_mmd = source_mmd.copy()
-    #             new_mmd.parent_metric_id = source_mmd.source_metric_id
-    #             new_mmd.metric_parquet_path = parquet_file_name
-    #             key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True))
-
-    #             def gen_hxltag(kv: dict[str, str]) -> str:
-    #                 out = ["#population"]
-    #                 for key, value in kv.items():
-    #                     out += ["".join(c for c in key if c.isalnum())]
-    #                     out += ["_"]
-    #                     out += ["".join(c for c in value if c.isalnum())]
-    #                 return "+".join(out)
-
-    #             new_mmd.hxl_tag = gen_hxltag(key_val)
-    #             new_mmd.parquet_column_name = metric_col
-    #             new_mmd.human_readable_name = "; ".join(
-    #                 [
-    #                     f"Variable: '{key}'; Value: '{value}'"
-    #                     for key, value in key_val.items()
-    #                 ]
-    #             )
-    #             derived_mmd.append(new_mmd)
-
-    #     joined_metrics = reduce(
-    #         lambda left, right: left.merge(
-    #             right, on="GEO_ID", how="inner", validate="one_to_one"
-    #         ),
-    #         derived_metrics,
-    #     )
-
-    #     context.add_output_metadata(
-    #         metadata={
-    #             "metadata_preview": MetadataValue.md(
-    #                 metadata_to_dataframe(derived_mmd).head().to_markdown()
-    #             ),
-    #             "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns",
-    #             "metrics_preview": MetadataValue.md(
-    #                 joined_metrics.head().to_markdown()
-    #             ),
-    #         },
-    #     )
-    #     return derived_mmd, joined_metrics
-
     def _metrics(
         self, context, catalog: pd.DataFrame
     ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]:
@@ -796,6 +687,5 @@ def _metrics(
 catalog = ni.create_catalog()
 census_tables = ni.create_census_tables()
 source_metric_metadata = ni.create_source_metric_metadata()
-# reshaped_metrics = ni.create_reshaped_metrics()
 derived_metrics = ni.create_derived_metrics()
 metrics = ni.create_metrics()

From 26bf2be4a7c682d784746084955c0852a85cb7c3 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 29 May 2024 10:24:04 +0100
Subject: [PATCH 22/39] Remove source table from metric specification

---
 python/popgetter/assets/ni/__init__.py | 27 ++++----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 73a263a..abcbab9 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -204,25 +204,9 @@ class SourceTable:
     ),
 ]
 
-DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = {
-    "DZ21/MS-A09": (
-        SourceTable(
-            hxltag="#population+dz21+2021",
-            geo_level="DZ21",
-            geo_column="Census 2021 Data Zone Code",
-            source_column="Count",
-        ),
-        DERIVED_COLUMNS,
-    ),
-    "SDZ21/MS-A09": (
-        SourceTable(
-            hxltag="#population+sdz21+2021",
-            geo_level="SDZ21",
-            geo_column="Census 2021 Super Data Zone Code",
-            source_column="Count",
-        ),
-        DERIVED_COLUMNS,
-    ),
+DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = {
+    "DZ21/MS-A09": DERIVED_COLUMNS,
+    "SDZ21/MS-A09": DERIVED_COLUMNS,
 }
 
 
@@ -540,10 +524,7 @@ def _derived_metrics(
         derived_mmd: list[MetricMetadata] = []
 
         try:
-            # TODO: check whether to drop unused source_table_metadata
-            source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[
-                partition_key
-            ]
+            metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key]
             for metric_spec in metric_specs:
                 new_table = (
                     source_table.pipe(metric_spec.filter_func)

From 1428420f7d80cc02f703faca6bc05e489f7f1054 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 29 May 2024 12:01:31 +0100
Subject: [PATCH 23/39] Update README

---
 python/popgetter/assets/ni/README.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/python/popgetter/assets/ni/README.md b/python/popgetter/assets/ni/README.md
index 2faa303..b1d3ea9 100644
--- a/python/popgetter/assets/ni/README.md
+++ b/python/popgetter/assets/ni/README.md
@@ -1,17 +1,22 @@
-# Northern Ireland
+# Northern Ireland
 
 ## Summary
 
 Census 2021 is available from
 [https://build.nisra.gov.uk](https://build.nisra.gov.uk/en/).
 
-The processing pipeline involves the following steps:
+The processing pipeline involves the following steps, achieved by implementing
+the [`Country`](../country.py) base class:
 
-- Gets the corresponding geography files and outputs as standard geometry
-  formats
-- Generates metadata associated with Northern Ireland and data releases
-- Generates a catalog by identifying all tables
-  [available](https://build.nisra.gov.uk/en/standard).
-- Read table metadata and census tables, across different geography levels
-  (currently only Data Zone 2021 and Super Data Zone 2021)
-- Construct a set of pre-defined derived metrics
+- Retrieve the geography data and outputs with standard geometry formats
+  (`geometry` asset)
+- Generate metadata associated with country, data publisher and source data
+  releases (`country_metadata`, `data_publisher` and `source_data_releases`
+  assets)
+- Generate a catalog by identifying all tables
+  [available](https://build.nisra.gov.uk/en/standard) (`catalog` asset)
+- Read table metadata and census tables, across different geography levels,
+  currently for Data Zone 2021, Super Data Zone 2021 and Local Government
+  District 2014 (`census_tables` and `source_metric_metadata` assets)
+- Process census tables into metrics per geography ID and any other pre-defined
+  derived metrics (`metrics` asset)

From ba5d00126cc1c2458dd07273c1329eccde928c88 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Wed, 29 May 2024 12:06:53 +0100
Subject: [PATCH 24/39] Rename module, add doc strings for country class

---
 .../assets/{common.py => country.py}          | 58 +++++++++++++++++--
 python/popgetter/assets/ni/__init__.py        |  4 +-
 2 files changed, 56 insertions(+), 6 deletions(-)
 rename python/popgetter/assets/{common.py => country.py} (72%)

diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/country.py
similarity index 72%
rename from python/popgetter/assets/common.py
rename to python/popgetter/assets/country.py
index b5983a2..2dc3fec 100644
--- a/python/popgetter/assets/common.py
+++ b/python/popgetter/assets/country.py
@@ -21,12 +21,25 @@
 
 
 class Country(ABC):
+    """
+    A general class that can be implemented for a given country providing asset
+    factories and abstract methods to provide a template for a given country.
+
+    Attributes:
+        key_prefix (str): the prefix for the asset keys (e.g. "be" for Belgium)
+        dataset_node_partition (DynamicPartitionsDefinition): a dynamic partitions
+            definition populated at runtime with a partition per census table.
+
+    """
+
     key_prefix: str
     dataset_node_partition: DynamicPartitionsDefinition
 
     def create_catalog(self):
+        """Creates an asset providing a census metedata catalog."""
+
         @asset(key_prefix=self.key_prefix)
-        def catalog(context):
+        def catalog(context) -> pd.DataFrame:
             return self._catalog(context)
 
         return catalog
@@ -36,6 +49,8 @@ def _catalog(self, context) -> pd.DataFrame:
         ...
 
     def create_country_metadata(self):
+        """Creates an asset providing the country metadata."""
+
         @send_to_metadata_sensor
         @asset(key_prefix=self.key_prefix)
         def country_metadata(context):
@@ -48,6 +63,8 @@ def _country_metadata(self, context) -> CountryMetadata:
         ...
 
     def create_data_publisher(self):
+        """Creates an asset providing the data publisher metadata."""
+
         @send_to_metadata_sensor
         @asset(key_prefix=self.key_prefix)
         def data_publisher(context, country_metadata: CountryMetadata):
@@ -62,6 +79,11 @@ def _data_publisher(
         ...
 
     def create_geometry(self):
+        """
+        Creates an asset providing a list of geometries, metadata and names
+        at different resolutions.
+        """
+
         @send_to_geometry_sensor
         @asset(key_prefix=self.key_prefix)
         def geometry(context):
@@ -76,13 +98,18 @@ def _geometry(
         ...
 
     def create_source_data_releases(self):
+        """
+        Creates an asset providing the corresponding source data release metadata for
+        each geometry.
+        """
+
         @send_to_metadata_sensor
         @asset(key_prefix=self.key_prefix)
         def source_data_releases(
             context,
             geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]],
             data_publisher: DataPublisher,
-        ):
+        ) -> dict[str, SourceDataRelease]:
             return self._source_data_releases(context, geometry, data_publisher)
 
         return source_data_releases
@@ -99,8 +126,13 @@ def _source_data_releases(
         ...
 
     def create_census_tables(self):
+        """
+        Creates an asset providing each census table as a dataframe for each
+        partition.
+        """
+
         @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
-        def census_tables(context, catalog):
+        def census_tables(context, catalog: pd.DataFrame) -> pd.DataFrame:
             return self._census_tables(context, catalog)
 
         return census_tables
@@ -110,8 +142,15 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame:
         ...
 
     def create_source_metric_metadata(self):
+        """
+        Creates an asset providing the metadata required for downstream metric
+        derivation.
+        """
+
         @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
-        def source_metric_metadata(context, catalog, source_data_releases):
+        def source_metric_metadata(
+            context, catalog, source_data_releases: dict[str, SourceDataRelease]
+        ) -> MetricMetadata:
             return self._source_metric_metadata(context, catalog, source_data_releases)
 
         return source_metric_metadata
@@ -126,6 +165,11 @@ def _source_metric_metadata(
         ...
 
     def create_derived_metrics(self):
+        """
+        Creates an asset providing the metrics derived from the census tables and the
+        corresponding source metric metadata.
+        """
+
         @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix)
         def derived_metrics(
             context,
@@ -146,6 +190,12 @@ def _derived_metrics(
         ...
 
     def create_metrics(self):
+        """
+        Creates an asset combining all partitions across census tables into a combined
+        list of metric data file names (for output), list of metadata and metric
+        dataframe.
+        """
+
         @send_to_metrics_sensor
         # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition:
         # See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index abcbab9..9a15c6d 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -20,7 +20,7 @@
 from icecream import ic
 
 import popgetter
-from popgetter.assets.common import Country
+from popgetter.assets.country import Country
 from popgetter.metadata import (
     CountryMetadata,
     DataPublisher,
@@ -639,7 +639,7 @@ def _metrics(
         for partition_key in catalog["partition_key"].to_list():
             try:
                 derived_metrics_partition = popgetter.defs.load_asset_value(
-                    ["uk-ni", "derived_metrics"], partition_key=partition_key
+                    [ni.key_prefix, "derived_metrics"], partition_key=partition_key
                 )
                 derived_metrics_dict[partition_key] = derived_metrics_partition
             except FileNotFoundError as err:

From 49bb7591a4512c757d2f71714e4267f03bb78104 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:38:25 +0100
Subject: [PATCH 25/39] Add openpyxl dep for reading Excel files

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 441b5d7..ba11cd3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ dependencies = [
   "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names
   "jcs >=0.2.1", # For generating IDs from class attributes
   "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages
+  "openpyxl >=3.1.3", # For reading Excel files
 ]
 
 

From 2462c09b3253dc7253afafe3f55802abcd050137 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:43:26 +0100
Subject: [PATCH 26/39] Fix job description

---
 python/popgetter/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 2300abd..9d544a5 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -85,7 +85,7 @@
 job_ni: UnresolvedAssetJobDefinition = define_asset_job(
     name="job_ni",
     selection=AssetSelection.groups("ni"),
-    description="Downloads UK data.",
+    description="Downloads Northern Ireland data.",
 )
 
 

From 01f1ecbb0946af4b521851d0fc35c8dd00eb92f5 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:45:24 +0100
Subject: [PATCH 27/39] Add key_prefix to init method

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/country.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py
index 2dc3fec..5c0dba8 100644
--- a/python/popgetter/assets/country.py
+++ b/python/popgetter/assets/country.py
@@ -32,9 +32,14 @@ class Country(ABC):
 
     """
 
-    key_prefix: str
+    key_prefix: ClassVar[str]
+    partition_name: str
     dataset_node_partition: DynamicPartitionsDefinition
 
+    def __init__(self, key_prefix: str):
+        self.partition_name = f"{self.key_prefix}_nodes"
+        self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name)
+
     def create_catalog(self):
         """Creates an asset providing a census metedata catalog."""
 

From 2387f75ff11a91cccfebe659335eb3703ec20af9 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:53:34 +0100
Subject: [PATCH 28/39] Add methods for adding an removing partition keys

Having methods to add and remove partition keys simplifies the Country API for subclassing.

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/country.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py
index 5c0dba8..fadbfa7 100644
--- a/python/popgetter/assets/country.py
+++ b/python/popgetter/assets/country.py
@@ -36,6 +36,15 @@ class Country(ABC):
     partition_name: str
     dataset_node_partition: DynamicPartitionsDefinition
 
+    def add_partition_keys(self, context, keys: list[str]):
+        context.instance.add_dynamic_partitions(
+            partitions_def_name=self.partition_name,
+            partition_keys=keys,
+        )
+
+    def remove_all_partition_keys(self, context):
+        for partition_key in context.instance.get_dynamic_partitions(self.partition_name):
+            context.instance.delete_dynamic_partition(self.partition_name, partition_key)
     def __init__(self, key_prefix: str):
         self.partition_name = f"{self.key_prefix}_nodes"
         self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name)

From 49948df791d14778e570d1fab764ae2321929291 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 16:59:15 +0100
Subject: [PATCH 29/39] Remove obsolete attributes

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/ni/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 9a15c6d..2b989ab 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -236,11 +236,9 @@ def census_table_metadata(
 
 
 class NorthernIreland(Country):
-    key_prefix: str = "uk-ni"
-    partition_name: str = "uk-ni_dataset_nodes"
+    key_prefix: ClassVar[str] = "uk-ni"
     geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
     required_tables: list[str] | None = REQUIRED_TABLES
-    dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes")
 
     def _country_metadata(self, _context) -> CountryMetadata:
         return CountryMetadata(

From eab49b6ec505bfae1c62cc6714ce04e4524602ae Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:00:41 +0100
Subject: [PATCH 30/39] Remove update to python version in notebook

---
 notebooks/explore.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/explore.ipynb b/notebooks/explore.ipynb
index 8d9a91c..44c18f0 100644
--- a/notebooks/explore.ipynb
+++ b/notebooks/explore.ipynb
@@ -66,7 +66,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.3"
   }
  },
  "nbformat": 4,

From adf5456bb191516aa721f9d0d571728d6798214f Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:06:04 +0100
Subject: [PATCH 31/39] Complete comment

---
 python/popgetter/assets/ni/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 2b989ab..5fd78cd 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -272,7 +272,8 @@ def _catalog(self, context) -> pd.DataFrame:
                 https://build.nisra.gov.uk/en/metadata
         2. Or through enumerating the ready-made tables:
             https://build.nisra.gov.uk/en/standard
-            However, some level of
+            However, for some geographical resolutions, ready-made tables may
+            not be available due to data confidentiality.
         """
         catalog_summary = {
             "node": [],

From b801e0e69602c45a5fc5a0c04555811937814ef4 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:08:09 +0100
Subject: [PATCH 32/39] Remove all partitions during materialization

This is to ensure that only partitions from the latest materialization are included.

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/ni/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 5fd78cd..5f51c19 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -296,6 +296,7 @@ def _catalog(self, context) -> pd.DataFrame:
             "table_schema": [],
         }
         nodes = get_nodes_and_links()
+        self.remove_all_partition_keys(context)
 
         def add_resolution(s: str, geo_level: str) -> str:
             s_split = s.split("?")

From 4667efd0f086b33c8e584b0ed8f197fb2287026c Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:09:40 +0100
Subject: [PATCH 33/39] Use country method to add partitions

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/ni/__init__.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 5f51c19..76b4329 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -344,10 +344,7 @@ def add_resolution(s: str, geo_level: str) -> str:
                 catalog_summary["table_schema"].append(metadata["tableSchema"])
 
         catalog_df = pd.DataFrame.from_records(catalog_summary)
-        context.instance.add_dynamic_partitions(
-            partitions_def_name=self.partition_name,
-            partition_keys=catalog_df["partition_key"].to_list(),
-        )
+        self.add_partition_keys(context, catalog_df["partition_key"].to_list())
 
         add_metadata(context, catalog_df, "Catalog")
         return catalog_df

From 0ca5bb485ae63aaa21bda2db7c8e830903eed241 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 6 Jun 2024 17:40:20 +0100
Subject: [PATCH 34/39] Add key_prefix to init

---
 python/popgetter/assets/ni/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 76b4329..1d02b26 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -14,7 +14,6 @@
 import requests
 from bs4 import BeautifulSoup
 from dagster import (
-    DynamicPartitionsDefinition,
     MetadataValue,
 )
 from icecream import ic
@@ -236,7 +235,7 @@ def census_table_metadata(
 
 
 class NorthernIreland(Country):
-    key_prefix: ClassVar[str] = "uk-ni"
+    key_prefix: str
     geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
     required_tables: list[str] | None = REQUIRED_TABLES
 
@@ -657,7 +656,7 @@ def _metrics(
 
 
 # Assets
-ni = NorthernIreland()
+ni = NorthernIreland("uk-ni")
 country_metadata = ni.create_country_metadata()
 data_publisher = ni.create_data_publisher()
 geometry = ni.create_geometry()

From f2fbe857d0fcf40a7a51919c5e7c95d7669b0cee Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 6 Jun 2024 18:05:53 +0100
Subject: [PATCH 35/39] Update key_prefix type and add assignment in init

---
 python/popgetter/assets/country.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py
index fadbfa7..a64325d 100644
--- a/python/popgetter/assets/country.py
+++ b/python/popgetter/assets/country.py
@@ -32,10 +32,17 @@ class Country(ABC):
 
     """
 
-    key_prefix: ClassVar[str]
+    key_prefix: str
     partition_name: str
     dataset_node_partition: DynamicPartitionsDefinition
 
+    def __init__(self, key_prefix: str):
+        self.key_prefix = key_prefix
+        self.partition_name = f"{self.key_prefix}_nodes"
+        self.dataset_node_partition = DynamicPartitionsDefinition(
+            name=self.partition_name
+        )
+
     def add_partition_keys(self, context, keys: list[str]):
         context.instance.add_dynamic_partitions(
             partitions_def_name=self.partition_name,
@@ -43,11 +50,12 @@ def add_partition_keys(self, context, keys: list[str]):
         )
 
     def remove_all_partition_keys(self, context):
-        for partition_key in context.instance.get_dynamic_partitions(self.partition_name):
-            context.instance.delete_dynamic_partition(self.partition_name, partition_key)
-    def __init__(self, key_prefix: str):
-        self.partition_name = f"{self.key_prefix}_nodes"
-        self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name)
+        for partition_key in context.instance.get_dynamic_partitions(
+            self.partition_name
+        ):
+            context.instance.delete_dynamic_partition(
+                self.partition_name, partition_key
+            )
 
     def create_catalog(self):
         """Creates an asset providing a census metedata catalog."""

From baaecbb2c9919d604d3168a57af4da415688ead6 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com>
Date: Fri, 7 Jun 2024 08:57:08 +0100
Subject: [PATCH 36/39] Fix condition for partition keys included in metadata
 construction

Co-authored-by: Penelope Yong <penelopeysm@gmail.com>
---
 python/popgetter/assets/ni/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 1d02b26..98bb27e 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -465,10 +465,10 @@ def _source_metric_metadata(
         partition_key = context.partition_key
         if (
             self.required_tables is not None
-            and partition_key not in self.required_tables
+            and partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys()
         ):
             skip_reason = (
-                f"Skipping as requested partition {partition_key} is configured "
+                f"Skipping as requested partition {partition_key} is not configured "
                 f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}"
             )
             context.log.warning(skip_reason)

From 56fff8a68642bca9274be58eb4192e5b118a9d95 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 6 Jun 2024 21:08:22 +0100
Subject: [PATCH 37/39] Revert key_prefix to ClassVar over arg of init

---
 python/popgetter/assets/country.py     | 6 +++---
 python/popgetter/assets/ni/__init__.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py
index a64325d..2cd43c4 100644
--- a/python/popgetter/assets/country.py
+++ b/python/popgetter/assets/country.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from typing import ClassVar
 
 import geopandas as gpd
 import pandas as pd
@@ -32,12 +33,11 @@ class Country(ABC):
 
     """
 
-    key_prefix: str
+    key_prefix: ClassVar[str]
     partition_name: str
     dataset_node_partition: DynamicPartitionsDefinition
 
-    def __init__(self, key_prefix: str):
-        self.key_prefix = key_prefix
+    def __init__(self):
         self.partition_name = f"{self.key_prefix}_nodes"
         self.dataset_node_partition = DynamicPartitionsDefinition(
             name=self.partition_name
diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 98bb27e..2f060ca 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -235,7 +235,7 @@ def census_table_metadata(
 
 
 class NorthernIreland(Country):
-    key_prefix: str
+    key_prefix: ClassVar[str] = "uk-ni"
     geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
     required_tables: list[str] | None = REQUIRED_TABLES
 
@@ -656,7 +656,7 @@ def _metrics(
 
 
 # Assets
-ni = NorthernIreland("uk-ni")
+ni = NorthernIreland()
 country_metadata = ni.create_country_metadata()
 data_publisher = ni.create_data_publisher()
 geometry = ni.create_geometry()

From a58a086f7911fe71f1efc3024a3de2f537180776 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Thu, 6 Jun 2024 21:13:29 +0100
Subject: [PATCH 38/39] Add jobs list towards simplifying popgetter init

---
 python/popgetter/__init__.py | 49 ++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py
index 9d544a5..17febfc 100644
--- a/python/popgetter/__init__.py
+++ b/python/popgetter/__init__.py
@@ -63,30 +63,29 @@
     ),
 ]
 
-job_be: UnresolvedAssetJobDefinition = define_asset_job(
-    name="job_be",
-    selection=AssetSelection.groups("be"),
-    description="Downloads Belgian data.",
-    partitions_def=assets.be.census_tables.dataset_node_partition,
-)
-
-job_us: UnresolvedAssetJobDefinition = define_asset_job(
-    name="job_us",
-    selection=AssetSelection.groups("us"),
-    description="Downloads USA data.",
-)
-
-job_uk: UnresolvedAssetJobDefinition = define_asset_job(
-    name="job_uk",
-    selection=AssetSelection.groups("uk"),
-    description="Downloads UK data.",
-)
-
-job_ni: UnresolvedAssetJobDefinition = define_asset_job(
-    name="job_ni",
-    selection=AssetSelection.groups("ni"),
-    description="Downloads Northern Ireland data.",
-)
+jobs: list[UnresolvedAssetJobDefinition] = [
+    define_asset_job(
+        name="job_be",
+        selection=AssetSelection.groups("be"),
+        description="Downloads Belgian data.",
+        partitions_def=assets.be.census_tables.dataset_node_partition,
+    ),
+    define_asset_job(
+        name="job_us",
+        selection=AssetSelection.groups("us"),
+        description="Downloads USA data.",
+    ),
+    define_asset_job(
+        name="job_uk",
+        selection=AssetSelection.groups("uk"),
+        description="Downloads UK data.",
+    ),
+    define_asset_job(
+        name="job_ni",
+        selection=AssetSelection.groups("ni"),
+        description="Downloads Northern Ireland data.",
+    ),
+]
 
 
 def resources_by_env():
@@ -127,5 +126,5 @@ def resources_by_env():
         cloud_outputs.metrics_sensor,
     ],
     resources=resources,
-    jobs=[job_be, job_us, job_uk, job_ni],
+    jobs=jobs,
 )

From 10c2126d3dd49d06e0b377872939d29f79d682a5 Mon Sep 17 00:00:00 2001
From: Sam Greenbury <sgreenbury@turing.ac.uk>
Date: Fri, 7 Jun 2024 09:05:59 +0100
Subject: [PATCH 39/39] Fix ruff lint

---
 python/popgetter/assets/ni/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py
index 2f060ca..6fc1242 100644
--- a/python/popgetter/assets/ni/__init__.py
+++ b/python/popgetter/assets/ni/__init__.py
@@ -465,7 +465,7 @@ def _source_metric_metadata(
         partition_key = context.partition_key
         if (
             self.required_tables is not None
-            and partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys()
+            and partition_key not in DERIVED_COLUMN_SPECIFICATIONS
         ):
             skip_reason = (
                 f"Skipping as requested partition {partition_key} is not configured "