From 57d0957b549387c2fbc109dd3e4eecf2ed5b99bc Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 15 May 2024 17:56:16 +0100 Subject: [PATCH 01/39] Initial set-up for port of Northern Ireland to dagster --- python/popgetter/__init__.py | 9 +- python/popgetter/assets/__init__.py | 2 +- python/popgetter/assets/ni/__init__.py | 399 +++++++++++++++++++++++++ python/popgetter/utils.py | 22 ++ 4 files changed, 430 insertions(+), 2 deletions(-) create mode 100644 python/popgetter/assets/ni/__init__.py diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 5eeab89..3ee36c8 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -44,6 +44,7 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), + *load_assets_from_package_module(assets.ni, group_name="ni"), *load_assets_from_modules([cloud_outputs], group_name="cloud_assets"), ] @@ -66,6 +67,12 @@ description="Downloads UK data.", ) +job_ni: UnresolvedAssetJobDefinition = define_asset_job( + name="job_ni", + selection=AssetSelection.groups("ni"), + partitions_def=assets.ni.dataset_node_partition, +) + resources_by_env = { "prod": { "general_io_manager": AzureGeneralIOManager(".bin"), @@ -92,5 +99,5 @@ schedules=[], sensors=[cloud_outputs.country_outputs_sensor], resources=resources, - jobs=[job_be, job_us, job_uk], + jobs=[job_be, job_us, job_uk, job_ni], ) diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index e050bf8..c16f0c1 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,3 +1,3 @@ from __future__ import annotations -from . import be, uk, us # noqa: F401 +from . import be, ni, uk, us # noqa: F401 diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py new file mode 100644 index 0000000..7fdbe3f --- /dev/null +++ b/python/popgetter/assets/ni/__init__.py @@ -0,0 +1,399 @@ +from __future__ import annotations + +import io +from abc import ABC +from dataclasses import dataclass +from datetime import date + +import geopandas as gpd +import matplotlib.pyplot as plt +import pandas as pd +import requests +from bs4 import BeautifulSoup +from dagster import ( + AssetExecutionContext, + DynamicPartitionsDefinition, + MetadataValue, + asset, +) + +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + GeometryMetadata, + MetricMetadata, + SourceDataRelease, + metadata_to_dataframe, +) +from popgetter.utils import add_metadata, markdown_from_plot + +PARTITION_NAME = "uk-ni_dataset_nodes" +REQUIRED_TABLES = [ + "MS-A09", +] +REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) +REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"] +GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" + +# TODO: get correct dates +CENSUS_REFERENCE_DATE = date(2011, 3, 27) +CENSUS_COLLECTION_DATE = date(2011, 3, 27) +CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) +CENSUS_REFERENCE_DATE = date(2021, 3, 1) +CENSUS_PUBLICATION_DATE = date(2021, 3, 1) + + +@dataclass +class NIGeometryLevel: + level: str + hxl_tag: str + geo_id_column: str + name_columns: dict[str, str] # keys = language codes, values = column names + url: str + + +NI_GEO_LEVELS = { + "DZ21": NIGeometryLevel( + level="DZ21", + hxl_tag="TBD", + geo_id_column="DZ2021_cd", + name_columns={"eng": "DZ2021_nm"}, + url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip", + ) +} + +# Full list of geographies, see metadata: +# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE +GEO_LEVELS = [ + "LGD14", # Local Government District 2014 + # "URBAN_STATUS", # Urban Status + # "HEALTH_TRUST", # Health and Social Care Trust + # "PARLCON24", # Parliamentary Constituency 2024 + # "DEA14", # District Electoral Area 2014 + "SDZ21", # Census 2021 Super Data Zone + "DZ21", # Census 2021 Data Zone +] + + +class Country(ABC): + def catalog(self, context) -> pd.DataFrame: + ... + + def source_table(self, context): + ... + + def census_table(self, context): + ... + + def derived_table(self, context): + ... + + +# async fn population(&self) -> anyhow::Result { +# let url = +# "https://build.nisra.gov.uk/en/custom/table.csv?d=PEOPLE&v=DZ21&v=UR_SEX&v=AGE_SYOA_85"; +# let data: Vec = reqwest::get(url).await?.text().await?.bytes().collect(); +# Ok(CsvReader::new(Cursor::new(data)) +# .has_header(true) +# .finish()?) +# } +# async fn geojson(&self) -> anyhow::Result { +# let url = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-geojson.zip"; +# let mut tmpfile = tempfile::tempfile()?; +# tmpfile.write_all(&reqwest::get(url).await?.bytes().await?)?; +# let mut zip = zip::ZipArchive::new(tmpfile)?; +# let mut file = zip.by_name("DZ2021.geojson")?; +# let mut buffer = String::from(""); +# file.read_to_string(&mut buffer)?; +# Ok(buffer.parse()?) +# } + + +def get_nodes_and_links() -> dict[str, dict[str, str]]: + SCHEME_AND_HOST = "https://build.nisra.gov.uk" + urls = [ + "".join([SCHEME_AND_HOST, url.get("href")]) + for url in BeautifulSoup( + requests.get(SCHEME_AND_HOST + "/en/standard").content, features="lxml" + ).find_all("a") + if str(url.get("href")).startswith("/en/standard") + ] + nodes: dict[str, dict[str, str]] = {} + for url in urls: + soup = BeautifulSoup(requests.get(url).content, features="lxml") + nodes[url] = { + "table_url": list( + set( + [ + "".join([SCHEME_AND_HOST, link.get("href")]) + for link in soup.find_all("a") + if "table.csv?" in link.get("href") + ] + ) + )[0], + "metadata_url": list( + set( + [ + "".join([SCHEME_AND_HOST, link.get("href")]) + for link in soup.find_all("a") + if "table.csv-metadata" in link.get("href") + ] + ) + )[0], + } + return nodes + + +class NorthernIreland(Country): + partition_name: str = PARTITION_NAME + geo_levels: list[str] = GEO_LEVELS + required_tables: list[str] = REQUIRED_TABLES + + def catalog(self, context: AssetExecutionContext) -> pd.DataFrame: + """ + A catalog for NI can be generated in two ways: + 1. With flexible table builder: + https://build.nisra.gov.uk/en/ + with metadata chosen from: + https://build.nisra.gov.uk/en/metadata + 2. Or through enumerating the ready-made tables: + https://build.nisra.gov.uk/en/standard + However, some level of + """ + catalog_summary = { + "node": [], + "partition_key": [], + "table_id": [], + "geo_level": [], + "human_readable_name": [], + "description": [], + "metric_parquet_file_url": [], + "parquet_column_name": [], + "parquet_margin_of_error_column": [], + "parquet_margin_of_error_file": [], + "potential_denominator_ids": [], + "parent_metric_id": [], + "source_data_release_id": [], + "source_download_url": [], + "source_format": [], + "source_archive_file_path": [], + "source_documentation_url": [], + "table_schema": [], + } + nodes = get_nodes_and_links() + + def add_resolution(s: str, geo_level: str) -> str: + s_split = s.split("?") + return "?".join([s_split[0], f"v={geo_level}&" + s_split[1]]) + + for node_url, node_items in nodes.items(): + for geo_level in self.geo_levels: + metadata = requests.get(node_items["metadata_url"]).json() + table_id = metadata["dc:title"].split(":")[0] + # Skip if not required + if table_id not in self.required_tables: + continue + + catalog_summary["node"].append(node_url) + catalog_summary["table_id"].append(table_id) + catalog_summary["geo_level"].append(geo_level) + catalog_summary["partition_key"].append(f"{geo_level}/{table_id}") + catalog_summary["human_readable_name"].append(metadata["dc:title"]) + catalog_summary["description"].append(metadata["dc:description"]) + catalog_summary["metric_parquet_file_url"].append(None) + catalog_summary["parquet_column_name"].append(None) + catalog_summary["parquet_margin_of_error_column"].append(None) + catalog_summary["parquet_margin_of_error_file"].append(None) + catalog_summary["potential_denominator_ids"].append(None) + catalog_summary["parent_metric_id"].append(None) + catalog_summary["source_data_release_id"].append(None) + catalog_summary["source_download_url"].append( + add_resolution(metadata["url"], geo_level) + ) + catalog_summary["source_format"].append(None) + catalog_summary["source_archive_file_path"].append(None) + catalog_summary["source_documentation_url"].append(node_url) + catalog_summary["table_schema"].append(metadata["tableSchema"]) + + catalog_df = pd.DataFrame.from_records(catalog_summary) + context.instance.add_dynamic_partitions( + partitions_def_name=self.partition_name, + partition_keys=catalog_df["partition_key"].to_list(), + ) + + add_metadata(context, catalog_df, "Catalog") + return catalog_df + + def census_tables( + self, context: AssetExecutionContext, catalog: pd.DataFrame, partition + ) -> pd.DataFrame: + url = catalog.loc[ + catalog["partition_key"].eq(partition), "source_download_url" + ].iloc[0] + return pd.read_csv(io.BytesIO(requests.get(url).content), encoding="utf8") + + def source_table(self) -> pd.DataFrame: + return pd.DataFrame() + + +country: CountryMetadata = CountryMetadata( + name_short_en="Northern Ireland", + name_official="Northern Ireland", + iso3="GBR", + iso2="GB", + iso3166_2="GB-NIR", +) + +publisher: DataPublisher = DataPublisher( + name="NISRA", + url="https://www.nisra.gov.uk/", + description="The Northern Ireland Statistics and Research Agency (NISRA), which incorporates the General Register Office (GRO), is an executive agency within the Department of Finance (NI) and was established on 1 April 1996.", + countries_of_interest=[country.id], +) + + +@asset +def source_data_release( + context: AssetExecutionContext, + geographies: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], +) -> list[SourceDataRelease]: + source_data_releases = [] + for geo_level in geographies[0]: + source_data_release: SourceDataRelease = SourceDataRelease( + name="Census 2021", + date_published=date(2014, 2, 27), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", + data_publisher_id=publisher.id, + description="TBC", + # geography_file="TBC", + # geography_level="TBC", + # countries_of_interest=[country.id], + geometry_metadata_id="tbd", + ) + source_data_releases.append(source_data_release) + return source_data_releases + + +key_prefix = "uk-ni" + +ni = NorthernIreland() + +dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) + + +@asset(key_prefix=key_prefix) +def catalog(context) -> pd.DataFrame: + return ni.catalog(context) + + +@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix) +def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame: + census_table = ni.census_tables( + context, catalog, context.asset_partition_key_for_output() + ) + add_metadata(context, census_table, title=context.asset_partition_key_for_output()) + return census_table + + +@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix) +def source_tables( + context: AssetExecutionContext, census_tables: pd.DataFrame +) -> pd.DataFrame: + return census_tables + + +def source_metadata_from_catalog(catalog) -> MetricMetadata: + ... + + +geometry_metadata: GeometryMetadata = GeometryMetadata( + validity_period_start=date(2023, 1, 1), + validity_period_end=date(2023, 12, 31), + level="municipality", + # country -> province -> region -> arrondisement -> municipality + hxl_tag="adm4", +) + + +@asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix) +def geographies( + context: AssetExecutionContext, +) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: + level_details = NI_GEO_LEVELS["DZ21"] + + # TODO: get correct values + geometry_metadata = GeometryMetadata( + validity_period_start=date(2023, 1, 1), + validity_period_end=date(2023, 12, 31), + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + region_geometries_raw = ( + gpd.read_file(level_details.url) + .dissolve(by=level_details.geo_id_column) + .reset_index() + ) + region_geometries = region_geometries_raw.rename( + columns={level_details.geo_id_column: "GEO_ID"} + ).loc[:, ["geometry", "GEO_ID"]] + region_names = ( + region_geometries_raw.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["eng"]: "eng", + } + ) + .loc[:, ["GEO_ID", "eng"]] + .drop_duplicates() + ) + + # Generate a plot and convert the image to Markdown to preview it within + # Dagster + joined_gdf = region_geometries.merge(region_names, on="GEO_ID") + ax = joined_gdf.plot(column="eng", legend=False) + ax.set_title(f"Northern Ireland 2023 {level_details.level}") + md_plot = markdown_from_plot(plt) + + geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) + + context.add_output_metadata( + metadata={ + "num_records": len(region_geometries), + "geometry_plot": MetadataValue.md(md_plot), + "names_preview": MetadataValue.md(region_names.head().to_markdown()), + "metadata_preview": MetadataValue.md( + geometry_metadata_df.head().to_markdown() + ), + }, + ) + + context.add_output_metadata( + metadata={ + "num_records": len(region_geometries), + "geometry_plot": MetadataValue.md(md_plot), + "names_preview": MetadataValue.md(region_names.head().to_markdown()), + "metadata_preview": MetadataValue.md( + geometry_metadata_df.head().to_markdown() + ), + }, + ) + return geometry_metadata_df, region_geometries, region_names + + +# @asset(partitions_def=dataset_node_partition, key_prefix=asset_prefix) +# def source_mmd(context: AssetExecutionContext, catalog: pd.DataFrame) -> list[MetricMetadata]: +# # return census_tables +# source_metadata_from_catalog(catalog) + +# @asset +# def source_tables() -> pd.DataFrame: +# return ni.catalog() + +# @asset +# def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]: +# # return ni.catalog() diff --git a/python/popgetter/utils.py b/python/popgetter/utils.py index 724377f..48070cc 100644 --- a/python/popgetter/utils.py +++ b/python/popgetter/utils.py @@ -11,10 +11,13 @@ from tempfile import TemporaryDirectory import fsspec +import geopandas as gpd +import pandas as pd import requests from dagster import ( ConfigurableResource, EnvVar, + MetadataValue, get_dagster_logger, ) @@ -297,6 +300,25 @@ def download_zipped_files(zipfile_url: str, output_dir: str) -> None: z.extractall(output_dpath) +def add_metadata( + context, + df: pd.DataFrame | gpd.GeoDataFrame, + title: str | list[str], + output_name: str | None = None, +): + context.add_output_metadata( + metadata={ + "title": title, + "num_records": len(df), + "columns": MetadataValue.md( + "\n".join([f"- '`{col}`'" for col in df.columns.to_list()]) + ), + "preview": MetadataValue.md(df.head().to_markdown()), + }, + output_name=output_name, + ) + + if __name__ == "__main__": pass # This is for testing only From 649ed1e8f05b3c7adb99eb5db93e0fba6857adce Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 16 May 2024 17:20:58 +0100 Subject: [PATCH 02/39] Update NI port --- python/popgetter/__init__.py | 2 +- python/popgetter/assets/ni/__init__.py | 292 ++++++++++----------- python/popgetter/cloud_outputs/__init__.py | 2 + python/popgetter/io_managers/__init__.py | 17 +- 4 files changed, 150 insertions(+), 163 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 2312895..f12c1a5 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -45,7 +45,7 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_package_module(assets.ni, group_name="ni"), + *load_assets_from_package_module(assets.ni, group_name="ni", key_prefix="uk-ni"), *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), ] diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 7fdbe3f..3975c8a 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -1,7 +1,6 @@ from __future__ import annotations import io -from abc import ABC from dataclasses import dataclass from datetime import date @@ -16,14 +15,15 @@ MetadataValue, asset, ) +from icecream import ic +from popgetter.assets.common import Country from popgetter.metadata import ( CountryMetadata, DataPublisher, GeometryMetadata, MetricMetadata, SourceDataRelease, - metadata_to_dataframe, ) from popgetter.utils import add_metadata, markdown_from_plot @@ -32,15 +32,19 @@ "MS-A09", ] REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) -REQUIRED_RELEASES = ["3A", "3I", "2A", "3C"] -GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" -# TODO: get correct dates -CENSUS_REFERENCE_DATE = date(2011, 3, 27) -CENSUS_COLLECTION_DATE = date(2011, 3, 27) -CENSUS_EXPECT_NEXT_UPDATE = date(2022, 1, 1) -CENSUS_REFERENCE_DATE = date(2021, 3, 1) -CENSUS_PUBLICATION_DATE = date(2021, 3, 1) +# TODO +REQUIRED_RELEASES = [""] +# GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" + +# TODO: get these are correct dates +CENSUS_REFERENCE_DATE = date(2021, 3, 21) +CENSUS_COLLECTION_DATE = date(2021, 3, 21) +CENSUS_EXPECT_NEXT_UPDATE = date(2031, 1, 1) +CENSUS_REFERENCE_DATE = date(2021, 3, 21) +# https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus: +# 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas +CENSUS_PUBLICATION_DATE = date(2023, 2, 21) @dataclass @@ -57,7 +61,7 @@ class NIGeometryLevel: level="DZ21", hxl_tag="TBD", geo_id_column="DZ2021_cd", - name_columns={"eng": "DZ2021_nm"}, + name_columns={"en": "DZ2021_nm"}, url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip", ) } @@ -75,40 +79,6 @@ class NIGeometryLevel: ] -class Country(ABC): - def catalog(self, context) -> pd.DataFrame: - ... - - def source_table(self, context): - ... - - def census_table(self, context): - ... - - def derived_table(self, context): - ... - - -# async fn population(&self) -> anyhow::Result { -# let url = -# "https://build.nisra.gov.uk/en/custom/table.csv?d=PEOPLE&v=DZ21&v=UR_SEX&v=AGE_SYOA_85"; -# let data: Vec = reqwest::get(url).await?.text().await?.bytes().collect(); -# Ok(CsvReader::new(Cursor::new(data)) -# .has_header(true) -# .finish()?) -# } -# async fn geojson(&self) -> anyhow::Result { -# let url = "https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-geojson.zip"; -# let mut tmpfile = tempfile::tempfile()?; -# tmpfile.write_all(&reqwest::get(url).await?.bytes().await?)?; -# let mut zip = zip::ZipArchive::new(tmpfile)?; -# let mut file = zip.by_name("DZ2021.geojson")?; -# let mut buffer = String::from(""); -# file.read_to_string(&mut buffer)?; -# Ok(buffer.parse()?) -# } - - def get_nodes_and_links() -> dict[str, dict[str, str]]: SCHEME_AND_HOST = "https://build.nisra.gov.uk" urls = [ @@ -149,7 +119,8 @@ class NorthernIreland(Country): geo_levels: list[str] = GEO_LEVELS required_tables: list[str] = REQUIRED_TABLES - def catalog(self, context: AssetExecutionContext) -> pd.DataFrame: + # def catalog(self, context: AssetExecutionContext) -> pd.DataFrame: + def catalog(self, context) -> pd.DataFrame: """ A catalog for NI can be generated in two ways: 1. With flexible table builder: @@ -184,7 +155,14 @@ def catalog(self, context: AssetExecutionContext) -> pd.DataFrame: def add_resolution(s: str, geo_level: str) -> str: s_split = s.split("?") - return "?".join([s_split[0], f"v={geo_level}&" + s_split[1]]) + query_params = s_split[1].split("&") + if query_params[0].startswith("d="): + query_params = "&".join( + [query_params[0], f"v={geo_level}", *query_params[1:]] + ) + else: + query_params = "&".join([f"v={geo_level}", *query_params[:]]) + return "?".join([s_split[0], query_params]) for node_url, node_items in nodes.items(): for geo_level in self.geo_levels: @@ -224,9 +202,7 @@ def add_resolution(s: str, geo_level: str) -> str: add_metadata(context, catalog_df, "Catalog") return catalog_df - def census_tables( - self, context: AssetExecutionContext, catalog: pd.DataFrame, partition - ) -> pd.DataFrame: + def census_tables(self, context, catalog: pd.DataFrame, partition) -> pd.DataFrame: url = catalog.loc[ catalog["partition_key"].eq(partition), "source_download_url" ].iloc[0] @@ -252,33 +228,6 @@ def source_table(self) -> pd.DataFrame: ) -@asset -def source_data_release( - context: AssetExecutionContext, - geographies: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], -) -> list[SourceDataRelease]: - source_data_releases = [] - for geo_level in geographies[0]: - source_data_release: SourceDataRelease = SourceDataRelease( - name="Census 2021", - date_published=date(2014, 2, 27), - reference_period_start=CENSUS_REFERENCE_DATE, - reference_period_end=CENSUS_REFERENCE_DATE, - collection_period_start=CENSUS_COLLECTION_DATE, - collection_period_end=CENSUS_COLLECTION_DATE, - expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, - url="https://www.nrscotland.gov.uk/news/2014/census-2011-release-3a", - data_publisher_id=publisher.id, - description="TBC", - # geography_file="TBC", - # geography_level="TBC", - # countries_of_interest=[country.id], - geometry_metadata_id="tbd", - ) - source_data_releases.append(source_data_release) - return source_data_releases - - key_prefix = "uk-ni" ni = NorthernIreland() @@ -286,13 +235,14 @@ def source_data_release( dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) -@asset(key_prefix=key_prefix) +@asset def catalog(context) -> pd.DataFrame: return ni.catalog(context) -@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix) -def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame: +@asset(partitions_def=dataset_node_partition) +# def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame: +def census_tables(context, catalog) -> pd.DataFrame: census_table = ni.census_tables( context, catalog, context.asset_partition_key_for_output() ) @@ -300,100 +250,126 @@ def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame: return census_table -@asset(partitions_def=dataset_node_partition, key_prefix=key_prefix) -def source_tables( - context: AssetExecutionContext, census_tables: pd.DataFrame -) -> pd.DataFrame: +@asset(partitions_def=dataset_node_partition) +def source_tables(context, census_tables: pd.DataFrame) -> pd.DataFrame: return census_tables -def source_metadata_from_catalog(catalog) -> MetricMetadata: - ... - - -geometry_metadata: GeometryMetadata = GeometryMetadata( - validity_period_start=date(2023, 1, 1), - validity_period_end=date(2023, 12, 31), - level="municipality", - # country -> province -> region -> arrondisement -> municipality - hxl_tag="adm4", -) +def source_metadata_from_catalog( + catalog: pd.DataFrame, parition_key: str, source_data_release: SourceDataRelease +) -> MetricMetadata: + catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :] + return MetricMetadata( + human_readable_name=catalog_row["human_readable_name"], + source_download_url=catalog_row["source_download_url"], + source_archive_file_path=catalog_row["source_archive_file_path"], + source_documentation_url=catalog_row["source_documentation_url"], + source_data_release_id=source_data_release.id, + # TODO - this is a placeholder + parent_metric_id="unknown_at_this_stage", + potential_denominator_ids=None, + parquet_margin_of_error_file=None, + parquet_margin_of_error_column=None, + parquet_column_name=catalog_row["source_column"], + # TODO - this is a placeholder + metric_parquet_file_url="unknown_at_this_stage", + hxl_tag=catalog_row["hxltag"], + description=catalog_row["description"], + source_metric_id=catalog_row["hxltag"], + ) -@asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix) -def geographies( - context: AssetExecutionContext, -) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: +@asset +# @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix) +def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + # TODO: This is almost identical to Belgium so can probably be refactored to common + # function with config of releases and languages level_details = NI_GEO_LEVELS["DZ21"] - # TODO: get correct values - geometry_metadata = GeometryMetadata( - validity_period_start=date(2023, 1, 1), - validity_period_end=date(2023, 12, 31), - level=level_details.level, - hxl_tag=level_details.hxl_tag, - ) - region_geometries_raw = ( - gpd.read_file(level_details.url) - .dissolve(by=level_details.geo_id_column) - .reset_index() - ) - region_geometries = region_geometries_raw.rename( - columns={level_details.geo_id_column: "GEO_ID"} - ).loc[:, ["geometry", "GEO_ID"]] - region_names = ( - region_geometries_raw.rename( - columns={ - level_details.geo_id_column: "GEO_ID", - level_details.name_columns["eng"]: "eng", - } + geometries_to_return = [] + for level_details in NI_GEO_LEVELS.values(): + # TODO: get correct values + geometry_metadata = GeometryMetadata( + validity_period_start=CENSUS_COLLECTION_DATE, + validity_period_end=CENSUS_COLLECTION_DATE, + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + region_geometries_raw = ( + gpd.read_file(level_details.url) + .dissolve(by=level_details.geo_id_column) + .reset_index() + ) + context.log.debug(ic(region_geometries_raw.head())) + region_geometries = region_geometries_raw.rename( + columns={level_details.geo_id_column: "GEO_ID"} + ).loc[:, ["geometry", "GEO_ID"]] + region_names = ( + region_geometries_raw.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["en"]: "en", + } + ) + .loc[:, ["GEO_ID", "en"]] + .drop_duplicates() + ) + geometries_to_return.append( + (geometry_metadata, region_geometries, region_names) ) - .loc[:, ["GEO_ID", "eng"]] - .drop_duplicates() - ) - # Generate a plot and convert the image to Markdown to preview it within - # Dagster - joined_gdf = region_geometries.merge(region_names, on="GEO_ID") - ax = joined_gdf.plot(column="eng", legend=False) - ax.set_title(f"Northern Ireland 2023 {level_details.level}") + # Add output metadata + first_metadata, first_gdf, first_names = geometries_to_return[0] + first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + ax = first_joined_gdf.plot(column="en", legend=False) + ax.set_title(f"NI 2023 {first_metadata.level}") md_plot = markdown_from_plot(plt) - - geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) - context.add_output_metadata( metadata={ - "num_records": len(region_geometries), - "geometry_plot": MetadataValue.md(md_plot), - "names_preview": MetadataValue.md(region_names.head().to_markdown()), - "metadata_preview": MetadataValue.md( - geometry_metadata_df.head().to_markdown() + "all_geom_levels": MetadataValue.md( + ",".join([metadata.level for metadata, _, _ in geometries_to_return]) ), - }, + "first_geometry_plot": MetadataValue.md(md_plot), + "first_names_preview": MetadataValue.md(first_names.head().to_markdown()), + } ) - context.add_output_metadata( - metadata={ - "num_records": len(region_geometries), - "geometry_plot": MetadataValue.md(md_plot), - "names_preview": MetadataValue.md(region_names.head().to_markdown()), - "metadata_preview": MetadataValue.md( - geometry_metadata_df.head().to_markdown() - ), - }, - ) - return geometry_metadata_df, region_geometries, region_names + return geometries_to_return -# @asset(partitions_def=dataset_node_partition, key_prefix=asset_prefix) -# def source_mmd(context: AssetExecutionContext, catalog: pd.DataFrame) -> list[MetricMetadata]: -# # return census_tables -# source_metadata_from_catalog(catalog) +@asset() +def source_data_release( + context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] +) -> list[SourceDataRelease]: + source_data_releases = [] + for geo_metadata, _, _ in geometry: + # TODO: update with dates from config + source_data_release: SourceDataRelease = SourceDataRelease( + name="Census 2021", + date_published=date(2014, 2, 27), + reference_period_start=CENSUS_REFERENCE_DATE, + reference_period_end=CENSUS_REFERENCE_DATE, + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus", + data_publisher_id=publisher.id, + description="TBC", + geometry_metadata_id=geo_metadata.id, + ) + source_data_releases.append(source_data_release) + return source_data_releases -# @asset -# def source_tables() -> pd.DataFrame: -# return ni.catalog() -# @asset -# def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]: -# # return ni.catalog() +@asset(partitions_def=dataset_node_partition) +def source_mmd( + context: AssetExecutionContext, + catalog: pd.DataFrame, + source_data_release: list[SourceDataRelease], +) -> list[MetricMetadata]: + source_metadata_from_catalog(catalog) + + +@asset +def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]: + return ni.catalog() diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index 96ba6f8..cf7b589 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -7,6 +7,7 @@ "be/country_metadata", "be/data_publisher", "be/source_data_release_munip", + "uk-ni/source_data_release", ], io_manager_key="metadata_io_manager", prefix="metadata", @@ -19,6 +20,7 @@ geometry_factory = CloudAssetSensor( asset_names_to_monitor=[ "be/geometry", + "uk-ni/geometry", ], io_manager_key="geometry_io_manager", prefix="geometry", diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 5bf6dd1..97b008e 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -33,13 +33,17 @@ def load_input(self, _context: InputContext) -> pd.DataFrame: class MetadataIOManager(PopgetterIOManager): def get_output_filename( - self, obj: CountryMetadata | DataPublisher | SourceDataRelease + self, + obj: CountryMetadata + | DataPublisher + | SourceDataRelease + | list[SourceDataRelease], ) -> str: if isinstance(obj, CountryMetadata): return "country_metadata.parquet" if isinstance(obj, DataPublisher): return "data_publishers.parquet" - if isinstance(obj, SourceDataRelease): + if isinstance(obj, SourceDataRelease) or isinstance(obj, list): return "source_data_releases.parquet" err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" @@ -57,11 +61,16 @@ def get_full_path( def handle_output( self, context: OutputContext, - obj: CountryMetadata | DataPublisher | SourceDataRelease, + obj: CountryMetadata + | DataPublisher + | SourceDataRelease + | list[SourceDataRelease], ): + if not isinstance(obj, list): + obj = [obj] full_path = self.get_full_path(context, obj) context.add_output_metadata(metadata={"parquet_path": str(full_path)}) - self.handle_df(context, metadata_to_dataframe([obj]), full_path) + self.handle_df(context, metadata_to_dataframe(obj), full_path) class GeoIOManager(PopgetterIOManager): From bf3ea60dd04fbd7e055ee3270d18934a4670c80c Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 16 May 2024 18:12:05 +0100 Subject: [PATCH 03/39] Adding transformations to match updated Belgium --- python/popgetter/assets/ni/__init__.py | 292 +++++++++++++++++++++---- 1 file changed, 247 insertions(+), 45 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 3975c8a..651fe7e 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -1,8 +1,10 @@ from __future__ import annotations import io +from collections.abc import Callable from dataclasses import dataclass from datetime import date +from functools import reduce import geopandas as gpd import matplotlib.pyplot as plt @@ -10,9 +12,12 @@ import requests from bs4 import BeautifulSoup from dagster import ( - AssetExecutionContext, + AssetIn, DynamicPartitionsDefinition, + IdentityPartitionMapping, MetadataValue, + SpecificPartitionsPartitionMapping, + StaticPartitionsDefinition, asset, ) from icecream import ic @@ -24,6 +29,7 @@ GeometryMetadata, MetricMetadata, SourceDataRelease, + metadata_to_dataframe, ) from popgetter.utils import add_metadata, markdown_from_plot @@ -158,11 +164,13 @@ def add_resolution(s: str, geo_level: str) -> str: query_params = s_split[1].split("&") if query_params[0].startswith("d="): query_params = "&".join( - [query_params[0], f"v={geo_level}", *query_params[1:]] + [query_params[0], f"v={geo_level}", *query_params[2:]] ) else: - query_params = "&".join([f"v={geo_level}", *query_params[:]]) - return "?".join([s_split[0], query_params]) + query_params = "&".join([f"v={geo_level}", *query_params[1:]]) + out_url = "?".join([s_split[0], query_params]) + ic(out_url) + return out_url for node_url, node_items in nodes.items(): for geo_level in self.geo_levels: @@ -202,7 +210,11 @@ def add_resolution(s: str, geo_level: str) -> str: add_metadata(context, catalog_df, "Catalog") return catalog_df - def census_tables(self, context, catalog: pd.DataFrame, partition) -> pd.DataFrame: + def census_tables( + self, context, catalog: pd.DataFrame, partition: str + ) -> pd.DataFrame: + ic(partition) + ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"]) url = catalog.loc[ catalog["partition_key"].eq(partition), "source_download_url" ].iloc[0] @@ -250,35 +262,6 @@ def census_tables(context, catalog) -> pd.DataFrame: return census_table -@asset(partitions_def=dataset_node_partition) -def source_tables(context, census_tables: pd.DataFrame) -> pd.DataFrame: - return census_tables - - -def source_metadata_from_catalog( - catalog: pd.DataFrame, parition_key: str, source_data_release: SourceDataRelease -) -> MetricMetadata: - catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :] - return MetricMetadata( - human_readable_name=catalog_row["human_readable_name"], - source_download_url=catalog_row["source_download_url"], - source_archive_file_path=catalog_row["source_archive_file_path"], - source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id=source_data_release.id, - # TODO - this is a placeholder - parent_metric_id="unknown_at_this_stage", - potential_denominator_ids=None, - parquet_margin_of_error_file=None, - parquet_margin_of_error_column=None, - parquet_column_name=catalog_row["source_column"], - # TODO - this is a placeholder - metric_parquet_file_url="unknown_at_this_stage", - hxl_tag=catalog_row["hxltag"], - description=catalog_row["description"], - source_metric_id=catalog_row["hxltag"], - ) - - @asset # @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix) def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: @@ -340,7 +323,7 @@ def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataF @asset() def source_data_release( context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] -) -> list[SourceDataRelease]: +) -> SourceDataRelease: source_data_releases = [] for geo_metadata, _, _ in geometry: # TODO: update with dates from config @@ -358,18 +341,237 @@ def source_data_release( geometry_metadata_id=geo_metadata.id, ) source_data_releases.append(source_data_release) - return source_data_releases + # TODO: update for multiple source data releases + return source_data_releases[0] -@asset(partitions_def=dataset_node_partition) -def source_mmd( - context: AssetExecutionContext, +# @asset(partitions_def=dataset_node_partition) +# def source_mmd( +# context, +# catalog: pd.DataFrame, +# source_data_release: SourceDataRelease, +# ) -> list[MetricMetadata]: +# source_metadata_from_catalog(catalog) + + +# TODO: check if this is a simpler approach? +# @asset(partitions_def=dataset_node_partition) +# def source_tables( +# context: AssetExecutionContext, census_tables: pd.DataFrame +# ) -> pd.DataFrame: +# if context.partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys(): +# raise ValueError(f"Specified partition '{context.partition_key}' not handled") +# return census_tables + + +@dataclass +class DerivedColumn: + hxltag: str + filter_func: Callable[[pd.DataFrame], pd.DataFrame] + output_column_name: str + human_readable_name: str + + +# The keys of this dict are the nodes (i.e. partition keys). The values are a +# list of all columns of data derived from this node. +age_code = "Age Code" +sex_code = "Sex Code" +DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, list[DerivedColumn])] = { # type: ignore + "DZ21/MS-A09": ( + "Census 2021 Data Zone Code", + [ + DerivedColumn( + hxltag="#population+children+age5_17", + filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"), + output_column_name="children_5_17", + human_readable_name="Children aged 5 to 17", + ), + DerivedColumn( + hxltag="#population+infants+age0_4", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"), + output_column_name="infants_0_4", + human_readable_name="Infants aged 0 to 4", + ), + DerivedColumn( + hxltag="#population+children+age0_17", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"), + output_column_name="children_0_17", + human_readable_name="Children aged 0 to 17", + ), + DerivedColumn( + hxltag="#population+adults+f", + filter_func=lambda df: df.query( + f"{age_code} >= 18 and {sex_code} == 'F'" + ), + output_column_name="adults_f", + human_readable_name="Female adults", + ), + DerivedColumn( + hxltag="#population+adults+m", + filter_func=lambda df: df.query( + f"{age_code} >= 18 and {sex_code} == 'M'" + ), + output_column_name="adults_m", + human_readable_name="Male adults", + ), + DerivedColumn( + hxltag="#population+adults", + filter_func=lambda df: df.query(f"{age_code} >= 18"), + output_column_name="adults", + human_readable_name="Adults", + ), + DerivedColumn( + hxltag="#population+ind", + filter_func=lambda df: df, + output_column_name="individuals", + human_readable_name="Total individuals", + ), + ], + ) +} +_needed_dataset_nodes = list(set([key for key in DERIVED_COLUMN_SPECIFICATIONS.keys()])) +needed_dataset_mapping = SpecificPartitionsPartitionMapping(_needed_dataset_nodes) +needed_dataset_partition = StaticPartitionsDefinition(_needed_dataset_nodes) + + +def census_table_metadata( + catalog_row: dict[str, str], source_data_release: SourceDataRelease +) -> MetricMetadata: + return MetricMetadata( + human_readable_name=catalog_row["human_readable_name"], + source_download_url=catalog_row["source_download_url"], + source_archive_file_path=catalog_row["source_archive_file_path"], + source_documentation_url=catalog_row["source_documentation_url"], + source_data_release_id=source_data_release.id, + # TODO - this is a placeholder + parent_metric_id="unknown_at_this_stage", + potential_denominator_ids=None, + parquet_margin_of_error_file=None, + parquet_margin_of_error_column=None, + parquet_column_name=catalog_row["source_column"], + # TODO - this is a placeholder + metric_parquet_path="unknown_at_this_stage", + hxl_tag=catalog_row["hxltag"], + description=catalog_row["description"], + source_metric_id=catalog_row["hxltag"], + ) + + +@asset( + ins={ + "census_tables": AssetIn(partition_mapping=needed_dataset_mapping), + "catalog": AssetIn(), + "source_data_release": AssetIn(), + }, + partitions_def=dataset_node_partition, +) +def source_metrics_by_partition( + context, + census_tables: dict[str, pd.DataFrame], catalog: pd.DataFrame, - source_data_release: list[SourceDataRelease], -) -> list[MetricMetadata]: - source_metadata_from_catalog(catalog) + # TODO: generalise to list or dict of SourceDataReleases as there may be + # tables in here that are not at the same release level + source_data_release: SourceDataRelease, + # TODO: return an intermediate type instead of MetricMetadata +) -> tuple[MetricMetadata, pd.DataFrame]: + input_partition_keys = context.asset_partition_keys_for_input( + input_name="census_tables" + ) + output_partition_key = context.partition_key + + if output_partition_key not in input_partition_keys: + skip_reason = f"Skipping as requested partition {output_partition_key} is not part of the 'needed' partitions {input_partition_keys}" + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) + + try: + result_df = census_tables[output_partition_key] + except KeyError: + err_msg = ( + f"Partition key {output_partition_key} not found in census_tables\n" + f"Available keys are {census_tables.keys()}" + ) + raise ValueError(err_msg) from None + catalog_row = catalog[catalog["node"] == output_partition_key].to_dict( + orient="records" + )[0] -@asset -def derived_tables() -> tuple[pd.DataFrame, list[MetricMetadata]]: - return ni.catalog() + # catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :] + result_mmd = census_table_metadata(catalog_row, source_data_release) + + return result_mmd, result_df + + +@asset( + partitions_def=dataset_node_partition, + ins={ + "source_metrics_by_partition": AssetIn( + partition_mapping=IdentityPartitionMapping() + ), + }, +) +def derived_metrics_by_partition( + context, + source_metrics_by_partition: tuple[MetricMetadata, pd.DataFrame], +) -> tuple[list[MetricMetadata], pd.DataFrame]: + node = context.partition_key + + source_mmd, source_table = source_metrics_by_partition + source_column = source_mmd.parquet_column_name + assert source_column in source_table.columns + assert len(source_table) > 0 + + try: + geo_id_col_name, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[node] + except KeyError: + skip_reason = ( + f"Skipping as no derived columns are to be created for node {node}" + ) + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) + + # Rename the geoID column to GEO_ID + source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"}) + + derived_metrics: list[pd.DataFrame] = [] + derived_mmd: list[MetricMetadata] = [] + + parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet" + + for metric_spec in metric_specs: + new_table = ( + source_table.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + + joined_metrics = reduce( + lambda left, right: left.merge( + right, on="GEO_ID", how="inner", validate="one_to_one" + ), + derived_metrics, + ) + + context.add_output_metadata( + metadata={ + "metadata_preview": MetadataValue.md( + metadata_to_dataframe(derived_mmd).head().to_markdown() + ), + "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + "metrics_preview": MetadataValue.md(joined_metrics.head().to_markdown()), + }, + ) + + return derived_mmd, joined_metrics From eb883babdf0caccbd449ad1e2624e3b9aa0f4cd1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 20 May 2024 11:18:29 +0100 Subject: [PATCH 04/39] Complete and revise DAG for NI, metadata for source table --- python/popgetter/assets/ni/__init__.py | 185 ++++++++++--------------- 1 file changed, 75 insertions(+), 110 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 651fe7e..512442f 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -12,12 +12,8 @@ import requests from bs4 import BeautifulSoup from dagster import ( - AssetIn, DynamicPartitionsDefinition, - IdentityPartitionMapping, MetadataValue, - SpecificPartitionsPartitionMapping, - StaticPartitionsDefinition, asset, ) from icecream import ic @@ -98,24 +94,24 @@ def get_nodes_and_links() -> dict[str, dict[str, str]]: for url in urls: soup = BeautifulSoup(requests.get(url).content, features="lxml") nodes[url] = { - "table_url": list( - set( + "table_url": next( + iter( [ "".join([SCHEME_AND_HOST, link.get("href")]) for link in soup.find_all("a") if "table.csv?" in link.get("href") ] ) - )[0], - "metadata_url": list( - set( + ), + "metadata_url": next( + iter( [ "".join([SCHEME_AND_HOST, link.get("href")]) for link in soup.find_all("a") if "table.csv-metadata" in link.get("href") ] ) - )[0], + ), } return nodes @@ -211,7 +207,7 @@ def add_resolution(s: str, geo_level: str) -> str: return catalog_df def census_tables( - self, context, catalog: pd.DataFrame, partition: str + self, _context, catalog: pd.DataFrame, partition: str ) -> pd.DataFrame: ic(partition) ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"]) @@ -321,10 +317,10 @@ def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataF @asset() -def source_data_release( - context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] -) -> SourceDataRelease: - source_data_releases = [] +def source_data_releases( + _context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] +) -> dict[str, SourceDataRelease]: + source_data_releases = {} for geo_metadata, _, _ in geometry: # TODO: update with dates from config source_data_release: SourceDataRelease = SourceDataRelease( @@ -340,28 +336,8 @@ def source_data_release( description="TBC", geometry_metadata_id=geo_metadata.id, ) - source_data_releases.append(source_data_release) - # TODO: update for multiple source data releases - return source_data_releases[0] - - -# @asset(partitions_def=dataset_node_partition) -# def source_mmd( -# context, -# catalog: pd.DataFrame, -# source_data_release: SourceDataRelease, -# ) -> list[MetricMetadata]: -# source_metadata_from_catalog(catalog) - - -# TODO: check if this is a simpler approach? -# @asset(partitions_def=dataset_node_partition) -# def source_tables( -# context: AssetExecutionContext, census_tables: pd.DataFrame -# ) -> pd.DataFrame: -# if context.partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys(): -# raise ValueError(f"Specified partition '{context.partition_key}' not handled") -# return census_tables + source_data_releases[geo_metadata.level] = source_data_release + return source_data_releases @dataclass @@ -372,13 +348,28 @@ class DerivedColumn: human_readable_name: str +@dataclass +class SourceTable: + hxltag: str + geo_level: str + geo_column: str + source_column: str + + # The keys of this dict are the nodes (i.e. partition keys). The values are a # list of all columns of data derived from this node. -age_code = "Age Code" -sex_code = "Sex Code" -DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, list[DerivedColumn])] = { # type: ignore +age_code = "`Age Code`" +sex_label = "`Sex Label`" + +# Config for each partition to be derived +DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = { "DZ21/MS-A09": ( - "Census 2021 Data Zone Code", + SourceTable( + hxltag="#population+dz21+2021", + geo_level="DZ21", + geo_column="Census 2021 Data Zone Code", + source_column="Count", + ), [ DerivedColumn( hxltag="#population+children+age5_17", @@ -401,7 +392,7 @@ class DerivedColumn: DerivedColumn( hxltag="#population+adults+f", filter_func=lambda df: df.query( - f"{age_code} >= 18 and {sex_code} == 'F'" + f"{age_code} >= 18 and {sex_label} == 'Female'" ), output_column_name="adults_f", human_readable_name="Female adults", @@ -409,7 +400,7 @@ class DerivedColumn: DerivedColumn( hxltag="#population+adults+m", filter_func=lambda df: df.query( - f"{age_code} >= 18 and {sex_code} == 'M'" + f"{age_code} >= 18 and {sex_label} == 'Male'" ), output_column_name="adults_m", human_readable_name="Male adults", @@ -429,116 +420,90 @@ class DerivedColumn: ], ) } -_needed_dataset_nodes = list(set([key for key in DERIVED_COLUMN_SPECIFICATIONS.keys()])) -needed_dataset_mapping = SpecificPartitionsPartitionMapping(_needed_dataset_nodes) -needed_dataset_partition = StaticPartitionsDefinition(_needed_dataset_nodes) def census_table_metadata( - catalog_row: dict[str, str], source_data_release: SourceDataRelease + catalog_row: dict[str, str], + source_table: SourceTable, + source_data_releases: dict[str, SourceDataRelease], ) -> MetricMetadata: return MetricMetadata( human_readable_name=catalog_row["human_readable_name"], source_download_url=catalog_row["source_download_url"], source_archive_file_path=catalog_row["source_archive_file_path"], source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id=source_data_release.id, + source_data_release_id=source_data_releases[source_table.geo_level].id, # TODO - this is a placeholder parent_metric_id="unknown_at_this_stage", potential_denominator_ids=None, parquet_margin_of_error_file=None, parquet_margin_of_error_column=None, - parquet_column_name=catalog_row["source_column"], + # parquet_column_name=catalog_row["source_column"], + parquet_column_name=source_table.source_column, # TODO - this is a placeholder metric_parquet_path="unknown_at_this_stage", - hxl_tag=catalog_row["hxltag"], + hxl_tag=source_table.hxltag, description=catalog_row["description"], - source_metric_id=catalog_row["hxltag"], + source_metric_id=source_table.hxltag, ) -@asset( - ins={ - "census_tables": AssetIn(partition_mapping=needed_dataset_mapping), - "catalog": AssetIn(), - "source_data_release": AssetIn(), - }, - partitions_def=dataset_node_partition, -) -def source_metrics_by_partition( +@asset(partitions_def=dataset_node_partition) +def source_metric_metadata( context, - census_tables: dict[str, pd.DataFrame], catalog: pd.DataFrame, - # TODO: generalise to list or dict of SourceDataReleases as there may be - # tables in here that are not at the same release level - source_data_release: SourceDataRelease, - # TODO: return an intermediate type instead of MetricMetadata -) -> tuple[MetricMetadata, pd.DataFrame]: - input_partition_keys = context.asset_partition_keys_for_input( - input_name="census_tables" - ) - output_partition_key = context.partition_key - - if output_partition_key not in input_partition_keys: - skip_reason = f"Skipping as requested partition {output_partition_key} is not part of the 'needed' partitions {input_partition_keys}" + source_data_releases: dict[str, SourceDataRelease], +) -> MetricMetadata: + partition_key = context.partition_key + if partition_key not in DERIVED_COLUMN_SPECIFICATIONS: + skip_reason = ( + f"Skipping as requested partition {partition_key} is configured " + f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}" + ) context.log.warning(skip_reason) raise RuntimeError(skip_reason) - try: - result_df = census_tables[output_partition_key] - except KeyError: - err_msg = ( - f"Partition key {output_partition_key} not found in census_tables\n" - f"Available keys are {census_tables.keys()}" - ) - raise ValueError(err_msg) from None - - catalog_row = catalog[catalog["node"] == output_partition_key].to_dict( + catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict( orient="records" )[0] - # catalog_row = catalog[catalog["partition_key"].eq(parition_key)].iloc[0, :] - result_mmd = census_table_metadata(catalog_row, source_data_release) - - return result_mmd, result_df + return census_table_metadata( + catalog_row, + DERIVED_COLUMN_SPECIFICATIONS[partition_key][0], + source_data_releases, + ) -@asset( - partitions_def=dataset_node_partition, - ins={ - "source_metrics_by_partition": AssetIn( - partition_mapping=IdentityPartitionMapping() - ), - }, -) -def derived_metrics_by_partition( +@asset(partitions_def=dataset_node_partition) +def derived_metrics( context, - source_metrics_by_partition: tuple[MetricMetadata, pd.DataFrame], + census_tables: pd.DataFrame, + source_metric_metadata: MetricMetadata, ) -> tuple[list[MetricMetadata], pd.DataFrame]: - node = context.partition_key - - source_mmd, source_table = source_metrics_by_partition + partition_key = context.partition_key + source_table = census_tables + source_mmd = source_metric_metadata source_column = source_mmd.parquet_column_name assert source_column in source_table.columns assert len(source_table) > 0 try: - geo_id_col_name, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[node] - except KeyError: + source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[ + partition_key + ] + except KeyError as err: skip_reason = ( - f"Skipping as no derived columns are to be created for node {node}" + f"Skipping as no derived columns are to be created for node {partition_key}" ) context.log.warning(skip_reason) - raise RuntimeError(skip_reason) - - # Rename the geoID column to GEO_ID - source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"}) + raise RuntimeError(skip_reason) from err + source_table = source_table.rename( + columns={source_table_metadata.geo_column: "GEO_ID"} + ) derived_metrics: list[pd.DataFrame] = [] derived_mmd: list[MetricMetadata] = [] - - parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet" - + parquet_file_name = "".join(c for c in partition_key if c.isalnum()) + ".parquet" for metric_spec in metric_specs: new_table = ( source_table.pipe(metric_spec.filter_func) From 6374fb4ac22da9f5093a4c564ec910cad421b7d7 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Mon, 20 May 2024 11:33:47 +0100 Subject: [PATCH 05/39] Add remaining required assets, update cloud outputs --- python/popgetter/assets/ni/__init__.py | 44 ++++++++++++++++++---- python/popgetter/cloud_outputs/__init__.py | 4 ++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 512442f..31e6474 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -220,6 +220,12 @@ def source_table(self) -> pd.DataFrame: return pd.DataFrame() +key_prefix = "uk-ni" + +ni = NorthernIreland() + +dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) + country: CountryMetadata = CountryMetadata( name_short_en="Northern Ireland", name_official="Northern Ireland", @@ -236,11 +242,14 @@ def source_table(self) -> pd.DataFrame: ) -key_prefix = "uk-ni" +@asset +def country_metadata() -> CountryMetadata: + return country -ni = NorthernIreland() -dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) +@asset +def data_publisher() -> DataPublisher: + return publisher @asset @@ -356,12 +365,9 @@ class SourceTable: source_column: str -# The keys of this dict are the nodes (i.e. partition keys). The values are a -# list of all columns of data derived from this node. +# Config for each partition to be derived age_code = "`Age Code`" sex_label = "`Sex Label`" - -# Config for each partition to be derived DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = { "DZ21/MS-A09": ( SourceTable( @@ -540,3 +546,27 @@ def derived_metrics( ) return derived_mmd, joined_metrics + + +@asset(partitions_def=dataset_node_partition) +def metrics( + context, derived_metrics: tuple[list[MetricMetadata], pd.DataFrame] +) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: + """ + This asset exists solely to aggregate all the derived tables into one + single unpartitioned asset, which the downstream publishing tasks can use. + + Right now it is a bit boring because it only relies on one partition, but + it could be extended when we have more data products. + """ + mmds, table = derived_metrics + filepath = mmds[0].metric_parquet_path + + context.add_output_metadata( + metadata={ + "num_metrics": len(mmds), + "num_parquets": 1, + }, + ) + + return [(filepath, mmds, table)] diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index a480713..ef2ad42 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -7,6 +7,9 @@ "be/country_metadata", "be/data_publisher", "be/source_data_releases", + "uk-ni/country_metadata", + "uk-ni/data_publisher", + "uk-ni/source_data_releases", ], io_manager_key="metadata_io_manager", prefix="metadata", @@ -32,6 +35,7 @@ metrics_factory = CloudAssetSensor( asset_names_to_monitor=[ "be/metrics", + "uk-ni/metrics", ], io_manager_key="metrics_io_manager", prefix="metrics", From 5ad8d470af0275accfb0438723cc2a140013e497 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 22 May 2024 11:45:40 +0100 Subject: [PATCH 06/39] Fix metrics asset --- python/popgetter/assets/ni/__init__.py | 50 +++++++++++++++++++------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 31e6474..78f2103 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -12,8 +12,10 @@ import requests from bs4 import BeautifulSoup from dagster import ( + AssetIn, DynamicPartitionsDefinition, MetadataValue, + SpecificPartitionsPartitionMapping, asset, ) from icecream import ic @@ -237,7 +239,11 @@ def source_table(self) -> pd.DataFrame: publisher: DataPublisher = DataPublisher( name="NISRA", url="https://www.nisra.gov.uk/", - description="The Northern Ireland Statistics and Research Agency (NISRA), which incorporates the General Register Office (GRO), is an executive agency within the Department of Finance (NI) and was established on 1 April 1996.", + description=( + "The Northern Ireland Statistics and Research Agency (NISRA), which " + "incorporates the General Register Office (GRO), is an executive agency " + "within the Department of Finance (NI) and was established on 1 April 1996." + ), countries_of_interest=[country.id], ) @@ -258,7 +264,6 @@ def catalog(context) -> pd.DataFrame: @asset(partitions_def=dataset_node_partition) -# def census_tables(context: AssetExecutionContext, catalog) -> pd.DataFrame: def census_tables(context, catalog) -> pd.DataFrame: census_table = ni.census_tables( context, catalog, context.asset_partition_key_for_output() @@ -268,7 +273,6 @@ def census_tables(context, catalog) -> pd.DataFrame: @asset -# @asset(io_manager_key="geometry_io_manager", key_prefix=key_prefix) def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: # TODO: This is almost identical to Belgium so can probably be refactored to common # function with config of releases and languages @@ -548,9 +552,20 @@ def derived_metrics( return derived_mmd, joined_metrics -@asset(partitions_def=dataset_node_partition) +@asset( + ins={ + "derived_metrics": AssetIn( + partition_mapping=SpecificPartitionsPartitionMapping( + list(DERIVED_COLUMN_SPECIFICATIONS.keys()) + ), + ), + }, +) def metrics( - context, derived_metrics: tuple[list[MetricMetadata], pd.DataFrame] + # Note dagster does not seem to allow a union type for `derived_metrics` for + # the cases of one or many partitions + context, + derived_metrics, ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: """ This asset exists solely to aggregate all the derived tables into one @@ -559,14 +574,25 @@ def metrics( Right now it is a bit boring because it only relies on one partition, but it could be extended when we have more data products. """ - mmds, table = derived_metrics - filepath = mmds[0].metric_parquet_path - + if len(DERIVED_COLUMN_SPECIFICATIONS) == 1: + # Make into same type for the case of multiple partitions + derived_metrics_dict: dict[str, tuple[list[MetricMetadata], pd.DataFrame]] = { + next(iter(DERIVED_COLUMN_SPECIFICATIONS.keys())): derived_metrics + } + else: + derived_metrics_dict: dict[ + str, tuple[list[MetricMetadata], pd.DataFrame] + ] = derived_metrics + + # Combine outputs across partitions + outputs = [ + (mmds[0].metric_parquet_path, mmds, table) + for (mmds, table) in derived_metrics_dict.values() + ] context.add_output_metadata( metadata={ - "num_metrics": len(mmds), - "num_parquets": 1, + "num_metrics": sum(len(output[1]) for output in outputs), + "num_parquets": len(outputs), }, ) - - return [(filepath, mmds, table)] + return outputs From 145166dc2f17639b76d838c2305fb9dbb353530b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 22 May 2024 12:10:35 +0100 Subject: [PATCH 07/39] Add super data zones --- python/popgetter/assets/ni/__init__.py | 121 ++++++++++++++----------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 78f2103..507f6d5 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -35,7 +35,6 @@ REQUIRED_TABLES = [ "MS-A09", ] -REQUIRED_TABLES_REGEX = "|".join(REQUIRED_TABLES) # TODO REQUIRED_RELEASES = [""] @@ -67,7 +66,14 @@ class NIGeometryLevel: geo_id_column="DZ2021_cd", name_columns={"en": "DZ2021_nm"}, url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip", - ) + ), + "SDZ21": NIGeometryLevel( + level="SDZ21", + hxl_tag="TBD", + geo_id_column="SDZ2021_cd", + name_columns={"en": "SDZ2021_nm"}, + url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-sdz2021-esri-shapefile.zip", + ), } # Full list of geographies, see metadata: @@ -222,8 +228,6 @@ def source_table(self) -> pd.DataFrame: return pd.DataFrame() -key_prefix = "uk-ni" - ni = NorthernIreland() dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) @@ -276,8 +280,6 @@ def census_tables(context, catalog) -> pd.DataFrame: def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: # TODO: This is almost identical to Belgium so can probably be refactored to common # function with config of releases and languages - level_details = NI_GEO_LEVELS["DZ21"] - geometries_to_return = [] for level_details in NI_GEO_LEVELS.values(): # TODO: get correct values @@ -372,6 +374,53 @@ class SourceTable: # Config for each partition to be derived age_code = "`Age Code`" sex_label = "`Sex Label`" +DERIVED_COLUMNS = [ + DerivedColumn( + hxltag="#population+children+age5_17", + filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"), + output_column_name="children_5_17", + human_readable_name="Children aged 5 to 17", + ), + DerivedColumn( + hxltag="#population+infants+age0_4", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"), + output_column_name="infants_0_4", + human_readable_name="Infants aged 0 to 4", + ), + DerivedColumn( + hxltag="#population+children+age0_17", + filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"), + output_column_name="children_0_17", + human_readable_name="Children aged 0 to 17", + ), + DerivedColumn( + hxltag="#population+adults+f", + filter_func=lambda df: df.query( + f"{age_code} >= 18 and {sex_label} == 'Female'" + ), + output_column_name="adults_f", + human_readable_name="Female adults", + ), + DerivedColumn( + hxltag="#population+adults+m", + filter_func=lambda df: df.query(f"{age_code} >= 18 and {sex_label} == 'Male'"), + output_column_name="adults_m", + human_readable_name="Male adults", + ), + DerivedColumn( + hxltag="#population+adults", + filter_func=lambda df: df.query(f"{age_code} >= 18"), + output_column_name="adults", + human_readable_name="Adults", + ), + DerivedColumn( + hxltag="#population+ind", + filter_func=lambda df: df, + output_column_name="individuals", + human_readable_name="Total individuals", + ), +] + DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = { "DZ21/MS-A09": ( SourceTable( @@ -380,55 +429,17 @@ class SourceTable: geo_column="Census 2021 Data Zone Code", source_column="Count", ), - [ - DerivedColumn( - hxltag="#population+children+age5_17", - filter_func=lambda df: df.query(f"{age_code} >= 5 and {age_code} < 18"), - output_column_name="children_5_17", - human_readable_name="Children aged 5 to 17", - ), - DerivedColumn( - hxltag="#population+infants+age0_4", - filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 5"), - output_column_name="infants_0_4", - human_readable_name="Infants aged 0 to 4", - ), - DerivedColumn( - hxltag="#population+children+age0_17", - filter_func=lambda df: df.query(f"{age_code} >= 0 and {age_code} < 18"), - output_column_name="children_0_17", - human_readable_name="Children aged 0 to 17", - ), - DerivedColumn( - hxltag="#population+adults+f", - filter_func=lambda df: df.query( - f"{age_code} >= 18 and {sex_label} == 'Female'" - ), - output_column_name="adults_f", - human_readable_name="Female adults", - ), - DerivedColumn( - hxltag="#population+adults+m", - filter_func=lambda df: df.query( - f"{age_code} >= 18 and {sex_label} == 'Male'" - ), - output_column_name="adults_m", - human_readable_name="Male adults", - ), - DerivedColumn( - hxltag="#population+adults", - filter_func=lambda df: df.query(f"{age_code} >= 18"), - output_column_name="adults", - human_readable_name="Adults", - ), - DerivedColumn( - hxltag="#population+ind", - filter_func=lambda df: df, - output_column_name="individuals", - human_readable_name="Total individuals", - ), - ], - ) + DERIVED_COLUMNS, + ), + "SDZ21/MS-A09": ( + SourceTable( + hxltag="#population+sdz21+2021", + geo_level="SDZ21", + geo_column="Census 2021 Super Data Zone Code", + source_column="Count", + ), + DERIVED_COLUMNS, + ), } From 7927b16316e83d1e9d240c72b0bb5902e988f3c8 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 22 May 2024 12:33:48 +0100 Subject: [PATCH 08/39] Refactor, fix source data release --- python/popgetter/assets/ni/__init__.py | 40 +++++++++++--------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 507f6d5..a9ece60 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -31,24 +31,6 @@ ) from popgetter.utils import add_metadata, markdown_from_plot -PARTITION_NAME = "uk-ni_dataset_nodes" -REQUIRED_TABLES = [ - "MS-A09", -] - -# TODO -REQUIRED_RELEASES = [""] -# GENERAL_METHODS_URL = "https://www.scotlandscensus.gov.uk/media/jx2lz54n/scotland-s_census_2011_general_report.pdf" - -# TODO: get these are correct dates -CENSUS_REFERENCE_DATE = date(2021, 3, 21) -CENSUS_COLLECTION_DATE = date(2021, 3, 21) -CENSUS_EXPECT_NEXT_UPDATE = date(2031, 1, 1) -CENSUS_REFERENCE_DATE = date(2021, 3, 21) -# https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus: -# 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas -CENSUS_PUBLICATION_DATE = date(2023, 2, 21) - @dataclass class NIGeometryLevel: @@ -59,6 +41,10 @@ class NIGeometryLevel: url: str +# Name for census tables partition +PARTITION_NAME = "uk-ni_dataset_nodes" + +# Geometry levels to include NI_GEO_LEVELS = { "DZ21": NIGeometryLevel( level="DZ21", @@ -76,6 +62,9 @@ class NIGeometryLevel: ), } +# Required tables +REQUIRED_TABLES = ["MS-A09"] + # Full list of geographies, see metadata: # https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE GEO_LEVELS = [ @@ -89,6 +78,10 @@ class NIGeometryLevel: ] +# 2021 census collection date +CENSUS_COLLECTION_DATE = date(2021, 3, 21) + + def get_nodes_and_links() -> dict[str, dict[str, str]]: SCHEME_AND_HOST = "https://build.nisra.gov.uk" urls = [ @@ -337,15 +330,16 @@ def source_data_releases( ) -> dict[str, SourceDataRelease]: source_data_releases = {} for geo_metadata, _, _ in geometry: - # TODO: update with dates from config source_data_release: SourceDataRelease = SourceDataRelease( name="Census 2021", - date_published=date(2014, 2, 27), - reference_period_start=CENSUS_REFERENCE_DATE, - reference_period_end=CENSUS_REFERENCE_DATE, + # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus: + # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas + date_published=date(2023, 2, 21), + reference_period_start=date(2021, 3, 21), + reference_period_end=date(2021, 3, 21), collection_period_start=CENSUS_COLLECTION_DATE, collection_period_end=CENSUS_COLLECTION_DATE, - expect_next_update=CENSUS_EXPECT_NEXT_UPDATE, + expect_next_update=date(2031, 1, 1), url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus", data_publisher_id=publisher.id, description="TBC", From 017ea207232cfb85a2cb78931da12b6acc53032b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 22 May 2024 12:35:11 +0100 Subject: [PATCH 09/39] Fix asset job definition --- python/popgetter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 24b9be0..9a356ed 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -84,7 +84,7 @@ job_ni: UnresolvedAssetJobDefinition = define_asset_job( name="job_ni", - selection=AssetSelection.groups("uk-ni"), + selection=AssetSelection.groups("ni"), description="Downloads UK data.", ) From e400dce4d40a6dedc294c758652e091492ce4d7e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 22 May 2024 18:09:35 +0100 Subject: [PATCH 10/39] Initial implementation using a country base class --- python/popgetter/assets/common.py | 134 ++++++ python/popgetter/assets/ni/__init__.py | 617 ++++++++++++------------- 2 files changed, 438 insertions(+), 313 deletions(-) create mode 100644 python/popgetter/assets/common.py diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py new file mode 100644 index 0000000..0420013 --- /dev/null +++ b/python/popgetter/assets/common.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +import geopandas as gpd +import pandas as pd +from dagster import DynamicPartitionsDefinition, asset + +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + GeometryMetadata, + MetricMetadata, + SourceDataRelease, +) + + +class Country(ABC): + dataset_node_partition: DynamicPartitionsDefinition + + def create_catalog(self): + @asset() + def catalog(context): + return self._catalog(context) + + return catalog + + @abstractmethod + def _catalog(self, context) -> pd.DataFrame: + ... + + def create_country_metadata(self): + @asset() + def country_metadata(context): + return self._country_metadata(context) + + return country_metadata + + @abstractmethod + def _country_metadata(self, context) -> CountryMetadata: + ... + + def create_data_publisher(self): + @asset + def data_publisher(context, country_metadata: CountryMetadata): + return self._data_publisher(context, country_metadata) + + return data_publisher + + @abstractmethod + def _data_publisher( + self, context, country_metdata: CountryMetadata + ) -> DataPublisher: + ... + + def create_geometry(self): + @asset() + def geometry(context): + return self._geometry(context) + + return geometry + + @abstractmethod + def _geometry( + self, context + ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + ... + + def create_source_data_releases(self): + @asset() + def source_data_releases( + context, + geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + data_publisher: DataPublisher, + ): + return self._source_data_releases(context, geometry, data_publisher) + + return source_data_releases + + @abstractmethod + def _source_data_releases( + self, + context, + geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + data_publisher: DataPublisher, + ) -> dict[str, SourceDataRelease]: + ... + + def create_census_tables(self): + @asset(partitions_def=self.dataset_node_partition) + def census_tables(context, catalog): + return self._census_tables(context, catalog) + + return census_tables + + @abstractmethod + def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: + ... + + def create_source_metric_metadata(self): + @asset(partitions_def=self.dataset_node_partition) + def source_metric_metadata(context, catalog, source_data_releases): + return self._source_metric_metadata(context, catalog, source_data_releases) + + return source_metric_metadata + + @abstractmethod + def _source_metric_metadata( + self, + context, + catalog: pd.DataFrame, + source_data_releases: dict[str, SourceDataRelease], + ) -> MetricMetadata: + ... + + def create_derived_metrics(self): + @asset(partitions_def=self.dataset_node_partition) + def derived_metrics( + context, + census_tables: pd.DataFrame, + source_metric_metadata: MetricMetadata, + ) -> tuple[list[MetricMetadata], pd.DataFrame]: + return self._derived_metrics(context, census_tables, source_metric_metadata) + + return derived_metrics + + @abstractmethod + def _derived_metrics( + self, + context, + census_tables: pd.DataFrame, + source_metric_metadata: MetricMetadata, + ) -> tuple[list[MetricMetadata], pd.DataFrame]: + ... diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index a9ece60..60a9039 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -117,238 +117,6 @@ def get_nodes_and_links() -> dict[str, dict[str, str]]: return nodes -class NorthernIreland(Country): - partition_name: str = PARTITION_NAME - geo_levels: list[str] = GEO_LEVELS - required_tables: list[str] = REQUIRED_TABLES - - # def catalog(self, context: AssetExecutionContext) -> pd.DataFrame: - def catalog(self, context) -> pd.DataFrame: - """ - A catalog for NI can be generated in two ways: - 1. With flexible table builder: - https://build.nisra.gov.uk/en/ - with metadata chosen from: - https://build.nisra.gov.uk/en/metadata - 2. Or through enumerating the ready-made tables: - https://build.nisra.gov.uk/en/standard - However, some level of - """ - catalog_summary = { - "node": [], - "partition_key": [], - "table_id": [], - "geo_level": [], - "human_readable_name": [], - "description": [], - "metric_parquet_file_url": [], - "parquet_column_name": [], - "parquet_margin_of_error_column": [], - "parquet_margin_of_error_file": [], - "potential_denominator_ids": [], - "parent_metric_id": [], - "source_data_release_id": [], - "source_download_url": [], - "source_format": [], - "source_archive_file_path": [], - "source_documentation_url": [], - "table_schema": [], - } - nodes = get_nodes_and_links() - - def add_resolution(s: str, geo_level: str) -> str: - s_split = s.split("?") - query_params = s_split[1].split("&") - if query_params[0].startswith("d="): - query_params = "&".join( - [query_params[0], f"v={geo_level}", *query_params[2:]] - ) - else: - query_params = "&".join([f"v={geo_level}", *query_params[1:]]) - out_url = "?".join([s_split[0], query_params]) - ic(out_url) - return out_url - - for node_url, node_items in nodes.items(): - for geo_level in self.geo_levels: - metadata = requests.get(node_items["metadata_url"]).json() - table_id = metadata["dc:title"].split(":")[0] - # Skip if not required - if table_id not in self.required_tables: - continue - - catalog_summary["node"].append(node_url) - catalog_summary["table_id"].append(table_id) - catalog_summary["geo_level"].append(geo_level) - catalog_summary["partition_key"].append(f"{geo_level}/{table_id}") - catalog_summary["human_readable_name"].append(metadata["dc:title"]) - catalog_summary["description"].append(metadata["dc:description"]) - catalog_summary["metric_parquet_file_url"].append(None) - catalog_summary["parquet_column_name"].append(None) - catalog_summary["parquet_margin_of_error_column"].append(None) - catalog_summary["parquet_margin_of_error_file"].append(None) - catalog_summary["potential_denominator_ids"].append(None) - catalog_summary["parent_metric_id"].append(None) - catalog_summary["source_data_release_id"].append(None) - catalog_summary["source_download_url"].append( - add_resolution(metadata["url"], geo_level) - ) - catalog_summary["source_format"].append(None) - catalog_summary["source_archive_file_path"].append(None) - catalog_summary["source_documentation_url"].append(node_url) - catalog_summary["table_schema"].append(metadata["tableSchema"]) - - catalog_df = pd.DataFrame.from_records(catalog_summary) - context.instance.add_dynamic_partitions( - partitions_def_name=self.partition_name, - partition_keys=catalog_df["partition_key"].to_list(), - ) - - add_metadata(context, catalog_df, "Catalog") - return catalog_df - - def census_tables( - self, _context, catalog: pd.DataFrame, partition: str - ) -> pd.DataFrame: - ic(partition) - ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"]) - url = catalog.loc[ - catalog["partition_key"].eq(partition), "source_download_url" - ].iloc[0] - return pd.read_csv(io.BytesIO(requests.get(url).content), encoding="utf8") - - def source_table(self) -> pd.DataFrame: - return pd.DataFrame() - - -ni = NorthernIreland() - -dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) - -country: CountryMetadata = CountryMetadata( - name_short_en="Northern Ireland", - name_official="Northern Ireland", - iso3="GBR", - iso2="GB", - iso3166_2="GB-NIR", -) - -publisher: DataPublisher = DataPublisher( - name="NISRA", - url="https://www.nisra.gov.uk/", - description=( - "The Northern Ireland Statistics and Research Agency (NISRA), which " - "incorporates the General Register Office (GRO), is an executive agency " - "within the Department of Finance (NI) and was established on 1 April 1996." - ), - countries_of_interest=[country.id], -) - - -@asset -def country_metadata() -> CountryMetadata: - return country - - -@asset -def data_publisher() -> DataPublisher: - return publisher - - -@asset -def catalog(context) -> pd.DataFrame: - return ni.catalog(context) - - -@asset(partitions_def=dataset_node_partition) -def census_tables(context, catalog) -> pd.DataFrame: - census_table = ni.census_tables( - context, catalog, context.asset_partition_key_for_output() - ) - add_metadata(context, census_table, title=context.asset_partition_key_for_output()) - return census_table - - -@asset -def geometry(context) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: - # TODO: This is almost identical to Belgium so can probably be refactored to common - # function with config of releases and languages - geometries_to_return = [] - for level_details in NI_GEO_LEVELS.values(): - # TODO: get correct values - geometry_metadata = GeometryMetadata( - validity_period_start=CENSUS_COLLECTION_DATE, - validity_period_end=CENSUS_COLLECTION_DATE, - level=level_details.level, - hxl_tag=level_details.hxl_tag, - ) - region_geometries_raw = ( - gpd.read_file(level_details.url) - .dissolve(by=level_details.geo_id_column) - .reset_index() - ) - context.log.debug(ic(region_geometries_raw.head())) - region_geometries = region_geometries_raw.rename( - columns={level_details.geo_id_column: "GEO_ID"} - ).loc[:, ["geometry", "GEO_ID"]] - region_names = ( - region_geometries_raw.rename( - columns={ - level_details.geo_id_column: "GEO_ID", - level_details.name_columns["en"]: "en", - } - ) - .loc[:, ["GEO_ID", "en"]] - .drop_duplicates() - ) - geometries_to_return.append( - (geometry_metadata, region_geometries, region_names) - ) - - # Add output metadata - first_metadata, first_gdf, first_names = geometries_to_return[0] - first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") - ax = first_joined_gdf.plot(column="en", legend=False) - ax.set_title(f"NI 2023 {first_metadata.level}") - md_plot = markdown_from_plot(plt) - context.add_output_metadata( - metadata={ - "all_geom_levels": MetadataValue.md( - ",".join([metadata.level for metadata, _, _ in geometries_to_return]) - ), - "first_geometry_plot": MetadataValue.md(md_plot), - "first_names_preview": MetadataValue.md(first_names.head().to_markdown()), - } - ) - - return geometries_to_return - - -@asset() -def source_data_releases( - _context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] -) -> dict[str, SourceDataRelease]: - source_data_releases = {} - for geo_metadata, _, _ in geometry: - source_data_release: SourceDataRelease = SourceDataRelease( - name="Census 2021", - # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus: - # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas - date_published=date(2023, 2, 21), - reference_period_start=date(2021, 3, 21), - reference_period_end=date(2021, 3, 21), - collection_period_start=CENSUS_COLLECTION_DATE, - collection_period_end=CENSUS_COLLECTION_DATE, - expect_next_update=date(2031, 1, 1), - url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus", - data_publisher_id=publisher.id, - description="TBC", - geometry_metadata_id=geo_metadata.id, - ) - source_data_releases[geo_metadata.level] = source_data_release - return source_data_releases - - @dataclass class DerivedColumn: hxltag: str @@ -463,98 +231,321 @@ def census_table_metadata( ) -@asset(partitions_def=dataset_node_partition) -def source_metric_metadata( - context, - catalog: pd.DataFrame, - source_data_releases: dict[str, SourceDataRelease], -) -> MetricMetadata: - partition_key = context.partition_key - if partition_key not in DERIVED_COLUMN_SPECIFICATIONS: - skip_reason = ( - f"Skipping as requested partition {partition_key} is configured " - f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}" +class NorthernIreland(Country): + partition_name: str = PARTITION_NAME + geo_levels: list[str] = GEO_LEVELS + required_tables: list[str] = REQUIRED_TABLES + dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) + + def _country_metadata(self, _context) -> CountryMetadata: + return CountryMetadata( + name_short_en="Northern Ireland", + name_official="Northern Ireland", + iso3="GBR", + iso2="GB", + iso3166_2="GB-NIR", ) - context.log.warning(skip_reason) - raise RuntimeError(skip_reason) - catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict( - orient="records" - )[0] + def _data_publisher( + self, _context, country_metadata: CountryMetadata + ) -> DataPublisher: + return DataPublisher( + name="NISRA", + url="https://www.nisra.gov.uk/", + description=( + "The Northern Ireland Statistics and Research Agency (NISRA), which " + "incorporates the General Register Office (GRO), is an executive agency " + "within the Department of Finance (NI) and was established on 1 April 1996." + ), + countries_of_interest=[country_metadata.id], + ) - return census_table_metadata( - catalog_row, - DERIVED_COLUMN_SPECIFICATIONS[partition_key][0], - source_data_releases, - ) + def _catalog(self, context) -> pd.DataFrame: + """ + A catalog for NI can be generated in two ways: + 1. With flexible table builder: + https://build.nisra.gov.uk/en/ + with metadata chosen from: + https://build.nisra.gov.uk/en/metadata + 2. Or through enumerating the ready-made tables: + https://build.nisra.gov.uk/en/standard + However, some level of + """ + catalog_summary = { + "node": [], + "partition_key": [], + "table_id": [], + "geo_level": [], + "human_readable_name": [], + "description": [], + "metric_parquet_file_url": [], + "parquet_column_name": [], + "parquet_margin_of_error_column": [], + "parquet_margin_of_error_file": [], + "potential_denominator_ids": [], + "parent_metric_id": [], + "source_data_release_id": [], + "source_download_url": [], + "source_format": [], + "source_archive_file_path": [], + "source_documentation_url": [], + "table_schema": [], + } + nodes = get_nodes_and_links() + def add_resolution(s: str, geo_level: str) -> str: + s_split = s.split("?") + query_params = s_split[1].split("&") + if query_params[0].startswith("d="): + query_params = "&".join( + [query_params[0], f"v={geo_level}", *query_params[2:]] + ) + else: + query_params = "&".join([f"v={geo_level}", *query_params[1:]]) + out_url = "?".join([s_split[0], query_params]) + ic(out_url) + return out_url -@asset(partitions_def=dataset_node_partition) -def derived_metrics( - context, - census_tables: pd.DataFrame, - source_metric_metadata: MetricMetadata, -) -> tuple[list[MetricMetadata], pd.DataFrame]: - partition_key = context.partition_key - source_table = census_tables - source_mmd = source_metric_metadata - source_column = source_mmd.parquet_column_name - assert source_column in source_table.columns - assert len(source_table) > 0 - - try: - source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[ - partition_key - ] - except KeyError as err: - skip_reason = ( - f"Skipping as no derived columns are to be created for node {partition_key}" + for node_url, node_items in nodes.items(): + for geo_level in self.geo_levels: + metadata = requests.get(node_items["metadata_url"]).json() + table_id = metadata["dc:title"].split(":")[0] + # Skip if not required + if table_id not in self.required_tables: + continue + + catalog_summary["node"].append(node_url) + catalog_summary["table_id"].append(table_id) + catalog_summary["geo_level"].append(geo_level) + catalog_summary["partition_key"].append(f"{geo_level}/{table_id}") + catalog_summary["human_readable_name"].append(metadata["dc:title"]) + catalog_summary["description"].append(metadata["dc:description"]) + catalog_summary["metric_parquet_file_url"].append(None) + catalog_summary["parquet_column_name"].append(None) + catalog_summary["parquet_margin_of_error_column"].append(None) + catalog_summary["parquet_margin_of_error_file"].append(None) + catalog_summary["potential_denominator_ids"].append(None) + catalog_summary["parent_metric_id"].append(None) + catalog_summary["source_data_release_id"].append(None) + catalog_summary["source_download_url"].append( + add_resolution(metadata["url"], geo_level) + ) + catalog_summary["source_format"].append(None) + catalog_summary["source_archive_file_path"].append(None) + catalog_summary["source_documentation_url"].append(node_url) + catalog_summary["table_schema"].append(metadata["tableSchema"]) + + catalog_df = pd.DataFrame.from_records(catalog_summary) + context.instance.add_dynamic_partitions( + partitions_def_name=self.partition_name, + partition_keys=catalog_df["partition_key"].to_list(), ) - context.log.warning(skip_reason) - raise RuntimeError(skip_reason) from err - source_table = source_table.rename( - columns={source_table_metadata.geo_column: "GEO_ID"} - ) - derived_metrics: list[pd.DataFrame] = [] - derived_mmd: list[MetricMetadata] = [] - parquet_file_name = "".join(c for c in partition_key if c.isalnum()) + ".parquet" - for metric_spec in metric_specs: - new_table = ( - source_table.pipe(metric_spec.filter_func) - .groupby(by="GEO_ID", as_index=True) - .sum() - .rename(columns={source_column: metric_spec.output_column_name}) - .filter(items=["GEO_ID", metric_spec.output_column_name]) + add_metadata(context, catalog_df, "Catalog") + return catalog_df + + def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: + partition = context.asset_partition_key_for_output() + ic(partition) + ic(catalog.loc[catalog["partition_key"].eq(partition), "source_download_url"]) + url = catalog.loc[ + catalog["partition_key"].eq(partition), "source_download_url" + ].iloc[0] + census_table = pd.read_csv( + io.BytesIO(requests.get(url).content), encoding="utf8" ) - derived_metrics.append(new_table) - - new_mmd = source_mmd.copy() - new_mmd.parent_metric_id = source_mmd.source_metric_id - new_mmd.metric_parquet_path = parquet_file_name - new_mmd.hxl_tag = metric_spec.hxltag - new_mmd.parquet_column_name = metric_spec.output_column_name - new_mmd.human_readable_name = metric_spec.human_readable_name - derived_mmd.append(new_mmd) - - joined_metrics = reduce( - lambda left, right: left.merge( - right, on="GEO_ID", how="inner", validate="one_to_one" - ), - derived_metrics, - ) + add_metadata( + context, census_table, title=context.asset_partition_key_for_output() + ) + return census_table + + def _geometry( + self, context + ) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: + # TODO: This is almost identical to Belgium so can probably be refactored to common + # function with config of releases and languages + geometries_to_return = [] + for level_details in NI_GEO_LEVELS.values(): + # TODO: get correct values + geometry_metadata = GeometryMetadata( + validity_period_start=CENSUS_COLLECTION_DATE, + validity_period_end=CENSUS_COLLECTION_DATE, + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + region_geometries_raw = ( + gpd.read_file(level_details.url) + .dissolve(by=level_details.geo_id_column) + .reset_index() + ) + context.log.debug(ic(region_geometries_raw.head())) + region_geometries = region_geometries_raw.rename( + columns={level_details.geo_id_column: "GEO_ID"} + ).loc[:, ["geometry", "GEO_ID"]] + region_names = ( + region_geometries_raw.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["en"]: "en", + } + ) + .loc[:, ["GEO_ID", "en"]] + .drop_duplicates() + ) + geometries_to_return.append( + (geometry_metadata, region_geometries, region_names) + ) - context.add_output_metadata( - metadata={ - "metadata_preview": MetadataValue.md( - metadata_to_dataframe(derived_mmd).head().to_markdown() + # Add output metadata + first_metadata, first_gdf, first_names = geometries_to_return[0] + first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + ax = first_joined_gdf.plot(column="en", legend=False) + ax.set_title(f"NI 2023 {first_metadata.level}") + md_plot = markdown_from_plot(plt) + context.add_output_metadata( + metadata={ + "all_geom_levels": MetadataValue.md( + ",".join( + [metadata.level for metadata, _, _ in geometries_to_return] + ) + ), + "first_geometry_plot": MetadataValue.md(md_plot), + "first_names_preview": MetadataValue.md( + first_names.head().to_markdown() + ), + } + ) + + return geometries_to_return + + def _source_data_releases( + self, _context, geometry, data_publisher + ) -> dict[str, SourceDataRelease]: + source_data_releases = {} + for geo_metadata, _, _ in geometry: + source_data_release: SourceDataRelease = SourceDataRelease( + name="Census 2021", + # https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus: + # 9.30 am on 21 February 2023 for DZ and SDZ and District Electoral Areas + date_published=date(2023, 2, 21), + reference_period_start=date(2021, 3, 21), + reference_period_end=date(2021, 3, 21), + collection_period_start=CENSUS_COLLECTION_DATE, + collection_period_end=CENSUS_COLLECTION_DATE, + expect_next_update=date(2031, 1, 1), + url="https://www.nisra.gov.uk/publications/census-2021-outputs-prospectus", + data_publisher_id=data_publisher.id, + description="TBC", + geometry_metadata_id=geo_metadata.id, + ) + source_data_releases[geo_metadata.level] = source_data_release + return source_data_releases + + def _source_metric_metadata( + self, + context, + catalog: pd.DataFrame, + source_data_releases: dict[str, SourceDataRelease], + ) -> MetricMetadata: + partition_key = context.partition_key + if partition_key not in DERIVED_COLUMN_SPECIFICATIONS: + skip_reason = ( + f"Skipping as requested partition {partition_key} is configured " + f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}" + ) + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) + + catalog_row = catalog[catalog["partition_key"] == partition_key].to_dict( + orient="records" + )[0] + + return census_table_metadata( + catalog_row, + DERIVED_COLUMN_SPECIFICATIONS[partition_key][0], + source_data_releases, + ) + + def _derived_metrics( + self, + context, + census_tables: pd.DataFrame, + source_metric_metadata: MetricMetadata, + ) -> tuple[list[MetricMetadata], pd.DataFrame]: + partition_key = context.partition_key + source_table = census_tables + source_mmd = source_metric_metadata + source_column = source_mmd.parquet_column_name + assert source_column in source_table.columns + assert len(source_table) > 0 + + try: + source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[ + partition_key + ] + except KeyError as err: + skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}" + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) from err + + source_table = source_table.rename( + columns={source_table_metadata.geo_column: "GEO_ID"} + ) + derived_metrics: list[pd.DataFrame] = [] + derived_mmd: list[MetricMetadata] = [] + parquet_file_name = ( + "".join(c for c in partition_key if c.isalnum()) + ".parquet" + ) + for metric_spec in metric_specs: + new_table = ( + source_table.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + + joined_metrics = reduce( + lambda left, right: left.merge( + right, on="GEO_ID", how="inner", validate="one_to_one" ), - "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", - "metrics_preview": MetadataValue.md(joined_metrics.head().to_markdown()), - }, - ) + derived_metrics, + ) + + context.add_output_metadata( + metadata={ + "metadata_preview": MetadataValue.md( + metadata_to_dataframe(derived_mmd).head().to_markdown() + ), + "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + "metrics_preview": MetadataValue.md( + joined_metrics.head().to_markdown() + ), + }, + ) - return derived_mmd, joined_metrics + return derived_mmd, joined_metrics + + +# Assets +ni = NorthernIreland() +country_metadata = ni.create_country_metadata() +data_publisher = ni.create_data_publisher() +geometry = ni.create_geometry() +source_data_releases = ni.create_source_data_releases() +catalog = ni.create_catalog() +census_tables = ni.create_census_tables() +source_metric_metadata = ni.create_source_metric_metadata() +derived_metrics = ni.create_derived_metrics() @asset( From 83668b90cb8a1c1164e656118407dd885324acf5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 23 May 2024 11:32:25 +0100 Subject: [PATCH 11/39] Add README for NI --- python/popgetter/assets/ni/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 python/popgetter/assets/ni/README.md diff --git a/python/popgetter/assets/ni/README.md b/python/popgetter/assets/ni/README.md new file mode 100644 index 0000000..2faa303 --- /dev/null +++ b/python/popgetter/assets/ni/README.md @@ -0,0 +1,17 @@ +# Northern Ireland + +## Summary + +Census 2021 is available from +[https://build.nisra.gov.uk](https://build.nisra.gov.uk/en/). + +The processing pipeline involves the following steps: + +- Gets the corresponding geography files and outputs as standard geometry + formats +- Generates metadata associated with Northern Ireland and data releases +- Generates a catalog by identifying all tables + [available](https://build.nisra.gov.uk/en/standard). +- Read table metadata and census tables, across different geography levels + (currently only Data Zone 2021 and Super Data Zone 2021) +- Construct a set of pre-defined derived metrics From 69f74d99f034466c98b581f1c92918612a3cc6b1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 23 May 2024 12:03:57 +0100 Subject: [PATCH 12/39] Add country outputs class --- python/popgetter/__init__.py | 4 ++- python/popgetter/assets/common.py | 16 ++++++++++++ python/popgetter/assets/ni/__init__.py | 24 ++++++++++++----- python/popgetter/cloud_outputs/__init__.py | 30 ++++++++++------------ 4 files changed, 50 insertions(+), 24 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 9a356ed..2da6302 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -54,7 +54,9 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_package_module(assets.ni, group_name="ni", key_prefix="uk-ni"), + *load_assets_from_package_module( + assets.ni, group_name="ni", key_prefix=assets.ni.ni.key_prefix + ), *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), *( load_assets_from_modules([azure_test], group_name="azure_test") diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py index 0420013..daa8ae0 100644 --- a/python/popgetter/assets/common.py +++ b/python/popgetter/assets/common.py @@ -15,6 +15,20 @@ ) +class CountryAssetOuputs(ABC): + @abstractmethod + def get_metadata_asset_keys(self) -> list[str]: + ... + + @abstractmethod + def get_geo_asset_keys(self) -> list[str]: + ... + + @abstractmethod + def get_metric_asset_keys(self) -> list[str]: + ... + + class Country(ABC): dataset_node_partition: DynamicPartitionsDefinition @@ -83,6 +97,8 @@ def _source_data_releases( context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], data_publisher: DataPublisher, + # TODO: consider version without inputs so only output type specified + # **kwargs, ) -> dict[str, SourceDataRelease]: ... diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 60a9039..3440c9e 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -20,7 +20,7 @@ ) from icecream import ic -from popgetter.assets.common import Country +from popgetter.assets.common import Country, CountryAssetOuputs from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -41,9 +41,6 @@ class NIGeometryLevel: url: str -# Name for census tables partition -PARTITION_NAME = "uk-ni_dataset_nodes" - # Geometry levels to include NI_GEO_LEVELS = { "DZ21": NIGeometryLevel( @@ -231,11 +228,24 @@ def census_table_metadata( ) -class NorthernIreland(Country): - partition_name: str = PARTITION_NAME +class NorthernIreland(Country, CountryAssetOuputs): + key_prefix: str = "uk-ni" + partition_name: str = "uk-ni_dataset_nodes" geo_levels: list[str] = GEO_LEVELS required_tables: list[str] = REQUIRED_TABLES - dataset_node_partition = DynamicPartitionsDefinition(name=PARTITION_NAME) + dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes") + + def get_metadata_asset_keys(self) -> list[str]: + return [ + f"{self.key_prefix}/{el}" + for el in ["country_metadata", "data_publisher", "source_data_releases"] + ] + + def get_geo_asset_keys(self) -> list[str]: + return [f"{self.key_prefix}/{el}" for el in ["geometry"]] + + def get_metric_asset_keys(self) -> list[str]: + return [f"{self.key_prefix}/{el}" for el in ["metrics"]] def _country_metadata(self, _context) -> CountryMetadata: return CountryMetadata( diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index ef2ad42..d4d7cbc 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -1,16 +1,20 @@ from __future__ import annotations +import popgetter.assets as assets + from .sensor_class import CloudAssetSensor +METADATA_ASSETS = [ + "be/country_metadata", + "be/data_publisher", + "be/source_data_releases", + *assets.ni.ni.get_metadata_asset_keys(), +] +GEOMETRY_ASSETS = ["be/geometry", *assets.ni.ni.get_geo_asset_keys()] +METRIC_ASSETS = ["be/metrics", *assets.ni.ni.get_metric_asset_keys()] + metadata_factory = CloudAssetSensor( - asset_names_to_monitor=[ - "be/country_metadata", - "be/data_publisher", - "be/source_data_releases", - "uk-ni/country_metadata", - "uk-ni/data_publisher", - "uk-ni/source_data_releases", - ], + asset_names_to_monitor=METADATA_ASSETS, io_manager_key="metadata_io_manager", prefix="metadata", interval=20, @@ -20,10 +24,7 @@ metadata_asset = metadata_factory.create_publishing_asset() geometry_factory = CloudAssetSensor( - asset_names_to_monitor=[ - "be/geometry", - "uk-ni/geometry", - ], + asset_names_to_monitor=GEOMETRY_ASSETS, io_manager_key="geometry_io_manager", prefix="geometry", interval=60, @@ -33,10 +34,7 @@ geometry_asset = geometry_factory.create_publishing_asset() metrics_factory = CloudAssetSensor( - asset_names_to_monitor=[ - "be/metrics", - "uk-ni/metrics", - ], + asset_names_to_monitor=METRIC_ASSETS, io_manager_key="metrics_io_manager", prefix="metrics", interval=60, From 852c8f82c29610779aee2137ca689a1caa65d861 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 23 May 2024 12:15:44 +0100 Subject: [PATCH 13/39] Add dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 00743ca..441b5d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dependencies = [ "icecream >=2.1.3", # General debugging tool "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names "jcs >=0.2.1", # For generating IDs from class attributes + "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages ] From b489dfb0618d6f4148fa1ef5ea39ce6d1e75fd82 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 23 May 2024 17:49:48 +0100 Subject: [PATCH 14/39] Begin update to cover all census tables --- python/popgetter/assets/ni/__init__.py | 76 ++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 3440c9e..4b2d162 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import os from collections.abc import Callable from dataclasses import dataclass from datetime import date @@ -37,8 +38,13 @@ class NIGeometryLevel: level: str hxl_tag: str geo_id_column: str + census_table_column: str name_columns: dict[str, str] # keys = language codes, values = column names url: str + lookup_url: str | None + lookup_sheet: str | None + left_on: str | None + right_on: str | None # Geometry levels to include @@ -47,20 +53,42 @@ class NIGeometryLevel: level="DZ21", hxl_tag="TBD", geo_id_column="DZ2021_cd", + census_table_column="Census 2021 Data Zone Code", name_columns={"en": "DZ2021_nm"}, url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip", + lookup_url=None, + lookup_sheet=None, + left_on=None, + right_on=None, ), "SDZ21": NIGeometryLevel( level="SDZ21", hxl_tag="TBD", geo_id_column="SDZ2021_cd", + census_table_column="Census 2021 Super Data Zone Code", name_columns={"en": "SDZ2021_nm"}, url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-sdz2021-esri-shapefile.zip", + lookup_url=None, + lookup_sheet=None, + left_on=None, + right_on=None, + ), + "LGD14": NIGeometryLevel( + level="LGD14", + hxl_tag="TBD", + geo_id_column="LGD2014_cd", + census_table_column="Local Government District 2014 Code", + name_columns={"en": "LGD2014_name"}, + url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-dz2021-esri-shapefile.zip", + lookup_url="https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/geography-data-zone-and-super-data-zone-lookups.xlsx", + lookup_sheet="DZ2021_lookup", + left_on="DZ2021_cd", + right_on="DZ2021_code", ), } # Required tables -REQUIRED_TABLES = ["MS-A09"] +REQUIRED_TABLES = ["MS-A09"] if os.getenv("ENV") == "dev" else None # Full list of geographies, see metadata: # https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE @@ -232,7 +260,7 @@ class NorthernIreland(Country, CountryAssetOuputs): key_prefix: str = "uk-ni" partition_name: str = "uk-ni_dataset_nodes" geo_levels: list[str] = GEO_LEVELS - required_tables: list[str] = REQUIRED_TABLES + required_tables: list[str] | None = REQUIRED_TABLES dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes") def get_metadata_asset_keys(self) -> list[str]: @@ -321,7 +349,10 @@ def add_resolution(s: str, geo_level: str) -> str: metadata = requests.get(node_items["metadata_url"]).json() table_id = metadata["dc:title"].split(":")[0] # Skip if not required - if table_id not in self.required_tables: + if ( + self.required_tables is not None + and table_id not in self.required_tables + ): continue catalog_summary["node"].append(node_url) @@ -383,11 +414,22 @@ def _geometry( level=level_details.level, hxl_tag=level_details.hxl_tag, ) - region_geometries_raw = ( - gpd.read_file(level_details.url) - .dissolve(by=level_details.geo_id_column) - .reset_index() - ) + region_geometries_raw: gpd.GeoDataFrame = gpd.read_file(level_details.url) + if level_details.lookup_url is not None: + lookup = pd.read_excel( + level_details.lookup_url, sheet_name=level_details.lookup_sheet + ) + region_geometries_raw = region_geometries_raw.merge( + lookup, + left_on=level_details.left_on, + right_on=level_details.right_on, + how="outer", + ) + + region_geometries_raw = region_geometries_raw.dissolve( + by=level_details.geo_id_column + ).reset_index() + context.log.debug(ic(region_geometries_raw.head())) region_geometries = region_geometries_raw.rename( columns={level_details.geo_id_column: "GEO_ID"} @@ -410,7 +452,7 @@ def _geometry( first_metadata, first_gdf, first_names = geometries_to_return[0] first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") ax = first_joined_gdf.plot(column="en", legend=False) - ax.set_title(f"NI 2023 {first_metadata.level}") + ax.set_title(f"NI 2021 {first_metadata.level}") md_plot = markdown_from_plot(plt) context.add_output_metadata( metadata={ @@ -458,7 +500,10 @@ def _source_metric_metadata( source_data_releases: dict[str, SourceDataRelease], ) -> MetricMetadata: partition_key = context.partition_key - if partition_key not in DERIVED_COLUMN_SPECIFICATIONS: + if ( + self.required_tables is not None + and partition_key not in self.required_tables + ): skip_reason = ( f"Skipping as requested partition {partition_key} is configured " f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}" @@ -470,9 +515,18 @@ def _source_metric_metadata( orient="records" )[0] + geo_level = partition_key.split("/")[0] + source_table = SourceTable( + # TODO: how programmatically do this + hxltag="TBD", + geo_level=geo_level, + geo_column=NI_GEO_LEVELS["geo_level"].geo_id_column, + source_column="Count", + ) + return census_table_metadata( catalog_row, - DERIVED_COLUMN_SPECIFICATIONS[partition_key][0], + source_table, source_data_releases, ) From 0d42d27057b8074d71ee88938296886afd42decf Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 24 May 2024 09:27:18 +0100 Subject: [PATCH 15/39] Add processing for pivoting arbitrary census tables --- python/popgetter/assets/ni/__init__.py | 122 +++++++++++++++++++------ 1 file changed, 94 insertions(+), 28 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 4b2d162..75ddb27 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -520,7 +520,7 @@ def _source_metric_metadata( # TODO: how programmatically do this hxltag="TBD", geo_level=geo_level, - geo_column=NI_GEO_LEVELS["geo_level"].geo_id_column, + geo_column=NI_GEO_LEVELS[geo_level].geo_id_column, source_column="Count", ) @@ -536,47 +536,113 @@ def _derived_metrics( census_tables: pd.DataFrame, source_metric_metadata: MetricMetadata, ) -> tuple[list[MetricMetadata], pd.DataFrame]: + SEP = "_" partition_key = context.partition_key + geo_level = partition_key.split("/")[0] source_table = census_tables source_mmd = source_metric_metadata source_column = source_mmd.parquet_column_name assert source_column in source_table.columns assert len(source_table) > 0 + geo_id = NI_GEO_LEVELS[geo_level].census_table_column + source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop( + columns=geo_id.replace("Code", "Label") + ) + + parquet_file_name = ( + "".join(c for c in partition_key if c.isalnum()) + ".parquet" + ) + derived_metrics: list[pd.DataFrame] = [] + derived_mmd: list[MetricMetadata] = [] + try: + # TODO: check whether to drop unused source_table_metadata source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[ partition_key ] - except KeyError as err: + for metric_spec in metric_specs: + new_table = ( + source_table.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + except KeyError: skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}" context.log.warning(skip_reason) - raise RuntimeError(skip_reason) from err - - source_table = source_table.rename( - columns={source_table_metadata.geo_column: "GEO_ID"} - ) - derived_metrics: list[pd.DataFrame] = [] - derived_mmd: list[MetricMetadata] = [] - parquet_file_name = ( - "".join(c for c in partition_key if c.isalnum()) + ".parquet" - ) - for metric_spec in metric_specs: - new_table = ( - source_table.pipe(metric_spec.filter_func) - .groupby(by="GEO_ID", as_index=True) - .sum() - .rename(columns={source_column: metric_spec.output_column_name}) - .filter(items=["GEO_ID", metric_spec.output_column_name]) + # raise RuntimeError(skip_reason) from err + + # Get all other metrics from table as is pivoted + def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: + # Variables are either code or label, only keep the case for given 'end' + cols = ( + [col for col in df.columns if col.endswith(end)] + + ["GEO_ID"] + + ["Count"] + ) + pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]] + ic(cols) + ic(pivot_cols) + ic(df.columns) + ic(df.head()) + pivot = df[cols].pivot_table( + index="GEO_ID", + columns=pivot_cols, + values="Count", ) - derived_metrics.append(new_table) - - new_mmd = source_mmd.copy() - new_mmd.parent_metric_id = source_mmd.source_metric_id - new_mmd.metric_parquet_path = parquet_file_name - new_mmd.hxl_tag = metric_spec.hxltag - new_mmd.parquet_column_name = metric_spec.output_column_name - new_mmd.human_readable_name = metric_spec.human_readable_name - derived_mmd.append(new_mmd) + + # FLattent multi-index + if isinstance(pivot.columns, pd.MultiIndex): + pivot.columns = [ + SEP.join(list(map(str, col))).strip() + for col in pivot.columns.to_numpy() + ] + # Ensure columns are string + else: + pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] + out_cols = [col.replace(var_type, "").strip() for col in pivot_cols] + return out_cols, pivot + + # Pivot for codes and labels + for var_type in ["Code", "Label"]: + out_cols, new_table = pivot_df(source_table, var_type) + ic(new_table) + for metric_col in new_table.columns: + metric_df = new_table.loc[:, metric_col].to_frame() + ic(metric_df) + derived_metrics.append(metric_df) + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) + + def gen_hxltag(kv: dict[str, str]) -> str: + out = ["#population"] + for key, value in kv.items(): + out += ["".join(c for c in key if c.isalnum())] + out += ["_"] + out += ["".join(c for c in value if c.isalnum())] + return "+".join(out) + + new_mmd.hxl_tag = gen_hxltag(key_val) + new_mmd.parquet_column_name = metric_col + new_mmd.human_readable_name = "; ".join( + [ + f"Variable: '{key}'; Value: '{value}'" + for key, value in key_val.items() + ] + ) + derived_mmd.append(new_mmd) joined_metrics = reduce( lambda left, right: left.merge( From ad6440622de240cfaca871d63fa113f2ce4ea015 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 28 May 2024 09:41:28 +0100 Subject: [PATCH 16/39] Add new metrics asset across all partitions --- python/popgetter/assets/common.py | 20 ++++ python/popgetter/assets/ni/__init__.py | 154 ++++++++++++++++++++----- 2 files changed, 146 insertions(+), 28 deletions(-) diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py index daa8ae0..cadb5f5 100644 --- a/python/popgetter/assets/common.py +++ b/python/popgetter/assets/common.py @@ -148,3 +148,23 @@ def _derived_metrics( source_metric_metadata: MetricMetadata, ) -> tuple[list[MetricMetadata], pd.DataFrame]: ... + + # def create_reshaped_metrics(self): + # @asset(partitions_def=self.dataset_node_partition) + # def reshaped_metrics( + # context, + # census_tables: pd.DataFrame, + # source_metric_metadata: MetricMetadata, + # ) -> tuple[list[MetricMetadata], pd.DataFrame]: + # return self._reshaped_metrics(context, census_tables, source_metric_metadata) + + # return reshaped_metrics + + # @abstractmethod + # def _reshaped_metrics( + # self, + # context, + # census_tables: pd.DataFrame, + # source_metric_metadata: MetricMetadata, + # ) -> tuple[list[MetricMetadata], pd.DataFrame]: + # ... diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 75ddb27..c45ad35 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -13,14 +13,14 @@ import requests from bs4 import BeautifulSoup from dagster import ( - AssetIn, + AssetDep, DynamicPartitionsDefinition, MetadataValue, - SpecificPartitionsPartitionMapping, asset, ) from icecream import ic +import popgetter from popgetter.assets.common import Country, CountryAssetOuputs from popgetter.metadata import ( CountryMetadata, @@ -580,7 +580,6 @@ def _derived_metrics( except KeyError: skip_reason = f"Skipping as no derived columns are to be created for node {partition_key}" context.log.warning(skip_reason) - # raise RuntimeError(skip_reason) from err # Get all other metrics from table as is pivoted def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: @@ -662,9 +661,117 @@ def gen_hxltag(kv: dict[str, str]) -> str: ), }, ) - return derived_mmd, joined_metrics + # TODO: consider splitting the reshaping of census table data into a separate asset + # def _reshaped_metrics( + # self, + # context, + # census_tables: pd.DataFrame, + # source_metric_metadata: MetricMetadata, + # ) -> tuple[list[MetricMetadata], pd.DataFrame]: + # SEP = "_" + # partition_key = context.partition_key + # geo_level = partition_key.split("/")[0] + # source_table = census_tables + # source_mmd = source_metric_metadata + # source_column = source_mmd.parquet_column_name + # assert source_column in source_table.columns + # assert len(source_table) > 0 + + # geo_id = NI_GEO_LEVELS[geo_level].census_table_column + # source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop( + # columns=geo_id.replace("Code", "Label") + # ) + + # parquet_file_name = ( + # "".join(c for c in partition_key if c.isalnum()) + ".parquet" + # ) + # derived_metrics: list[pd.DataFrame] = [] + # derived_mmd: list[MetricMetadata] = [] + + # # Get all other metrics from table as is pivoted + # def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: + # # Variables are either code or label, only keep the case for given 'end' + # cols = ( + # [col for col in df.columns if col.endswith(end)] + # + ["GEO_ID"] + # + ["Count"] + # ) + # pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]] + # ic(cols) + # ic(pivot_cols) + # ic(df.columns) + # ic(df.head()) + # pivot = df[cols].pivot_table( + # index="GEO_ID", + # columns=pivot_cols, + # values="Count", + # ) + + # # FLattent multi-index + # if isinstance(pivot.columns, pd.MultiIndex): + # pivot.columns = [ + # SEP.join(list(map(str, col))).strip() + # for col in pivot.columns.to_numpy() + # ] + # # Ensure columns are string + # else: + # pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] + # out_cols = [col.replace(var_type, "").strip() for col in pivot_cols] + # return out_cols, pivot + + # # Pivot for codes and labels + # for var_type in ["Code", "Label"]: + # out_cols, new_table = pivot_df(source_table, var_type) + # ic(new_table) + # for metric_col in new_table.columns: + # metric_df = new_table.loc[:, metric_col].to_frame() + # ic(metric_df) + # derived_metrics.append(metric_df) + # new_mmd = source_mmd.copy() + # new_mmd.parent_metric_id = source_mmd.source_metric_id + # new_mmd.metric_parquet_path = parquet_file_name + # key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) + + # def gen_hxltag(kv: dict[str, str]) -> str: + # out = ["#population"] + # for key, value in kv.items(): + # out += ["".join(c for c in key if c.isalnum())] + # out += ["_"] + # out += ["".join(c for c in value if c.isalnum())] + # return "+".join(out) + + # new_mmd.hxl_tag = gen_hxltag(key_val) + # new_mmd.parquet_column_name = metric_col + # new_mmd.human_readable_name = "; ".join( + # [ + # f"Variable: '{key}'; Value: '{value}'" + # for key, value in key_val.items() + # ] + # ) + # derived_mmd.append(new_mmd) + + # joined_metrics = reduce( + # lambda left, right: left.merge( + # right, on="GEO_ID", how="inner", validate="one_to_one" + # ), + # derived_metrics, + # ) + + # context.add_output_metadata( + # metadata={ + # "metadata_preview": MetadataValue.md( + # metadata_to_dataframe(derived_mmd).head().to_markdown() + # ), + # "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + # "metrics_preview": MetadataValue.md( + # joined_metrics.head().to_markdown() + # ), + # }, + # ) + # return derived_mmd, joined_metrics + # Assets ni = NorthernIreland() @@ -675,40 +782,31 @@ def gen_hxltag(kv: dict[str, str]) -> str: catalog = ni.create_catalog() census_tables = ni.create_census_tables() source_metric_metadata = ni.create_source_metric_metadata() +# reshaped_metrics = ni.create_reshaped_metrics() derived_metrics = ni.create_derived_metrics() -@asset( - ins={ - "derived_metrics": AssetIn( - partition_mapping=SpecificPartitionsPartitionMapping( - list(DERIVED_COLUMN_SPECIFICATIONS.keys()) - ), - ), - }, -) +# Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition: +# See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi +@asset(deps=[AssetDep("derived_metrics")]) def metrics( - # Note dagster does not seem to allow a union type for `derived_metrics` for - # the cases of one or many partitions context, - derived_metrics, + catalog: pd.DataFrame, ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: """ This asset exists solely to aggregate all the derived tables into one single unpartitioned asset, which the downstream publishing tasks can use. - - Right now it is a bit boring because it only relies on one partition, but - it could be extended when we have more data products. """ - if len(DERIVED_COLUMN_SPECIFICATIONS) == 1: - # Make into same type for the case of multiple partitions - derived_metrics_dict: dict[str, tuple[list[MetricMetadata], pd.DataFrame]] = { - next(iter(DERIVED_COLUMN_SPECIFICATIONS.keys())): derived_metrics - } - else: - derived_metrics_dict: dict[ - str, tuple[list[MetricMetadata], pd.DataFrame] - ] = derived_metrics + # Get derived_metrics asset for partitions that were successful + derived_metrics_dict = {} + for partition_key in catalog["partition_key"].to_list(): + try: + derived_metrics_partition = popgetter.defs.load_asset_value( + ["uk-ni", "derived_metrics"], partition_key=partition_key + ) + derived_metrics_dict[partition_key] = derived_metrics_partition + except FileNotFoundError as err: + context.log.debug(ic(f"Failed partition key {partition_key}: {err}")) # Combine outputs across partitions outputs = [ From 1ef97ea096f7e40b3d43a0a7a55854ac5306a81e Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 28 May 2024 09:58:51 +0100 Subject: [PATCH 17/39] Change aggfunc to sum to prevent cast as float --- python/popgetter/assets/ni/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index c45ad35..e3e9029 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -595,9 +595,7 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: ic(df.columns) ic(df.head()) pivot = df[cols].pivot_table( - index="GEO_ID", - columns=pivot_cols, - values="Count", + index="GEO_ID", columns=pivot_cols, values="Count", aggfunc="sum" ) # FLattent multi-index From 3661f68fc7478d090c3537a4abd5abc332071413 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 28 May 2024 10:00:08 +0100 Subject: [PATCH 18/39] Fix hxltag construction --- python/popgetter/assets/ni/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index e3e9029..713d7e1 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -626,9 +626,11 @@ def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: def gen_hxltag(kv: dict[str, str]) -> str: out = ["#population"] for key, value in kv.items(): - out += ["".join(c for c in key if c.isalnum())] - out += ["_"] - out += ["".join(c for c in value if c.isalnum())] + out += [ + "".join(c for c in key if c.isalnum()) + + "_" + + "".join(c for c in value if c.isalnum()) + ] return "+".join(out) new_mmd.hxl_tag = gen_hxltag(key_val) From f069cea118e927b9411c8ed31aad51d496f18004 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 28 May 2024 17:18:59 +0100 Subject: [PATCH 19/39] Move metrics into abc --- python/popgetter/__init__.py | 4 +- python/popgetter/assets/common.py | 46 ++++++++++++---- python/popgetter/assets/ni/__init__.py | 72 ++++++++++++-------------- 3 files changed, 69 insertions(+), 53 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 2da6302..2300abd 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -54,9 +54,7 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_package_module( - assets.ni, group_name="ni", key_prefix=assets.ni.ni.key_prefix - ), + *load_assets_from_package_module(assets.ni, group_name="ni"), *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), *( load_assets_from_modules([azure_test], group_name="azure_test") diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py index 5233825..a99f8e5 100644 --- a/python/popgetter/assets/common.py +++ b/python/popgetter/assets/common.py @@ -4,9 +4,13 @@ import geopandas as gpd import pandas as pd -from dagster import DynamicPartitionsDefinition, asset +from dagster import AssetDep, DynamicPartitionsDefinition, asset -from popgetter.cloud_outputs import send_to_geometry_sensor, send_to_metadata_sensor +from popgetter.cloud_outputs import ( + send_to_geometry_sensor, + send_to_metadata_sensor, + send_to_metrics_sensor, +) from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -17,10 +21,11 @@ class Country(ABC): + key_prefix: str dataset_node_partition: DynamicPartitionsDefinition def create_catalog(self): - @asset() + @asset(key_prefix=self.key_prefix) def catalog(context): return self._catalog(context) @@ -32,7 +37,7 @@ def _catalog(self, context) -> pd.DataFrame: def create_country_metadata(self): @send_to_metadata_sensor - @asset() + @asset(key_prefix=self.key_prefix) def country_metadata(context): return self._country_metadata(context) @@ -44,7 +49,7 @@ def _country_metadata(self, context) -> CountryMetadata: def create_data_publisher(self): @send_to_metadata_sensor - @asset + @asset(key_prefix=self.key_prefix) def data_publisher(context, country_metadata: CountryMetadata): return self._data_publisher(context, country_metadata) @@ -58,7 +63,7 @@ def _data_publisher( def create_geometry(self): @send_to_geometry_sensor - @asset() + @asset(key_prefix=self.key_prefix) def geometry(context): return self._geometry(context) @@ -72,7 +77,7 @@ def _geometry( def create_source_data_releases(self): @send_to_metadata_sensor - @asset() + @asset(key_prefix=self.key_prefix) def source_data_releases( context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], @@ -94,7 +99,7 @@ def _source_data_releases( ... def create_census_tables(self): - @asset(partitions_def=self.dataset_node_partition) + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) def census_tables(context, catalog): return self._census_tables(context, catalog) @@ -105,7 +110,7 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: ... def create_source_metric_metadata(self): - @asset(partitions_def=self.dataset_node_partition) + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) def source_metric_metadata(context, catalog, source_data_releases): return self._source_metric_metadata(context, catalog, source_data_releases) @@ -121,7 +126,7 @@ def _source_metric_metadata( ... def create_derived_metrics(self): - @asset(partitions_def=self.dataset_node_partition) + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) def derived_metrics( context, census_tables: pd.DataFrame, @@ -159,3 +164,24 @@ def _derived_metrics( # source_metric_metadata: MetricMetadata, # ) -> tuple[list[MetricMetadata], pd.DataFrame]: # ... + + def create_metrics(self): + @send_to_metrics_sensor + # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition: + # See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi + @asset(deps=[AssetDep("derived_metrics")], key_prefix=self.key_prefix) + def metrics( + context, + catalog: pd.DataFrame, + ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: + return self._metrics(context, catalog) + + return metrics + + @abstractmethod + def _metrics( + self, + context, + catalog: pd.DataFrame, + ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: + ... diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index f8e4cb2..4b80d4f 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -13,16 +13,13 @@ import requests from bs4 import BeautifulSoup from dagster import ( - AssetDep, DynamicPartitionsDefinition, MetadataValue, - asset, ) from icecream import ic import popgetter from popgetter.assets.common import Country -from popgetter.cloud_outputs import send_to_metrics_sensor from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -761,6 +758,37 @@ def gen_hxltag(kv: dict[str, str]) -> str: # ) # return derived_mmd, joined_metrics + def _metrics( + self, context, catalog: pd.DataFrame + ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: + """ + This asset exists solely to aggregate all the derived tables into one + single unpartitioned asset, which the downstream publishing tasks can use. + """ + # Get derived_metrics asset for partitions that were successful + derived_metrics_dict = {} + for partition_key in catalog["partition_key"].to_list(): + try: + derived_metrics_partition = popgetter.defs.load_asset_value( + ["uk-ni", "derived_metrics"], partition_key=partition_key + ) + derived_metrics_dict[partition_key] = derived_metrics_partition + except FileNotFoundError as err: + context.log.debug(ic(f"Failed partition key {partition_key}: {err}")) + + # Combine outputs across partitions + outputs = [ + (mmds[0].metric_parquet_path, mmds, table) + for (mmds, table) in derived_metrics_dict.values() + ] + context.add_output_metadata( + metadata={ + "num_metrics": sum(len(output[1]) for output in outputs), + "num_parquets": len(outputs), + }, + ) + return outputs + # Assets ni = NorthernIreland() @@ -773,40 +801,4 @@ def gen_hxltag(kv: dict[str, str]) -> str: source_metric_metadata = ni.create_source_metric_metadata() # reshaped_metrics = ni.create_reshaped_metrics() derived_metrics = ni.create_derived_metrics() - - -@send_to_metrics_sensor -# Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition: -# See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi -@asset(deps=[AssetDep("derived_metrics")]) -def metrics( - context, - catalog: pd.DataFrame, -) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: - """ - This asset exists solely to aggregate all the derived tables into one - single unpartitioned asset, which the downstream publishing tasks can use. - """ - # Get derived_metrics asset for partitions that were successful - derived_metrics_dict = {} - for partition_key in catalog["partition_key"].to_list(): - try: - derived_metrics_partition = popgetter.defs.load_asset_value( - ["uk-ni", "derived_metrics"], partition_key=partition_key - ) - derived_metrics_dict[partition_key] = derived_metrics_partition - except FileNotFoundError as err: - context.log.debug(ic(f"Failed partition key {partition_key}: {err}")) - - # Combine outputs across partitions - outputs = [ - (mmds[0].metric_parquet_path, mmds, table) - for (mmds, table) in derived_metrics_dict.values() - ] - context.add_output_metadata( - metadata={ - "num_metrics": sum(len(output[1]) for output in outputs), - "num_parquets": len(outputs), - }, - ) - return outputs +metrics = ni.create_metrics() From 01836c045a42bc3ad3534f7340c7dd5d1f7838ac Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 29 May 2024 10:07:40 +0100 Subject: [PATCH 20/39] Refactor to remove obsolete geo levels dict --- notebooks/explore.ipynb | 2 +- python/popgetter/assets/ni/__init__.py | 27 ++++++++++++-------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/notebooks/explore.ipynb b/notebooks/explore.ipynb index 44c18f0..8d9a91c 100644 --- a/notebooks/explore.ipynb +++ b/notebooks/explore.ipynb @@ -66,7 +66,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 4b80d4f..297a398 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from datetime import date from functools import reduce +from typing import ClassVar import geopandas as gpd import matplotlib.pyplot as plt @@ -46,6 +47,15 @@ class NIGeometryLevel: # Geometry levels to include +# Full list of geographies, see metadata: +# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE +# - "LGD14", # Local Government District 2014 +# - "URBAN_STATUS", # Urban Status +# - "HEALTH_TRUST", # Health and Social Care Trust +# - "PARLCON24", # Parliamentary Constituency 2024 +# - "DEA14", # District Electoral Area 2014 +# - "SDZ21", # Census 2021 Super Data Zone +# - "DZ21", # Census 2021 Data Zone NI_GEO_LEVELS = { "DZ21": NIGeometryLevel( level="DZ21", @@ -88,24 +98,12 @@ class NIGeometryLevel: # Required tables REQUIRED_TABLES = ["MS-A09"] if os.getenv("ENV") == "dev" else None -# Full list of geographies, see metadata: -# https://build.nisra.gov.uk/en/metadata/dataset?d=PEOPLE -GEO_LEVELS = [ - "LGD14", # Local Government District 2014 - # "URBAN_STATUS", # Urban Status - # "HEALTH_TRUST", # Health and Social Care Trust - # "PARLCON24", # Parliamentary Constituency 2024 - # "DEA14", # District Electoral Area 2014 - "SDZ21", # Census 2021 Super Data Zone - "DZ21", # Census 2021 Data Zone -] - - # 2021 census collection date CENSUS_COLLECTION_DATE = date(2021, 3, 21) def get_nodes_and_links() -> dict[str, dict[str, str]]: + """Extracts the URLs for census tables and metadata for ready-made tables.""" SCHEME_AND_HOST = "https://build.nisra.gov.uk" urls = [ "".join([SCHEME_AND_HOST, url.get("href")]) @@ -244,7 +242,6 @@ def census_table_metadata( potential_denominator_ids=None, parquet_margin_of_error_file=None, parquet_margin_of_error_column=None, - # parquet_column_name=catalog_row["source_column"], parquet_column_name=source_table.source_column, # TODO - this is a placeholder metric_parquet_path="unknown_at_this_stage", @@ -257,7 +254,7 @@ def census_table_metadata( class NorthernIreland(Country): key_prefix: str = "uk-ni" partition_name: str = "uk-ni_dataset_nodes" - geo_levels: list[str] = GEO_LEVELS + geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys()) required_tables: list[str] | None = REQUIRED_TABLES dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes") From 6c33d266d29920729f77348e9638736571708897 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 29 May 2024 10:20:56 +0100 Subject: [PATCH 21/39] Remove unused reshape metrics method --- python/popgetter/assets/common.py | 20 ----- python/popgetter/assets/ni/__init__.py | 110 ------------------------- 2 files changed, 130 deletions(-) diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/common.py index a99f8e5..b5983a2 100644 --- a/python/popgetter/assets/common.py +++ b/python/popgetter/assets/common.py @@ -145,26 +145,6 @@ def _derived_metrics( ) -> tuple[list[MetricMetadata], pd.DataFrame]: ... - # def create_reshaped_metrics(self): - # @asset(partitions_def=self.dataset_node_partition) - # def reshaped_metrics( - # context, - # census_tables: pd.DataFrame, - # source_metric_metadata: MetricMetadata, - # ) -> tuple[list[MetricMetadata], pd.DataFrame]: - # return self._reshaped_metrics(context, census_tables, source_metric_metadata) - - # return reshaped_metrics - - # @abstractmethod - # def _reshaped_metrics( - # self, - # context, - # census_tables: pd.DataFrame, - # source_metric_metadata: MetricMetadata, - # ) -> tuple[list[MetricMetadata], pd.DataFrame]: - # ... - def create_metrics(self): @send_to_metrics_sensor # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition: diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 297a398..73a263a 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -646,115 +646,6 @@ def gen_hxltag(kv: dict[str, str]) -> str: ) return derived_mmd, joined_metrics - # TODO: consider splitting the reshaping of census table data into a separate asset - # def _reshaped_metrics( - # self, - # context, - # census_tables: pd.DataFrame, - # source_metric_metadata: MetricMetadata, - # ) -> tuple[list[MetricMetadata], pd.DataFrame]: - # SEP = "_" - # partition_key = context.partition_key - # geo_level = partition_key.split("/")[0] - # source_table = census_tables - # source_mmd = source_metric_metadata - # source_column = source_mmd.parquet_column_name - # assert source_column in source_table.columns - # assert len(source_table) > 0 - - # geo_id = NI_GEO_LEVELS[geo_level].census_table_column - # source_table = source_table.rename(columns={geo_id: "GEO_ID"}).drop( - # columns=geo_id.replace("Code", "Label") - # ) - - # parquet_file_name = ( - # "".join(c for c in partition_key if c.isalnum()) + ".parquet" - # ) - # derived_metrics: list[pd.DataFrame] = [] - # derived_mmd: list[MetricMetadata] = [] - - # # Get all other metrics from table as is pivoted - # def pivot_df(df: pd.DataFrame, end: str) -> tuple[list[str], pd.DataFrame]: - # # Variables are either code or label, only keep the case for given 'end' - # cols = ( - # [col for col in df.columns if col.endswith(end)] - # + ["GEO_ID"] - # + ["Count"] - # ) - # pivot_cols = [col for col in cols if col not in ["GEO_ID", "Count"]] - # ic(cols) - # ic(pivot_cols) - # ic(df.columns) - # ic(df.head()) - # pivot = df[cols].pivot_table( - # index="GEO_ID", - # columns=pivot_cols, - # values="Count", - # ) - - # # FLattent multi-index - # if isinstance(pivot.columns, pd.MultiIndex): - # pivot.columns = [ - # SEP.join(list(map(str, col))).strip() - # for col in pivot.columns.to_numpy() - # ] - # # Ensure columns are string - # else: - # pivot.columns = [str(col).strip() for col in pivot.columns.to_numpy()] - # out_cols = [col.replace(var_type, "").strip() for col in pivot_cols] - # return out_cols, pivot - - # # Pivot for codes and labels - # for var_type in ["Code", "Label"]: - # out_cols, new_table = pivot_df(source_table, var_type) - # ic(new_table) - # for metric_col in new_table.columns: - # metric_df = new_table.loc[:, metric_col].to_frame() - # ic(metric_df) - # derived_metrics.append(metric_df) - # new_mmd = source_mmd.copy() - # new_mmd.parent_metric_id = source_mmd.source_metric_id - # new_mmd.metric_parquet_path = parquet_file_name - # key_val = dict(zip(out_cols, metric_col.split(SEP), strict=True)) - - # def gen_hxltag(kv: dict[str, str]) -> str: - # out = ["#population"] - # for key, value in kv.items(): - # out += ["".join(c for c in key if c.isalnum())] - # out += ["_"] - # out += ["".join(c for c in value if c.isalnum())] - # return "+".join(out) - - # new_mmd.hxl_tag = gen_hxltag(key_val) - # new_mmd.parquet_column_name = metric_col - # new_mmd.human_readable_name = "; ".join( - # [ - # f"Variable: '{key}'; Value: '{value}'" - # for key, value in key_val.items() - # ] - # ) - # derived_mmd.append(new_mmd) - - # joined_metrics = reduce( - # lambda left, right: left.merge( - # right, on="GEO_ID", how="inner", validate="one_to_one" - # ), - # derived_metrics, - # ) - - # context.add_output_metadata( - # metadata={ - # "metadata_preview": MetadataValue.md( - # metadata_to_dataframe(derived_mmd).head().to_markdown() - # ), - # "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", - # "metrics_preview": MetadataValue.md( - # joined_metrics.head().to_markdown() - # ), - # }, - # ) - # return derived_mmd, joined_metrics - def _metrics( self, context, catalog: pd.DataFrame ) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: @@ -796,6 +687,5 @@ def _metrics( catalog = ni.create_catalog() census_tables = ni.create_census_tables() source_metric_metadata = ni.create_source_metric_metadata() -# reshaped_metrics = ni.create_reshaped_metrics() derived_metrics = ni.create_derived_metrics() metrics = ni.create_metrics() From 26bf2be4a7c682d784746084955c0852a85cb7c3 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 29 May 2024 10:24:04 +0100 Subject: [PATCH 22/39] Remove source table from metric specification --- python/popgetter/assets/ni/__init__.py | 27 ++++---------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 73a263a..abcbab9 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -204,25 +204,9 @@ class SourceTable: ), ] -DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[SourceTable, list[DerivedColumn]]] = { - "DZ21/MS-A09": ( - SourceTable( - hxltag="#population+dz21+2021", - geo_level="DZ21", - geo_column="Census 2021 Data Zone Code", - source_column="Count", - ), - DERIVED_COLUMNS, - ), - "SDZ21/MS-A09": ( - SourceTable( - hxltag="#population+sdz21+2021", - geo_level="SDZ21", - geo_column="Census 2021 Super Data Zone Code", - source_column="Count", - ), - DERIVED_COLUMNS, - ), +DERIVED_COLUMN_SPECIFICATIONS: dict[str, list[DerivedColumn]] = { + "DZ21/MS-A09": DERIVED_COLUMNS, + "SDZ21/MS-A09": DERIVED_COLUMNS, } @@ -540,10 +524,7 @@ def _derived_metrics( derived_mmd: list[MetricMetadata] = [] try: - # TODO: check whether to drop unused source_table_metadata - source_table_metadata, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[ - partition_key - ] + metric_specs = DERIVED_COLUMN_SPECIFICATIONS[partition_key] for metric_spec in metric_specs: new_table = ( source_table.pipe(metric_spec.filter_func) From 1428420f7d80cc02f703faca6bc05e489f7f1054 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 29 May 2024 12:01:31 +0100 Subject: [PATCH 23/39] Update README --- python/popgetter/assets/ni/README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/python/popgetter/assets/ni/README.md b/python/popgetter/assets/ni/README.md index 2faa303..b1d3ea9 100644 --- a/python/popgetter/assets/ni/README.md +++ b/python/popgetter/assets/ni/README.md @@ -1,17 +1,22 @@ -# Northern Ireland +# Northern Ireland ## Summary Census 2021 is available from [https://build.nisra.gov.uk](https://build.nisra.gov.uk/en/). -The processing pipeline involves the following steps: +The processing pipeline involves the following steps, achieved by implementing +the [`Country`](../country.py) base class: -- Gets the corresponding geography files and outputs as standard geometry - formats -- Generates metadata associated with Northern Ireland and data releases -- Generates a catalog by identifying all tables - [available](https://build.nisra.gov.uk/en/standard). -- Read table metadata and census tables, across different geography levels - (currently only Data Zone 2021 and Super Data Zone 2021) -- Construct a set of pre-defined derived metrics +- Retrieve the geography data and outputs with standard geometry formats + (`geometry` asset) +- Generate metadata associated with country, data publisher and source data + releases (`country_metadata`, `data_publisher` and `source_data_releases` + assets) +- Generate a catalog by identifying all tables + [available](https://build.nisra.gov.uk/en/standard) (`catalog` asset) +- Read table metadata and census tables, across different geography levels, + currently for Data Zone 2021, Super Data Zone 2021 and Local Government + District 2014 (`census_tables` and `source_metric_metadata` assets) +- Process census tables into metrics per geography ID and any other pre-defined + derived metrics (`metrics` asset) From ba5d00126cc1c2458dd07273c1329eccde928c88 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 29 May 2024 12:06:53 +0100 Subject: [PATCH 24/39] Rename module, add doc strings for country class --- .../assets/{common.py => country.py} | 58 +++++++++++++++++-- python/popgetter/assets/ni/__init__.py | 4 +- 2 files changed, 56 insertions(+), 6 deletions(-) rename python/popgetter/assets/{common.py => country.py} (72%) diff --git a/python/popgetter/assets/common.py b/python/popgetter/assets/country.py similarity index 72% rename from python/popgetter/assets/common.py rename to python/popgetter/assets/country.py index b5983a2..2dc3fec 100644 --- a/python/popgetter/assets/common.py +++ b/python/popgetter/assets/country.py @@ -21,12 +21,25 @@ class Country(ABC): + """ + A general class that can be implemented for a given country providing asset + factories and abstract methods to provide a template for a given country. + + Attributes: + key_prefix (str): the prefix for the asset keys (e.g. "be" for Belgium) + dataset_node_partition (DynamicPartitionsDefinition): a dynamic partitions + definition populated at runtime with a partition per census table. + + """ + key_prefix: str dataset_node_partition: DynamicPartitionsDefinition def create_catalog(self): + """Creates an asset providing a census metedata catalog.""" + @asset(key_prefix=self.key_prefix) - def catalog(context): + def catalog(context) -> pd.DataFrame: return self._catalog(context) return catalog @@ -36,6 +49,8 @@ def _catalog(self, context) -> pd.DataFrame: ... def create_country_metadata(self): + """Creates an asset providing the country metadata.""" + @send_to_metadata_sensor @asset(key_prefix=self.key_prefix) def country_metadata(context): @@ -48,6 +63,8 @@ def _country_metadata(self, context) -> CountryMetadata: ... def create_data_publisher(self): + """Creates an asset providing the data publisher metadata.""" + @send_to_metadata_sensor @asset(key_prefix=self.key_prefix) def data_publisher(context, country_metadata: CountryMetadata): @@ -62,6 +79,11 @@ def _data_publisher( ... def create_geometry(self): + """ + Creates an asset providing a list of geometries, metadata and names + at different resolutions. + """ + @send_to_geometry_sensor @asset(key_prefix=self.key_prefix) def geometry(context): @@ -76,13 +98,18 @@ def _geometry( ... def create_source_data_releases(self): + """ + Creates an asset providing the corresponding source data release metadata for + each geometry. + """ + @send_to_metadata_sensor @asset(key_prefix=self.key_prefix) def source_data_releases( context, geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], data_publisher: DataPublisher, - ): + ) -> dict[str, SourceDataRelease]: return self._source_data_releases(context, geometry, data_publisher) return source_data_releases @@ -99,8 +126,13 @@ def _source_data_releases( ... def create_census_tables(self): + """ + Creates an asset providing each census table as a dataframe for each + partition. + """ + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) - def census_tables(context, catalog): + def census_tables(context, catalog: pd.DataFrame) -> pd.DataFrame: return self._census_tables(context, catalog) return census_tables @@ -110,8 +142,15 @@ def _census_tables(self, context, catalog: pd.DataFrame) -> pd.DataFrame: ... def create_source_metric_metadata(self): + """ + Creates an asset providing the metadata required for downstream metric + derivation. + """ + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) - def source_metric_metadata(context, catalog, source_data_releases): + def source_metric_metadata( + context, catalog, source_data_releases: dict[str, SourceDataRelease] + ) -> MetricMetadata: return self._source_metric_metadata(context, catalog, source_data_releases) return source_metric_metadata @@ -126,6 +165,11 @@ def _source_metric_metadata( ... def create_derived_metrics(self): + """ + Creates an asset providing the metrics derived from the census tables and the + corresponding source metric metadata. + """ + @asset(partitions_def=self.dataset_node_partition, key_prefix=self.key_prefix) def derived_metrics( context, @@ -146,6 +190,12 @@ def _derived_metrics( ... def create_metrics(self): + """ + Creates an asset combining all partitions across census tables into a combined + list of metric data file names (for output), list of metadata and metric + dataframe. + """ + @send_to_metrics_sensor # Note: does not seem possible to specify a StaticPartition derived from a DynamicPartition: # See: https://discuss.dagster.io/t/16717119/i-want-to-be-able-to-populate-a-dagster-staticpartitionsdefi diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index abcbab9..9a15c6d 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -20,7 +20,7 @@ from icecream import ic import popgetter -from popgetter.assets.common import Country +from popgetter.assets.country import Country from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -639,7 +639,7 @@ def _metrics( for partition_key in catalog["partition_key"].to_list(): try: derived_metrics_partition = popgetter.defs.load_asset_value( - ["uk-ni", "derived_metrics"], partition_key=partition_key + [ni.key_prefix, "derived_metrics"], partition_key=partition_key ) derived_metrics_dict[partition_key] = derived_metrics_partition except FileNotFoundError as err: From 49bb7591a4512c757d2f71714e4267f03bb78104 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:38:25 +0100 Subject: [PATCH 25/39] Add openpyxl dep for reading Excel files Co-authored-by: Penelope Yong --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 441b5d7..ba11cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "python-slugify >=8.0.4", # Required for generating asset names from GBR Ordnance Survey OpenData Product names "jcs >=0.2.1", # For generating IDs from class attributes "beautifulsoup4 >=4.12.3", # For extracting catalogs from web pages + "openpyxl >=3.1.3", # For reading Excel files ] From 2462c09b3253dc7253afafe3f55802abcd050137 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:43:26 +0100 Subject: [PATCH 26/39] Fix job description --- python/popgetter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 2300abd..9d544a5 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -85,7 +85,7 @@ job_ni: UnresolvedAssetJobDefinition = define_asset_job( name="job_ni", selection=AssetSelection.groups("ni"), - description="Downloads UK data.", + description="Downloads Northern Ireland data.", ) From 01f1ecbb0946af4b521851d0fc35c8dd00eb92f5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:45:24 +0100 Subject: [PATCH 27/39] Add key_prefix to init method Co-authored-by: Penelope Yong --- python/popgetter/assets/country.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py index 2dc3fec..5c0dba8 100644 --- a/python/popgetter/assets/country.py +++ b/python/popgetter/assets/country.py @@ -32,9 +32,14 @@ class Country(ABC): """ - key_prefix: str + key_prefix: ClassVar[str] + partition_name: str dataset_node_partition: DynamicPartitionsDefinition + def __init__(self, key_prefix: str): + self.partition_name = f"{self.key_prefix}_nodes" + self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name) + def create_catalog(self): """Creates an asset providing a census metedata catalog.""" From 2387f75ff11a91cccfebe659335eb3703ec20af9 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:53:34 +0100 Subject: [PATCH 28/39] Add methods for adding an removing partition keys Having methods to add and remove partition keys simplifies the Country API for subclassing. Co-authored-by: Penelope Yong --- python/popgetter/assets/country.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py index 5c0dba8..fadbfa7 100644 --- a/python/popgetter/assets/country.py +++ b/python/popgetter/assets/country.py @@ -36,6 +36,15 @@ class Country(ABC): partition_name: str dataset_node_partition: DynamicPartitionsDefinition + def add_partition_keys(self, context, keys: list[str]): + context.instance.add_dynamic_partitions( + partitions_def_name=self.partition_name, + partition_keys=keys, + ) + + def remove_all_partition_keys(self, context): + for partition_key in context.instance.get_dynamic_partitions(self.partition_name): + context.instance.delete_dynamic_partition(self.partition_name, partition_key) def __init__(self, key_prefix: str): self.partition_name = f"{self.key_prefix}_nodes" self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name) From 49948df791d14778e570d1fab764ae2321929291 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 16:59:15 +0100 Subject: [PATCH 29/39] Remove obsolete attributes Co-authored-by: Penelope Yong --- python/popgetter/assets/ni/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 9a15c6d..2b989ab 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -236,11 +236,9 @@ def census_table_metadata( class NorthernIreland(Country): - key_prefix: str = "uk-ni" - partition_name: str = "uk-ni_dataset_nodes" + key_prefix: ClassVar[str] = "uk-ni" geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys()) required_tables: list[str] | None = REQUIRED_TABLES - dataset_node_partition = DynamicPartitionsDefinition(name="uk-ni_dataset_nodes") def _country_metadata(self, _context) -> CountryMetadata: return CountryMetadata( From eab49b6ec505bfae1c62cc6714ce04e4524602ae Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:00:41 +0100 Subject: [PATCH 30/39] Remove update to python version in notebook --- notebooks/explore.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/explore.ipynb b/notebooks/explore.ipynb index 8d9a91c..44c18f0 100644 --- a/notebooks/explore.ipynb +++ b/notebooks/explore.ipynb @@ -66,7 +66,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.3" } }, "nbformat": 4, From adf5456bb191516aa721f9d0d571728d6798214f Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:06:04 +0100 Subject: [PATCH 31/39] Complete comment --- python/popgetter/assets/ni/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 2b989ab..5fd78cd 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -272,7 +272,8 @@ def _catalog(self, context) -> pd.DataFrame: https://build.nisra.gov.uk/en/metadata 2. Or through enumerating the ready-made tables: https://build.nisra.gov.uk/en/standard - However, some level of + However, for some geographical resolutions, ready-made tables may + not be available due to data confidentiality. """ catalog_summary = { "node": [], From b801e0e69602c45a5fc5a0c04555811937814ef4 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:08:09 +0100 Subject: [PATCH 32/39] Remove all partitions during materialization This is to ensure that only partitions from the latest materialization are included. Co-authored-by: Penelope Yong --- python/popgetter/assets/ni/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 5fd78cd..5f51c19 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -296,6 +296,7 @@ def _catalog(self, context) -> pd.DataFrame: "table_schema": [], } nodes = get_nodes_and_links() + self.remove_all_partition_keys(context) def add_resolution(s: str, geo_level: str) -> str: s_split = s.split("?") From 4667efd0f086b33c8e584b0ed8f197fb2287026c Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:09:40 +0100 Subject: [PATCH 33/39] Use country method to add partitions Co-authored-by: Penelope Yong --- python/popgetter/assets/ni/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 5f51c19..76b4329 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -344,10 +344,7 @@ def add_resolution(s: str, geo_level: str) -> str: catalog_summary["table_schema"].append(metadata["tableSchema"]) catalog_df = pd.DataFrame.from_records(catalog_summary) - context.instance.add_dynamic_partitions( - partitions_def_name=self.partition_name, - partition_keys=catalog_df["partition_key"].to_list(), - ) + self.add_partition_keys(context, catalog_df["partition_key"].to_list()) add_metadata(context, catalog_df, "Catalog") return catalog_df From 0ca5bb485ae63aaa21bda2db7c8e830903eed241 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 6 Jun 2024 17:40:20 +0100 Subject: [PATCH 34/39] Add key_prefix to init --- python/popgetter/assets/ni/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 76b4329..1d02b26 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -14,7 +14,6 @@ import requests from bs4 import BeautifulSoup from dagster import ( - DynamicPartitionsDefinition, MetadataValue, ) from icecream import ic @@ -236,7 +235,7 @@ def census_table_metadata( class NorthernIreland(Country): - key_prefix: ClassVar[str] = "uk-ni" + key_prefix: str geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys()) required_tables: list[str] | None = REQUIRED_TABLES @@ -657,7 +656,7 @@ def _metrics( # Assets -ni = NorthernIreland() +ni = NorthernIreland("uk-ni") country_metadata = ni.create_country_metadata() data_publisher = ni.create_data_publisher() geometry = ni.create_geometry() From f2fbe857d0fcf40a7a51919c5e7c95d7669b0cee Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 6 Jun 2024 18:05:53 +0100 Subject: [PATCH 35/39] Update key_prefix type and add assignment in init --- python/popgetter/assets/country.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py index fadbfa7..a64325d 100644 --- a/python/popgetter/assets/country.py +++ b/python/popgetter/assets/country.py @@ -32,10 +32,17 @@ class Country(ABC): """ - key_prefix: ClassVar[str] + key_prefix: str partition_name: str dataset_node_partition: DynamicPartitionsDefinition + def __init__(self, key_prefix: str): + self.key_prefix = key_prefix + self.partition_name = f"{self.key_prefix}_nodes" + self.dataset_node_partition = DynamicPartitionsDefinition( + name=self.partition_name + ) + def add_partition_keys(self, context, keys: list[str]): context.instance.add_dynamic_partitions( partitions_def_name=self.partition_name, @@ -43,11 +50,12 @@ def add_partition_keys(self, context, keys: list[str]): ) def remove_all_partition_keys(self, context): - for partition_key in context.instance.get_dynamic_partitions(self.partition_name): - context.instance.delete_dynamic_partition(self.partition_name, partition_key) - def __init__(self, key_prefix: str): - self.partition_name = f"{self.key_prefix}_nodes" - self.dataset_node_partition = DynamicPartitionsDefinition(name=self.partition_name) + for partition_key in context.instance.get_dynamic_partitions( + self.partition_name + ): + context.instance.delete_dynamic_partition( + self.partition_name, partition_key + ) def create_catalog(self): """Creates an asset providing a census metedata catalog.""" From baaecbb2c9919d604d3168a57af4da415688ead6 Mon Sep 17 00:00:00 2001 From: Sam Greenbury <50113363+sgreenbury@users.noreply.github.com> Date: Fri, 7 Jun 2024 08:57:08 +0100 Subject: [PATCH 36/39] Fix condition for partition keys included in metadata construction Co-authored-by: Penelope Yong --- python/popgetter/assets/ni/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 1d02b26..98bb27e 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -465,10 +465,10 @@ def _source_metric_metadata( partition_key = context.partition_key if ( self.required_tables is not None - and partition_key not in self.required_tables + and partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys() ): skip_reason = ( - f"Skipping as requested partition {partition_key} is configured " + f"Skipping as requested partition {partition_key} is not configured " f"for derived metrics {DERIVED_COLUMN_SPECIFICATIONS.keys()}" ) context.log.warning(skip_reason) From 56fff8a68642bca9274be58eb4192e5b118a9d95 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 6 Jun 2024 21:08:22 +0100 Subject: [PATCH 37/39] Revert key_prefix to ClassVar over arg of init --- python/popgetter/assets/country.py | 6 +++--- python/popgetter/assets/ni/__init__.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py index a64325d..2cd43c4 100644 --- a/python/popgetter/assets/country.py +++ b/python/popgetter/assets/country.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod +from typing import ClassVar import geopandas as gpd import pandas as pd @@ -32,12 +33,11 @@ class Country(ABC): """ - key_prefix: str + key_prefix: ClassVar[str] partition_name: str dataset_node_partition: DynamicPartitionsDefinition - def __init__(self, key_prefix: str): - self.key_prefix = key_prefix + def __init__(self): self.partition_name = f"{self.key_prefix}_nodes" self.dataset_node_partition = DynamicPartitionsDefinition( name=self.partition_name diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 98bb27e..2f060ca 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -235,7 +235,7 @@ def census_table_metadata( class NorthernIreland(Country): - key_prefix: str + key_prefix: ClassVar[str] = "uk-ni" geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys()) required_tables: list[str] | None = REQUIRED_TABLES @@ -656,7 +656,7 @@ def _metrics( # Assets -ni = NorthernIreland("uk-ni") +ni = NorthernIreland() country_metadata = ni.create_country_metadata() data_publisher = ni.create_data_publisher() geometry = ni.create_geometry() From a58a086f7911fe71f1efc3024a3de2f537180776 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 6 Jun 2024 21:13:29 +0100 Subject: [PATCH 38/39] Add jobs list towards simplifying popgetter init --- python/popgetter/__init__.py | 49 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 9d544a5..17febfc 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -63,30 +63,29 @@ ), ] -job_be: UnresolvedAssetJobDefinition = define_asset_job( - name="job_be", - selection=AssetSelection.groups("be"), - description="Downloads Belgian data.", - partitions_def=assets.be.census_tables.dataset_node_partition, -) - -job_us: UnresolvedAssetJobDefinition = define_asset_job( - name="job_us", - selection=AssetSelection.groups("us"), - description="Downloads USA data.", -) - -job_uk: UnresolvedAssetJobDefinition = define_asset_job( - name="job_uk", - selection=AssetSelection.groups("uk"), - description="Downloads UK data.", -) - -job_ni: UnresolvedAssetJobDefinition = define_asset_job( - name="job_ni", - selection=AssetSelection.groups("ni"), - description="Downloads Northern Ireland data.", -) +jobs: list[UnresolvedAssetJobDefinition] = [ + define_asset_job( + name="job_be", + selection=AssetSelection.groups("be"), + description="Downloads Belgian data.", + partitions_def=assets.be.census_tables.dataset_node_partition, + ), + define_asset_job( + name="job_us", + selection=AssetSelection.groups("us"), + description="Downloads USA data.", + ), + define_asset_job( + name="job_uk", + selection=AssetSelection.groups("uk"), + description="Downloads UK data.", + ), + define_asset_job( + name="job_ni", + selection=AssetSelection.groups("ni"), + description="Downloads Northern Ireland data.", + ), +] def resources_by_env(): @@ -127,5 +126,5 @@ def resources_by_env(): cloud_outputs.metrics_sensor, ], resources=resources, - jobs=[job_be, job_us, job_uk, job_ni], + jobs=jobs, ) From 10c2126d3dd49d06e0b377872939d29f79d682a5 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 7 Jun 2024 09:05:59 +0100 Subject: [PATCH 39/39] Fix ruff lint --- python/popgetter/assets/ni/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/ni/__init__.py index 2f060ca..6fc1242 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/ni/__init__.py @@ -465,7 +465,7 @@ def _source_metric_metadata( partition_key = context.partition_key if ( self.required_tables is not None - and partition_key not in DERIVED_COLUMN_SPECIFICATIONS.keys() + and partition_key not in DERIVED_COLUMN_SPECIFICATIONS ): skip_reason = ( f"Skipping as requested partition {partition_key} is not configured "