From 3dc30ffa662864e764a9bdf63ccee01aba6d2136 Mon Sep 17 00:00:00 2001 From: spwoodcock Date: Thu, 9 Jan 2025 00:57:18 +0000 Subject: [PATCH] feat: work in progress, replace pure python --> PostGIS functions --- Dockerfile | 7 ++ README.md | 29 ++++--- compose.yml | 33 ++++++- geojson_aoi/merge.py | 182 --------------------------------------- geojson_aoi/normalize.py | 114 ------------------------ geojson_aoi/parser.py | 127 +++++++++++++-------------- geojson_aoi/postgis.py | 169 ++++++++++++++++++++++++++++++++++++ geojson_aoi/types.py | 12 +-- mkdocs.yml | 2 +- pyproject.toml | 6 +- tests/conftest.py | 6 ++ tests/test_parser.py | 86 +++++++++--------- uv.lock | 35 ++++++++ 13 files changed, 377 insertions(+), 431 deletions(-) delete mode 100644 geojson_aoi/merge.py delete mode 100644 geojson_aoi/normalize.py create mode 100644 geojson_aoi/postgis.py diff --git a/Dockerfile b/Dockerfile index 4d1ce30..5209f74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,6 +70,7 @@ RUN apt-get update --quiet \ apt-get install -y --quiet --no-install-recommends \ "build-essential" \ "gcc" \ + "libpq-dev" \ && rm -rf /var/lib/apt/lists/* COPY --from=uv /uv /usr/local/bin/uv COPY pyproject.toml uv.lock README.md /_lock/ @@ -103,6 +104,12 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt \ REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt \ CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt +RUN apt-get update --quiet \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y --quiet --no-install-recommends \ + "postgresql-client" \ + && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \ + && rm -rf /var/lib/apt/lists/* # Copy Python deps from build to runtime COPY --from=build /opt/python /opt/python WORKDIR /data diff --git a/README.md b/README.md index df1cae0..087705c 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@

- HOT + HOT

- Parse and normalize a GeoJSON area of interest, using pure Python. + Parse and normalize a GeoJSON area of interest, using using PostGIS.

@@ -54,25 +54,16 @@ - **Flexible geometry input**: - Polygon - MultiPolygons + - GeometryCollection - Feature - FeatureCollection - Handle multigeometries with an optional merge to single polygon, or split into featcol of individual polygons. -- Handle geometries nested inside GeometryCollection*. +- Handle geometries nested inside GeometryCollection. - Remove any z-dimension coordinates. - Warn user if CRS is provided, in a coordinate system other than EPSG:4326. - **Normalised output**: FeatureCollection containing Polygon geoms. -> [!WARNING] -> *We typically advise against using the GeometryCollection type, and support -> in this library may not be fully functional. -> -> However sometimes geometries may need to be returned wrapped in -> GeometryCollection, for example due to idiosyncrasies of PostGIS. -> -> In this scenario, we support stripping out the first geometry from inside -> each GeometryCollection object (that may be nested in a FeatureCollection). - ## Capturing The Warnings If the GeoJSON has an invalid CRS, or coordinates seem off, a warning @@ -104,3 +95,15 @@ if recorded_warnings: # do stuff with warning logger.warning(f"A warning was encountered: {warning.message}") ``` + +## History + +- Initially I tried to write a pure-Python implementation of this, no dependencies. +- I underestimated the amount of work that is! It could be possible to reverse + engineer C++ Geos or georust/geos, but it's more hassle than it's worth. +- As all of the target install candidates for this package use a db driver + anyway, I thought it wisest (and most time efficient) to use the PostGIS + Geos implementation (specifically for the unary_union and convex_hull + algorithms). +- An additional advantage is the potential to port this to PGLite when the + PostGIS extension is available, meaning AOI processing easily in the browser. diff --git a/compose.yml b/compose.yml index 9f9f75c..91fcca9 100644 --- a/compose.yml +++ b/compose.yml @@ -15,8 +15,12 @@ # along with geojson-aoi-parser. If not, see . # +networks: + net: + name: aoi-parser + services: - aoi-parser: + parser: image: "ghcr.io/hotosm/geojson-aoi-parser:${TAG_OVERRIDE:-ci}" build: target: ci @@ -26,6 +30,29 @@ services: - ./geojson_aoi:/opt/python/lib/python3.10/site-packages/geojson_aoi # Mount local tests - ./tests:/data/tests - network_mode: none - restart: "unless-stopped" + depends_on: + db: + condition: service_healthy + networks: + - net + restart: "no" command: "pytest" + + db: + image: "postgis/postgis:17-3.5-alpine" + container_name: aoi-parser-db + environment: + - POSTGRES_USER=aoi + - POSTGRES_PASSWORD=dummycipassword + - POSTGRES_DB=aoi + ports: + - "5439:5432" + networks: + - net + restart: "unless-stopped" + healthcheck: + test: pg_isready -U ${FMTM_DB_USER:-aoi} -d ${FMTM_DB_NAME:-aoi} + start_period: 5s + interval: 10s + timeout: 5s + retries: 3 diff --git a/geojson_aoi/merge.py b/geojson_aoi/merge.py deleted file mode 100644 index 7af4a52..0000000 --- a/geojson_aoi/merge.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Functions for Polygon merging.""" - -from itertools import chain - -from geojson_aoi.types import FeatureCollection, PointCoords, PolygonCoords - - -def merge_polygons(featcol: FeatureCollection) -> FeatureCollection: - """Merge multiple Polygons or MultiPolygons into a single Polygon. - - It is used to create a single polygon boundary. - - Automatically determine whether to use union (for overlapping polygons) - or convex hull (for disjoint polygons). - - As a result of the processing, any Feature properties will be lost. - - Args: - featcol (FeatureCollection): a FeatureCollection containing geometries. - - Returns: - FeatureCollection: a FeatureCollection of a single Polygon. - """ - if not featcol.get("features"): - raise ValueError("FeatureCollection must contain at least one feature") - - polygons = [] - for feature in featcol.get("features", []): - geom = feature["geometry"] - if geom["type"] == "Polygon": - polygons.append([_remove_holes(geom["coordinates"])]) - elif geom["type"] == "MultiPolygon": - for polygon in geom["coordinates"]: - polygons.append([_remove_holes(polygon)]) - - polygons = [_ensure_right_hand_rule(polygon[0]) for polygon in polygons] - - if all( - _polygons_disjoint(p1[0], p2[0]) - for i, p1 in enumerate(polygons) - for p2 in polygons[i + 1 :] - ): - merged_coordinates = _create_convex_hull(list(chain.from_iterable(polygons))) - else: - merged_coordinates = _create_unary_union(polygons) - - return { - "type": "FeatureCollection", - "features": [ - { - "type": "Feature", - "geometry": {"type": "Polygon", "coordinates": [merged_coordinates]}, - "properties": {}, - } - ], - } - - -def _ensure_right_hand_rule( - coordinates: PolygonCoords, -) -> PolygonCoords: - """Ensure the outer ring follows the right-hand rule (clockwise).""" - - def is_clockwise(ring: list[PointCoords]) -> bool: - """Check coords are in clockwise direction.""" - return ( - sum( - (ring[i][0] - ring[i - 1][0]) * (ring[i][1] + ring[i - 1][1]) - for i in range(len(ring)) - ) - > 0 - ) - - # Validate input - if not isinstance(coordinates[0], list) or not all( - isinstance(pt, list) and len(pt) == 2 for pt in coordinates[0] - ): - raise ValueError( - "Invalid input: coordinates[0] must be a list " - f"of [x, y] points. Got: {coordinates[0]}" - ) - - # Ensure the first ring is the exterior ring and follows clockwise direction - if not is_clockwise(coordinates[0]): - coordinates[0] = coordinates[0][::-1] - - # Ensure any holes follow counter-clockwise direction - for i in range(1, len(coordinates)): - if is_clockwise(coordinates[i]): - coordinates[i] = coordinates[i][::-1] - - return coordinates - - -def _remove_holes(polygon: list) -> list: - """Remove holes from a polygon by keeping only the exterior ring. - - Args: - polygon: A list of coordinate rings, where the first is the exterior - and subsequent ones are interior holes. - - Returns: - list: A list containing only the exterior ring coordinates. - """ - if not polygon: - return [] # Return an empty list if the polygon is empty - return polygon[0] # Only return the exterior ring - - -def _create_convex_hull(points: list[PointCoords]) -> list[PointCoords]: - """Create a convex hull from a list of polygons. - - This essentially draws a boundary around the outside of the polygons. - - Most appropriate when the boundaries are not touching (disjoint). - """ - - def cross(o: PointCoords, a: PointCoords, b: PointCoords) -> float: - return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0]) - - points = sorted(set(points)) - if len(points) <= 1: - return points - - lower, upper = [], [] - for p in points: - while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0: - lower.pop() - lower.append(p) - for p in reversed(points): - while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0: - upper.pop() - upper.append(p) - - return lower[:-1] + upper[:-1] - - -def _polygons_disjoint(poly1: list[list[float]], poly2: list[list[float]]) -> bool: - """Check if two polygons are disjoint. - - Test bounding boxes and edge intersections. - """ - - def bounding_box(polygon: list[list[float]]) -> tuple: - xs, ys = zip(*polygon, strict=False) - return min(xs), min(ys), max(xs), max(ys) - - def bounding_boxes_overlap(bb1: tuple, bb2: tuple) -> bool: - return not ( - bb1[2] < bb2[0] or bb2[2] < bb1[0] or bb1[3] < bb2[1] or bb2[3] < bb1[1] - ) - - bb1, bb2 = bounding_box(poly1), bounding_box(poly2) - if not bounding_boxes_overlap(bb1, bb2): - return True - - def line_segments_intersect(p1, p2, q1, q2) -> bool: - def ccw(a, b, c): - return (c[1] - a[1]) * (b[0] - a[0]) > (b[1] - a[1]) * (c[0] - a[0]) - - return ccw(p1, q1, q2) != ccw(p2, q1, q2) and ccw(p1, p2, q1) != ccw(p1, p2, q2) - - for i in range(len(poly1)): - p1, p2 = poly1[i], poly1[(i + 1) % len(poly1)] - for j in range(len(poly2)): - q1, q2 = poly2[j], poly2[(j + 1) % len(poly2)] - if line_segments_intersect(p1, p2, q1, q2): - return False - - return True - - -def _create_unary_union(polygons: list[list[list[float]]]) -> list[list[list[float]]]: - """Create a unary union from a list of polygons. - - This merges the polygons by their boundaries exactly. - Most appropriate when the boundaries are touching (not disjoint). - """ - # Pure Python union implementation is non-trivial, so this is simplified: - # Merge all coordinates into a single outer ring. - all_points = chain.from_iterable(polygon[0] for polygon in polygons) - return [list(set(all_points))] diff --git a/geojson_aoi/normalize.py b/geojson_aoi/normalize.py deleted file mode 100644 index 0f8a477..0000000 --- a/geojson_aoi/normalize.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Functions to normalize a GeoJSON to FeatureCollection.""" - -from geojson_aoi.types import ( - Feature, - FeatureCollection, - Geometry, - PolygonCoords, - Properties, -) - - -def normalize_featcol(featcol: FeatureCollection) -> FeatureCollection: - """Normalize a FeatureCollection into a standardised format. - - The final FeatureCollection will only contain: - - Polygon - - LineString - - Point - - Processed: - - MultiPolygons will be divided out into individual polygons. - - GeometryCollections wrappers will be stripped out. - - Removes any z-dimension coordinates, e.g. [43, 32, 0.0] - - Args: - featcol: A parsed FeatureCollection. - - Returns: - FeatureCollection: A normalized FeatureCollection. - """ - for feat in featcol.get("features", []): - geom = feat.get("geometry") - if not geom or "type" not in geom: - continue # Skip invalid features - - # Strip out GeometryCollection wrappers - if ( - geom.get("type") == "GeometryCollection" - and len(geom.get("geometries", [])) == 1 - ): - feat["geometry"] = geom.get("geometries")[0] - - # Remove any z-dimension coordinates - coords = geom.get("coordinates") - if coords: - geom["coordinates"] = _remove_z_dimension(coords) - - # Convert MultiPolygon type --> individual Polygons - return _multigeom_to_singlegeom(featcol) - - -def _remove_z_dimension(coords: PolygonCoords) -> PolygonCoords: - """Recursively remove the Z dimension from coordinates.""" - if isinstance(coords[0], (list, tuple)): - # If the first element is a list, recurse into each sub-list - return [_remove_z_dimension(sub_coord) for sub_coord in coords] - else: - # If the first element is not a list, it's a coordinate pair (x, y, z) - return coords[:2] # Return only [x, y] - - -def _multigeom_to_singlegeom(featcol: FeatureCollection) -> FeatureCollection: - """Converts any Multi(xxx) geometry types to list of individual geometries. - - Args: - featcol : A GeoJSON FeatureCollection of geometries. - - Returns: - FeatureCollection: A GeoJSON FeatureCollection containing - single geometry types only: Polygon, LineString, Point. - """ - - def split_multigeom(geom: Geometry, properties: Properties) -> list[Feature]: - """Splits multi-geometries into individual geometries.""" - geom_type = geom["type"] - coordinates = geom["coordinates"] - - # Handle MultiPolygon correctly - if geom_type == "MultiPolygon": - return [ - { - "type": "Feature", - "geometry": {"type": "Polygon", "coordinates": polygon}, - "properties": properties, - } - for polygon in coordinates - ] - - # Handle other MultiXXX types - return [ - { - "type": "Feature", - "geometry": {"type": geom_type[5:], "coordinates": coord}, - "properties": properties, - } - for coord in coordinates - ] - - final_features = [] - - for feature in featcol.get("features", []): - properties = feature.get("properties", {}) - geom = feature.get("geometry") - if not geom or "type" not in geom: - continue - - if geom["type"].startswith("Multi"): - # Handle all MultiXXX types - final_features.extend(split_multigeom(geom, properties)) - else: - # Handle single geometry types - final_features.append(feature) - - return {"type": "FeatureCollection", "features": final_features} diff --git a/geojson_aoi/parser.py b/geojson_aoi/parser.py index fd03535..dd5cf0a 100644 --- a/geojson_aoi/parser.py +++ b/geojson_aoi/parser.py @@ -1,3 +1,20 @@ +# Copyright (c) Humanitarian OpenStreetMap Team +# This file is part of geojson-aoi-parser. +# +# geojson-aoi-parser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# geojson-aoi-parser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with geojson-aoi-parser. If not, see . +# + """Parse various AOI GeoJSON formats and normalize.""" import json @@ -5,9 +22,10 @@ import warnings from pathlib import Path -from geojson_aoi.merge import merge_polygons -from geojson_aoi.normalize import normalize_featcol -from geojson_aoi.types import FeatureCollection +from psycopg import Connection + +from geojson_aoi.postgis import PostGis +from geojson_aoi.types import Feature, FeatureCollection, GeoJSON AllowedInputTypes = [ "Polygon", @@ -20,20 +38,20 @@ log = logging.getLogger(__name__) -def check_crs(featcol: FeatureCollection) -> None: +def check_crs(geojson: GeoJSON) -> None: """Warn the user if an invalid CRS is detected. Also does a rough check for one geometry, to determine if the coordinates are range 90/180 degree range. Args: - featcol (FeatureCollection): a FeatureCollection. + geojson (GeoJSON): a GeoJSON. Returns: - FeatureCollection: a FeatureCollection. + None """ - def is_valid_crs(crs_name): + def is_valid_crs(crs_name: str) -> bool: valid_crs_list = [ "urn:ogc:def:crs:OGC:1.3:CRS84", "urn:ogc:def:crs:EPSG::4326", @@ -41,78 +59,66 @@ def is_valid_crs(crs_name): ] return crs_name in valid_crs_list - def is_valid_coordinate(coord): - if coord is None: - return False - return -180 <= coord[0] <= 180 and -90 <= coord[1] <= 90 - - if "crs" in featcol: - crs = featcol.get("crs", {}).get("properties", {}).get("name") - if not is_valid_crs(crs): - warning_msg = ( - "Unsupported coordinate system, it is recommended to use a " - "GeoJSON file in WGS84(EPSG 4326) standard." - ) - log.warning(warning_msg) - warnings.warn(UserWarning(warning_msg), stacklevel=2) - - features = featcol.get("features", []) - coordinates = ( - features[-1].get("geometry", {}).get("coordinates", []) if features else [] - ) + def is_valid_coordinate(coord: list[float]) -> bool: + return len(coord) == 2 and -180 <= coord[0] <= 180 and -90 <= coord[1] <= 90 - first_coordinate = None - if coordinates: - while isinstance(coordinates, list): - first_coordinate = coordinates - coordinates = coordinates[0] - - if not is_valid_coordinate(first_coordinate): + crs = geojson.get("crs", {}).get("properties", {}).get("name") + if crs and not is_valid_crs(crs): warning_msg = ( - "The coordinates within the GeoJSON file are not valid. " - "Is the file empty?" + "Unsupported coordinate system. Use WGS84 (EPSG 4326) for best results." ) log.warning(warning_msg) warnings.warn(UserWarning(warning_msg), stacklevel=2) + geom = geojson.get("geometry") or geojson.get("features", [{}])[-1].get( + "geometry", {} + ) + coordinates = geom.get("coordinates", []) + + # Drill down into nested coordinates to find the first coordinate + while isinstance(coordinates, list) and len(coordinates) > 0: + coordinates = coordinates[0] -def geojson_to_featcol(geojson_obj: dict) -> FeatureCollection: - """Enforce GeoJSON is wrapped in FeatureCollection. + if not is_valid_coordinate(coordinates): + warning_msg = "Invalid coordinates in GeoJSON. Ensure the file is not empty." + log.warning(warning_msg) + warnings.warn(UserWarning(warning_msg), stacklevel=2) - The type check is done directly from the GeoJSON to allow parsing - from different upstream libraries (e.g. geojson_pydantic). + +def strip_featcol(geojson_obj: GeoJSON | Feature | FeatureCollection) -> list[GeoJSON]: + """Remove FeatureCollection and Feature wrapping. Args: - geojson_obj (dict): a parsed geojson, to wrap in a FeatureCollection. + geojson_obj (dict): a parsed geojson. Returns: - FeatureCollection: a FeatureCollection. + list[GeoJSON]: a list of geometries. """ + # FIXME possibly add logic to retain and existing properties? + + if geojson_obj.get("crs"): + # Warn the user if invalid CRS detected + check_crs(geojson_obj) + geojson_type = geojson_obj.get("type") - geojson_crs = geojson_obj.get("crs") if geojson_type == "FeatureCollection": - log.debug("Already in FeatureCollection format, reparsing") - features = geojson_obj.get("features", []) + geoms = [feature["geometry"] for feature in geojson_obj.get("features", [])] elif geojson_type == "Feature": - log.debug("Converting Feature to FeatureCollection") - features = [geojson_obj] + geoms = [geojson_obj.get("geometry")] else: - log.debug("Converting Geometry to FeatureCollection") - features = [{"type": "Feature", "geometry": geojson_obj, "properties": {}}] + geoms = [geojson_obj] - featcol = {"type": "FeatureCollection", "features": features} - if geojson_crs: - featcol["crs"] = geojson_crs - return featcol + return geoms def parse_aoi( - geojson_raw: str | bytes | dict, merge: bool = False + db: str | Connection, geojson_raw: str | bytes | dict, merge: bool = False ) -> FeatureCollection: """Parse a GeoJSON file or data struc into a normalized FeatureCollection. Args: + db (str | Connection): Existing db connection, or connection string. geojson_raw (str | bytes | dict): GeoJSON file path, JSON string, dict, or file bytes. merge (bool): If any nested Polygons / MultiPolygon should be merged. @@ -143,14 +149,9 @@ def parse_aoi( if geojson_parsed["type"] not in AllowedInputTypes: raise ValueError(f"The GeoJSON type must be one of: {AllowedInputTypes}") - # Convert to FeatureCollection - featcol = geojson_to_featcol(geojson_parsed) - if not featcol.get("features", []): - raise ValueError("Failed parsing geojson") - - # Warn the user if invalid CRS detected - check_crs(featcol) + # Extract from FeatureCollection + geoms = strip_featcol(geojson_parsed) - if not merge: - return normalize_featcol(featcol) - return merge_polygons(normalize_featcol(featcol)) + with PostGis(db, geoms, merge) as result: + print(result.featcol) + return result.featcol diff --git a/geojson_aoi/postgis.py b/geojson_aoi/postgis.py new file mode 100644 index 0000000..be303da --- /dev/null +++ b/geojson_aoi/postgis.py @@ -0,0 +1,169 @@ +# Copyright (c) Humanitarian OpenStreetMap Team +# This file is part of geojson-aoi-parser. +# +# geojson-aoi-parser is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# geojson-aoi-parser is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with geojson-aoi-parser. If not, see . +# +"""Wrapper around PostGIS geometry functions.""" + +import json +import logging +from uuid import uuid4 + +from psycopg import Connection, connect + +from geojson_aoi.types import GeoJSON + +log = logging.getLogger(__name__) + + +class Normalize: + """Normalise the geometry. + + - Strip z-dimension (force 2D). + - Remove geoms from GeometryCollection. + - Multi geometries to single geometries. + """ + + @staticmethod + def init_table(table_id: str) -> str: + """Create the table for geometry processing.""" + return f""" + CREATE TEMP TABLE "{table_id}" ( + id SERIAL PRIMARY KEY, + geometry GEOMETRY(Polygon, 4326) + ); + """ + + @staticmethod + def insert(geoms: list[GeoJSON], table_id: str) -> str: + """Insert geometries into db, normalising where possible.""" + values = [] + for geom in geoms: + # ST_Force2D strings z-coordinates + val = ( + "ST_Force2D(ST_SetSRID(" + f"ST_GeomFromGeoJSON('{json.dumps(geom)}'), 4326))" + ) + + # ST_CollectionExtract converts any GeometryCollections + # into MultiXXX geoms + if geom.get("type") == "GeometryCollection": + val = f"ST_CollectionExtract({val})" + + # ST_Dump extracts all MultiXXX geoms to single geom equivalents + # TODO ST_Dump (complex, as it returns multiple geometries!) + + # ST_ForcePolygonCW forces clockwise orientation for + # their exterior ring + if geom.get("type") == "Polygon" or geom.get("type") == "MultiPolygon": + val = f"ST_ForcePolygonCW({val})" + + values.append(val) + + value_string = ", ".join(values) + return f""" + INSERT INTO "{table_id}" (geometry) + VALUES {value_string}; + """ + + +class Merge: + """Merge polygons. + + - MultiPolygon to a single Polygon. + - Remove interior rings from all polygons (holes). + + Automatically determine whether to use union (for overlapping polygons) + or convex hull (for disjoint polygons). + """ + + pass + # ST_UnaryUnion + # ST_ConvexHull + + +class PostGis: + """A synchronous database connection. + + Typically called standalone. + Can reuse an existing upstream connection. + """ + + def __init__(self, db: str | Connection, geoms: list[GeoJSON], merge: bool = False): + """Initialise variables and compose classes.""" + self.table_id = uuid4().hex + self.geoms = geoms + self.db = db + self.featcol = None + + self.normalize = Normalize() + if merge: + self.merge = Merge() + + def __enter__(self) -> "PostGis": + """Initialise the database via context manager.""" + self.create_connection() + with self.connection.cursor() as cur: + cur.execute(self.normalize.init_table(self.table_id)) + cur.execute(self.normalize.insert(self.geoms, self.table_id)) + # if self.merge: + # cur.execute(self.merge.unary_union(self.geoms, self.table_id)) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Execute the SQL and optionally close the db connection.""" + self.close_connection() + + def create_connection(self) -> None: + """Get a new database connection.""" + # Create new connection + if isinstance(self.db, str): + self.connection = connect(self.db) + self.is_new_connection = True + # Reuse existing connection + elif isinstance(self.db, Connection): + self.connection = self.db + self.is_new_connection = False + # Else, error + else: + msg = ( + "The `db` variable is not a valid string or " + "existing psycopg connection." + ) + log.error(msg) + raise ValueError(msg) + + def close_connection(self) -> None: + """Close the database connection.""" + if not self.connection: + return + + # Execute all commands in a transaction before closing + try: + self.connection.commit() + except Exception as e: + log.error(e) + log.error("Error committing psycopg transaction to db") + finally: + # Only close the connection if it was newly created + if self.is_new_connection: + self.connection.close() + + +class PostGisAsync: + """An asynchronous database connection. + + Typically called from an async web server. + Can reuse an existing upstream connection. + """ diff --git a/geojson_aoi/types.py b/geojson_aoi/types.py index bd49c50..8face2c 100644 --- a/geojson_aoi/types.py +++ b/geojson_aoi/types.py @@ -2,15 +2,7 @@ from typing import Any -# Coordinates -Coordinate = float | int -PointCoords = tuple[Coordinate, Coordinate] -PolygonCoords = list[list[PointCoords]] - -# GeoJSON -Geometry = dict[str, Any] -Properties = dict[str, Any] - -# Features +# FIXME these should be improved +GeoJSON = dict[str, Any] Feature = dict[str, Any] FeatureCollection = dict[str, Any] diff --git a/mkdocs.yml b/mkdocs.yml index d88ef65..5c2c569 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: geojson-aoi-parser -site_description: Parse and normalize a GeoJSON area of interest, using pure Python. +site_description: Parse and normalize a GeoJSON area of interest, using PostGIS. # strict: true site_url: "https://www.hotosm.org" diff --git a/pyproject.toml b/pyproject.toml index 852ae0b..5831b10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "geojson-aoi-parser" version = "0.2.0" -description = "Parse and normalize a GeoJSON area of interest, using pure Python." +description = "Parse and normalize a GeoJSON area of interest, using PostGIS." authors = [ {name = "Sam Woodcock", email = "sam.woodcock@hotosm.org"}, ] @@ -17,7 +17,9 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] -dependencies = [] +dependencies = [ + "psycopg>=3.1", +] [dependency-groups] test = [ diff --git a/tests/conftest.py b/tests/conftest.py index b1abfdd..973ce2b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,12 @@ import pytest +@pytest.fixture(scope="session") +def db(): + """Database URI.""" + return "postgresql://aoi:dummycipassword@db:5432/aoi" + + @pytest.fixture def polygon_geojson(): """Polygon.""" diff --git a/tests/test_parser.py b/tests/test_parser.py index 7b9663e..beb243b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -17,38 +17,38 @@ def is_featcol_nested_polygon(geojson) -> bool: return False -def test_polygon(polygon_geojson): +def test_polygon(db, polygon_geojson): """A single Polygon.""" - result = parse_aoi(polygon_geojson) + result = parse_aoi(db, polygon_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_polygon_with_holes(polygon_holes_geojson): +def test_polygon_with_holes(db, polygon_holes_geojson): """A single Polygon with holes, should remain unchanged.""" - result = parse_aoi(polygon_holes_geojson) + result = parse_aoi(db, polygon_holes_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 # We have three rings inside polygon (1 exterior, 2 interior) assert len(result["features"][0]["geometry"]["coordinates"]) == 3 -def test_polygon_merge_with_holes(polygon_holes_geojson): +def test_polygon_merge_with_holes(db, polygon_holes_geojson): """A single Polygon with holes, where the holes should be removed.""" - result = parse_aoi(polygon_holes_geojson, merge=True) + result = parse_aoi(db, polygon_holes_geojson, merge=True) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 # As we specify 'merge', only the exterior ring should be remaining assert len(result["features"][0]["geometry"]["coordinates"]) == 1 -def test_z_dimension_polygon(polygon_geojson): +def test_z_dimension_polygon(db, polygon_geojson): """A single Polygon, with z-dimension coord stripped out.""" geojson_data = { "type": "Polygon", "coordinates": [[[0, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], [0, 0, 0]]], } - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 assert result == { @@ -63,21 +63,21 @@ def test_z_dimension_polygon(polygon_geojson): } -def test_feature(feature_geojson): +def test_feature(db, feature_geojson): """A Polygon nested in a Feature.""" - result = parse_aoi(feature_geojson) + result = parse_aoi(db, feature_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_feature_collection(featcol_geojson): +def test_feature_collection(db, featcol_geojson): """A Polygon nested in a Feature, inside a FeatureCollection.""" - result = parse_aoi(featcol_geojson) + result = parse_aoi(db, featcol_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_feature_collection_multiple_geoms(feature_geojson): +def test_feature_collection_multiple_geoms(db, feature_geojson): """Multiple Polygon nested in Features, inside a FeatureCollection. Intentionally no merging in this test. @@ -86,12 +86,12 @@ def test_feature_collection_multiple_geoms(feature_geojson): "type": "FeatureCollection", "features": [feature_geojson, feature_geojson, feature_geojson], } - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 3 -def test_nested_geometrycollection(geomcol_geojson): +def test_nested_geometrycollection(db, geomcol_geojson): """A GeometryCollection nested inside a FeatureCollection.""" geojson_data = { "type": "FeatureCollection", @@ -103,12 +103,12 @@ def test_nested_geometrycollection(geomcol_geojson): } ], } - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_multiple_nested_geometrycollection(geomcol_geojson): +def test_multiple_nested_geometrycollection(db, geomcol_geojson): """Multiple GeometryCollection nested inside a FeatureCollection.""" geojson_data = { "type": "FeatureCollection", @@ -125,7 +125,7 @@ def test_multiple_nested_geometrycollection(geomcol_geojson): }, ], } - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 2 @@ -138,12 +138,12 @@ def test_multiple_nested_geometrycollection(geomcol_geojson): # "geometries": [polygon_geojson, polygon_geojson, polygon_geojson], # } -# result = parse_aoi(geojson_data) +# result = parse_aoi(db, geojson_data) # assert is_featcol_nested_polygon(result) # assert len(result["features"]) == 3 -def test_featcol_merge_multiple_polygons(): +def test_featcol_merge_multiple_polygons(db): """Merge multiple polygons inside a FeatureCollection.""" geojson_data = { "type": "FeatureCollection", @@ -166,12 +166,12 @@ def test_featcol_merge_multiple_polygons(): }, ], } - result = parse_aoi(geojson_data, merge=True) + result = parse_aoi(db, geojson_data, merge=True) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_featcol_no_merge_polygons(): +def test_featcol_no_merge_polygons(db): """Do not merge multiple polygons inside a FeatureCollection.""" geojson_data = { "type": "FeatureCollection", @@ -194,14 +194,14 @@ def test_featcol_no_merge_polygons(): }, ], } - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 2 -def test_merge_multipolygon(multipolygon_geojson): +def test_merge_multipolygon(db, multipolygon_geojson): """Merge multiple polygons inside a MultiPolygon.""" - result = parse_aoi(multipolygon_geojson, merge=True) + result = parse_aoi(db, multipolygon_geojson, merge=True) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 @@ -211,45 +211,45 @@ def test_merge_multipolygon(multipolygon_geojson): # assert False -def test_multipolygon_no_merge(multipolygon_geojson): +def test_multipolygon_no_merge(db, multipolygon_geojson): """Do not merge multiple polygons inside a MultiPolygon.""" - result = parse_aoi(multipolygon_geojson) + result = parse_aoi(db, multipolygon_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 3 -def test_multipolygon_with_holes(multipolygon_holes_geojson): +def test_multipolygon_with_holes(db, multipolygon_holes_geojson): """MultiPolygon --> Polygon, with holes remaining.""" # FIXME this should not removed the holes from the polygon geom # FIXME Instead the polygon should simply be extrated from the MultiPolygon # FIXME (we only remove holes if merge=True) - result = parse_aoi(multipolygon_holes_geojson) + result = parse_aoi(db, multipolygon_holes_geojson) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 3 -def test_multipolygon_with_holes_merged(multipolygon_holes_geojson): +def test_multipolygon_with_holes_merged(db, multipolygon_holes_geojson): """Merge multipolygon, including holes.""" - result = parse_aoi(multipolygon_holes_geojson, merge=True) + result = parse_aoi(db, multipolygon_holes_geojson, merge=True) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_invalid_input(): +def test_invalid_input(db): """Invalud input for parse_aoi function.""" with pytest.raises( ValueError, match="GeoJSON input must be a valid dict, str, or bytes" ): - parse_aoi(123) + parse_aoi(db, 123) with pytest.raises(ValueError, match="Provided GeoJSON is empty"): - parse_aoi("{}") + parse_aoi(db, "{}") with pytest.raises(ValueError, match="The GeoJSON type must be one of:"): - parse_aoi({"type": "Point"}) + parse_aoi(db, {"type": "Point"}) -def test_file_input(tmp_path): +def test_file_input(db, tmp_path): """GeoJSON file input for parse_aoi function.""" geojson_file = tmp_path / "test.geojson" geojson_data = { @@ -267,12 +267,12 @@ def test_file_input(tmp_path): } geojson_file.write_text(json.dumps(geojson_data)) - result = parse_aoi(str(geojson_file)) + result = parse_aoi(db, str(geojson_file)) assert is_featcol_nested_polygon(result) assert len(result["features"]) == 1 -def test_no_warnings_valid_crs(): +def test_no_warnings_valid_crs(db): """Test including a valid CRS.""" geojson_data = { "type": "FeatureCollection", @@ -293,7 +293,7 @@ def test_no_warnings_valid_crs(): } with warnings.catch_warnings(record=True) as recorded_warnings: - result = parse_aoi(geojson_data) + result = parse_aoi(db, geojson_data) if recorded_warnings: raise AssertionError( f"Warnings should not be raised here: {recorded_warnings[0].message}" @@ -303,7 +303,7 @@ def test_no_warnings_valid_crs(): assert len(result["features"]) == 1 -def test_warnings_raised_invalid_crs(): +def test_warnings_raised_invalid_crs(db): """Test including an invalid CRS, raising warnings.""" geojson_data = { "type": "FeatureCollection", @@ -320,10 +320,10 @@ def test_warnings_raised_invalid_crs(): "crs": {"type": "name", "properties": {"name": "invalid!!"}}, } with pytest.warns(UserWarning): - parse_aoi(geojson_data) + parse_aoi(db, geojson_data) -def test_warnings_raised_invalid_coords(): +def test_warnings_raised_invalid_coords(db): """Test including an invalid coordinates, raising warnings.""" geojson_data = { "type": "FeatureCollection", @@ -340,4 +340,4 @@ def test_warnings_raised_invalid_coords(): "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::4326"}}, } with pytest.warns(UserWarning): - parse_aoi(geojson_data) + parse_aoi(db, geojson_data) diff --git a/uv.lock b/uv.lock index 7911b69..d7f3110 100644 --- a/uv.lock +++ b/uv.lock @@ -114,6 +114,9 @@ wheels = [ name = "geojson-aoi-parser" version = "0.2.0" source = { editable = "." } +dependencies = [ + { name = "psycopg" }, +] [package.dev-dependencies] docs = [ @@ -128,6 +131,7 @@ test = [ ] [package.metadata] +requires-dist = [{ name = "psycopg", specifier = ">=3.1" }] [package.metadata.requires-dev] docs = [ @@ -478,6 +482,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] +[[package]] +name = "psycopg" +version = "3.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/ad/7ce016ae63e231575df0498d2395d15f005f05e32d3a2d439038e1bd0851/psycopg-3.2.3.tar.gz", hash = "sha256:a5764f67c27bec8bfac85764d23c534af2c27b893550377e37ce59c12aac47a2", size = 155550 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/21/534b8f5bd9734b7a2fcd3a16b1ee82ef6cad81a4796e95ebf4e0c6a24119/psycopg-3.2.3-py3-none-any.whl", hash = "sha256:644d3973fe26908c73d4be746074f6e5224b03c1101d302d9a53bf565ad64907", size = 197934 }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -735,6 +752,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 }, ] +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + +[[package]] +name = "tzdata" +version = "2024.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/34/943888654477a574a86a98e9896bae89c7aa15078ec29f490fef2f1e5384/tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc", size = 193282 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, +] + [[package]] name = "urllib3" version = "2.3.0"