datasciencecampus · CBROWN-ONS · Jun 25, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024
diff --git a/docs/tutorials/osm/index.qmd b/docs/tutorials/osm/index.qmd
@@ -128,7 +128,7 @@ xmin, ymin, xmax, ymax format. Call the list `BBOX_LIST`.
 BBOX_LIST = [<INSERT_VALUES_HERE>]
 ```
 
-### Exercise
+### Solution
 
 ```{python}
 BBOX_LIST = [-3.002175, 51.587035, -2.994271, 51.59095]
@@ -353,6 +353,14 @@ To read more on `osmosis` filtering strategies, refer to the `completeWays` and
 `completeRelations` flag descriptions in the
 [Osmosis detailed usage documentation](https://wiki.openstreetmap.org/wiki/Osmosis/Detailed_Usage_0.48).
 
+
+Note that additional metadata can be added to the map by setting `include_tags=True`. Adding this rich contextual data to the map can be useful but is also  computationally expensive. This operation should be avoided for large osm files, for example anything over 500 KB.
+
+```{python}
+loc_finder.plot_ids(id_finder.id_dict["way_ids"], feature_type="way", include_tags=True)
+
+```
+
 ## Conclusion
 
 Congratulations, you have successfully completed this tutorial on OpenStreetMap

diff --git a/requirements.txt b/requirements.txt
@@ -28,11 +28,12 @@ seaborn
 haversine
 pretty_html_table
 kaleido
-numpy>=1.25.0 # test suite will fail if user installed lower than this
+numpy==1.26.4 # ERROR  - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
 dask[dataframe]
-pyarrow >= 14.0.1 # 14.0.0 has security vulnerability
+pyarrow>=14.0.1 # 14.0.0 has security vulnerability
 osmium # has dependencies on `cmake` and `boost` which require brew install
 tqdm
 quartodoc
 jupyterlab
+xyzservices
 -e .
diff --git a/src/transport_performance/osm/validate_osm.py b/src/transport_performance/osm/validate_osm.py
@@ -18,6 +18,8 @@
 * Find coordinates for node or way features
 * Plot the coordinates of a given list of node or way IDs
 """
+import os
+import warnings
 from pathlib import Path
 from typing import Union
 
@@ -26,6 +28,7 @@
 import osmium
 import pandas as pd
 from shapely import Point
+from xyzservices import TileProvider
 
 from transport_performance.utils.defence import (
     _check_item_in_iter,
@@ -37,6 +40,12 @@
 # ---------utilities-----------
 
 
+class PerformanceWarning(Warning):
+    """Operation may be slow."""
+
+    pass
+
+
 def _compile_tags(osmium_feature):
     """Return tag name value pairs.
 
@@ -532,6 +541,9 @@ class FindTags:
         Tags found for OSM relation features.
     __area_tags : dict
         Tags found for OSM area features.
+    __LARGE_FILE_THRESH : int
+        Size threshold in bytes on which to raise a PerformanceWarning when
+        osm file on disk exceeds.
 
     """
 
@@ -541,6 +553,16 @@ def __init__(
         _is_expected_filetype(
             osm_pth, "osm_pth", check_existing=True, exp_ext=".pbf"
         )
+        self.__LARGE_FILE_THRESH = 50000  # 50 KB
+        # implement performance warning on large OSM files.
+        osm_size = os.path.getsize(osm_pth)
+        if osm_size > self.__LARGE_FILE_THRESH:
+            warnings.warn(
+                f"PBF file is {osm_size} bytes. Tag operations are expensive."
+                " Consider filtering the pbf file smaller than"
+                f" {self.__LARGE_FILE_THRESH} bytes",
+                PerformanceWarning,
+            )
         tags = tag_collator()
         classnm = tags.__class__.__name__
         if classnm != "_TagHandler":
@@ -614,6 +636,9 @@ class FindLocations:
         Locations of nodes.
     __way_node_locs : dict
         Locations of nodes that belong to a way.
+    _osm_pth : Union[Path, str]
+        Path to osm file on disk. Used for method plot_ids() when include_tags
+        is True.
 
     """
 
@@ -630,6 +655,7 @@ def __init__(
         self.__node_locs = locs.node_locs
         self.__way_node_locs = locs.way_node_locs
         self.found_locs = dict()
+        self._osm_pth = osm_pth
 
     def _check_is_implemented(self, user_feature: str, param_nm: str) -> None:
         """If the requested feature is not node or way, raise."""
@@ -678,11 +704,123 @@ def check_locs_for_ids(self, ids: list, feature_type: str) -> dict:
         )
         return self.found_locs
 
+    def _merge_dicts_retain_dupe_keys(
+        self, dict1: dict, dict2: dict, prepend_pattern: str = "parent_"
+    ) -> dict:
+        """Squish 2 dictionaries while retaining any duplicated keys.
+
+        Update dict1 with key:value pairs from dict2. If duplicated keys are
+        found in dict2, prepend the key with prepend_pattern.
+
+        Parameters
+        ----------
+        dict1 : dict
+            Dictionary of (child or node) tags.
+        dict2 : dict
+            Dictionary of (parent) tags.
+        prepend_pattern : str
+            A string to prepend any duplicated keys in dict_2 with.
+
+        Returns
+        -------
+        dict
+            A merged dictionary, retaining key:value pairs from both.
+
+        """
+        tags_out = {}
+        for d in [dict1, dict2]:
+            if not isinstance(d, dict):
+                raise TypeError(f"Expected dict but found {type(d)}: {d}")
+        for id_, tags in dict1.items():  # child_tags is nested
+            # find duplicated keys and prepend parent keys
+            if dupes := set(tags.keys()).intersection(dict2.keys()):
+                for key in dupes:
+                    dict2[f"{prepend_pattern}{key}"] = dict2.pop(key)
+            # merge parent and child tag collections
+            tags_out[id_] = tags | dict2
+        return tags_out
+
+    def _add_tag_context_to_coord_gdf(  # noqa: C901
+        self, ids: list, feature_type: str, tooltip_nm: str
+    ) -> gpd.GeoDataFrame:
+        """Add a column of tooltips to the coord_gdf attribute.
+
+        Handles node and way features separately.
+
+        Parameters
+        ----------
+        ids : list
+            A list of IDs.
+        feature_type : str
+            "way" or "node".
+        tooltip_nm : str
+            Name of the column to use for the tooltips.
+
+        Returns
+        -------
+        None
+            Updates `coord_gdf` attribute.
+
+        """
+        mapping = {}
+        parent_tags = self.tagfinder.check_tags_for_ids(ids, feature_type)
+        self.coord_gdf[tooltip_nm] = self.coord_gdf.index.to_list()
+        if feature_type == "way":
+            parent_child_mapping = self.coord_gdf.index
+            # Now we have child IDs, we need to run them through FindTags
+            child_tags = self.tagfinder.check_tags_for_ids(
+                [i[-1] for i in parent_child_mapping], feature_type="node"
+            )
+            # add in the parent tag ID to all child tags
+            for k, v in child_tags.items():
+                for t in parent_child_mapping.to_flat_index():
+                    if k == t[-1]:
+                        v["parent_id"] = t[0]
+            # merge the parent way metadata dictionary with the child
+            # metadata dict
+            all_tags = parent_child_mapping.to_series().to_dict()
+            for k, v in parent_tags.items():
+                # iterate over only the children for each parent node
+                for id_ in [i for i in parent_child_mapping if i[0] == k]:
+                    all_tags[id_] = self._merge_dicts_retain_dupe_keys(
+                        {id_[-1]: child_tags[id_[-1]]}, v
+                    )
+            # add combined tags as custom tooltips to coord_gdf. Use map
+            # method to avoid lexsort performance warning
+            for _, v in all_tags.items():
+                for k, val in v.items():
+                    tooltips = [
+                        f"<b>{tag}:</b> {val_}<br>"
+                        for tag, val_ in val.items()
+                    ]
+                    mapping[(val["parent_id"], k)] = "".join(tooltips)
+
+        elif feature_type == "node":
+            for k, val in self.tagfinder.found_tags.items():
+                tooltips = [
+                    f"<b>{tag}:</b> {val_}<br>" for tag, val_ in val.items()
+                ]
+                mapping[k] = "".join(tooltips)
+
+        self.coord_gdf[tooltip_nm] = self.coord_gdf[tooltip_nm].map(mapping)
+        return None
+
     def plot_ids(
         self,
         ids: list,
         feature_type: str,
         crs: Union[str, int] = "epsg:4326",
+        include_tags: bool = False,
+        tooltip_nm: str = "custom_tooltip",
+        tooltip_kwds: dict = {"labels": False},
+        tiles: str = "CartoDB positron",
+        style_kwds: dict = {
+            "color": "#3f5277",
+            "fill": True,
+            "fillOpacity": 0.3,
+            "fillColor": "#3f5277",
+            "weight": 4,
+        },
     ) -> folium.Map:
         """Plot coordinates for nodes or node members of a way.
 
@@ -698,6 +836,26 @@ def plot_ids(
             Whether the type of OSM feature to plot is node or way.
         crs : Union[str, int], optional
             The projection of the spatial features, by default "epsg:4326"
+        include_tags : bool
+            Should tag metadata be included in the map tooltips, by default
+            False
+        tooltip_nm : str
+            Name to use for tooltip column in coord_gdf attribute, by default
+            "custom_tooltip"
+        tooltip_kwds : dict
+            Additional tooltip styling arguments to pass to gpd explore(), by
+            default {"labels": False}
+        tiles : Union[str, xyzservices.TileProvider]
+            Basemap provider tiles to use, by default "CartoDB positron"
+        style_kwds : dict
+            Additional map styling arguments to pass to gpd explore(), by
+            default {
+                "color": "#3f5277",
+                "fill": True,
+                "fillOpacity": 0.3,
+                "fillColor": "#3f5277",
+                "weight": 4,
+                }
 
         Returns
         -------
@@ -719,6 +877,11 @@ def plot_ids(
         _type_defence(ids, "ids", list)
         _type_defence(feature_type, "feature_type", str)
         _type_defence(crs, "crs", (str, int))
+        _type_defence(include_tags, "include_tags", bool)
+        _type_defence(tooltip_nm, "tooltip_nm", str)
+        _type_defence(tooltip_kwds, "tooltip_kwds", dict)
+        _type_defence(tiles, "tiles", (str, TileProvider))
+        _type_defence(style_kwds, "style_kwds", dict)
         self._check_is_implemented(
             user_feature=feature_type, param_nm="feature_type"
         )
@@ -728,4 +891,19 @@ def plot_ids(
             feature_type=feature_type,
             crs=crs,
         )
-        return self.coord_gdf.explore()
+        if not include_tags:
+            imap = self.coord_gdf.explore(tiles=tiles, style_kwds=style_kwds)
+        else:
+            # retrieve tags for IDs and add them to self.coord_gdf
+            self.tagfinder = FindTags(self._osm_pth)
+            self._add_tag_context_to_coord_gdf(
+                ids, feature_type, tooltip_nm=tooltip_nm
+            )
+            imap = self.coord_gdf.explore(
+                tooltip=tooltip_nm,
+                tooltip_kwds=tooltip_kwds,
+                tiles=tiles,
+                style_kwds=style_kwds,
+            )
+
+        return imap
diff --git a/tests/osm/test_validate_osm.py b/tests/osm/test_validate_osm.py
@@ -11,6 +11,7 @@
     FindIds,
     FindLocations,
     FindTags,
+    PerformanceWarning,
     _convert_osm_dict_to_gdf,
     _filter_target_dict_with_list,
 )
@@ -321,6 +322,19 @@ def test_find_locations_init(self, _tiny_osm_locs):
             way_len == 2
         ), f"Expected way with length 2, instead found {way_len}"
 
+    def test__merge_dicts_retain_dupe_keys_raises(self, _tiny_osm_locs):
+        """Test internal raises TypeError."""
+        with pytest.raises(
+            TypeError,
+            match=re.escape(
+                "Expected dict but found <class 'list'>: ['not a key', 2]"
+            ),
+        ):
+            _tiny_osm_locs._merge_dicts_retain_dupe_keys(
+                dict1={"some_key": 1},
+                dict2=["not a key", 2],
+            )
+
     def test_check_locs_for_ids(self, _tiny_osm_locs, _tiny_osm_ids):
         """Assert check_locs_for_ids."""
         ids = _tiny_osm_ids
@@ -368,8 +382,42 @@ def test_plot_ids_on_pass(self, _tiny_osm_locs, _tiny_osm_ids):
             ids=ids._FindIds__node_ids[0:1], feature_type="node"
         )
         assert isinstance(plt, folium.Map)
+        plt = locs.plot_ids(
+            ids=ids._FindIds__node_ids[0:1],
+            feature_type="node",
+            include_tags=True,
+        )
+        assert isinstance(plt, folium.Map)
+        # check the tag column is as expected - for nodes, this example should
+        # be empty, nodes often contain no tags, but not always
+        pd.testing.assert_series_equal(
+            locs.coord_gdf["custom_tooltip"],
+            pd.Series([""], index=[7727955], name="custom_tooltip"),
+        )
+        assert locs.coord_gdf["custom_tooltip"].values == [""]
         plt = locs.plot_ids(ids=ids._FindIds__way_ids[0:1], feature_type="way")
         assert isinstance(plt, folium.Map)
+        plt = locs.plot_ids(
+            ids=ids._FindIds__way_ids[0:1],
+            feature_type="way",
+            include_tags=True,
+        )
+        # check the tag column is as expected - for ways, these should always
+        # include at least the parent_id tag.
+        pd.testing.assert_series_equal(
+            locs.coord_gdf["custom_tooltip"],
+            pd.Series(
+                [
+                    "<b>crossing:</b> marked<br><b>highway:</b> crossing<br><b>tactile_paving:</b> yes<br><b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>",  # noqa E501
+                    "<b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>",  # noqa E501
+                ],
+                index=pd.MultiIndex.from_tuples(
+                    [(4811009, 7447008812), (4811009, 443158788)],
+                    names=["parent_id", "member_id"],
+                ),
+                name="custom_tooltip",
+            ),
+        )
 
     def test_plot_ids_not_implemented(self, _tiny_osm_locs):
         """Assert asking for relation or area riases not implemented error."""
@@ -419,6 +467,18 @@ def test_find_tags_init(self, _tiny_osm_tags):
         ]
         _class_atttribute_assertions(tags, expected_attrs, expected_methods)
 
+    @pytest.mark.runexpensive
+    def test_find_tags_init_warning(self):
+        """Test that large OSM files trigger a performance warning.
+
+        execution duration c.80 seconds.
+        """
+        with pytest.warns(
+            PerformanceWarning,
+            match=".*Consider filtering the pbf file smaller than 50000 bytes",
+        ):
+            FindTags(here("tests/data/newport-2023-06-13.osm.pbf"))
+
     def test_find_tags_check_tags_for_ids(self, _tiny_osm_tags, _tiny_osm_ids):
         """Test FindTags.check_tags_for_ids()."""
         ids = _tiny_osm_ids