From 818c3646beadc17c85e43b1a8b8a229198749c26 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 09:56:41 -0400 Subject: [PATCH 01/38] Script to update the namning convention of eo tif files --- scripts/20220418_renaming.py | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/20220418_renaming.py diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py new file mode 100644 index 0000000..80a7825 --- /dev/null +++ b/scripts/20220418_renaming.py @@ -0,0 +1,51 @@ +""" +A script to shift the naming convention of EO data +from a _ format to a _ +format. +This decouples the EO data from the labels.geojson +""" +from pathlib import Path +import shutil +import geopandas + +from cropharvest.columns import RequiredColumns +from cropharvest.eo.eo import EarthEngineExporter +from cropharvest.eo.ee_boundingbox import EEBoundingBox +from cropharvest.utils import DATAFOLDER_PATH + + +SURROUNDING_METRES = 80 + + +def construct_new_name(labels: geopandas.GeoDataFrame, old_name: str) -> str: + + identifier = old_name.split("_")[0] + relevant_rows = labels[labels["export_identifier"] == identifier] + assert len(relevant_rows) == 1 + row = relevant_rows.iloc[0] + + # make a bounding box + ee_bbox = EEBoundingBox.from_centre( + mid_lat=row[RequiredColumns.LAT], + mid_lon=row[RequiredColumns.LON], + surrounding_metres=SURROUNDING_METRES, + ) + export_identifier = EarthEngineExporter.make_identifier( + ee_bbox, row["start_date"], row["end_date"] + ) + + return f"{export_identifier}.tif" + + +def copy_and_rename_dataset(org_folder: Path, new_folder: Path): + + original_tif_files = list(org_folder.glob("*.tif")) + labels = EarthEngineExporter.load_default_labels() + + for tif_file in original_tif_files: + new_name = construct_new_name(labels, tif_file.name) + shutil.copy(tif_file, new_folder / new_name) + + +if __name__ == "__main__": + copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", DATAFOLDER_PATH / "renamed_eo_data") From e09fc93cbc127c2daa7705879d63319b5ae57c81 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 10:23:38 -0400 Subject: [PATCH 02/38] Make the output folder --- scripts/20220418_renaming.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py index 80a7825..d4526b7 100644 --- a/scripts/20220418_renaming.py +++ b/scripts/20220418_renaming.py @@ -48,4 +48,6 @@ def copy_and_rename_dataset(org_folder: Path, new_folder: Path): if __name__ == "__main__": - copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", DATAFOLDER_PATH / "renamed_eo_data") + output_folder = DATAFOLDER_PATH / "renamed_eo_data" + output_folder.mkdir(exist_ok=True) + copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", output_folder) From d8ce1a2dafb56da62a43ec858d4405f3d5a5c0f9 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 10:24:59 -0400 Subject: [PATCH 03/38] Track progress using tqdm --- scripts/20220418_renaming.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py index d4526b7..a167871 100644 --- a/scripts/20220418_renaming.py +++ b/scripts/20220418_renaming.py @@ -7,6 +7,7 @@ from pathlib import Path import shutil import geopandas +from tqdm import tqdm from cropharvest.columns import RequiredColumns from cropharvest.eo.eo import EarthEngineExporter @@ -42,7 +43,7 @@ def copy_and_rename_dataset(org_folder: Path, new_folder: Path): original_tif_files = list(org_folder.glob("*.tif")) labels = EarthEngineExporter.load_default_labels() - for tif_file in original_tif_files: + for tif_file in tqdm(original_tif_files): new_name = construct_new_name(labels, tif_file.name) shutil.copy(tif_file, new_folder / new_name) From 78f326bc99497a248f255a8674a4365c2f577a05 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 10:26:48 -0400 Subject: [PATCH 04/38] Add the right arguments to load_default_labels --- scripts/20220418_renaming.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py index a167871..6a965f9 100644 --- a/scripts/20220418_renaming.py +++ b/scripts/20220418_renaming.py @@ -41,7 +41,9 @@ def construct_new_name(labels: geopandas.GeoDataFrame, old_name: str) -> str: def copy_and_rename_dataset(org_folder: Path, new_folder: Path): original_tif_files = list(org_folder.glob("*.tif")) - labels = EarthEngineExporter.load_default_labels() + labels = EarthEngineExporter.load_default_labels( + dataset=None, start_from_last=False, checkpoint=None + ) for tif_file in tqdm(original_tif_files): new_name = construct_new_name(labels, tif_file.name) From f27e94df0b158a33e768e20de0deed9de1dc2f4c Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 10:43:00 -0400 Subject: [PATCH 05/38] Remove conflicting column name --- cropharvest/columns.py | 2 +- cropharvest/eo/eo.py | 2 +- test/process_labels/test_datasets.py | 4 ++-- test/process_labels/test_process_labels_utils.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cropharvest/columns.py b/cropharvest/columns.py index a04b282..852daab 100644 --- a/cropharvest/columns.py +++ b/cropharvest/columns.py @@ -18,7 +18,7 @@ def date_columns(cls) -> List[str]: class RequiredColumns(Columns): - INDEX = "index" + INDEX = "dataset_index" IS_CROP = "is_crop" LAT = "lat" LON = "lon" diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index 2290373..a6461c2 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -144,7 +144,7 @@ def load_default_labels( - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) ) labels = labels.assign( - export_identifier=lambda x: f"{x['index']}-{x[RequiredColumns.DATASET]}" + export_identifier=lambda x: f"{x[RequiredColumns.INDEX]}-{x[RequiredColumns.DATASET]}" ) if dataset: labels = labels[labels.dataset == dataset] diff --git a/test/process_labels/test_datasets.py b/test/process_labels/test_datasets.py index df190f2..8dfc8cd 100644 --- a/test/process_labels/test_datasets.py +++ b/test/process_labels/test_datasets.py @@ -17,7 +17,7 @@ def _check_columns_and_types(df: geopandas.GeoDataFrame) -> None: for expected_column, expected_type in [ - ("index", int), + ("dataset_index", int), ("lat", float), ("lon", float), ("is_crop", int), @@ -50,7 +50,7 @@ def _check_export_end_date(df: geopandas.GeoDataFrame) -> None: def _check_index(df: geopandas.GeoDataFrame) -> None: - assert len(df["index"].unique()) == len(df) + assert len(df["dataset_index"].unique()) == len(df) def _check_labels(df: geopandas.GeoDataFrame) -> None: diff --git a/test/process_labels/test_process_labels_utils.py b/test/process_labels/test_process_labels_utils.py index 3a788c7..8d51b27 100644 --- a/test/process_labels/test_process_labels_utils.py +++ b/test/process_labels/test_process_labels_utils.py @@ -8,7 +8,7 @@ def test_is_test_column(): labels = geopandas.GeoDataFrame( data={ "dataset": ["togo-eval", "geowiki", "geowiki"], - "index": [1, 2, 3], + "dataset_index": [1, 2, 3], "lat": [7.5817201079726511, -12.17, 1.11], "lon": [1.3954393874414535, -45.8, 8.29], } From 674944cf73d915e348fb33b4753f202c40adb19a Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 11:46:28 -0400 Subject: [PATCH 06/38] Replace 'index' string with RequiredColumns.INDEX --- process_labels/loading_funcs/central_asia.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/process_labels/loading_funcs/central_asia.py b/process_labels/loading_funcs/central_asia.py index f7456ca..8c88f26 100644 --- a/process_labels/loading_funcs/central_asia.py +++ b/process_labels/loading_funcs/central_asia.py @@ -65,7 +65,7 @@ def make_harvest_date(row) -> datetime: # two manual changes to replace multipolygons with polygons. # the first polygon is 10^5 times smaller than the second, so # we use the second - df.loc[df["index"] == 5162, "geometry"] = df.iloc[5162].geometry[1] - df.loc[df["index"] == 4049, "geometry"] = df.iloc[4049].geometry[1] + df.loc[df[RequiredColumns.INDEX] == 5162, "geometry"] = df.iloc[5162].geometry[1] + df.loc[df[RequiredColumns.INDEX] == 4049, "geometry"] = df.iloc[4049].geometry[1] return df From 3f529e1c563126a03bf10ae307cc8e8c79c5d4a3 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 11:50:49 -0400 Subject: [PATCH 07/38] Replace 'index' string with RequiredColumns.INDEX --- process_labels/loading_funcs/kenya.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/process_labels/loading_funcs/kenya.py b/process_labels/loading_funcs/kenya.py index 26b3619..d0b57b1 100644 --- a/process_labels/loading_funcs/kenya.py +++ b/process_labels/loading_funcs/kenya.py @@ -72,7 +72,7 @@ def load_kenya(): df = pd.concat(dfs) df = df.reset_index(drop=True) - df["index"] = df.index + df[RequiredColumns.INDEX] = df.index return df From ddaab9c019a49e20ba052b6603e44fbdd4f7ba58 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 19 Apr 2022 12:04:42 -0400 Subject: [PATCH 08/38] Fix assignment of export_identifier when loading default labels --- cropharvest/eo/eo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index a6461c2..4c2e210 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -144,7 +144,9 @@ def load_default_labels( - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) ) labels = labels.assign( - export_identifier=lambda x: f"{x[RequiredColumns.INDEX]}-{x[RequiredColumns.DATASET]}" + export_identifier=labels[RequiredColumns.INDEX].map(str) + + "-" + + labels[RequiredColumns.DATASET] ) if dataset: labels = labels[labels.dataset == dataset] From 171bb73a139199d6c95749a37e82be1bda59e0a9 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 09:11:57 -0400 Subject: [PATCH 09/38] Add a script to test the renaming has happened correctly --- scripts/20220420_check_renaming.py | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 scripts/20220420_check_renaming.py diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py new file mode 100644 index 0000000..6c31541 --- /dev/null +++ b/scripts/20220420_check_renaming.py @@ -0,0 +1,53 @@ +""" +After 20220418_renaming.py was run, +this script was used to check that +the renaming happened correctly +""" +from pathlib import Path +import xarray as xr +import re +import numpy as np +from random import shuffle +from tqdm import tqdm + +from cropharvest.countries import BBox +from cropharvest.utils import DATAFOLDER_PATH + + +def _bbox_from_filepath(p: Path) -> BBox: + """ + https://github.com/nasaharvest/crop-mask/blob/master/src/ETL/boundingbox.py#L24 + """ + decimals_in_p = re.findall(r"=-?\d*\.?\d*", p.stem) + coords = [float(d[1:]) for d in decimals_in_p[0:4]] + return BBox(min_lat=coords[0], min_lon=coords[1], max_lat=coords[2], max_lon=coords[3]) + + +def isin(x: np.ndarray, val: float) -> bool: + return (val >= x.min()) & (val <= x.max()) + + +def check_file(path: Path) -> None: + + # extract expected info from the path + bbox = _bbox_from_filepath(path) + tif_file = xr.open_rasterio(path) + + # x is lon, y is lat + lat, lon = bbox.get_centre(in_radians=False) + x, y = tif_file.x.values, tif_file.y.values + + assert isin(lon, x) & isin(lat, y) + + +def main(renamed_path: Path, num_to_check: int = 1000): + + all_files = list(renamed_path.glob("*.tif")) + shuffle(all_files) + + for path_to_check in tqdm(all_files[:num_to_check]): + check_file(path_to_check) + + +if __name__ == "__main__": + main(Path(DATAFOLDER_PATH / "renamed_eo_data")) From 50bf35e7cab02d6d2292ca199867811df6739fc2 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 09:13:07 -0400 Subject: [PATCH 10/38] Correct argument ordering in the renaming script --- scripts/20220420_check_renaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py index 6c31541..e3b66a1 100644 --- a/scripts/20220420_check_renaming.py +++ b/scripts/20220420_check_renaming.py @@ -37,7 +37,7 @@ def check_file(path: Path) -> None: lat, lon = bbox.get_centre(in_radians=False) x, y = tif_file.x.values, tif_file.y.values - assert isin(lon, x) & isin(lat, y) + assert isin(x, lon) & isin(y, lat) def main(renamed_path: Path, num_to_check: int = 1000): From 52b0cc38cabc0a631d43c8bf73da34b0ef79fe1d Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 09:14:31 -0400 Subject: [PATCH 11/38] Add information to the assert statement --- scripts/20220420_check_renaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py index e3b66a1..a183fdb 100644 --- a/scripts/20220420_check_renaming.py +++ b/scripts/20220420_check_renaming.py @@ -37,7 +37,7 @@ def check_file(path: Path) -> None: lat, lon = bbox.get_centre(in_radians=False) x, y = tif_file.x.values, tif_file.y.values - assert isin(x, lon) & isin(y, lat) + assert isin(x, lon) & isin(y, lat), f"{path} failed with {x}, {y} and {lat}, {lon}" def main(renamed_path: Path, num_to_check: int = 1000): From eff87864f2ddd3d733bcf4d861056a7c55952d11 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 09:18:08 -0400 Subject: [PATCH 12/38] Keep track of failures --- scripts/20220420_check_renaming.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py index a183fdb..635bd2a 100644 --- a/scripts/20220420_check_renaming.py +++ b/scripts/20220420_check_renaming.py @@ -45,8 +45,15 @@ def main(renamed_path: Path, num_to_check: int = 1000): all_files = list(renamed_path.glob("*.tif")) shuffle(all_files) + failed = 0 for path_to_check in tqdm(all_files[:num_to_check]): - check_file(path_to_check) + try: + check_file(path_to_check) + except AssertionError as e: + print(e) + failed += 1 + + print(f"{failed} files failed check out of {num_to_check}") if __name__ == "__main__": From 7eff7d432e2ff87bcb8d1080e27eecbbb5fba0da Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 09:20:34 -0400 Subject: [PATCH 13/38] Final addition to check_renaming docstring --- scripts/20220420_check_renaming.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py index 635bd2a..471d2cb 100644 --- a/scripts/20220420_check_renaming.py +++ b/scripts/20220420_check_renaming.py @@ -2,6 +2,12 @@ After 20220418_renaming.py was run, this script was used to check that the renaming happened correctly + +From 3 runs, this had a failure rate of 8, 10 and 6 +(out of 1000), which is <1%. In all cases, I looked at +the failures and the corresponding latitudes and longitudes +were just outside of the margins, which I am happy to attribute +to rounding errors. """ from pathlib import Path import xarray as xr From d2f737e6e286cd0f8bc55dbf6024d7852ba7b780 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 10:43:29 -0400 Subject: [PATCH 14/38] Correctly calculate the export end date for the default labels --- cropharvest/eo/eo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index 4c2e210..5330e3e 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -137,8 +137,7 @@ def load_default_labels( dataset: Optional[str], start_from_last, checkpoint: Optional[Path] ) -> geopandas.GeoDataFrame: labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME) - export_end_year = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]).dt.year - labels["end_date"] = export_end_year.apply(lambda x: date(x, 12, 12)) + labels["end_date"] = labels[RequiredColumns.EXPORT_END_DATE] labels = labels.assign( start_date=lambda x: x["end_date"] - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) From db9148737d78e7084349ef00cbdb6a4529e6ace2 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 10:46:00 -0400 Subject: [PATCH 15/38] Keep the end_date as a datetime, not a string --- cropharvest/eo/eo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index 5330e3e..098e7b1 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -137,7 +137,7 @@ def load_default_labels( dataset: Optional[str], start_from_last, checkpoint: Optional[Path] ) -> geopandas.GeoDataFrame: labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME) - labels["end_date"] = labels[RequiredColumns.EXPORT_END_DATE] + labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]) labels = labels.assign( start_date=lambda x: x["end_date"] - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) From 429baf6956bb33b082cc8f4ae2c50f93444e5818 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 10:48:21 -0400 Subject: [PATCH 16/38] Store it as a date, not a datetime --- cropharvest/eo/eo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index 098e7b1..c1fe916 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -137,7 +137,7 @@ def load_default_labels( dataset: Optional[str], start_from_last, checkpoint: Optional[Path] ) -> geopandas.GeoDataFrame: labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME) - labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]) + labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]).dt.date labels = labels.assign( start_date=lambda x: x["end_date"] - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) From dd9a3d203798a9015f0f6c2797ce584311407743 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 11:03:06 -0400 Subject: [PATCH 17/38] [WIP] match tifs to labels based on lat/lon instead of dataset/idx --- cropharvest/engineer.py | 61 +++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index e183da3..511a0f8 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -1,5 +1,5 @@ from pathlib import Path -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date import geopandas from dataclasses import dataclass import numpy as np @@ -10,6 +10,7 @@ from tqdm import tqdm import warnings import h5py +import re from sklearn.metrics import roc_auc_score, f1_score @@ -43,6 +44,7 @@ class DataInstance: instance_lon: float array: np.ndarray is_crop: int + year: int label: Optional[str] = None @property @@ -172,7 +174,7 @@ def __init__(self, data_folder: Path = DATAFOLDER_PATH) -> None: self.test_eo_files = data_folder / "test_eo_data" self.labels = geopandas.read_file(data_folder / LABELS_FILENAME) - self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date) + self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date).dt.date self.savedir = data_folder / "features" self.savedir.mkdir(exist_ok=True) @@ -201,7 +203,7 @@ def process_filename(filename: str) -> Tuple[int, str]: @staticmethod def load_tif( - filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS + ds: xr.Dataset, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS ) -> Tuple[xr.DataArray, float]: r""" The sentinel files exported from google earth have all the timesteps @@ -210,9 +212,7 @@ def load_tif( Returns: The loaded xr.DataArray, and the average slope (used for filling nan slopes) """ - - da = xr.open_rasterio(filepath).rename("FEATURES") - + da = da.rename("FEATURES") da_split_by_time: List[xr.DataArray] = [] bands_per_timestep = len(DYNAMIC_BANDS) @@ -400,7 +400,7 @@ def remove_bands(array: np.ndarray) -> np.ndarray: def process_test_file( path_to_file: Path, start_date: datetime ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - da, slope = Engineer.load_tif(path_to_file, start_date=start_date) + da, slope = Engineer.load_tif(xr.open_rasterio(path_to_file), start_date=start_date) # Process remote sensing data x_np = da.values @@ -473,12 +473,35 @@ def process_test_file_with_region( return identifier_plus_idx, TestInstance(x=final_x, y=y, lats=flat_lat, lons=flat_lon) + @staticmethod + def _year_from_filepath(p: Path) -> date: + dates_in_p = re.findall(r"(\d+-\d+-\d+)", p.stem) + end_date = dates_in_p[-1].split("-") + return date(end_date[0], end_date[1], end_date[2]) + + def find_row_from_path( + self, lat: np.ndarray, lon: np.ndarray, export_end_date: date + ) -> pd.Series: + relevant_labels = self.labels[ + ( + (self.labels[RequiredColumns.LAT] >= lat.min()) + & (self.labels[RequiredColumns.LAT] <= lat.max()) + & (self.labels[RequiredColumns.LON] <= lon.max()) + & (self.labels[RequiredColumns.LON] >= lon.min()) + & (self.labels[RequiredColumns.EXPORT_END_DATE] == export_end_date) + ) + ] + # TODO - check for the most central row if there is more than 1 row + return relevant_labels.iloc[0] + def process_single_file( self, path_to_file: Path, - row: pd.Series, num_timesteps: int = DEFAULT_NUM_TIMESTEPS, ) -> Optional[DataInstance]: + ds = xr.open_rasterio(path_to_file) + year = self._year_from_filepath(path_to_file) + row = self.find_row_from_path(ds.y, ds.x, year) start_date = row.export_end_date - timedelta(days=num_timesteps * DAYS_PER_TIMESTEP) da, average_slope = self.load_tif(path_to_file, start_date=start_date) closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON]) @@ -505,6 +528,7 @@ def process_single_file( is_crop=row[RequiredColumns.IS_CROP], label=row[NullableColumns.LABEL], dataset=row[RequiredColumns.DATASET], + year=start_date.year, ) def create_h5_test_instances( @@ -572,23 +596,12 @@ def create_h5_dataset(self, checkpoint: bool = True) -> None: skipped_files: int = 0 num_new_files: int = 0 for file_path in tqdm(list(self.eo_files.glob("*.tif"))): - file_index, dataset = self.process_filename(file_path.name) - file_name = f"{file_index}_{dataset}.h5" - if (checkpoint) & ((arrays_dir / file_name).exists()): - # we check if the file has already been written - continue - - file_row = self.labels[ - ( - (self.labels[RequiredColumns.DATASET] == dataset) - & (self.labels[RequiredColumns.INDEX] == file_index) - ) - ].iloc[0] - - instance = self.process_single_file(file_path, row=file_row) + instance = self.process_single_file(file_path) if instance is not None: - - hf = h5py.File(arrays_dir / file_name, "w") + filename = ( + f"lat={instance.label_lat}_lon={instance.label_lon}_year={instance.year}.h5" + ) + hf = h5py.File(arrays_dir / filename, "w") hf.create_dataset("array", data=instance.array) for key, val in instance.attrs.items(): From 30c70514dc78468409e67fea9e68a759b72da302 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 20 Apr 2022 15:00:20 -0400 Subject: [PATCH 18/38] Update Engineer for labelled tifs --- cropharvest/boundingbox.py | 77 ++++++++++++++ cropharvest/columns.py | 12 +++ cropharvest/config.py | 12 ++- cropharvest/countries.py | 69 +------------ cropharvest/engineer.py | 167 ++++++++++++++++++++----------- cropharvest/eo/ee_boundingbox.py | 2 +- cropharvest/eo/eo.py | 9 +- cropharvest/utils.py | 2 - 8 files changed, 217 insertions(+), 133 deletions(-) create mode 100644 cropharvest/boundingbox.py diff --git a/cropharvest/boundingbox.py b/cropharvest/boundingbox.py new file mode 100644 index 0000000..3c972a3 --- /dev/null +++ b/cropharvest/boundingbox.py @@ -0,0 +1,77 @@ +from dataclasses import dataclass +from pathlib import Path +from shapely.geometry import Polygon +from math import sin, cos, radians +from typing import List, Tuple +import re + +from typing import Optional + + +@dataclass +class BBox: + + min_lat: float + max_lat: float + min_lon: float + max_lon: float + + name: Optional[str] = None + + def __post_init__(self): + if self.max_lon < self.min_lon: + raise ValueError("max_lon should be larger than min_lon") + if self.max_lat < self.min_lat: + raise ValueError("max_lat should be larger than min_lat") + + self.url = ( + f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}" + ) + + def contains(self, lat: float, lon: float) -> bool: + return ( + (lat >= self.min_lat) + & (lat <= self.max_lat) + & (lon >= self.min_lon) + & (lon <= self.max_lon) + ) + + def contains_bbox(self, bbox: "BBox") -> bool: + return ( + (bbox.min_lat >= self.min_lat) + & (bbox.max_lat <= self.max_lat) + & (bbox.min_lon >= self.min_lon) + & (bbox.max_lon <= self.max_lon) + ) + + @property + def three_dimensional_points(self) -> List[float]: + r""" + If we are passing the central latitude and longitude to + an ML model, we want it to know the extremes are close together. + Mapping them to 3d space allows us to do that + """ + lat, lon = self.get_centre(in_radians=True) + return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)] + + def get_centre(self, in_radians: bool = True) -> Tuple[float, float]: + + # roughly calculate the centres + lat = self.min_lat + ((self.max_lat - self.min_lat) / 2) + lon = self.min_lon + ((self.max_lon - self.min_lon) / 2) + if in_radians: + return radians(lat), radians(lon) + else: + return lat, lon + + @classmethod + def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None): + (min_lon, min_lat, max_lon, max_lat) = polygon.bounds + return cls(min_lat, max_lat, min_lon, max_lon, name) + + @classmethod + def from_eo_tif_file(cls, path: Path) -> "BBox": + decimals_in_p = re.findall(r"=-?\d*\.?\d*", path.stem) + coords = [float(d[1:]) for d in decimals_in_p[0:4]] + bbox = cls(min_lat=coords[0], min_lon=coords[1], max_lat=coords[2], max_lon=coords[3]) + return bbox diff --git a/cropharvest/columns.py b/cropharvest/columns.py index 852daab..2c4488e 100644 --- a/cropharvest/columns.py +++ b/cropharvest/columns.py @@ -42,3 +42,15 @@ class NullableColumns(Columns): @classmethod def date_columns(cls) -> List[str]: return [cls.HARVEST_DATE, cls.PLANTING_DATE] + + +class EngColumns: + """ + Some columns uniquely created & used by the labels + as loaded by the Engineer + """ + + FEATURES_FILENAME = "features_filename" + FEATURES_PATH = "features_path" + EXISTS = "feature_exists" + TIF_FILEPATH = "tif_path" diff --git a/cropharvest/config.py b/cropharvest/config.py index 65261d0..587616b 100644 --- a/cropharvest/config.py +++ b/cropharvest/config.py @@ -1,4 +1,6 @@ -from .countries import BBox +from pathlib import Path + +from .boundingbox import BBox from typing import Dict @@ -22,6 +24,14 @@ FEATURES_DIR = "features" TEST_FEATURES_DIR = "test_features" +# These values describe the structure of the data folder +DATAFOLDER_PATH = Path(__file__).parent.parent / "data" +EO_FILEPATH = DATAFOLDER_PATH / "eo_data" +TEST_EO_FILEPATH = DATAFOLDER_PATH / "test_eo_data" +FEATURES_FILEPATH = DATAFOLDER_PATH / FEATURES_DIR +ARRAYS_FILEPATH = FEATURES_FILEPATH / "arrays" +TEST_FEATURES_FILEPATH = DATAFOLDER_PATH / TEST_FEATURES_DIR + # the default seed is useful because it also seeds the deterministic # shuffling algorithm we use (in cropharvest.utils.deterministic_shuffle) # so fixing this ensures the evaluation sets consist of the same data no matter diff --git a/cropharvest/countries.py b/cropharvest/countries.py index b053ac8..777b301 100644 --- a/cropharvest/countries.py +++ b/cropharvest/countries.py @@ -1,75 +1,12 @@ -from dataclasses import dataclass import geopandas from shapely.geometry import Polygon, MultiPolygon -from math import sin, cos, radians -from typing import List, Tuple +from typing import List from pathlib import Path -from typing import Optional +from cropharvest.boundingbox import BBox -COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile")) - - -@dataclass -class BBox: - - min_lat: float - max_lat: float - min_lon: float - max_lon: float - - name: Optional[str] = None - - def __post_init__(self): - if self.max_lon < self.min_lon: - raise ValueError("max_lon should be larger than min_lon") - if self.max_lat < self.min_lat: - raise ValueError("max_lat should be larger than min_lat") - - self.url = ( - f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}" - ) - def contains(self, lat: float, lon: float) -> bool: - return ( - (lat >= self.min_lat) - & (lat <= self.max_lat) - & (lon >= self.min_lon) - & (lon <= self.max_lon) - ) - - def contains_bbox(self, bbox) -> bool: - return ( - (bbox.min_lat >= self.min_lat) - & (bbox.max_lat <= self.max_lat) - & (bbox.min_lon >= self.min_lon) - & (bbox.max_lon <= self.max_lon) - ) - - @property - def three_dimensional_points(self) -> List[float]: - r""" - If we are passing the central latitude and longitude to - an ML model, we want it to know the extremes are close together. - Mapping them to 3d space allows us to do that - """ - lat, lon = self.get_centre(in_radians=True) - return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)] - - def get_centre(self, in_radians: bool = True) -> Tuple[float, float]: - - # roughly calculate the centres - lat = self.min_lat + ((self.max_lat - self.min_lat) / 2) - lon = self.min_lon + ((self.max_lon - self.min_lon) / 2) - if in_radians: - return radians(lat), radians(lon) - else: - return lat, lon - - @classmethod - def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None): - (min_lon, min_lat, max_lon, max_lat) = polygon.bounds - return cls(min_lat, max_lat, min_lon, max_lon, name) +COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile")) def get_country_bbox(country_name: str) -> List[BBox]: diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 511a0f8..7aa1506 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -15,8 +15,9 @@ from sklearn.metrics import roc_auc_score, f1_score from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS -from cropharvest.columns import RequiredColumns, NullableColumns -from .config import ( +from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns +from cropharvest.boundingbox import BBox +from cropharvest.config import ( EXPORT_END_DAY, EXPORT_END_MONTH, LABELS_FILENAME, @@ -24,8 +25,14 @@ DEFAULT_NUM_TIMESTEPS, TEST_REGIONS, TEST_DATASETS, + DATAFOLDER_PATH, + FEATURES_FILEPATH, + EO_FILEPATH, + TEST_EO_FILEPATH, + ARRAYS_FILEPATH, + TEST_FEATURES_FILEPATH, ) -from .utils import DATAFOLDER_PATH, load_normalizing_dict +from cropharvest.utils import load_normalizing_dict from typing import cast, Optional, Dict, Union, Tuple, List, Sequence @@ -168,22 +175,37 @@ def __len__(self) -> int: class Engineer: - def __init__(self, data_folder: Path = DATAFOLDER_PATH) -> None: - self.data_folder = data_folder - self.eo_files = data_folder / "eo_data" - self.test_eo_files = data_folder / "test_eo_data" + def __init__(self) -> None: - self.labels = geopandas.read_file(data_folder / LABELS_FILENAME) - self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date).dt.date - - self.savedir = data_folder / "features" - self.savedir.mkdir(exist_ok=True) - - self.test_savedir = data_folder / "test_features" - self.test_savedir.mkdir(exist_ok=True) + self.labels = self.load_labels() + FEATURES_FILEPATH.mkdir(exist_ok=True) + ARRAYS_FILEPATH.mkdir(exist_ok=True) + TEST_FEATURES_FILEPATH.mkdir(exist_ok=True) self.norm_interim: Dict[str, Union[np.ndarray, int]] = {"n": 0} + @staticmethod + def load_labels() -> geopandas.GeoDataFrame: + labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME) + labels[RequiredColumns.EXPORT_END_DATE] = pd.to_datetime( + labels[RequiredColumns.EXPORT_END_DATE] + ).dt.date + labels[EngColumns.FEATURES_FILENAME] = ( + "lat=" + + labels[RequiredColumns.LAT].round(8).astype(str) + + "_lon=" + + labels[RequiredColumns.LON].round(8).astype(str) + + "_date=" + + labels[RequiredColumns.EXPORT_END_DATE].astype(str) + ) + labels[EngColumns.FEATURES_PATH] = ( + str(ARRAYS_FILEPATH) + labels[EngColumns.FEATURES_FILENAME] + ) + labels[EngColumns.EXISTS] = np.vectorize(lambda p: Path(p).exists())( + labels[EngColumns.FEATURES_PATH] + ) + return labels + @staticmethod def find_nearest(array, value: float) -> float: array = np.asarray(array) @@ -203,7 +225,7 @@ def process_filename(filename: str) -> Tuple[int, str]: @staticmethod def load_tif( - ds: xr.Dataset, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS + filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS ) -> Tuple[xr.DataArray, float]: r""" The sentinel files exported from google earth have all the timesteps @@ -212,7 +234,7 @@ def load_tif( Returns: The loaded xr.DataArray, and the average slope (used for filling nan slopes) """ - da = da.rename("FEATURES") + da = xr.open_rasterio(filepath).rename("FEATURES") da_split_by_time: List[xr.DataArray] = [] bands_per_timestep = len(DYNAMIC_BANDS) @@ -473,37 +495,15 @@ def process_test_file_with_region( return identifier_plus_idx, TestInstance(x=final_x, y=y, lats=flat_lat, lons=flat_lon) - @staticmethod - def _year_from_filepath(p: Path) -> date: - dates_in_p = re.findall(r"(\d+-\d+-\d+)", p.stem) - end_date = dates_in_p[-1].split("-") - return date(end_date[0], end_date[1], end_date[2]) - - def find_row_from_path( - self, lat: np.ndarray, lon: np.ndarray, export_end_date: date - ) -> pd.Series: - relevant_labels = self.labels[ - ( - (self.labels[RequiredColumns.LAT] >= lat.min()) - & (self.labels[RequiredColumns.LAT] <= lat.max()) - & (self.labels[RequiredColumns.LON] <= lon.max()) - & (self.labels[RequiredColumns.LON] >= lon.min()) - & (self.labels[RequiredColumns.EXPORT_END_DATE] == export_end_date) - ) - ] - # TODO - check for the most central row if there is more than 1 row - return relevant_labels.iloc[0] - def process_single_file( self, - path_to_file: Path, + row: pd.Series, num_timesteps: int = DEFAULT_NUM_TIMESTEPS, ) -> Optional[DataInstance]: - ds = xr.open_rasterio(path_to_file) - year = self._year_from_filepath(path_to_file) - row = self.find_row_from_path(ds.y, ds.x, year) - start_date = row.export_end_date - timedelta(days=num_timesteps * DAYS_PER_TIMESTEP) - da, average_slope = self.load_tif(path_to_file, start_date=start_date) + start_date = row[RequiredColumns.EXPORT_END_DATE] - timedelta( + days=num_timesteps * DAYS_PER_TIMESTEP + ) + da, average_slope = self.load_tif(row[EngColumns.TIF_FILEPATH], start_date=start_date) closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON]) closest_lat = self.find_nearest(da.y, row[RequiredColumns.LAT]) @@ -535,7 +535,7 @@ def create_h5_test_instances( self, ) -> None: for region_identifier, _ in TEST_REGIONS.items(): - all_region_files = list(self.test_eo_files.glob(f"{region_identifier}*.tif")) + all_region_files = list(TEST_EO_FILEPATH.glob(f"{region_identifier}*.tif")) if len(all_region_files) == 0: print(f"No downloaded files for {region_identifier}") continue @@ -544,7 +544,7 @@ def create_h5_test_instances( filepath, region_idx ) if test_instance is not None: - hf = h5py.File(self.test_savedir / f"{instance_name}.h5", "w") + hf = h5py.File(TEST_FEATURES_FILEPATH / f"{instance_name}.h5", "w") for key, val in test_instance.datasets.items(): hf.create_dataset(key, data=val) @@ -581,27 +581,74 @@ def create_h5_test_instances( hf.create_dataset(key, data=val) hf.close() - def create_h5_dataset(self, checkpoint: bool = True) -> None: - arrays_dir = self.savedir / "arrays" - arrays_dir.mkdir(exist_ok=True) + @staticmethod + def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]: + return { + p: BBox.from_path(p) + for p in tqdm(filepath.glob("**/*.tif"), desc="Generating BoundingBoxes from paths") + } + + @staticmethod + def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar): + candidate_paths = [] + for p, bbox in path_to_bbox.items(): + if bbox.contains(lat, lon) and f"dates=*_{end_date}" in p.stem: + candidate_paths.append(p) + pbar.update(1) + return candidate_paths + + @classmethod + def match_labels_to_tifs(cls, labels: geopandas.GeoDataFrame) -> pd.Series: + bbox_for_labels = BBox( + min_lon=labels[RequiredColumns.LON].min(), + min_lat=labels[RequiredColumns.LAT].min(), + max_lon=labels[RequiredColumns.LON].max(), + max_lat=labels[RequiredColumns.LAT].max(), + ) + # Get all tif paths and bboxes + path_to_bbox = { + p: bbox + for p, bbox in cls.generate_bbox_from_paths(EO_FILEPATH).items() + if bbox_for_labels.contains_bbox(bbox) + } + + # Match labels to tif files + # Faster than going through bboxes + with tqdm(total=len(labels), desc="Matching labels to tif paths") as pbar: + tif_paths = np.vectorize(cls.get_tif_paths, otypes=[np.ndarray])( + path_to_bbox, + labels[RequiredColumns.LAT], + labels[RequiredColumns.LON], + labels[RequiredColumns.EXPORT_END_DATE], + pbar, + ) + return tif_paths + + def create_h5_dataset(self) -> None: old_normalizing_dict: Optional[Tuple[int, Optional[Dict[str, np.ndarray]]]] = None - if checkpoint: - # check for an already existing normalizing dict - if (self.savedir / "normalizing_dict.h5").exists(): - old_nd = load_normalizing_dict(self.savedir / "normalizing_dict.hf") - num_existing_files = len(list(arrays_dir.glob("*"))) - old_normalizing_dict = (num_existing_files, old_nd) + # check for an already existing normalizing dict + if (FEATURES_FILEPATH / "normalizing_dict.h5").exists(): + old_nd = load_normalizing_dict(FEATURES_FILEPATH / "normalizing_dict.hf") + num_existing_files = len(list(ARRAYS_FILEPATH.glob("*"))) + old_normalizing_dict = (num_existing_files, old_nd) + + labels_with_no_features = self.labels[~self.labels[EngColumns.EXISTS]].copy() + labels_with_no_features[EngColumns.TIF_FILEPATH] = self.match_labels_to_tifs( + labels_with_no_features + ) + tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATH].str.len() > 0 + labels_with_tifs_but_no_features = labels_with_no_features.loc[tifs_found] skipped_files: int = 0 num_new_files: int = 0 - for file_path in tqdm(list(self.eo_files.glob("*.tif"))): - instance = self.process_single_file(file_path) + for _, row in tqdm(labels_with_tifs_but_no_features.iterrows()): + instance = self.process_single_file(row) if instance is not None: filename = ( f"lat={instance.label_lat}_lon={instance.label_lon}_year={instance.year}.h5" ) - hf = h5py.File(arrays_dir / filename, "w") + hf = h5py.File(ARRAYS_FILEPATH / filename, "w") hf.create_dataset("array", data=instance.array) for key, val in instance.attrs.items(): @@ -616,11 +663,11 @@ def create_h5_dataset(self, checkpoint: bool = True) -> None: normalizing_dict = self.calculate_normalizing_dict() - if checkpoint and (old_normalizing_dict is not None): + if old_normalizing_dict is not None: normalizing_dicts = [old_normalizing_dict, (num_new_files, normalizing_dict)] normalizing_dict = self.adjust_normalizing_dict(normalizing_dicts) if normalizing_dict is not None: - save_path = self.savedir / "normalizing_dict.h5" + save_path = FEATURES_FILEPATH / "normalizing_dict.h5" hf = h5py.File(save_path, "w") for key, val in normalizing_dict.items(): hf.create_dataset(key, data=val) diff --git a/cropharvest/eo/ee_boundingbox.py b/cropharvest/eo/ee_boundingbox.py index df4d6cc..f121ef3 100644 --- a/cropharvest/eo/ee_boundingbox.py +++ b/cropharvest/eo/ee_boundingbox.py @@ -3,7 +3,7 @@ from typing import List, Tuple, Union import ee -from cropharvest.countries import BBox +from cropharvest.boundingbox import BBox @dataclass diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index c1fe916..51e5b5d 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -23,7 +23,7 @@ from .utils import make_combine_bands_function from cropharvest.bands import DYNAMIC_BANDS -from cropharvest.utils import DATAFOLDER_PATH, memoized +from cropharvest.utils import memoized from cropharvest.countries import BBox from cropharvest.config import ( EXPORT_END_DAY, @@ -32,6 +32,9 @@ DEFAULT_NUM_TIMESTEPS, LABELS_FILENAME, TEST_REGIONS, + DATAFOLDER_PATH, + EO_FILEPATH, + TEST_EO_FILEPATH, ) from cropharvest.columns import RequiredColumns @@ -95,8 +98,8 @@ class EarthEngineExporter: :param dest_bucket: The bucket to export to, google-cloud-storage must be installed. """ - output_folder_name = "eo_data" - test_output_folder_name = "test_eo_data" + output_folder_name = EO_FILEPATH.name + test_output_folder_name = TEST_EO_FILEPATH.name def __init__( self, diff --git a/cropharvest/utils.py b/cropharvest/utils.py index 48a8e2f..fef5770 100644 --- a/cropharvest/utils.py +++ b/cropharvest/utils.py @@ -20,8 +20,6 @@ except ImportError: TORCH_INSTALLED = False -DATAFOLDER_PATH = Path(__file__).parent.parent / "data" - def set_seed(seed: int = 42) -> None: np.random.seed(seed) From a8042148a53c9fabc458460fa973a3dafeee6208 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Mon, 25 Apr 2022 16:34:58 -0400 Subject: [PATCH 19/38] Use the default identifier for the default labels --- cropharvest/eo/eo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py index 51e5b5d..2d925c5 100644 --- a/cropharvest/eo/eo.py +++ b/cropharvest/eo/eo.py @@ -145,11 +145,6 @@ def load_default_labels( start_date=lambda x: x["end_date"] - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS) ) - labels = labels.assign( - export_identifier=labels[RequiredColumns.INDEX].map(str) - + "-" - + labels[RequiredColumns.DATASET] - ) if dataset: labels = labels[labels.dataset == dataset] From 270e3afed518cdacef074609e1d0909ae95c1838 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Mon, 25 Apr 2022 17:12:38 -0400 Subject: [PATCH 20/38] Mypy fixes --- benchmarks/config.py | 2 +- benchmarks/deep_learning.py | 2 +- benchmarks/random_forest.py | 2 +- cropharvest/engineer.py | 5 ++--- process_labels/datasets.py | 3 +-- scripts/20220418_renaming.py | 2 +- scripts/20220420_check_renaming.py | 2 +- 7 files changed, 8 insertions(+), 10 deletions(-) diff --git a/benchmarks/config.py b/benchmarks/config.py index f97c656..4667d81 100644 --- a/benchmarks/config.py +++ b/benchmarks/config.py @@ -11,7 +11,7 @@ # Model names -RANDOM_FOREST = "RF" +RANDOM_FOREST = "RF_GRID_SEARCH" DL_RANDOM = "DL_RANDOM" DL_PRETRAINED = "DL_PRETRAINED" DL_MAML = "DL_MAML" diff --git a/benchmarks/deep_learning.py b/benchmarks/deep_learning.py index c6b5422..4763520 100644 --- a/benchmarks/deep_learning.py +++ b/benchmarks/deep_learning.py @@ -4,7 +4,7 @@ import json from cropharvest.datasets import CropHarvest -from cropharvest.utils import DATAFOLDER_PATH +from cropharvest.config import DATAFOLDER_PATH from cropharvest.engineer import TestInstance from config import ( diff --git a/benchmarks/random_forest.py b/benchmarks/random_forest.py index c8a2360..d9a4cf0 100644 --- a/benchmarks/random_forest.py +++ b/benchmarks/random_forest.py @@ -3,7 +3,7 @@ from sklearn.ensemble import RandomForestClassifier from cropharvest.datasets import CropHarvest -from cropharvest.utils import DATAFOLDER_PATH +from cropharvest.config import DATAFOLDER_PATH from cropharvest.engineer import TestInstance from config import SHUFFLE_SEEDS, DATASET_TO_SIZES, RANDOM_FOREST diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 7aa1506..d350102 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -1,5 +1,5 @@ from pathlib import Path -from datetime import datetime, timedelta, date +from datetime import datetime, timedelta import geopandas from dataclasses import dataclass import numpy as np @@ -10,7 +10,6 @@ from tqdm import tqdm import warnings import h5py -import re from sklearn.metrics import roc_auc_score, f1_score @@ -584,7 +583,7 @@ def create_h5_test_instances( @staticmethod def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]: return { - p: BBox.from_path(p) + p: BBox.from_eo_tif_file(p) for p in tqdm(filepath.glob("**/*.tif"), desc="Generating BoundingBoxes from paths") } diff --git a/process_labels/datasets.py b/process_labels/datasets.py index 6ae3ebd..c785179 100644 --- a/process_labels/datasets.py +++ b/process_labels/datasets.py @@ -6,8 +6,7 @@ from .utils import add_is_test_column from cropharvest.columns import NullableColumns, RequiredColumns -from cropharvest.utils import DATAFOLDER_PATH -from cropharvest.config import LABELS_FILENAME +from cropharvest.config import LABELS_FILENAME, DATAFOLDER_PATH from typing import cast, Callable, List, Optional diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py index 6a965f9..11c2ec5 100644 --- a/scripts/20220418_renaming.py +++ b/scripts/20220418_renaming.py @@ -12,7 +12,7 @@ from cropharvest.columns import RequiredColumns from cropharvest.eo.eo import EarthEngineExporter from cropharvest.eo.ee_boundingbox import EEBoundingBox -from cropharvest.utils import DATAFOLDER_PATH +from cropharvest.config import DATAFOLDER_PATH SURROUNDING_METRES = 80 diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py index 471d2cb..09c1fd0 100644 --- a/scripts/20220420_check_renaming.py +++ b/scripts/20220420_check_renaming.py @@ -17,7 +17,7 @@ from tqdm import tqdm from cropharvest.countries import BBox -from cropharvest.utils import DATAFOLDER_PATH +from cropharvest.config import DATAFOLDER_PATH def _bbox_from_filepath(p: Path) -> BBox: From b02a2bfe4c47791980509b42114ebc04392cad67 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 26 Apr 2022 07:23:57 -0400 Subject: [PATCH 21/38] Update the processing of test files to work with the new approach too --- benchmarks/dl/maml.py | 19 ++++++------------- cropharvest/config.py | 18 ++++++++++++++++++ cropharvest/datasets.py | 15 +++------------ cropharvest/engineer.py | 23 +++++++++-------------- cropharvest/utils.py | 13 +++++++++++++ 5 files changed, 49 insertions(+), 39 deletions(-) diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py index c1f824b..d8ab58f 100644 --- a/benchmarks/dl/maml.py +++ b/benchmarks/dl/maml.py @@ -19,7 +19,7 @@ from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task from cropharvest import countries -from cropharvest.config import TEST_DATASETS, TEST_REGIONS +from cropharvest.config import TEST_DATASETS, TEST_REGIONS, TEST_COUNTRIES_TO_CROPS from cropharvest.utils import NoDataForBoundingBoxError from typing import Dict, Tuple, Optional, List, DefaultDict @@ -363,18 +363,11 @@ def _make_tasks( ) -> Tuple[Dict[str, CropHarvest], Dict[str, CropHarvest]]: labels = CropHarvestLabels(self.root) - # remove any test regions, and collect the countries / crops - test_countries_to_crops: DefaultDict[str, List[str]] = defaultdict(list) - - # reshuffle the test_regions dict so its a little easier to - # manipulate in this function - for identifier, _ in TEST_REGIONS.items(): - country, crop, _, _ = identifier.split("_") - test_countries_to_crops[country].append(crop) - label_to_task: Dict[str, CropHarvest] = {} - countries_to_ignore = [country for country, _ in TEST_DATASETS.items() if crop is None] + countries_to_ignore = [ + country for country, crop in TEST_COUNTRIES_TO_CROPS.items() if crop is not None + ] for country in tqdm(countries.get_countries()): if country in countries_to_ignore: @@ -394,8 +387,8 @@ def _make_tasks( label_to_task[task.id] = task for label in labels.classes_in_bbox(country_bbox): - if country in test_countries_to_crops: - if label in test_countries_to_crops[country]: + if country in TEST_COUNTRIES_TO_CROPS: + if label in TEST_COUNTRIES_TO_CROPS[country]: continue try: task = CropHarvest( diff --git a/cropharvest/config.py b/cropharvest/config.py index 587616b..f2e0e93 100644 --- a/cropharvest/config.py +++ b/cropharvest/config.py @@ -57,3 +57,21 @@ } TEST_DATASETS = {"Togo": "togo-eval"} + + +def test_countries_to_crops(): + output_dict = {} + for identifier, _ in TEST_REGIONS.items(): + country, crop, _, _ = identifier.split("_") + if country in output_dict.keys(): + assert output_dict[country] == crop + else: + output_dict[country].append(crop) + + for country, _ in TEST_DATASETS.items(): + output_dict[country].append(None) + + return output_dict + + +TEST_COUNTRIES_TO_CROPS = test_countries_to_crops() diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index ca082a9..85204d6 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -13,6 +13,7 @@ load_normalizing_dict, sample_with_memory, NoDataForBoundingBoxError, + filter_geojson, ) from cropharvest.config import ( FEATURES_DIR, @@ -89,18 +90,8 @@ def __init__(self, root, download=False): def as_geojson(self) -> geopandas.GeoDataFrame: return self._labels - @staticmethod - def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - # warning: invalid value encountered in ? (vectorized) - in_bounding_box = np.vectorize(bounding_box.contains)( - gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON] - ) - return gpdf[in_bounding_box] - def classes_in_bbox(self, bounding_box: BBox) -> List[str]: - bbox_geojson = self.filter_geojson(self.as_geojson(), bounding_box) + bbox_geojson = filter_geojson(self.as_geojson(), bounding_box) unique_labels = [x for x in bbox_geojson.label.unique() if x is not None] return unique_labels @@ -117,7 +108,7 @@ def construct_positive_and_negative_labels( if filter_test: gpdf = gpdf[gpdf[RequiredColumns.IS_TEST] == False] if task.bounding_box is not None: - gpdf = self.filter_geojson(gpdf, task.bounding_box) + gpdf = filter_geojson(gpdf, task.bounding_box) if len(gpdf) == 0: raise NoDataForBoundingBoxError diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index d350102..11d4bae 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -15,6 +15,7 @@ from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns +from cropharvest.countries import get_country_bbox from cropharvest.boundingbox import BBox from cropharvest.config import ( EXPORT_END_DAY, @@ -31,7 +32,7 @@ ARRAYS_FILEPATH, TEST_FEATURES_FILEPATH, ) -from cropharvest.utils import load_normalizing_dict +from cropharvest.utils import load_normalizing_dict, filter_geojson from typing import cast, Optional, Dict, Union, Tuple, List, Sequence @@ -549,24 +550,18 @@ def create_h5_test_instances( hf.create_dataset(key, data=val) hf.close() - for _, dataset in TEST_DATASETS.items(): + for country, dataset in TEST_DATASETS.items(): x: List[np.ndarray] = [] y: List[int] = [] lats: List[float] = [] lons: List[float] = [] - relevant_labels = self.labels[self.labels[RequiredColumns.DATASET] == dataset] + country_bboxes = get_country_bbox(country) + relevant_labels = pd.concat( + [filter_geojson(self.labels, box) for box in country_bboxes] + ) for _, row in tqdm(relevant_labels.iterrows()): - tif_paths = list( - self.eo_files.glob( - f"{row[RequiredColumns.INDEX]}-{row[RequiredColumns.DATASET]}_*.tif" - ) - ) - if len(tif_paths) == 0: - continue - else: - tif_path = tif_paths[0] - instance = self.process_single_file(tif_path, row) + instance = self.process_single_file(row) if instance is not None: x.append(instance.array) y.append(instance.is_crop) @@ -575,7 +570,7 @@ def create_h5_test_instances( # then, combine the instances into a test instance test_instance = TestInstance(np.stack(x), np.stack(y), np.stack(lats), np.stack(lons)) - hf = h5py.File(self.test_savedir / f"{dataset}.h5", "w") + hf = h5py.File(TEST_FEATURES_FILEPATH / f"{dataset}.h5", "w") for key, val in test_instance.datasets.items(): hf.create_dataset(key, data=val) hf.close() diff --git a/cropharvest/utils.py b/cropharvest/utils.py index fef5770..673370b 100644 --- a/cropharvest/utils.py +++ b/cropharvest/utils.py @@ -8,10 +8,13 @@ import collections import functools import tarfile +import warnings from typing import Dict, List, Tuple, Optional from cropharvest.config import DATASET_URL +from cropharvest.boundingbox import BBox +from cropharvest.columns import RequiredColumns try: import torch @@ -146,3 +149,13 @@ def read_geopandas(file_path) -> geopandas.GeoDataFrame: class NoDataForBoundingBoxError(Exception): pass + + +def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # warning: invalid value encountered in ? (vectorized) + in_bounding_box = np.vectorize(bounding_box.contains)( + gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON] + ) + return gpdf[in_bounding_box] From d16905ac969efb6f5397b66df1b0c3db81d46492 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 26 Apr 2022 07:32:40 -0400 Subject: [PATCH 22/38] Make output_dict a defaultdict --- cropharvest/config.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cropharvest/config.py b/cropharvest/config.py index f2e0e93..67654e7 100644 --- a/cropharvest/config.py +++ b/cropharvest/config.py @@ -1,4 +1,5 @@ from pathlib import Path +from collections import defaultdict from .boundingbox import BBox @@ -60,13 +61,10 @@ def test_countries_to_crops(): - output_dict = {} + output_dict = defaultdict(list) for identifier, _ in TEST_REGIONS.items(): country, crop, _, _ = identifier.split("_") - if country in output_dict.keys(): - assert output_dict[country] == crop - else: - output_dict[country].append(crop) + output_dict[country].append(crop) for country, _ in TEST_DATASETS.items(): output_dict[country].append(None) From e05e3053da173e0eaf4bf8813526663aac6f3499 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 26 Apr 2022 08:44:55 -0400 Subject: [PATCH 23/38] Use fnmatch instead of (incorrect) string matching --- cropharvest/engineer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 11d4bae..907b7f3 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -2,6 +2,7 @@ from datetime import datetime, timedelta import geopandas from dataclasses import dataclass +from fnmatch import fnmatch import numpy as np import pandas as pd import xarray as xr @@ -199,7 +200,7 @@ def load_labels() -> geopandas.GeoDataFrame: + labels[RequiredColumns.EXPORT_END_DATE].astype(str) ) labels[EngColumns.FEATURES_PATH] = ( - str(ARRAYS_FILEPATH) + labels[EngColumns.FEATURES_FILENAME] + str(ARRAYS_FILEPATH) + "/" + labels[EngColumns.FEATURES_FILENAME] ) labels[EngColumns.EXISTS] = np.vectorize(lambda p: Path(p).exists())( labels[EngColumns.FEATURES_PATH] @@ -586,7 +587,7 @@ def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]: def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar): candidate_paths = [] for p, bbox in path_to_bbox.items(): - if bbox.contains(lat, lon) and f"dates=*_{end_date}" in p.stem: + if bbox.contains(lat, lon) and fnmatch(p.stem, f"dates=*_{end_date}*"): candidate_paths.append(p) pbar.update(1) return candidate_paths From 30a55d5f844a469c2a1887e1ff5e31a3fef2b6f8 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 26 Apr 2022 10:52:19 -0400 Subject: [PATCH 24/38] Fix incorrect pattern in fnmatch --- cropharvest/engineer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 907b7f3..01b4e65 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -587,7 +587,7 @@ def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]: def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar): candidate_paths = [] for p, bbox in path_to_bbox.items(): - if bbox.contains(lat, lon) and fnmatch(p.stem, f"dates=*_{end_date}*"): + if bbox.contains(lat, lon) and fnmatch(p.stem, f"*dates=*_{end_date}*"): candidate_paths.append(p) pbar.update(1) return candidate_paths From b391d3b8b3c792ce06f6c79f7a15e6b3b28cd229 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 27 Apr 2022 10:19:33 -0400 Subject: [PATCH 25/38] Correctly find the correct tif_filepath per label --- cropharvest/columns.py | 2 +- cropharvest/engineer.py | 70 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/cropharvest/columns.py b/cropharvest/columns.py index 2c4488e..7675138 100644 --- a/cropharvest/columns.py +++ b/cropharvest/columns.py @@ -53,4 +53,4 @@ class EngColumns: FEATURES_FILENAME = "features_filename" FEATURES_PATH = "features_path" EXISTS = "feature_exists" - TIF_FILEPATH = "tif_path" + TIF_FILEPATHS = "tif_path" diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 01b4e65..971afe3 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -53,6 +53,7 @@ class DataInstance: array: np.ndarray is_crop: int year: int + source_tif_file: str label: Optional[str] = None @property @@ -208,10 +209,30 @@ def load_labels() -> geopandas.GeoDataFrame: return labels @staticmethod - def find_nearest(array, value: float) -> float: + def distance_from_degrees(lat1: float, lon1: float, lat2: float, lon2: float) -> float: + """ + haversince formula, inspired by: + https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude/41337005 + """ + p = 0.017453292519943295 + a = ( + 0.5 + - np.cos((lat2 - lat1) * p) / 2 + + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2 + ) + return 12742 * np.arcsin(np.sqrt(a)) + + @staticmethod + def distance_point_from_center(lat_idx: int, lon_idx: int, tif) -> int: + x_dist = np.abs((len(tif.x) - 1) / 2 - lon_idx) + y_dist = np.abs((len(tif.y) - 1) / 2 - lat_idx) + return x_dist + y_dist + + @staticmethod + def find_nearest(array, value: float) -> Tuple[float, int]: array = np.asarray(array) idx = (np.abs(array - value)).argmin() - return array[idx] + return array[idx], idx @staticmethod def process_filename(filename: str) -> Tuple[int, str]: @@ -504,11 +525,43 @@ def process_single_file( start_date = row[RequiredColumns.EXPORT_END_DATE] - timedelta( days=num_timesteps * DAYS_PER_TIMESTEP ) - da, average_slope = self.load_tif(row[EngColumns.TIF_FILEPATH], start_date=start_date) - closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON]) - closest_lat = self.find_nearest(da.y, row[RequiredColumns.LAT]) - labelled_np = da.sel(x=closest_lon).sel(y=closest_lat).values + tif_slope_tuples = [ + self.load_tif(filepath, start_date=start_date) + for filepath in row[EngColumns.TIF_FILEPATHS] + ] + if len(tif_slope_tuples) == 1: + tif, average_slope = tif_slope_tuples[0] + + closest_lon, _ = self.find_nearest(tif.x, row[RequiredColumns.LON]) + closest_lat, _ = self.find_nearest(tif.y, row[RequiredColumns.LAT]) + + labelled_np = tif.sel(x=closest_lon).sel(y=closest_lat).values + tif_file = row[EngColumns.TIF_FILEPATHS].iloc[0].name + + else: + min_distance_from_point = np.inf + min_distance_from_center = np.inf + for i, tif_slope_tuple in enumerate(tif_slope_tuples): + tif, slope = tif_slope_tuple + lon, lon_idx = self.find_nearest(tif.x, row[RequiredColumns.LON]) + lat, lat_idx = self.find_nearest(tif.y, row[RequiredColumns.LAT]) + distance_from_point = self.distance_from_degrees( + row[RequiredColumns.LAT], row[RequiredColumns.LON], lat, lon + ) + distance_from_center = self.distance_point_from_center(lat_idx, lon_idx, tif) + if (distance_from_point < min_distance_from_point) or ( + distance_from_point == min_distance_from_point + and distance_from_center < min_distance_from_center + ): + closest_lon = lon + closest_lat = lat + min_distance_from_center = distance_from_center + min_distance_from_point = distance_from_point + + labelled_np = tif.sel(x=lon).sel(y=lat).values + average_slope = slope + tif_file = row[EngColumns.TIF_FILEPATHS].iloc[i].name labelled_np = self.calculate_ndvi(labelled_np) labelled_np = self.remove_bands(labelled_np) @@ -530,6 +583,7 @@ def process_single_file( label=row[NullableColumns.LABEL], dataset=row[RequiredColumns.DATASET], year=start_date.year, + source_tif_file=tif_file, ) def create_h5_test_instances( @@ -629,10 +683,10 @@ def create_h5_dataset(self) -> None: old_normalizing_dict = (num_existing_files, old_nd) labels_with_no_features = self.labels[~self.labels[EngColumns.EXISTS]].copy() - labels_with_no_features[EngColumns.TIF_FILEPATH] = self.match_labels_to_tifs( + labels_with_no_features[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs( labels_with_no_features ) - tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATH].str.len() > 0 + tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATHS].str.len() > 0 labels_with_tifs_but_no_features = labels_with_no_features.loc[tifs_found] skipped_files: int = 0 From 127947a67ee43bee12b3bed466a97062d1b0447c Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 27 Apr 2022 10:26:08 -0400 Subject: [PATCH 26/38] Don't use iloc to index a list --- cropharvest/engineer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 971afe3..e402266 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -537,7 +537,7 @@ def process_single_file( closest_lat, _ = self.find_nearest(tif.y, row[RequiredColumns.LAT]) labelled_np = tif.sel(x=closest_lon).sel(y=closest_lat).values - tif_file = row[EngColumns.TIF_FILEPATHS].iloc[0].name + tif_file = row[EngColumns.TIF_FILEPATHS][0].name else: min_distance_from_point = np.inf @@ -561,7 +561,7 @@ def process_single_file( labelled_np = tif.sel(x=lon).sel(y=lat).values average_slope = slope - tif_file = row[EngColumns.TIF_FILEPATHS].iloc[i].name + tif_file = row[EngColumns.TIF_FILEPATHS][i].name labelled_np = self.calculate_ndvi(labelled_np) labelled_np = self.remove_bands(labelled_np) From f6a2c06a56f2005715e637602ce7dd158c172ecd Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 27 Apr 2022 13:19:35 -0400 Subject: [PATCH 27/38] Fix failing engineer tests --- test/cropharvest/engineer/test_engineer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cropharvest/engineer/test_engineer.py b/test/cropharvest/engineer/test_engineer.py index a6b16da..8f3622e 100644 --- a/test/cropharvest/engineer/test_engineer.py +++ b/test/cropharvest/engineer/test_engineer.py @@ -82,7 +82,7 @@ def test_find_nearest(): target = 1.1 - assert Engineer.find_nearest(array, target) == 1 + assert Engineer.find_nearest(array, target) == (1, 0) def test_filename_correctly_processed(): From f5438ccea60d3451c7c489d985d9db3361094f3f Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 27 Apr 2022 13:23:29 -0400 Subject: [PATCH 28/38] Remove unused function --- cropharvest/engineer.py | 13 +------------ test/cropharvest/engineer/test_engineer.py | 8 -------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index e402266..25f9761 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -234,17 +234,6 @@ def find_nearest(array, value: float) -> Tuple[float, int]: idx = (np.abs(array - value)).argmin() return array[idx], idx - @staticmethod - def process_filename(filename: str) -> Tuple[int, str]: - r""" - Given an exported sentinel file, process it to get the dataset - it came from, and the index of that dataset - """ - parts = filename.split("_")[0].split("-") - index = parts[0] - dataset = "-".join(parts[1:]) - return int(index), dataset - @staticmethod def load_tif( filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS @@ -444,7 +433,7 @@ def remove_bands(array: np.ndarray) -> np.ndarray: def process_test_file( path_to_file: Path, start_date: datetime ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - da, slope = Engineer.load_tif(xr.open_rasterio(path_to_file), start_date=start_date) + da, slope = Engineer.load_tif(path_to_file, start_date=start_date) # Process remote sensing data x_np = da.values diff --git a/test/cropharvest/engineer/test_engineer.py b/test/cropharvest/engineer/test_engineer.py index 8f3622e..2a12d44 100644 --- a/test/cropharvest/engineer/test_engineer.py +++ b/test/cropharvest/engineer/test_engineer.py @@ -92,14 +92,6 @@ def test_filename_correctly_processed(): assert dataset == "togo" -def test_filename_correctly_processed_2(): - - filename = "98-geowiki-landcover-2017_2019-02-06_2020-02-01.tif" - idx, dataset = Engineer.process_filename(filename) - assert idx == 98 - assert dataset == "geowiki-landcover-2017" - - def test_process_test_file(): x_np, flat_lat, flat_lon = Engineer.process_test_file( TIF_FILE, start_date=datetime(2019, 2, 6, 0, 0) From 3bffd7096a6622b9aac70be503a7686f335c5ecb Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Mon, 2 May 2022 15:00:40 -0400 Subject: [PATCH 29/38] Remove unused imports --- benchmarks/dl/maml.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py index d8ab58f..113edb6 100644 --- a/benchmarks/dl/maml.py +++ b/benchmarks/dl/maml.py @@ -3,7 +3,6 @@ import dill import warnings from random import shuffle, random -from collections import defaultdict import torch from torch import nn @@ -19,10 +18,10 @@ from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task from cropharvest import countries -from cropharvest.config import TEST_DATASETS, TEST_REGIONS, TEST_COUNTRIES_TO_CROPS +from cropharvest.config import TEST_COUNTRIES_TO_CROPS from cropharvest.utils import NoDataForBoundingBoxError -from typing import Dict, Tuple, Optional, List, DefaultDict +from typing import Dict, Tuple, Optional, List class TrainDataLoader: From 765aec0a7daac398e515c09fc20dd84a8f4ea603 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 07:43:08 -0400 Subject: [PATCH 30/38] For the labels in a test area, associate the labels to the right tif files --- cropharvest/engineer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 25f9761..0cbbe86 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -604,7 +604,11 @@ def create_h5_test_instances( [filter_geojson(self.labels, box) for box in country_bboxes] ) - for _, row in tqdm(relevant_labels.iterrows()): + relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels) + tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0 + labels_with_tifs = labels_with_tifs.loc[tifs_found] + + for _, row in tqdm(labels_with_tifs.iterrows()): instance = self.process_single_file(row) if instance is not None: x.append(instance.array) From cee6c20242a345885753ce0338931439428ba9fa Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 08:03:09 -0400 Subject: [PATCH 31/38] Fix variable naming when creating test instances --- cropharvest/engineer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 0cbbe86..1f36285 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -606,7 +606,7 @@ def create_h5_test_instances( relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels) tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0 - labels_with_tifs = labels_with_tifs.loc[tifs_found] + labels_with_tifs = relevant_labels.loc[tifs_found] for _, row in tqdm(labels_with_tifs.iterrows()): instance = self.process_single_file(row) From 3698c15339576f63c34d64b3c770e4bce6cbe23a Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 09:00:40 -0400 Subject: [PATCH 32/38] Only use the test data for Togo --- cropharvest/engineer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 1f36285..28c397f 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -594,15 +594,12 @@ def create_h5_test_instances( hf.create_dataset(key, data=val) hf.close() - for country, dataset in TEST_DATASETS.items(): + for _, dataset in TEST_DATASETS.items(): x: List[np.ndarray] = [] y: List[int] = [] lats: List[float] = [] lons: List[float] = [] - country_bboxes = get_country_bbox(country) - relevant_labels = pd.concat( - [filter_geojson(self.labels, box) for box in country_bboxes] - ) + relevant_labels = self.labels[self.labels[RequiredColumns.DATASET] == dataset] relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels) tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0 From ac39be3d33e6d90021bd22458215485280a47c36 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 09:25:50 -0400 Subject: [PATCH 33/38] Use the new .h5 filenames in the datasets --- cropharvest/datasets.py | 34 ++++++++++++++++------------------ cropharvest/engineer.py | 4 ++-- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index 85204d6..2bfa76a 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -23,8 +23,8 @@ TEST_REGIONS, TEST_DATASETS, ) -from cropharvest.columns import NullableColumns, RequiredColumns -from cropharvest.engineer import TestInstance +from cropharvest.columns import NullableColumns, RequiredColumns, EngColumns +from cropharvest.engineer import TestInstance, Engineer from cropharvest import countries from typing import cast, List, Optional, Tuple, Generator @@ -84,8 +84,13 @@ def __init__(self, root, download=False): super().__init__(root, download, filenames=(LABELS_FILENAME,)) # self._labels will always contain the original dataframe; - # the CropHarvestLabels class should not modify it - self._labels = read_geopandas(self.root / LABELS_FILENAME) + # the CropHarvestLabels class should not modify it. + # We use the Engineer load_labels since this also loads the .h5 + # filenames for each row + # TODO: The load_labels doesn't actually allow the root to be + # modified. We should probably do this at a package level, not + # at a class level + self._labels = Engineer.load_labels(root=root) def as_geojson(self) -> geopandas.GeoDataFrame: return self._labels @@ -128,15 +133,17 @@ def construct_positive_and_negative_labels( # then we can just collect all classes which either # 1) are crop, or 2) are a different non crop class (e.g. forest) negative_labels = gpdf[((is_null & is_crop) | (~is_null & ~is_target))] - negative_paths = self._dataframe_to_paths(negative_labels) + negative_paths = negative_labels[EngColumns.FEATURES_PATH].tolist() else: # otherwise, the target label is a crop. If balance_negative_crops is # true, then we want an equal number of (other) crops and non crops in # the negative labels negative_non_crop_labels = gpdf[~is_crop] negative_other_crop_labels = gpdf[(is_crop & ~is_null & ~is_target)] - negative_non_crop_paths = self._dataframe_to_paths(negative_non_crop_labels) - negative_paths = self._dataframe_to_paths(negative_other_crop_labels) + negative_paths = negative_other_crop_labels[EngColumns.FEATURES_PATH].tolist() + negative_non_crop_paths = negative_non_crop_labels[ + EngColumns.FEATURES_PATH + ].tolist() if task.balance_negative_crops: negative_paths.extend( @@ -150,26 +157,17 @@ def construct_positive_and_negative_labels( # otherwise, we will just filter by crop and non crop positive_labels = gpdf[is_crop] negative_labels = gpdf[~is_crop] - negative_paths = self._dataframe_to_paths(negative_labels) + negative_paths = negative_labels[EngColumns.FEATURES_PATH].tolist() except IndexError: raise NoDataForBoundingBoxError - positive_paths = self._dataframe_to_paths(positive_labels) + positive_paths = positive_paths[EngColumns.FEATURES_PATH].tolist() if (len(positive_paths) == 0) or (len(negative_paths) == 0): raise NoDataForBoundingBoxError return [x for x in positive_paths if x.exists()], [x for x in negative_paths if x.exists()] - def _path_from_row(self, row: geopandas.GeoSeries) -> Path: - return ( - self.root - / f"features/arrays/{row[RequiredColumns.INDEX]}_{row[RequiredColumns.DATASET]}.h5" - ) - - def _dataframe_to_paths(self, df: geopandas.GeoDataFrame) -> List[Path]: - return [self._path_from_row(row) for _, row in df.iterrows()] - class CropHarvestTifs(BaseDataset): def __init__(self, root, download=False): diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 28c397f..83d9d9a 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -187,8 +187,8 @@ def __init__(self) -> None: self.norm_interim: Dict[str, Union[np.ndarray, int]] = {"n": 0} @staticmethod - def load_labels() -> geopandas.GeoDataFrame: - labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME) + def load_labels(root=DATAFOLDER_PATH) -> geopandas.GeoDataFrame: + labels = geopandas.read_file(root / LABELS_FILENAME) labels[RequiredColumns.EXPORT_END_DATE] = pd.to_datetime( labels[RequiredColumns.EXPORT_END_DATE] ).dt.date From 26aacf562637b71c05b699dd6418ccd84706164b Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 10:29:38 -0400 Subject: [PATCH 34/38] Fix variable names in the dataset --- cropharvest/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index 2bfa76a..c82b88b 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -161,7 +161,7 @@ def construct_positive_and_negative_labels( except IndexError: raise NoDataForBoundingBoxError - positive_paths = positive_paths[EngColumns.FEATURES_PATH].tolist() + positive_paths = positive_labels[EngColumns.FEATURES_PATH].tolist() if (len(positive_paths) == 0) or (len(negative_paths) == 0): raise NoDataForBoundingBoxError From 84aa0221f9767800750205228513b308a5a4df0e Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 10:42:58 -0400 Subject: [PATCH 35/38] Remove unused imports --- cropharvest/datasets.py | 1 - cropharvest/engineer.py | 1 - 2 files changed, 2 deletions(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index c82b88b..34f214a 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -9,7 +9,6 @@ from cropharvest.utils import ( download_and_extract_archive, deterministic_shuffle, - read_geopandas, load_normalizing_dict, sample_with_memory, NoDataForBoundingBoxError, diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 83d9d9a..cb202bd 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -16,7 +16,6 @@ from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns -from cropharvest.countries import get_country_bbox from cropharvest.boundingbox import BBox from cropharvest.config import ( EXPORT_END_DAY, From 96625b47ac281323e0e926258900829bbbee278c Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Tue, 3 May 2022 12:56:00 -0400 Subject: [PATCH 36/38] Don't hardcode the seed in the CropHarvest dataset class --- cropharvest/datasets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index 34f214a..87659e0 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -21,6 +21,7 @@ DEFAULT_SEED, TEST_REGIONS, TEST_DATASETS, + DEFAULT_SEED, ) from cropharvest.columns import NullableColumns, RequiredColumns, EngColumns from cropharvest.engineer import TestInstance, Engineer @@ -206,8 +207,8 @@ def __init__( if val_ratio > 0.0: # the fixed seed is to ensure the validation set is always # different from the training set - positive_paths = deterministic_shuffle(positive_paths, seed=42) - negative_paths = deterministic_shuffle(negative_paths, seed=42) + positive_paths = deterministic_shuffle(positive_paths, seed=DEFAULT_SEED) + negative_paths = deterministic_shuffle(negative_paths, seed=DEFAULT_SEED) if is_val: positive_paths = positive_paths[: int(len(positive_paths) * val_ratio)] negative_paths = negative_paths[: int(len(negative_paths) * val_ratio)] From eaa7d187868c4d0b24d66b5c485f457aa242f4fb Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 4 May 2022 08:18:47 -0400 Subject: [PATCH 37/38] Fix random forest name in the config --- benchmarks/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/config.py b/benchmarks/config.py index 4667d81..f97c656 100644 --- a/benchmarks/config.py +++ b/benchmarks/config.py @@ -11,7 +11,7 @@ # Model names -RANDOM_FOREST = "RF_GRID_SEARCH" +RANDOM_FOREST = "RF" DL_RANDOM = "DL_RANDOM" DL_PRETRAINED = "DL_PRETRAINED" DL_MAML = "DL_MAML" From ea3acfd18773739e3e1af60044387f0bbd635af0 Mon Sep 17 00:00:00 2001 From: Gabriel Tseng Date: Wed, 13 Jul 2022 15:29:55 +0100 Subject: [PATCH 38/38] Fix imports --- cropharvest/datasets.py | 1 - cropharvest/engineer.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py index 87659e0..3050897 100644 --- a/cropharvest/datasets.py +++ b/cropharvest/datasets.py @@ -18,7 +18,6 @@ FEATURES_DIR, TEST_FEATURES_DIR, LABELS_FILENAME, - DEFAULT_SEED, TEST_REGIONS, TEST_DATASETS, DEFAULT_SEED, diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py index 12a06aa..e49e8cb 100644 --- a/cropharvest/engineer.py +++ b/cropharvest/engineer.py @@ -32,7 +32,7 @@ ARRAYS_FILEPATH, TEST_FEATURES_FILEPATH, ) -from cropharvest.utils import load_normalizing_dict, filter_geojson +from cropharvest.utils import load_normalizing_dict from typing import cast, Optional, Dict, Union, Tuple, List, Sequence