From 818c3646beadc17c85e43b1a8b8a229198749c26 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 09:56:41 -0400
Subject: [PATCH 01/38] Script to update the namning convention of eo tif files

---
 scripts/20220418_renaming.py | 51 ++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 scripts/20220418_renaming.py
diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py
new file mode 100644
index 0000000..80a7825
--- /dev/null
+++ b/scripts/20220418_renaming.py
@@ -0,0 +1,51 @@
+"""
+A script to shift the naming convention of EO data
+from a <id>_<dataset> format to a <date>_<location>
+format.
+This decouples the EO data from the labels.geojson
+"""
+from pathlib import Path
+import shutil
+import geopandas
+
+from cropharvest.columns import RequiredColumns
+from cropharvest.eo.eo import EarthEngineExporter
+from cropharvest.eo.ee_boundingbox import EEBoundingBox
+from cropharvest.utils import DATAFOLDER_PATH
+
+
+SURROUNDING_METRES = 80
+
+
+def construct_new_name(labels: geopandas.GeoDataFrame, old_name: str) -> str:
+
+    identifier = old_name.split("_")[0]
+    relevant_rows = labels[labels["export_identifier"] == identifier]
+    assert len(relevant_rows) == 1
+    row = relevant_rows.iloc[0]
+
+    # make a bounding box
+    ee_bbox = EEBoundingBox.from_centre(
+        mid_lat=row[RequiredColumns.LAT],
+        mid_lon=row[RequiredColumns.LON],
+        surrounding_metres=SURROUNDING_METRES,
+    )
+    export_identifier = EarthEngineExporter.make_identifier(
+        ee_bbox, row["start_date"], row["end_date"]
+    )
+
+    return f"{export_identifier}.tif"
+
+
+def copy_and_rename_dataset(org_folder: Path, new_folder: Path):
+
+    original_tif_files = list(org_folder.glob("*.tif"))
+    labels = EarthEngineExporter.load_default_labels()
+
+    for tif_file in original_tif_files:
+        new_name = construct_new_name(labels, tif_file.name)
+        shutil.copy(tif_file, new_folder / new_name)
+
+
+if __name__ == "__main__":
+    copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", DATAFOLDER_PATH / "renamed_eo_data")

From e09fc93cbc127c2daa7705879d63319b5ae57c81 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 10:23:38 -0400
Subject: [PATCH 02/38] Make the output folder

---
 scripts/20220418_renaming.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py
index 80a7825..d4526b7 100644
--- a/scripts/20220418_renaming.py
+++ b/scripts/20220418_renaming.py
@@ -48,4 +48,6 @@ def copy_and_rename_dataset(org_folder: Path, new_folder: Path):
 
 
 if __name__ == "__main__":
-    copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", DATAFOLDER_PATH / "renamed_eo_data")
+    output_folder = DATAFOLDER_PATH / "renamed_eo_data"
+    output_folder.mkdir(exist_ok=True)
+    copy_and_rename_dataset(DATAFOLDER_PATH / "eo_data", output_folder)

From d8ce1a2dafb56da62a43ec858d4405f3d5a5c0f9 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 10:24:59 -0400
Subject: [PATCH 03/38] Track progress using tqdm

---
 scripts/20220418_renaming.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py
index d4526b7..a167871 100644
--- a/scripts/20220418_renaming.py
+++ b/scripts/20220418_renaming.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 import shutil
 import geopandas
+from tqdm import tqdm
 
 from cropharvest.columns import RequiredColumns
 from cropharvest.eo.eo import EarthEngineExporter
@@ -42,7 +43,7 @@ def copy_and_rename_dataset(org_folder: Path, new_folder: Path):
     original_tif_files = list(org_folder.glob("*.tif"))
     labels = EarthEngineExporter.load_default_labels()
 
-    for tif_file in original_tif_files:
+    for tif_file in tqdm(original_tif_files):
         new_name = construct_new_name(labels, tif_file.name)
         shutil.copy(tif_file, new_folder / new_name)
 

From 78f326bc99497a248f255a8674a4365c2f577a05 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 10:26:48 -0400
Subject: [PATCH 04/38] Add the right arguments to load_default_labels

---
 scripts/20220418_renaming.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py
index a167871..6a965f9 100644
--- a/scripts/20220418_renaming.py
+++ b/scripts/20220418_renaming.py
@@ -41,7 +41,9 @@ def construct_new_name(labels: geopandas.GeoDataFrame, old_name: str) -> str:
 def copy_and_rename_dataset(org_folder: Path, new_folder: Path):
 
     original_tif_files = list(org_folder.glob("*.tif"))
-    labels = EarthEngineExporter.load_default_labels()
+    labels = EarthEngineExporter.load_default_labels(
+        dataset=None, start_from_last=False, checkpoint=None
+    )
 
     for tif_file in tqdm(original_tif_files):
         new_name = construct_new_name(labels, tif_file.name)

From f27e94df0b158a33e768e20de0deed9de1dc2f4c Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 10:43:00 -0400
Subject: [PATCH 05/38] Remove conflicting column name

---
 cropharvest/columns.py                           | 2 +-
 cropharvest/eo/eo.py                             | 2 +-
 test/process_labels/test_datasets.py             | 4 ++--
 test/process_labels/test_process_labels_utils.py | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cropharvest/columns.py b/cropharvest/columns.py
index a04b282..852daab 100644
--- a/cropharvest/columns.py
+++ b/cropharvest/columns.py
@@ -18,7 +18,7 @@ def date_columns(cls) -> List[str]:
 
 class RequiredColumns(Columns):
 
-    INDEX = "index"
+    INDEX = "dataset_index"
     IS_CROP = "is_crop"
     LAT = "lat"
     LON = "lon"
diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index 2290373..a6461c2 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -144,7 +144,7 @@ def load_default_labels(
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)
         )
         labels = labels.assign(
-            export_identifier=lambda x: f"{x['index']}-{x[RequiredColumns.DATASET]}"
+            export_identifier=lambda x: f"{x[RequiredColumns.INDEX]}-{x[RequiredColumns.DATASET]}"
         )
         if dataset:
             labels = labels[labels.dataset == dataset]
diff --git a/test/process_labels/test_datasets.py b/test/process_labels/test_datasets.py
index df190f2..8dfc8cd 100644
--- a/test/process_labels/test_datasets.py
+++ b/test/process_labels/test_datasets.py
@@ -17,7 +17,7 @@
 def _check_columns_and_types(df: geopandas.GeoDataFrame) -> None:
 
     for expected_column, expected_type in [
-        ("index", int),
+        ("dataset_index", int),
         ("lat", float),
         ("lon", float),
         ("is_crop", int),
@@ -50,7 +50,7 @@ def _check_export_end_date(df: geopandas.GeoDataFrame) -> None:
 
 
 def _check_index(df: geopandas.GeoDataFrame) -> None:
-    assert len(df["index"].unique()) == len(df)
+    assert len(df["dataset_index"].unique()) == len(df)
 
 
 def _check_labels(df: geopandas.GeoDataFrame) -> None:
diff --git a/test/process_labels/test_process_labels_utils.py b/test/process_labels/test_process_labels_utils.py
index 3a788c7..8d51b27 100644
--- a/test/process_labels/test_process_labels_utils.py
+++ b/test/process_labels/test_process_labels_utils.py
@@ -8,7 +8,7 @@ def test_is_test_column():
     labels = geopandas.GeoDataFrame(
         data={
             "dataset": ["togo-eval", "geowiki", "geowiki"],
-            "index": [1, 2, 3],
+            "dataset_index": [1, 2, 3],
             "lat": [7.5817201079726511, -12.17, 1.11],
             "lon": [1.3954393874414535, -45.8, 8.29],
         }

From 674944cf73d915e348fb33b4753f202c40adb19a Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 11:46:28 -0400
Subject: [PATCH 06/38] Replace 'index' string with RequiredColumns.INDEX

---
 process_labels/loading_funcs/central_asia.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/process_labels/loading_funcs/central_asia.py b/process_labels/loading_funcs/central_asia.py
index f7456ca..8c88f26 100644
--- a/process_labels/loading_funcs/central_asia.py
+++ b/process_labels/loading_funcs/central_asia.py
@@ -65,7 +65,7 @@ def make_harvest_date(row) -> datetime:
     # two manual changes to replace multipolygons with polygons.
     # the first polygon is 10^5 times smaller than the second, so
     # we use the second
-    df.loc[df["index"] == 5162, "geometry"] = df.iloc[5162].geometry[1]
-    df.loc[df["index"] == 4049, "geometry"] = df.iloc[4049].geometry[1]
+    df.loc[df[RequiredColumns.INDEX] == 5162, "geometry"] = df.iloc[5162].geometry[1]
+    df.loc[df[RequiredColumns.INDEX] == 4049, "geometry"] = df.iloc[4049].geometry[1]
 
     return df

From 3f529e1c563126a03bf10ae307cc8e8c79c5d4a3 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 11:50:49 -0400
Subject: [PATCH 07/38] Replace 'index' string with RequiredColumns.INDEX

---
 process_labels/loading_funcs/kenya.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/process_labels/loading_funcs/kenya.py b/process_labels/loading_funcs/kenya.py
index 26b3619..d0b57b1 100644
--- a/process_labels/loading_funcs/kenya.py
+++ b/process_labels/loading_funcs/kenya.py
@@ -72,7 +72,7 @@ def load_kenya():
 
     df = pd.concat(dfs)
     df = df.reset_index(drop=True)
-    df["index"] = df.index
+    df[RequiredColumns.INDEX] = df.index
     return df
 
 

From ddaab9c019a49e20ba052b6603e44fbdd4f7ba58 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 19 Apr 2022 12:04:42 -0400
Subject: [PATCH 08/38] Fix assignment of export_identifier when loading
 default labels

---
 cropharvest/eo/eo.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index a6461c2..4c2e210 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -144,7 +144,9 @@ def load_default_labels(
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)
         )
         labels = labels.assign(
-            export_identifier=lambda x: f"{x[RequiredColumns.INDEX]}-{x[RequiredColumns.DATASET]}"
+            export_identifier=labels[RequiredColumns.INDEX].map(str)
+            + "-"
+            + labels[RequiredColumns.DATASET]
         )
         if dataset:
             labels = labels[labels.dataset == dataset]

From 171bb73a139199d6c95749a37e82be1bda59e0a9 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 09:11:57 -0400
Subject: [PATCH 09/38] Add a script to test the renaming has happened
 correctly

---
 scripts/20220420_check_renaming.py | 53 ++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 scripts/20220420_check_renaming.py

diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
new file mode 100644
index 0000000..6c31541
--- /dev/null
+++ b/scripts/20220420_check_renaming.py
@@ -0,0 +1,53 @@
+"""
+After 20220418_renaming.py was run,
+this script was used to check that
+the renaming happened correctly
+"""
+from pathlib import Path
+import xarray as xr
+import re
+import numpy as np
+from random import shuffle
+from tqdm import tqdm
+
+from cropharvest.countries import BBox
+from cropharvest.utils import DATAFOLDER_PATH
+
+
+def _bbox_from_filepath(p: Path) -> BBox:
+    """
+    https://github.com/nasaharvest/crop-mask/blob/master/src/ETL/boundingbox.py#L24
+    """
+    decimals_in_p = re.findall(r"=-?\d*\.?\d*", p.stem)
+    coords = [float(d[1:]) for d in decimals_in_p[0:4]]
+    return BBox(min_lat=coords[0], min_lon=coords[1], max_lat=coords[2], max_lon=coords[3])
+
+
+def isin(x: np.ndarray, val: float) -> bool:
+    return (val >= x.min()) & (val <= x.max())
+
+
+def check_file(path: Path) -> None:
+
+    # extract expected info from the path
+    bbox = _bbox_from_filepath(path)
+    tif_file = xr.open_rasterio(path)
+
+    # x is lon, y is lat
+    lat, lon = bbox.get_centre(in_radians=False)
+    x, y = tif_file.x.values, tif_file.y.values
+
+    assert isin(lon, x) & isin(lat, y)
+
+
+def main(renamed_path: Path, num_to_check: int = 1000):
+
+    all_files = list(renamed_path.glob("*.tif"))
+    shuffle(all_files)
+
+    for path_to_check in tqdm(all_files[:num_to_check]):
+        check_file(path_to_check)
+
+
+if __name__ == "__main__":
+    main(Path(DATAFOLDER_PATH / "renamed_eo_data"))

From 50bf35e7cab02d6d2292ca199867811df6739fc2 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 09:13:07 -0400
Subject: [PATCH 10/38] Correct argument ordering in the renaming script

---
 scripts/20220420_check_renaming.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
index 6c31541..e3b66a1 100644
--- a/scripts/20220420_check_renaming.py
+++ b/scripts/20220420_check_renaming.py
@@ -37,7 +37,7 @@ def check_file(path: Path) -> None:
     lat, lon = bbox.get_centre(in_radians=False)
     x, y = tif_file.x.values, tif_file.y.values
 
-    assert isin(lon, x) & isin(lat, y)
+    assert isin(x, lon) & isin(y, lat)
 
 
 def main(renamed_path: Path, num_to_check: int = 1000):

From 52b0cc38cabc0a631d43c8bf73da34b0ef79fe1d Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 09:14:31 -0400
Subject: [PATCH 11/38] Add information to the assert statement

---
 scripts/20220420_check_renaming.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
index e3b66a1..a183fdb 100644
--- a/scripts/20220420_check_renaming.py
+++ b/scripts/20220420_check_renaming.py
@@ -37,7 +37,7 @@ def check_file(path: Path) -> None:
     lat, lon = bbox.get_centre(in_radians=False)
     x, y = tif_file.x.values, tif_file.y.values
 
-    assert isin(x, lon) & isin(y, lat)
+    assert isin(x, lon) & isin(y, lat), f"{path} failed with {x}, {y} and {lat}, {lon}"
 
 
 def main(renamed_path: Path, num_to_check: int = 1000):

From eff87864f2ddd3d733bcf4d861056a7c55952d11 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 09:18:08 -0400
Subject: [PATCH 12/38] Keep track of failures

---
 scripts/20220420_check_renaming.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
index a183fdb..635bd2a 100644
--- a/scripts/20220420_check_renaming.py
+++ b/scripts/20220420_check_renaming.py
@@ -45,8 +45,15 @@ def main(renamed_path: Path, num_to_check: int = 1000):
     all_files = list(renamed_path.glob("*.tif"))
     shuffle(all_files)
 
+    failed = 0
     for path_to_check in tqdm(all_files[:num_to_check]):
-        check_file(path_to_check)
+        try:
+            check_file(path_to_check)
+        except AssertionError as e:
+            print(e)
+            failed += 1
+
+    print(f"{failed} files failed check out of {num_to_check}")
 
 
 if __name__ == "__main__":

From 7eff7d432e2ff87bcb8d1080e27eecbbb5fba0da Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 09:20:34 -0400
Subject: [PATCH 13/38] Final addition to check_renaming docstring

---
 scripts/20220420_check_renaming.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
index 635bd2a..471d2cb 100644
--- a/scripts/20220420_check_renaming.py
+++ b/scripts/20220420_check_renaming.py
@@ -2,6 +2,12 @@
 After 20220418_renaming.py was run,
 this script was used to check that
 the renaming happened correctly
+
+From 3 runs, this had a failure rate of 8, 10 and 6
+(out of 1000), which is <1%. In all cases, I looked at
+the failures and the corresponding latitudes and longitudes
+were just outside of the margins, which I am happy to attribute
+to rounding errors.
 """
 from pathlib import Path
 import xarray as xr

From d2f737e6e286cd0f8bc55dbf6024d7852ba7b780 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 10:43:29 -0400
Subject: [PATCH 14/38] Correctly calculate the export end date for the default
 labels

---
 cropharvest/eo/eo.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index 4c2e210..5330e3e 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -137,8 +137,7 @@ def load_default_labels(
         dataset: Optional[str], start_from_last, checkpoint: Optional[Path]
     ) -> geopandas.GeoDataFrame:
         labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME)
-        export_end_year = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]).dt.year
-        labels["end_date"] = export_end_year.apply(lambda x: date(x, 12, 12))
+        labels["end_date"] = labels[RequiredColumns.EXPORT_END_DATE]
         labels = labels.assign(
             start_date=lambda x: x["end_date"]
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)

From db9148737d78e7084349ef00cbdb6a4529e6ace2 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 10:46:00 -0400
Subject: [PATCH 15/38] Keep the end_date as a datetime, not a string

---
 cropharvest/eo/eo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index 5330e3e..098e7b1 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -137,7 +137,7 @@ def load_default_labels(
         dataset: Optional[str], start_from_last, checkpoint: Optional[Path]
     ) -> geopandas.GeoDataFrame:
         labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME)
-        labels["end_date"] = labels[RequiredColumns.EXPORT_END_DATE]
+        labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE])
         labels = labels.assign(
             start_date=lambda x: x["end_date"]
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)

From 429baf6956bb33b082cc8f4ae2c50f93444e5818 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 10:48:21 -0400
Subject: [PATCH 16/38] Store it as a date, not a datetime

---
 cropharvest/eo/eo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index 098e7b1..c1fe916 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -137,7 +137,7 @@ def load_default_labels(
         dataset: Optional[str], start_from_last, checkpoint: Optional[Path]
     ) -> geopandas.GeoDataFrame:
         labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME)
-        labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE])
+        labels["end_date"] = pd.to_datetime(labels[RequiredColumns.EXPORT_END_DATE]).dt.date
         labels = labels.assign(
             start_date=lambda x: x["end_date"]
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)

From dd9a3d203798a9015f0f6c2797ce584311407743 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 11:03:06 -0400
Subject: [PATCH 17/38] [WIP] match tifs to labels based on lat/lon instead of
 dataset/idx

---
 cropharvest/engineer.py | 61 +++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index e183da3..511a0f8 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, date
 import geopandas
 from dataclasses import dataclass
 import numpy as np
@@ -10,6 +10,7 @@
 from tqdm import tqdm
 import warnings
 import h5py
+import re
 
 from sklearn.metrics import roc_auc_score, f1_score
 
@@ -43,6 +44,7 @@ class DataInstance:
     instance_lon: float
     array: np.ndarray
     is_crop: int
+    year: int
     label: Optional[str] = None
 
     @property
@@ -172,7 +174,7 @@ def __init__(self, data_folder: Path = DATAFOLDER_PATH) -> None:
         self.test_eo_files = data_folder / "test_eo_data"
 
         self.labels = geopandas.read_file(data_folder / LABELS_FILENAME)
-        self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date)
+        self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date).dt.date
 
         self.savedir = data_folder / "features"
         self.savedir.mkdir(exist_ok=True)
@@ -201,7 +203,7 @@ def process_filename(filename: str) -> Tuple[int, str]:
 
     @staticmethod
     def load_tif(
-        filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS
+        ds: xr.Dataset, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS
     ) -> Tuple[xr.DataArray, float]:
         r"""
         The sentinel files exported from google earth have all the timesteps
@@ -210,9 +212,7 @@ def load_tif(
 
         Returns: The loaded xr.DataArray, and the average slope (used for filling nan slopes)
         """
-
-        da = xr.open_rasterio(filepath).rename("FEATURES")
-
+        da = da.rename("FEATURES")
         da_split_by_time: List[xr.DataArray] = []
 
         bands_per_timestep = len(DYNAMIC_BANDS)
@@ -400,7 +400,7 @@ def remove_bands(array: np.ndarray) -> np.ndarray:
     def process_test_file(
         path_to_file: Path, start_date: datetime
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        da, slope = Engineer.load_tif(path_to_file, start_date=start_date)
+        da, slope = Engineer.load_tif(xr.open_rasterio(path_to_file), start_date=start_date)
 
         # Process remote sensing data
         x_np = da.values
@@ -473,12 +473,35 @@ def process_test_file_with_region(
 
         return identifier_plus_idx, TestInstance(x=final_x, y=y, lats=flat_lat, lons=flat_lon)
 
+    @staticmethod
+    def _year_from_filepath(p: Path) -> date:
+        dates_in_p = re.findall(r"(\d+-\d+-\d+)", p.stem)
+        end_date = dates_in_p[-1].split("-")
+        return date(end_date[0], end_date[1], end_date[2])
+
+    def find_row_from_path(
+        self, lat: np.ndarray, lon: np.ndarray, export_end_date: date
+    ) -> pd.Series:
+        relevant_labels = self.labels[
+            (
+                (self.labels[RequiredColumns.LAT] >= lat.min())
+                & (self.labels[RequiredColumns.LAT] <= lat.max())
+                & (self.labels[RequiredColumns.LON] <= lon.max())
+                & (self.labels[RequiredColumns.LON] >= lon.min())
+                & (self.labels[RequiredColumns.EXPORT_END_DATE] == export_end_date)
+            )
+        ]
+        # TODO - check for the most central row if there is more than 1 row
+        return relevant_labels.iloc[0]
+
     def process_single_file(
         self,
         path_to_file: Path,
-        row: pd.Series,
         num_timesteps: int = DEFAULT_NUM_TIMESTEPS,
     ) -> Optional[DataInstance]:
+        ds = xr.open_rasterio(path_to_file)
+        year = self._year_from_filepath(path_to_file)
+        row = self.find_row_from_path(ds.y, ds.x, year)
         start_date = row.export_end_date - timedelta(days=num_timesteps * DAYS_PER_TIMESTEP)
         da, average_slope = self.load_tif(path_to_file, start_date=start_date)
         closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON])
@@ -505,6 +528,7 @@ def process_single_file(
             is_crop=row[RequiredColumns.IS_CROP],
             label=row[NullableColumns.LABEL],
             dataset=row[RequiredColumns.DATASET],
+            year=start_date.year,
         )
 
     def create_h5_test_instances(
@@ -572,23 +596,12 @@ def create_h5_dataset(self, checkpoint: bool = True) -> None:
         skipped_files: int = 0
         num_new_files: int = 0
         for file_path in tqdm(list(self.eo_files.glob("*.tif"))):
-            file_index, dataset = self.process_filename(file_path.name)
-            file_name = f"{file_index}_{dataset}.h5"
-            if (checkpoint) & ((arrays_dir / file_name).exists()):
-                # we check if the file has already been written
-                continue
-
-            file_row = self.labels[
-                (
-                    (self.labels[RequiredColumns.DATASET] == dataset)
-                    & (self.labels[RequiredColumns.INDEX] == file_index)
-                )
-            ].iloc[0]
-
-            instance = self.process_single_file(file_path, row=file_row)
+            instance = self.process_single_file(file_path)
             if instance is not None:
-
-                hf = h5py.File(arrays_dir / file_name, "w")
+                filename = (
+                    f"lat={instance.label_lat}_lon={instance.label_lon}_year={instance.year}.h5"
+                )
+                hf = h5py.File(arrays_dir / filename, "w")
                 hf.create_dataset("array", data=instance.array)
 
                 for key, val in instance.attrs.items():

From 30c70514dc78468409e67fea9e68a759b72da302 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 20 Apr 2022 15:00:20 -0400
Subject: [PATCH 18/38] Update Engineer for labelled tifs

---
 cropharvest/boundingbox.py       |  77 ++++++++++++++
 cropharvest/columns.py           |  12 +++
 cropharvest/config.py            |  12 ++-
 cropharvest/countries.py         |  69 +------------
 cropharvest/engineer.py          | 167 ++++++++++++++++++++-----------
 cropharvest/eo/ee_boundingbox.py |   2 +-
 cropharvest/eo/eo.py             |   9 +-
 cropharvest/utils.py             |   2 -
 8 files changed, 217 insertions(+), 133 deletions(-)
 create mode 100644 cropharvest/boundingbox.py

diff --git a/cropharvest/boundingbox.py b/cropharvest/boundingbox.py
new file mode 100644
index 0000000..3c972a3
--- /dev/null
+++ b/cropharvest/boundingbox.py
@@ -0,0 +1,77 @@
+from dataclasses import dataclass
+from pathlib import Path
+from shapely.geometry import Polygon
+from math import sin, cos, radians
+from typing import List, Tuple
+import re
+
+from typing import Optional
+
+
+@dataclass
+class BBox:
+
+    min_lat: float
+    max_lat: float
+    min_lon: float
+    max_lon: float
+
+    name: Optional[str] = None
+
+    def __post_init__(self):
+        if self.max_lon < self.min_lon:
+            raise ValueError("max_lon should be larger than min_lon")
+        if self.max_lat < self.min_lat:
+            raise ValueError("max_lat should be larger than min_lat")
+
+        self.url = (
+            f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}"
+        )
+
+    def contains(self, lat: float, lon: float) -> bool:
+        return (
+            (lat >= self.min_lat)
+            & (lat <= self.max_lat)
+            & (lon >= self.min_lon)
+            & (lon <= self.max_lon)
+        )
+
+    def contains_bbox(self, bbox: "BBox") -> bool:
+        return (
+            (bbox.min_lat >= self.min_lat)
+            & (bbox.max_lat <= self.max_lat)
+            & (bbox.min_lon >= self.min_lon)
+            & (bbox.max_lon <= self.max_lon)
+        )
+
+    @property
+    def three_dimensional_points(self) -> List[float]:
+        r"""
+        If we are passing the central latitude and longitude to
+        an ML model, we want it to know the extremes are close together.
+        Mapping them to 3d space allows us to do that
+        """
+        lat, lon = self.get_centre(in_radians=True)
+        return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)]
+
+    def get_centre(self, in_radians: bool = True) -> Tuple[float, float]:
+
+        # roughly calculate the centres
+        lat = self.min_lat + ((self.max_lat - self.min_lat) / 2)
+        lon = self.min_lon + ((self.max_lon - self.min_lon) / 2)
+        if in_radians:
+            return radians(lat), radians(lon)
+        else:
+            return lat, lon
+
+    @classmethod
+    def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None):
+        (min_lon, min_lat, max_lon, max_lat) = polygon.bounds
+        return cls(min_lat, max_lat, min_lon, max_lon, name)
+
+    @classmethod
+    def from_eo_tif_file(cls, path: Path) -> "BBox":
+        decimals_in_p = re.findall(r"=-?\d*\.?\d*", path.stem)
+        coords = [float(d[1:]) for d in decimals_in_p[0:4]]
+        bbox = cls(min_lat=coords[0], min_lon=coords[1], max_lat=coords[2], max_lon=coords[3])
+        return bbox
diff --git a/cropharvest/columns.py b/cropharvest/columns.py
index 852daab..2c4488e 100644
--- a/cropharvest/columns.py
+++ b/cropharvest/columns.py
@@ -42,3 +42,15 @@ class NullableColumns(Columns):
     @classmethod
     def date_columns(cls) -> List[str]:
         return [cls.HARVEST_DATE, cls.PLANTING_DATE]
+
+
+class EngColumns:
+    """
+    Some columns uniquely created & used by the labels
+    as loaded by the Engineer
+    """
+
+    FEATURES_FILENAME = "features_filename"
+    FEATURES_PATH = "features_path"
+    EXISTS = "feature_exists"
+    TIF_FILEPATH = "tif_path"
diff --git a/cropharvest/config.py b/cropharvest/config.py
index 65261d0..587616b 100644
--- a/cropharvest/config.py
+++ b/cropharvest/config.py
@@ -1,4 +1,6 @@
-from .countries import BBox
+from pathlib import Path
+
+from .boundingbox import BBox
 
 from typing import Dict
 
@@ -22,6 +24,14 @@
 FEATURES_DIR = "features"
 TEST_FEATURES_DIR = "test_features"
 
+# These values describe the structure of the data folder
+DATAFOLDER_PATH = Path(__file__).parent.parent / "data"
+EO_FILEPATH = DATAFOLDER_PATH / "eo_data"
+TEST_EO_FILEPATH = DATAFOLDER_PATH / "test_eo_data"
+FEATURES_FILEPATH = DATAFOLDER_PATH / FEATURES_DIR
+ARRAYS_FILEPATH = FEATURES_FILEPATH / "arrays"
+TEST_FEATURES_FILEPATH = DATAFOLDER_PATH / TEST_FEATURES_DIR
+
 # the default seed is useful because it also seeds the deterministic
 # shuffling algorithm we use (in cropharvest.utils.deterministic_shuffle)
 # so fixing this ensures the evaluation sets consist of the same data no matter
diff --git a/cropharvest/countries.py b/cropharvest/countries.py
index b053ac8..777b301 100644
--- a/cropharvest/countries.py
+++ b/cropharvest/countries.py
@@ -1,75 +1,12 @@
-from dataclasses import dataclass
 import geopandas
 from shapely.geometry import Polygon, MultiPolygon
-from math import sin, cos, radians
-from typing import List, Tuple
+from typing import List
 from pathlib import Path
 
-from typing import Optional
+from cropharvest.boundingbox import BBox
 
-COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile"))
-
-
-@dataclass
-class BBox:
-
-    min_lat: float
-    max_lat: float
-    min_lon: float
-    max_lon: float
-
-    name: Optional[str] = None
-
-    def __post_init__(self):
-        if self.max_lon < self.min_lon:
-            raise ValueError("max_lon should be larger than min_lon")
-        if self.max_lat < self.min_lat:
-            raise ValueError("max_lat should be larger than min_lat")
-
-        self.url = (
-            f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}"
-        )
 
-    def contains(self, lat: float, lon: float) -> bool:
-        return (
-            (lat >= self.min_lat)
-            & (lat <= self.max_lat)
-            & (lon >= self.min_lon)
-            & (lon <= self.max_lon)
-        )
-
-    def contains_bbox(self, bbox) -> bool:
-        return (
-            (bbox.min_lat >= self.min_lat)
-            & (bbox.max_lat <= self.max_lat)
-            & (bbox.min_lon >= self.min_lon)
-            & (bbox.max_lon <= self.max_lon)
-        )
-
-    @property
-    def three_dimensional_points(self) -> List[float]:
-        r"""
-        If we are passing the central latitude and longitude to
-        an ML model, we want it to know the extremes are close together.
-        Mapping them to 3d space allows us to do that
-        """
-        lat, lon = self.get_centre(in_radians=True)
-        return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)]
-
-    def get_centre(self, in_radians: bool = True) -> Tuple[float, float]:
-
-        # roughly calculate the centres
-        lat = self.min_lat + ((self.max_lat - self.min_lat) / 2)
-        lon = self.min_lon + ((self.max_lon - self.min_lon) / 2)
-        if in_radians:
-            return radians(lat), radians(lon)
-        else:
-            return lat, lon
-
-    @classmethod
-    def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None):
-        (min_lon, min_lat, max_lon, max_lat) = polygon.bounds
-        return cls(min_lat, max_lat, min_lon, max_lon, name)
+COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile"))
 
 
 def get_country_bbox(country_name: str) -> List[BBox]:
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 511a0f8..7aa1506 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -15,8 +15,9 @@
 from sklearn.metrics import roc_auc_score, f1_score
 
 from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS
-from cropharvest.columns import RequiredColumns, NullableColumns
-from .config import (
+from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns
+from cropharvest.boundingbox import BBox
+from cropharvest.config import (
     EXPORT_END_DAY,
     EXPORT_END_MONTH,
     LABELS_FILENAME,
@@ -24,8 +25,14 @@
     DEFAULT_NUM_TIMESTEPS,
     TEST_REGIONS,
     TEST_DATASETS,
+    DATAFOLDER_PATH,
+    FEATURES_FILEPATH,
+    EO_FILEPATH,
+    TEST_EO_FILEPATH,
+    ARRAYS_FILEPATH,
+    TEST_FEATURES_FILEPATH,
 )
-from .utils import DATAFOLDER_PATH, load_normalizing_dict
+from cropharvest.utils import load_normalizing_dict
 
 from typing import cast, Optional, Dict, Union, Tuple, List, Sequence
 
@@ -168,22 +175,37 @@ def __len__(self) -> int:
 
 
 class Engineer:
-    def __init__(self, data_folder: Path = DATAFOLDER_PATH) -> None:
-        self.data_folder = data_folder
-        self.eo_files = data_folder / "eo_data"
-        self.test_eo_files = data_folder / "test_eo_data"
+    def __init__(self) -> None:
 
-        self.labels = geopandas.read_file(data_folder / LABELS_FILENAME)
-        self.labels["export_end_date"] = pd.to_datetime(self.labels.export_end_date).dt.date
-
-        self.savedir = data_folder / "features"
-        self.savedir.mkdir(exist_ok=True)
-
-        self.test_savedir = data_folder / "test_features"
-        self.test_savedir.mkdir(exist_ok=True)
+        self.labels = self.load_labels()
+        FEATURES_FILEPATH.mkdir(exist_ok=True)
+        ARRAYS_FILEPATH.mkdir(exist_ok=True)
+        TEST_FEATURES_FILEPATH.mkdir(exist_ok=True)
 
         self.norm_interim: Dict[str, Union[np.ndarray, int]] = {"n": 0}
 
+    @staticmethod
+    def load_labels() -> geopandas.GeoDataFrame:
+        labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME)
+        labels[RequiredColumns.EXPORT_END_DATE] = pd.to_datetime(
+            labels[RequiredColumns.EXPORT_END_DATE]
+        ).dt.date
+        labels[EngColumns.FEATURES_FILENAME] = (
+            "lat="
+            + labels[RequiredColumns.LAT].round(8).astype(str)
+            + "_lon="
+            + labels[RequiredColumns.LON].round(8).astype(str)
+            + "_date="
+            + labels[RequiredColumns.EXPORT_END_DATE].astype(str)
+        )
+        labels[EngColumns.FEATURES_PATH] = (
+            str(ARRAYS_FILEPATH) + labels[EngColumns.FEATURES_FILENAME]
+        )
+        labels[EngColumns.EXISTS] = np.vectorize(lambda p: Path(p).exists())(
+            labels[EngColumns.FEATURES_PATH]
+        )
+        return labels
+
     @staticmethod
     def find_nearest(array, value: float) -> float:
         array = np.asarray(array)
@@ -203,7 +225,7 @@ def process_filename(filename: str) -> Tuple[int, str]:
 
     @staticmethod
     def load_tif(
-        ds: xr.Dataset, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS
+        filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS
     ) -> Tuple[xr.DataArray, float]:
         r"""
         The sentinel files exported from google earth have all the timesteps
@@ -212,7 +234,7 @@ def load_tif(
 
         Returns: The loaded xr.DataArray, and the average slope (used for filling nan slopes)
         """
-        da = da.rename("FEATURES")
+        da = xr.open_rasterio(filepath).rename("FEATURES")
         da_split_by_time: List[xr.DataArray] = []
 
         bands_per_timestep = len(DYNAMIC_BANDS)
@@ -473,37 +495,15 @@ def process_test_file_with_region(
 
         return identifier_plus_idx, TestInstance(x=final_x, y=y, lats=flat_lat, lons=flat_lon)
 
-    @staticmethod
-    def _year_from_filepath(p: Path) -> date:
-        dates_in_p = re.findall(r"(\d+-\d+-\d+)", p.stem)
-        end_date = dates_in_p[-1].split("-")
-        return date(end_date[0], end_date[1], end_date[2])
-
-    def find_row_from_path(
-        self, lat: np.ndarray, lon: np.ndarray, export_end_date: date
-    ) -> pd.Series:
-        relevant_labels = self.labels[
-            (
-                (self.labels[RequiredColumns.LAT] >= lat.min())
-                & (self.labels[RequiredColumns.LAT] <= lat.max())
-                & (self.labels[RequiredColumns.LON] <= lon.max())
-                & (self.labels[RequiredColumns.LON] >= lon.min())
-                & (self.labels[RequiredColumns.EXPORT_END_DATE] == export_end_date)
-            )
-        ]
-        # TODO - check for the most central row if there is more than 1 row
-        return relevant_labels.iloc[0]
-
     def process_single_file(
         self,
-        path_to_file: Path,
+        row: pd.Series,
         num_timesteps: int = DEFAULT_NUM_TIMESTEPS,
     ) -> Optional[DataInstance]:
-        ds = xr.open_rasterio(path_to_file)
-        year = self._year_from_filepath(path_to_file)
-        row = self.find_row_from_path(ds.y, ds.x, year)
-        start_date = row.export_end_date - timedelta(days=num_timesteps * DAYS_PER_TIMESTEP)
-        da, average_slope = self.load_tif(path_to_file, start_date=start_date)
+        start_date = row[RequiredColumns.EXPORT_END_DATE] - timedelta(
+            days=num_timesteps * DAYS_PER_TIMESTEP
+        )
+        da, average_slope = self.load_tif(row[EngColumns.TIF_FILEPATH], start_date=start_date)
         closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON])
         closest_lat = self.find_nearest(da.y, row[RequiredColumns.LAT])
 
@@ -535,7 +535,7 @@ def create_h5_test_instances(
         self,
     ) -> None:
         for region_identifier, _ in TEST_REGIONS.items():
-            all_region_files = list(self.test_eo_files.glob(f"{region_identifier}*.tif"))
+            all_region_files = list(TEST_EO_FILEPATH.glob(f"{region_identifier}*.tif"))
             if len(all_region_files) == 0:
                 print(f"No downloaded files for {region_identifier}")
                 continue
@@ -544,7 +544,7 @@ def create_h5_test_instances(
                     filepath, region_idx
                 )
                 if test_instance is not None:
-                    hf = h5py.File(self.test_savedir / f"{instance_name}.h5", "w")
+                    hf = h5py.File(TEST_FEATURES_FILEPATH / f"{instance_name}.h5", "w")
 
                     for key, val in test_instance.datasets.items():
                         hf.create_dataset(key, data=val)
@@ -581,27 +581,74 @@ def create_h5_test_instances(
                 hf.create_dataset(key, data=val)
             hf.close()
 
-    def create_h5_dataset(self, checkpoint: bool = True) -> None:
-        arrays_dir = self.savedir / "arrays"
-        arrays_dir.mkdir(exist_ok=True)
+    @staticmethod
+    def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]:
+        return {
+            p: BBox.from_path(p)
+            for p in tqdm(filepath.glob("**/*.tif"), desc="Generating BoundingBoxes from paths")
+        }
+
+    @staticmethod
+    def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar):
+        candidate_paths = []
+        for p, bbox in path_to_bbox.items():
+            if bbox.contains(lat, lon) and f"dates=*_{end_date}" in p.stem:
+                candidate_paths.append(p)
+        pbar.update(1)
+        return candidate_paths
+
+    @classmethod
+    def match_labels_to_tifs(cls, labels: geopandas.GeoDataFrame) -> pd.Series:
+        bbox_for_labels = BBox(
+            min_lon=labels[RequiredColumns.LON].min(),
+            min_lat=labels[RequiredColumns.LAT].min(),
+            max_lon=labels[RequiredColumns.LON].max(),
+            max_lat=labels[RequiredColumns.LAT].max(),
+        )
+        # Get all tif paths and bboxes
+        path_to_bbox = {
+            p: bbox
+            for p, bbox in cls.generate_bbox_from_paths(EO_FILEPATH).items()
+            if bbox_for_labels.contains_bbox(bbox)
+        }
+
+        # Match labels to tif files
+        # Faster than going through bboxes
+        with tqdm(total=len(labels), desc="Matching labels to tif paths") as pbar:
+            tif_paths = np.vectorize(cls.get_tif_paths, otypes=[np.ndarray])(
+                path_to_bbox,
+                labels[RequiredColumns.LAT],
+                labels[RequiredColumns.LON],
+                labels[RequiredColumns.EXPORT_END_DATE],
+                pbar,
+            )
+        return tif_paths
+
+    def create_h5_dataset(self) -> None:
 
         old_normalizing_dict: Optional[Tuple[int, Optional[Dict[str, np.ndarray]]]] = None
-        if checkpoint:
-            # check for an already existing normalizing dict
-            if (self.savedir / "normalizing_dict.h5").exists():
-                old_nd = load_normalizing_dict(self.savedir / "normalizing_dict.hf")
-                num_existing_files = len(list(arrays_dir.glob("*")))
-                old_normalizing_dict = (num_existing_files, old_nd)
+        # check for an already existing normalizing dict
+        if (FEATURES_FILEPATH / "normalizing_dict.h5").exists():
+            old_nd = load_normalizing_dict(FEATURES_FILEPATH / "normalizing_dict.hf")
+            num_existing_files = len(list(ARRAYS_FILEPATH.glob("*")))
+            old_normalizing_dict = (num_existing_files, old_nd)
+
+        labels_with_no_features = self.labels[~self.labels[EngColumns.EXISTS]].copy()
+        labels_with_no_features[EngColumns.TIF_FILEPATH] = self.match_labels_to_tifs(
+            labels_with_no_features
+        )
+        tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATH].str.len() > 0
+        labels_with_tifs_but_no_features = labels_with_no_features.loc[tifs_found]
 
         skipped_files: int = 0
         num_new_files: int = 0
-        for file_path in tqdm(list(self.eo_files.glob("*.tif"))):
-            instance = self.process_single_file(file_path)
+        for _, row in tqdm(labels_with_tifs_but_no_features.iterrows()):
+            instance = self.process_single_file(row)
             if instance is not None:
                 filename = (
                     f"lat={instance.label_lat}_lon={instance.label_lon}_year={instance.year}.h5"
                 )
-                hf = h5py.File(arrays_dir / filename, "w")
+                hf = h5py.File(ARRAYS_FILEPATH / filename, "w")
                 hf.create_dataset("array", data=instance.array)
 
                 for key, val in instance.attrs.items():
@@ -616,11 +663,11 @@ def create_h5_dataset(self, checkpoint: bool = True) -> None:
 
         normalizing_dict = self.calculate_normalizing_dict()
 
-        if checkpoint and (old_normalizing_dict is not None):
+        if old_normalizing_dict is not None:
             normalizing_dicts = [old_normalizing_dict, (num_new_files, normalizing_dict)]
             normalizing_dict = self.adjust_normalizing_dict(normalizing_dicts)
         if normalizing_dict is not None:
-            save_path = self.savedir / "normalizing_dict.h5"
+            save_path = FEATURES_FILEPATH / "normalizing_dict.h5"
             hf = h5py.File(save_path, "w")
             for key, val in normalizing_dict.items():
                 hf.create_dataset(key, data=val)
diff --git a/cropharvest/eo/ee_boundingbox.py b/cropharvest/eo/ee_boundingbox.py
index df4d6cc..f121ef3 100644
--- a/cropharvest/eo/ee_boundingbox.py
+++ b/cropharvest/eo/ee_boundingbox.py
@@ -3,7 +3,7 @@
 from typing import List, Tuple, Union
 import ee
 
-from cropharvest.countries import BBox
+from cropharvest.boundingbox import BBox
 
 
 @dataclass
diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index c1fe916..51e5b5d 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -23,7 +23,7 @@
 
 from .utils import make_combine_bands_function
 from cropharvest.bands import DYNAMIC_BANDS
-from cropharvest.utils import DATAFOLDER_PATH, memoized
+from cropharvest.utils import memoized
 from cropharvest.countries import BBox
 from cropharvest.config import (
     EXPORT_END_DAY,
@@ -32,6 +32,9 @@
     DEFAULT_NUM_TIMESTEPS,
     LABELS_FILENAME,
     TEST_REGIONS,
+    DATAFOLDER_PATH,
+    EO_FILEPATH,
+    TEST_EO_FILEPATH,
 )
 from cropharvest.columns import RequiredColumns
 
@@ -95,8 +98,8 @@ class EarthEngineExporter:
     :param dest_bucket: The bucket to export to, google-cloud-storage must be installed.
     """
 
-    output_folder_name = "eo_data"
-    test_output_folder_name = "test_eo_data"
+    output_folder_name = EO_FILEPATH.name
+    test_output_folder_name = TEST_EO_FILEPATH.name
 
     def __init__(
         self,
diff --git a/cropharvest/utils.py b/cropharvest/utils.py
index 48a8e2f..fef5770 100644
--- a/cropharvest/utils.py
+++ b/cropharvest/utils.py
@@ -20,8 +20,6 @@
 except ImportError:
     TORCH_INSTALLED = False
 
-DATAFOLDER_PATH = Path(__file__).parent.parent / "data"
-
 
 def set_seed(seed: int = 42) -> None:
     np.random.seed(seed)

From a8042148a53c9fabc458460fa973a3dafeee6208 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Mon, 25 Apr 2022 16:34:58 -0400
Subject: [PATCH 19/38] Use the default identifier for the default labels

---
 cropharvest/eo/eo.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cropharvest/eo/eo.py b/cropharvest/eo/eo.py
index 51e5b5d..2d925c5 100644
--- a/cropharvest/eo/eo.py
+++ b/cropharvest/eo/eo.py
@@ -145,11 +145,6 @@ def load_default_labels(
             start_date=lambda x: x["end_date"]
             - timedelta(days=DAYS_PER_TIMESTEP * DEFAULT_NUM_TIMESTEPS)
         )
-        labels = labels.assign(
-            export_identifier=labels[RequiredColumns.INDEX].map(str)
-            + "-"
-            + labels[RequiredColumns.DATASET]
-        )
         if dataset:
             labels = labels[labels.dataset == dataset]
 

From 270e3afed518cdacef074609e1d0909ae95c1838 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Mon, 25 Apr 2022 17:12:38 -0400
Subject: [PATCH 20/38] Mypy fixes

---
 benchmarks/config.py               | 2 +-
 benchmarks/deep_learning.py        | 2 +-
 benchmarks/random_forest.py        | 2 +-
 cropharvest/engineer.py            | 5 ++---
 process_labels/datasets.py         | 3 +--
 scripts/20220418_renaming.py       | 2 +-
 scripts/20220420_check_renaming.py | 2 +-
 7 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/benchmarks/config.py b/benchmarks/config.py
index f97c656..4667d81 100644
--- a/benchmarks/config.py
+++ b/benchmarks/config.py
@@ -11,7 +11,7 @@
 
 
 # Model names
-RANDOM_FOREST = "RF"
+RANDOM_FOREST = "RF_GRID_SEARCH"
 DL_RANDOM = "DL_RANDOM"
 DL_PRETRAINED = "DL_PRETRAINED"
 DL_MAML = "DL_MAML"
diff --git a/benchmarks/deep_learning.py b/benchmarks/deep_learning.py
index c6b5422..4763520 100644
--- a/benchmarks/deep_learning.py
+++ b/benchmarks/deep_learning.py
@@ -4,7 +4,7 @@
 import json
 
 from cropharvest.datasets import CropHarvest
-from cropharvest.utils import DATAFOLDER_PATH
+from cropharvest.config import DATAFOLDER_PATH
 from cropharvest.engineer import TestInstance
 
 from config import (
diff --git a/benchmarks/random_forest.py b/benchmarks/random_forest.py
index c8a2360..d9a4cf0 100644
--- a/benchmarks/random_forest.py
+++ b/benchmarks/random_forest.py
@@ -3,7 +3,7 @@
 from sklearn.ensemble import RandomForestClassifier
 
 from cropharvest.datasets import CropHarvest
-from cropharvest.utils import DATAFOLDER_PATH
+from cropharvest.config import DATAFOLDER_PATH
 from cropharvest.engineer import TestInstance
 
 from config import SHUFFLE_SEEDS, DATASET_TO_SIZES, RANDOM_FOREST
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 7aa1506..d350102 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from datetime import datetime, timedelta, date
+from datetime import datetime, timedelta
 import geopandas
 from dataclasses import dataclass
 import numpy as np
@@ -10,7 +10,6 @@
 from tqdm import tqdm
 import warnings
 import h5py
-import re
 
 from sklearn.metrics import roc_auc_score, f1_score
 
@@ -584,7 +583,7 @@ def create_h5_test_instances(
     @staticmethod
     def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]:
         return {
-            p: BBox.from_path(p)
+            p: BBox.from_eo_tif_file(p)
             for p in tqdm(filepath.glob("**/*.tif"), desc="Generating BoundingBoxes from paths")
         }
 
diff --git a/process_labels/datasets.py b/process_labels/datasets.py
index 6ae3ebd..c785179 100644
--- a/process_labels/datasets.py
+++ b/process_labels/datasets.py
@@ -6,8 +6,7 @@
 from .utils import add_is_test_column
 
 from cropharvest.columns import NullableColumns, RequiredColumns
-from cropharvest.utils import DATAFOLDER_PATH
-from cropharvest.config import LABELS_FILENAME
+from cropharvest.config import LABELS_FILENAME, DATAFOLDER_PATH
 
 from typing import cast, Callable, List, Optional
 
diff --git a/scripts/20220418_renaming.py b/scripts/20220418_renaming.py
index 6a965f9..11c2ec5 100644
--- a/scripts/20220418_renaming.py
+++ b/scripts/20220418_renaming.py
@@ -12,7 +12,7 @@
 from cropharvest.columns import RequiredColumns
 from cropharvest.eo.eo import EarthEngineExporter
 from cropharvest.eo.ee_boundingbox import EEBoundingBox
-from cropharvest.utils import DATAFOLDER_PATH
+from cropharvest.config import DATAFOLDER_PATH
 
 
 SURROUNDING_METRES = 80
diff --git a/scripts/20220420_check_renaming.py b/scripts/20220420_check_renaming.py
index 471d2cb..09c1fd0 100644
--- a/scripts/20220420_check_renaming.py
+++ b/scripts/20220420_check_renaming.py
@@ -17,7 +17,7 @@
 from tqdm import tqdm
 
 from cropharvest.countries import BBox
-from cropharvest.utils import DATAFOLDER_PATH
+from cropharvest.config import DATAFOLDER_PATH
 
 
 def _bbox_from_filepath(p: Path) -> BBox:

From b02a2bfe4c47791980509b42114ebc04392cad67 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 26 Apr 2022 07:23:57 -0400
Subject: [PATCH 21/38] Update the processing of test files to work with the
 new approach too

---
 benchmarks/dl/maml.py   | 19 ++++++-------------
 cropharvest/config.py   | 18 ++++++++++++++++++
 cropharvest/datasets.py | 15 +++------------
 cropharvest/engineer.py | 23 +++++++++--------------
 cropharvest/utils.py    | 13 +++++++++++++
 5 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py
index c1f824b..d8ab58f 100644
--- a/benchmarks/dl/maml.py
+++ b/benchmarks/dl/maml.py
@@ -19,7 +19,7 @@
 
 from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task
 from cropharvest import countries
-from cropharvest.config import TEST_DATASETS, TEST_REGIONS
+from cropharvest.config import TEST_DATASETS, TEST_REGIONS, TEST_COUNTRIES_TO_CROPS
 from cropharvest.utils import NoDataForBoundingBoxError
 
 from typing import Dict, Tuple, Optional, List, DefaultDict
@@ -363,18 +363,11 @@ def _make_tasks(
     ) -> Tuple[Dict[str, CropHarvest], Dict[str, CropHarvest]]:
         labels = CropHarvestLabels(self.root)
 
-        # remove any test regions, and collect the countries / crops
-        test_countries_to_crops: DefaultDict[str, List[str]] = defaultdict(list)
-
-        # reshuffle the test_regions dict so its a little easier to
-        # manipulate in this function
-        for identifier, _ in TEST_REGIONS.items():
-            country, crop, _, _ = identifier.split("_")
-            test_countries_to_crops[country].append(crop)
-
         label_to_task: Dict[str, CropHarvest] = {}
 
-        countries_to_ignore = [country for country, _ in TEST_DATASETS.items() if crop is None]
+        countries_to_ignore = [
+            country for country, crop in TEST_COUNTRIES_TO_CROPS.items() if crop is not None
+        ]
 
         for country in tqdm(countries.get_countries()):
             if country in countries_to_ignore:
@@ -394,8 +387,8 @@ def _make_tasks(
                     label_to_task[task.id] = task
 
                 for label in labels.classes_in_bbox(country_bbox):
-                    if country in test_countries_to_crops:
-                        if label in test_countries_to_crops[country]:
+                    if country in TEST_COUNTRIES_TO_CROPS:
+                        if label in TEST_COUNTRIES_TO_CROPS[country]:
                             continue
                     try:
                         task = CropHarvest(
diff --git a/cropharvest/config.py b/cropharvest/config.py
index 587616b..f2e0e93 100644
--- a/cropharvest/config.py
+++ b/cropharvest/config.py
@@ -57,3 +57,21 @@
 }
 
 TEST_DATASETS = {"Togo": "togo-eval"}
+
+
+def test_countries_to_crops():
+    output_dict = {}
+    for identifier, _ in TEST_REGIONS.items():
+        country, crop, _, _ = identifier.split("_")
+        if country in output_dict.keys():
+            assert output_dict[country] == crop
+        else:
+            output_dict[country].append(crop)
+
+    for country, _ in TEST_DATASETS.items():
+        output_dict[country].append(None)
+
+    return output_dict
+
+
+TEST_COUNTRIES_TO_CROPS = test_countries_to_crops()
diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index ca082a9..85204d6 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -13,6 +13,7 @@
     load_normalizing_dict,
     sample_with_memory,
     NoDataForBoundingBoxError,
+    filter_geojson,
 )
 from cropharvest.config import (
     FEATURES_DIR,
@@ -89,18 +90,8 @@ def __init__(self, root, download=False):
     def as_geojson(self) -> geopandas.GeoDataFrame:
         return self._labels
 
-    @staticmethod
-    def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            # warning: invalid value encountered in ? (vectorized)
-            in_bounding_box = np.vectorize(bounding_box.contains)(
-                gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON]
-            )
-        return gpdf[in_bounding_box]
-
     def classes_in_bbox(self, bounding_box: BBox) -> List[str]:
-        bbox_geojson = self.filter_geojson(self.as_geojson(), bounding_box)
+        bbox_geojson = filter_geojson(self.as_geojson(), bounding_box)
         unique_labels = [x for x in bbox_geojson.label.unique() if x is not None]
         return unique_labels
 
@@ -117,7 +108,7 @@ def construct_positive_and_negative_labels(
         if filter_test:
             gpdf = gpdf[gpdf[RequiredColumns.IS_TEST] == False]
         if task.bounding_box is not None:
-            gpdf = self.filter_geojson(gpdf, task.bounding_box)
+            gpdf = filter_geojson(gpdf, task.bounding_box)
 
         if len(gpdf) == 0:
             raise NoDataForBoundingBoxError
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index d350102..11d4bae 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -15,6 +15,7 @@
 
 from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS
 from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns
+from cropharvest.countries import get_country_bbox
 from cropharvest.boundingbox import BBox
 from cropharvest.config import (
     EXPORT_END_DAY,
@@ -31,7 +32,7 @@
     ARRAYS_FILEPATH,
     TEST_FEATURES_FILEPATH,
 )
-from cropharvest.utils import load_normalizing_dict
+from cropharvest.utils import load_normalizing_dict, filter_geojson
 
 from typing import cast, Optional, Dict, Union, Tuple, List, Sequence
 
@@ -549,24 +550,18 @@ def create_h5_test_instances(
                         hf.create_dataset(key, data=val)
                     hf.close()
 
-        for _, dataset in TEST_DATASETS.items():
+        for country, dataset in TEST_DATASETS.items():
             x: List[np.ndarray] = []
             y: List[int] = []
             lats: List[float] = []
             lons: List[float] = []
-            relevant_labels = self.labels[self.labels[RequiredColumns.DATASET] == dataset]
+            country_bboxes = get_country_bbox(country)
+            relevant_labels = pd.concat(
+                [filter_geojson(self.labels, box) for box in country_bboxes]
+            )
 
             for _, row in tqdm(relevant_labels.iterrows()):
-                tif_paths = list(
-                    self.eo_files.glob(
-                        f"{row[RequiredColumns.INDEX]}-{row[RequiredColumns.DATASET]}_*.tif"
-                    )
-                )
-                if len(tif_paths) == 0:
-                    continue
-                else:
-                    tif_path = tif_paths[0]
-                instance = self.process_single_file(tif_path, row)
+                instance = self.process_single_file(row)
                 if instance is not None:
                     x.append(instance.array)
                     y.append(instance.is_crop)
@@ -575,7 +570,7 @@ def create_h5_test_instances(
 
             # then, combine the instances into a test instance
             test_instance = TestInstance(np.stack(x), np.stack(y), np.stack(lats), np.stack(lons))
-            hf = h5py.File(self.test_savedir / f"{dataset}.h5", "w")
+            hf = h5py.File(TEST_FEATURES_FILEPATH / f"{dataset}.h5", "w")
             for key, val in test_instance.datasets.items():
                 hf.create_dataset(key, data=val)
             hf.close()
diff --git a/cropharvest/utils.py b/cropharvest/utils.py
index fef5770..673370b 100644
--- a/cropharvest/utils.py
+++ b/cropharvest/utils.py
@@ -8,10 +8,13 @@
 import collections
 import functools
 import tarfile
+import warnings
 
 from typing import Dict, List, Tuple, Optional
 
 from cropharvest.config import DATASET_URL
+from cropharvest.boundingbox import BBox
+from cropharvest.columns import RequiredColumns
 
 try:
     import torch
@@ -146,3 +149,13 @@ def read_geopandas(file_path) -> geopandas.GeoDataFrame:
 
 class NoDataForBoundingBoxError(Exception):
     pass
+
+
+def filter_geojson(gpdf: geopandas.GeoDataFrame, bounding_box: BBox) -> geopandas.GeoDataFrame:
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # warning: invalid value encountered in ? (vectorized)
+        in_bounding_box = np.vectorize(bounding_box.contains)(
+            gpdf[RequiredColumns.LAT], gpdf[RequiredColumns.LON]
+        )
+    return gpdf[in_bounding_box]

From d16905ac969efb6f5397b66df1b0c3db81d46492 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 26 Apr 2022 07:32:40 -0400
Subject: [PATCH 22/38] Make output_dict a defaultdict

---
 cropharvest/config.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cropharvest/config.py b/cropharvest/config.py
index f2e0e93..67654e7 100644
--- a/cropharvest/config.py
+++ b/cropharvest/config.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from collections import defaultdict
 
 from .boundingbox import BBox
 
@@ -60,13 +61,10 @@
 
 
 def test_countries_to_crops():
-    output_dict = {}
+    output_dict = defaultdict(list)
     for identifier, _ in TEST_REGIONS.items():
         country, crop, _, _ = identifier.split("_")
-        if country in output_dict.keys():
-            assert output_dict[country] == crop
-        else:
-            output_dict[country].append(crop)
+        output_dict[country].append(crop)
 
     for country, _ in TEST_DATASETS.items():
         output_dict[country].append(None)

From e05e3053da173e0eaf4bf8813526663aac6f3499 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 26 Apr 2022 08:44:55 -0400
Subject: [PATCH 23/38] Use fnmatch instead of (incorrect) string matching

---
 cropharvest/engineer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 11d4bae..907b7f3 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -2,6 +2,7 @@
 from datetime import datetime, timedelta
 import geopandas
 from dataclasses import dataclass
+from fnmatch import fnmatch
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -199,7 +200,7 @@ def load_labels() -> geopandas.GeoDataFrame:
             + labels[RequiredColumns.EXPORT_END_DATE].astype(str)
         )
         labels[EngColumns.FEATURES_PATH] = (
-            str(ARRAYS_FILEPATH) + labels[EngColumns.FEATURES_FILENAME]
+            str(ARRAYS_FILEPATH) + "/" + labels[EngColumns.FEATURES_FILENAME]
         )
         labels[EngColumns.EXISTS] = np.vectorize(lambda p: Path(p).exists())(
             labels[EngColumns.FEATURES_PATH]
@@ -586,7 +587,7 @@ def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]:
     def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar):
         candidate_paths = []
         for p, bbox in path_to_bbox.items():
-            if bbox.contains(lat, lon) and f"dates=*_{end_date}" in p.stem:
+            if bbox.contains(lat, lon) and fnmatch(p.stem, f"dates=*_{end_date}*"):
                 candidate_paths.append(p)
         pbar.update(1)
         return candidate_paths

From 30a55d5f844a469c2a1887e1ff5e31a3fef2b6f8 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 26 Apr 2022 10:52:19 -0400
Subject: [PATCH 24/38] Fix incorrect pattern in fnmatch

---
 cropharvest/engineer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 907b7f3..01b4e65 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -587,7 +587,7 @@ def generate_bbox_from_paths(filepath: Path) -> Dict[Path, BBox]:
     def get_tif_paths(path_to_bbox, lat, lon, end_date, pbar):
         candidate_paths = []
         for p, bbox in path_to_bbox.items():
-            if bbox.contains(lat, lon) and fnmatch(p.stem, f"dates=*_{end_date}*"):
+            if bbox.contains(lat, lon) and fnmatch(p.stem, f"*dates=*_{end_date}*"):
                 candidate_paths.append(p)
         pbar.update(1)
         return candidate_paths

From b391d3b8b3c792ce06f6c79f7a15e6b3b28cd229 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 27 Apr 2022 10:19:33 -0400
Subject: [PATCH 25/38] Correctly find the correct tif_filepath per label

---
 cropharvest/columns.py  |  2 +-
 cropharvest/engineer.py | 70 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/cropharvest/columns.py b/cropharvest/columns.py
index 2c4488e..7675138 100644
--- a/cropharvest/columns.py
+++ b/cropharvest/columns.py
@@ -53,4 +53,4 @@ class EngColumns:
     FEATURES_FILENAME = "features_filename"
     FEATURES_PATH = "features_path"
     EXISTS = "feature_exists"
-    TIF_FILEPATH = "tif_path"
+    TIF_FILEPATHS = "tif_path"
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 01b4e65..971afe3 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -53,6 +53,7 @@ class DataInstance:
     array: np.ndarray
     is_crop: int
     year: int
+    source_tif_file: str
     label: Optional[str] = None
 
     @property
@@ -208,10 +209,30 @@ def load_labels() -> geopandas.GeoDataFrame:
         return labels
 
     @staticmethod
-    def find_nearest(array, value: float) -> float:
+    def distance_from_degrees(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
+        """
+        haversince formula, inspired by:
+        https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude/41337005
+        """
+        p = 0.017453292519943295
+        a = (
+            0.5
+            - np.cos((lat2 - lat1) * p) / 2
+            + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
+        )
+        return 12742 * np.arcsin(np.sqrt(a))
+
+    @staticmethod
+    def distance_point_from_center(lat_idx: int, lon_idx: int, tif) -> int:
+        x_dist = np.abs((len(tif.x) - 1) / 2 - lon_idx)
+        y_dist = np.abs((len(tif.y) - 1) / 2 - lat_idx)
+        return x_dist + y_dist
+
+    @staticmethod
+    def find_nearest(array, value: float) -> Tuple[float, int]:
         array = np.asarray(array)
         idx = (np.abs(array - value)).argmin()
-        return array[idx]
+        return array[idx], idx
 
     @staticmethod
     def process_filename(filename: str) -> Tuple[int, str]:
@@ -504,11 +525,43 @@ def process_single_file(
         start_date = row[RequiredColumns.EXPORT_END_DATE] - timedelta(
             days=num_timesteps * DAYS_PER_TIMESTEP
         )
-        da, average_slope = self.load_tif(row[EngColumns.TIF_FILEPATH], start_date=start_date)
-        closest_lon = self.find_nearest(da.x, row[RequiredColumns.LON])
-        closest_lat = self.find_nearest(da.y, row[RequiredColumns.LAT])
 
-        labelled_np = da.sel(x=closest_lon).sel(y=closest_lat).values
+        tif_slope_tuples = [
+            self.load_tif(filepath, start_date=start_date)
+            for filepath in row[EngColumns.TIF_FILEPATHS]
+        ]
+        if len(tif_slope_tuples) == 1:
+            tif, average_slope = tif_slope_tuples[0]
+
+            closest_lon, _ = self.find_nearest(tif.x, row[RequiredColumns.LON])
+            closest_lat, _ = self.find_nearest(tif.y, row[RequiredColumns.LAT])
+
+            labelled_np = tif.sel(x=closest_lon).sel(y=closest_lat).values
+            tif_file = row[EngColumns.TIF_FILEPATHS].iloc[0].name
+
+        else:
+            min_distance_from_point = np.inf
+            min_distance_from_center = np.inf
+            for i, tif_slope_tuple in enumerate(tif_slope_tuples):
+                tif, slope = tif_slope_tuple
+                lon, lon_idx = self.find_nearest(tif.x, row[RequiredColumns.LON])
+                lat, lat_idx = self.find_nearest(tif.y, row[RequiredColumns.LAT])
+                distance_from_point = self.distance_from_degrees(
+                    row[RequiredColumns.LAT], row[RequiredColumns.LON], lat, lon
+                )
+                distance_from_center = self.distance_point_from_center(lat_idx, lon_idx, tif)
+                if (distance_from_point < min_distance_from_point) or (
+                    distance_from_point == min_distance_from_point
+                    and distance_from_center < min_distance_from_center
+                ):
+                    closest_lon = lon
+                    closest_lat = lat
+                    min_distance_from_center = distance_from_center
+                    min_distance_from_point = distance_from_point
+
+                    labelled_np = tif.sel(x=lon).sel(y=lat).values
+                    average_slope = slope
+                    tif_file = row[EngColumns.TIF_FILEPATHS].iloc[i].name
 
         labelled_np = self.calculate_ndvi(labelled_np)
         labelled_np = self.remove_bands(labelled_np)
@@ -530,6 +583,7 @@ def process_single_file(
             label=row[NullableColumns.LABEL],
             dataset=row[RequiredColumns.DATASET],
             year=start_date.year,
+            source_tif_file=tif_file,
         )
 
     def create_h5_test_instances(
@@ -629,10 +683,10 @@ def create_h5_dataset(self) -> None:
             old_normalizing_dict = (num_existing_files, old_nd)
 
         labels_with_no_features = self.labels[~self.labels[EngColumns.EXISTS]].copy()
-        labels_with_no_features[EngColumns.TIF_FILEPATH] = self.match_labels_to_tifs(
+        labels_with_no_features[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(
             labels_with_no_features
         )
-        tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATH].str.len() > 0
+        tifs_found = labels_with_no_features[EngColumns.TIF_FILEPATHS].str.len() > 0
         labels_with_tifs_but_no_features = labels_with_no_features.loc[tifs_found]
 
         skipped_files: int = 0

From 127947a67ee43bee12b3bed466a97062d1b0447c Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 27 Apr 2022 10:26:08 -0400
Subject: [PATCH 26/38] Don't use iloc to index a list

---
 cropharvest/engineer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 971afe3..e402266 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -537,7 +537,7 @@ def process_single_file(
             closest_lat, _ = self.find_nearest(tif.y, row[RequiredColumns.LAT])
 
             labelled_np = tif.sel(x=closest_lon).sel(y=closest_lat).values
-            tif_file = row[EngColumns.TIF_FILEPATHS].iloc[0].name
+            tif_file = row[EngColumns.TIF_FILEPATHS][0].name
 
         else:
             min_distance_from_point = np.inf
@@ -561,7 +561,7 @@ def process_single_file(
 
                     labelled_np = tif.sel(x=lon).sel(y=lat).values
                     average_slope = slope
-                    tif_file = row[EngColumns.TIF_FILEPATHS].iloc[i].name
+                    tif_file = row[EngColumns.TIF_FILEPATHS][i].name
 
         labelled_np = self.calculate_ndvi(labelled_np)
         labelled_np = self.remove_bands(labelled_np)

From f6a2c06a56f2005715e637602ce7dd158c172ecd Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 27 Apr 2022 13:19:35 -0400
Subject: [PATCH 27/38] Fix failing engineer tests

---
 test/cropharvest/engineer/test_engineer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cropharvest/engineer/test_engineer.py b/test/cropharvest/engineer/test_engineer.py
index a6b16da..8f3622e 100644
--- a/test/cropharvest/engineer/test_engineer.py
+++ b/test/cropharvest/engineer/test_engineer.py
@@ -82,7 +82,7 @@ def test_find_nearest():
 
     target = 1.1
 
-    assert Engineer.find_nearest(array, target) == 1
+    assert Engineer.find_nearest(array, target) == (1, 0)
 
 
 def test_filename_correctly_processed():

From f5438ccea60d3451c7c489d985d9db3361094f3f Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 27 Apr 2022 13:23:29 -0400
Subject: [PATCH 28/38] Remove unused function

---
 cropharvest/engineer.py                    | 13 +------------
 test/cropharvest/engineer/test_engineer.py |  8 --------
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index e402266..25f9761 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -234,17 +234,6 @@ def find_nearest(array, value: float) -> Tuple[float, int]:
         idx = (np.abs(array - value)).argmin()
         return array[idx], idx
 
-    @staticmethod
-    def process_filename(filename: str) -> Tuple[int, str]:
-        r"""
-        Given an exported sentinel file, process it to get the dataset
-        it came from, and the index of that dataset
-        """
-        parts = filename.split("_")[0].split("-")
-        index = parts[0]
-        dataset = "-".join(parts[1:])
-        return int(index), dataset
-
     @staticmethod
     def load_tif(
         filepath: Path, start_date: datetime, num_timesteps: Optional[int] = DEFAULT_NUM_TIMESTEPS
@@ -444,7 +433,7 @@ def remove_bands(array: np.ndarray) -> np.ndarray:
     def process_test_file(
         path_to_file: Path, start_date: datetime
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-        da, slope = Engineer.load_tif(xr.open_rasterio(path_to_file), start_date=start_date)
+        da, slope = Engineer.load_tif(path_to_file, start_date=start_date)
 
         # Process remote sensing data
         x_np = da.values
diff --git a/test/cropharvest/engineer/test_engineer.py b/test/cropharvest/engineer/test_engineer.py
index 8f3622e..2a12d44 100644
--- a/test/cropharvest/engineer/test_engineer.py
+++ b/test/cropharvest/engineer/test_engineer.py
@@ -92,14 +92,6 @@ def test_filename_correctly_processed():
     assert dataset == "togo"
 
 
-def test_filename_correctly_processed_2():
-
-    filename = "98-geowiki-landcover-2017_2019-02-06_2020-02-01.tif"
-    idx, dataset = Engineer.process_filename(filename)
-    assert idx == 98
-    assert dataset == "geowiki-landcover-2017"
-
-
 def test_process_test_file():
     x_np, flat_lat, flat_lon = Engineer.process_test_file(
         TIF_FILE, start_date=datetime(2019, 2, 6, 0, 0)

From 3bffd7096a6622b9aac70be503a7686f335c5ecb Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Mon, 2 May 2022 15:00:40 -0400
Subject: [PATCH 29/38] Remove unused imports

---
 benchmarks/dl/maml.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dl/maml.py b/benchmarks/dl/maml.py
index d8ab58f..113edb6 100644
--- a/benchmarks/dl/maml.py
+++ b/benchmarks/dl/maml.py
@@ -3,7 +3,6 @@
 import dill
 import warnings
 from random import shuffle, random
-from collections import defaultdict
 
 import torch
 from torch import nn
@@ -19,10 +18,10 @@
 
 from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task
 from cropharvest import countries
-from cropharvest.config import TEST_DATASETS, TEST_REGIONS, TEST_COUNTRIES_TO_CROPS
+from cropharvest.config import TEST_COUNTRIES_TO_CROPS
 from cropharvest.utils import NoDataForBoundingBoxError
 
-from typing import Dict, Tuple, Optional, List, DefaultDict
+from typing import Dict, Tuple, Optional, List
 
 
 class TrainDataLoader:

From 765aec0a7daac398e515c09fc20dd84a8f4ea603 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 07:43:08 -0400
Subject: [PATCH 30/38] For the labels in a test area, associate the labels to
 the right tif files

---
 cropharvest/engineer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 25f9761..0cbbe86 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -604,7 +604,11 @@ def create_h5_test_instances(
                 [filter_geojson(self.labels, box) for box in country_bboxes]
             )
 
-            for _, row in tqdm(relevant_labels.iterrows()):
+            relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels)
+            tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0
+            labels_with_tifs = labels_with_tifs.loc[tifs_found]
+
+            for _, row in tqdm(labels_with_tifs.iterrows()):
                 instance = self.process_single_file(row)
                 if instance is not None:
                     x.append(instance.array)

From cee6c20242a345885753ce0338931439428ba9fa Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 08:03:09 -0400
Subject: [PATCH 31/38] Fix variable naming when creating test instances

---
 cropharvest/engineer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 0cbbe86..1f36285 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -606,7 +606,7 @@ def create_h5_test_instances(
 
             relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels)
             tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0
-            labels_with_tifs = labels_with_tifs.loc[tifs_found]
+            labels_with_tifs = relevant_labels.loc[tifs_found]
 
             for _, row in tqdm(labels_with_tifs.iterrows()):
                 instance = self.process_single_file(row)

From 3698c15339576f63c34d64b3c770e4bce6cbe23a Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 09:00:40 -0400
Subject: [PATCH 32/38] Only use the test data for Togo

---
 cropharvest/engineer.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 1f36285..28c397f 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -594,15 +594,12 @@ def create_h5_test_instances(
                         hf.create_dataset(key, data=val)
                     hf.close()
 
-        for country, dataset in TEST_DATASETS.items():
+        for _, dataset in TEST_DATASETS.items():
             x: List[np.ndarray] = []
             y: List[int] = []
             lats: List[float] = []
             lons: List[float] = []
-            country_bboxes = get_country_bbox(country)
-            relevant_labels = pd.concat(
-                [filter_geojson(self.labels, box) for box in country_bboxes]
-            )
+            relevant_labels = self.labels[self.labels[RequiredColumns.DATASET] == dataset]
 
             relevant_labels[EngColumns.TIF_FILEPATHS] = self.match_labels_to_tifs(relevant_labels)
             tifs_found = relevant_labels[EngColumns.TIF_FILEPATHS].str.len() > 0

From ac39be3d33e6d90021bd22458215485280a47c36 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 09:25:50 -0400
Subject: [PATCH 33/38] Use the new .h5 filenames in the datasets

---
 cropharvest/datasets.py | 34 ++++++++++++++++------------------
 cropharvest/engineer.py |  4 ++--
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index 85204d6..2bfa76a 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -23,8 +23,8 @@
     TEST_REGIONS,
     TEST_DATASETS,
 )
-from cropharvest.columns import NullableColumns, RequiredColumns
-from cropharvest.engineer import TestInstance
+from cropharvest.columns import NullableColumns, RequiredColumns, EngColumns
+from cropharvest.engineer import TestInstance, Engineer
 from cropharvest import countries
 
 from typing import cast, List, Optional, Tuple, Generator
@@ -84,8 +84,13 @@ def __init__(self, root, download=False):
         super().__init__(root, download, filenames=(LABELS_FILENAME,))
 
         # self._labels will always contain the original dataframe;
-        # the CropHarvestLabels class should not modify it
-        self._labels = read_geopandas(self.root / LABELS_FILENAME)
+        # the CropHarvestLabels class should not modify it.
+        # We use the Engineer load_labels since this also loads the .h5
+        # filenames for each row
+        # TODO: The load_labels doesn't actually allow the root to be
+        # modified. We should probably do this at a package level, not
+        # at a class level
+        self._labels = Engineer.load_labels(root=root)
 
     def as_geojson(self) -> geopandas.GeoDataFrame:
         return self._labels
@@ -128,15 +133,17 @@ def construct_positive_and_negative_labels(
                     # then we can just collect all classes which either
                     # 1) are crop, or 2) are a different non crop class (e.g. forest)
                     negative_labels = gpdf[((is_null & is_crop) | (~is_null & ~is_target))]
-                    negative_paths = self._dataframe_to_paths(negative_labels)
+                    negative_paths = negative_labels[EngColumns.FEATURES_PATH].tolist()
                 else:
                     # otherwise, the target label is a crop. If balance_negative_crops is
                     # true, then we want an equal number of (other) crops and non crops in
                     # the negative labels
                     negative_non_crop_labels = gpdf[~is_crop]
                     negative_other_crop_labels = gpdf[(is_crop & ~is_null & ~is_target)]
-                    negative_non_crop_paths = self._dataframe_to_paths(negative_non_crop_labels)
-                    negative_paths = self._dataframe_to_paths(negative_other_crop_labels)
+                    negative_paths = negative_other_crop_labels[EngColumns.FEATURES_PATH].tolist()
+                    negative_non_crop_paths = negative_non_crop_labels[
+                        EngColumns.FEATURES_PATH
+                    ].tolist()
 
                     if task.balance_negative_crops:
                         negative_paths.extend(
@@ -150,26 +157,17 @@ def construct_positive_and_negative_labels(
                 # otherwise, we will just filter by crop and non crop
                 positive_labels = gpdf[is_crop]
                 negative_labels = gpdf[~is_crop]
-                negative_paths = self._dataframe_to_paths(negative_labels)
+                negative_paths = negative_labels[EngColumns.FEATURES_PATH].tolist()
         except IndexError:
             raise NoDataForBoundingBoxError
 
-        positive_paths = self._dataframe_to_paths(positive_labels)
+        positive_paths = positive_paths[EngColumns.FEATURES_PATH].tolist()
 
         if (len(positive_paths) == 0) or (len(negative_paths) == 0):
             raise NoDataForBoundingBoxError
 
         return [x for x in positive_paths if x.exists()], [x for x in negative_paths if x.exists()]
 
-    def _path_from_row(self, row: geopandas.GeoSeries) -> Path:
-        return (
-            self.root
-            / f"features/arrays/{row[RequiredColumns.INDEX]}_{row[RequiredColumns.DATASET]}.h5"
-        )
-
-    def _dataframe_to_paths(self, df: geopandas.GeoDataFrame) -> List[Path]:
-        return [self._path_from_row(row) for _, row in df.iterrows()]
-
 
 class CropHarvestTifs(BaseDataset):
     def __init__(self, root, download=False):
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 28c397f..83d9d9a 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -187,8 +187,8 @@ def __init__(self) -> None:
         self.norm_interim: Dict[str, Union[np.ndarray, int]] = {"n": 0}
 
     @staticmethod
-    def load_labels() -> geopandas.GeoDataFrame:
-        labels = geopandas.read_file(DATAFOLDER_PATH / LABELS_FILENAME)
+    def load_labels(root=DATAFOLDER_PATH) -> geopandas.GeoDataFrame:
+        labels = geopandas.read_file(root / LABELS_FILENAME)
         labels[RequiredColumns.EXPORT_END_DATE] = pd.to_datetime(
             labels[RequiredColumns.EXPORT_END_DATE]
         ).dt.date

From 26aacf562637b71c05b699dd6418ccd84706164b Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 10:29:38 -0400
Subject: [PATCH 34/38] Fix variable names in the dataset

---
 cropharvest/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index 2bfa76a..c82b88b 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -161,7 +161,7 @@ def construct_positive_and_negative_labels(
         except IndexError:
             raise NoDataForBoundingBoxError
 
-        positive_paths = positive_paths[EngColumns.FEATURES_PATH].tolist()
+        positive_paths = positive_labels[EngColumns.FEATURES_PATH].tolist()
 
         if (len(positive_paths) == 0) or (len(negative_paths) == 0):
             raise NoDataForBoundingBoxError

From 84aa0221f9767800750205228513b308a5a4df0e Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 10:42:58 -0400
Subject: [PATCH 35/38] Remove unused imports

---
 cropharvest/datasets.py | 1 -
 cropharvest/engineer.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index c82b88b..34f214a 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -9,7 +9,6 @@
 from cropharvest.utils import (
     download_and_extract_archive,
     deterministic_shuffle,
-    read_geopandas,
     load_normalizing_dict,
     sample_with_memory,
     NoDataForBoundingBoxError,
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 83d9d9a..cb202bd 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -16,7 +16,6 @@
 
 from cropharvest.bands import STATIC_BANDS, DYNAMIC_BANDS
 from cropharvest.columns import RequiredColumns, NullableColumns, EngColumns
-from cropharvest.countries import get_country_bbox
 from cropharvest.boundingbox import BBox
 from cropharvest.config import (
     EXPORT_END_DAY,

From 96625b47ac281323e0e926258900829bbbee278c Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Tue, 3 May 2022 12:56:00 -0400
Subject: [PATCH 36/38] Don't hardcode the seed in the CropHarvest dataset
 class

---
 cropharvest/datasets.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index 34f214a..87659e0 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -21,6 +21,7 @@
     DEFAULT_SEED,
     TEST_REGIONS,
     TEST_DATASETS,
+    DEFAULT_SEED,
 )
 from cropharvest.columns import NullableColumns, RequiredColumns, EngColumns
 from cropharvest.engineer import TestInstance, Engineer
@@ -206,8 +207,8 @@ def __init__(
         if val_ratio > 0.0:
             # the fixed seed is to ensure the validation set is always
             # different from the training set
-            positive_paths = deterministic_shuffle(positive_paths, seed=42)
-            negative_paths = deterministic_shuffle(negative_paths, seed=42)
+            positive_paths = deterministic_shuffle(positive_paths, seed=DEFAULT_SEED)
+            negative_paths = deterministic_shuffle(negative_paths, seed=DEFAULT_SEED)
             if is_val:
                 positive_paths = positive_paths[: int(len(positive_paths) * val_ratio)]
                 negative_paths = negative_paths[: int(len(negative_paths) * val_ratio)]

From eaa7d187868c4d0b24d66b5c485f457aa242f4fb Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 4 May 2022 08:18:47 -0400
Subject: [PATCH 37/38] Fix random forest name in the config

---
 benchmarks/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/config.py b/benchmarks/config.py
index 4667d81..f97c656 100644
--- a/benchmarks/config.py
+++ b/benchmarks/config.py
@@ -11,7 +11,7 @@
 
 
 # Model names
-RANDOM_FOREST = "RF_GRID_SEARCH"
+RANDOM_FOREST = "RF"
 DL_RANDOM = "DL_RANDOM"
 DL_PRETRAINED = "DL_PRETRAINED"
 DL_MAML = "DL_MAML"

From ea3acfd18773739e3e1af60044387f0bbd635af0 Mon Sep 17 00:00:00 2001
From: Gabriel Tseng <gabriel.tseng@mail.mcgill.ca>
Date: Wed, 13 Jul 2022 15:29:55 +0100
Subject: [PATCH 38/38] Fix imports

---
 cropharvest/datasets.py | 1 -
 cropharvest/engineer.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cropharvest/datasets.py b/cropharvest/datasets.py
index 87659e0..3050897 100644
--- a/cropharvest/datasets.py
+++ b/cropharvest/datasets.py
@@ -18,7 +18,6 @@
     FEATURES_DIR,
     TEST_FEATURES_DIR,
     LABELS_FILENAME,
-    DEFAULT_SEED,
     TEST_REGIONS,
     TEST_DATASETS,
     DEFAULT_SEED,
diff --git a/cropharvest/engineer.py b/cropharvest/engineer.py
index 12a06aa..e49e8cb 100644
--- a/cropharvest/engineer.py
+++ b/cropharvest/engineer.py
@@ -32,7 +32,7 @@
     ARRAYS_FILEPATH,
     TEST_FEATURES_FILEPATH,
 )
-from cropharvest.utils import load_normalizing_dict, filter_geojson
+from cropharvest.utils import load_normalizing_dict
 
 from typing import cast, Optional, Dict, Union, Tuple, List, Sequence