Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crop Mask Integration #85

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
818c364
Script to update the namning convention of eo tif files
gabrieltseng Apr 19, 2022
e09fc93
Make the output folder
gabrieltseng Apr 19, 2022
d8ce1a2
Track progress using tqdm
gabrieltseng Apr 19, 2022
78f326b
Add the right arguments to load_default_labels
gabrieltseng Apr 19, 2022
f27e94d
Remove conflicting column name
gabrieltseng Apr 19, 2022
674944c
Replace 'index' string with RequiredColumns.INDEX
gabrieltseng Apr 19, 2022
3f529e1
Replace 'index' string with RequiredColumns.INDEX
gabrieltseng Apr 19, 2022
ddaab9c
Fix assignment of export_identifier when loading default labels
gabrieltseng Apr 19, 2022
475bb30
Merge branch 'main' into crop-mask-integration
gabrieltseng Apr 19, 2022
171bb73
Add a script to test the renaming has happened correctly
gabrieltseng Apr 20, 2022
50bf35e
Correct argument ordering in the renaming script
gabrieltseng Apr 20, 2022
52b0cc3
Add information to the assert statement
gabrieltseng Apr 20, 2022
eff8786
Keep track of failures
gabrieltseng Apr 20, 2022
7eff7d4
Final addition to check_renaming docstring
gabrieltseng Apr 20, 2022
d2f737e
Correctly calculate the export end date for the default labels
gabrieltseng Apr 20, 2022
db91487
Keep the end_date as a datetime, not a string
gabrieltseng Apr 20, 2022
429baf6
Store it as a date, not a datetime
gabrieltseng Apr 20, 2022
dd9a3d2
[WIP] match tifs to labels based on lat/lon instead of dataset/idx
gabrieltseng Apr 20, 2022
30c7051
Update Engineer for labelled tifs
gabrieltseng Apr 20, 2022
a804214
Use the default identifier for the default labels
gabrieltseng Apr 25, 2022
270e3af
Mypy fixes
gabrieltseng Apr 25, 2022
b02a2bf
Update the processing of test files to work with the new approach too
gabrieltseng Apr 26, 2022
d16905a
Make output_dict a defaultdict
gabrieltseng Apr 26, 2022
e05e305
Use fnmatch instead of (incorrect) string matching
gabrieltseng Apr 26, 2022
30a55d5
Fix incorrect pattern in fnmatch
gabrieltseng Apr 26, 2022
b391d3b
Correctly find the correct tif_filepath per label
gabrieltseng Apr 27, 2022
127947a
Don't use iloc to index a list
gabrieltseng Apr 27, 2022
f6a2c06
Fix failing engineer tests
gabrieltseng Apr 27, 2022
f5438cc
Remove unused function
gabrieltseng Apr 27, 2022
3bffd70
Remove unused imports
gabrieltseng May 2, 2022
765aec0
For the labels in a test area, associate the labels to the right tif …
gabrieltseng May 3, 2022
cee6c20
Fix variable naming when creating test instances
gabrieltseng May 3, 2022
3698c15
Only use the test data for Togo
gabrieltseng May 3, 2022
ac39be3
Use the new .h5 filenames in the datasets
gabrieltseng May 3, 2022
26aacf5
Fix variable names in the dataset
gabrieltseng May 3, 2022
84aa022
Remove unused imports
gabrieltseng May 3, 2022
96625b4
Don't hardcode the seed in the CropHarvest dataset class
gabrieltseng May 3, 2022
eaa7d18
Fix random forest name in the config
gabrieltseng May 4, 2022
82ade08
Fix merge conflicts in the engineer
gabrieltseng Jul 13, 2022
ea3acfd
Fix imports
gabrieltseng Jul 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmarks/deep_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json

from cropharvest.datasets import CropHarvest
from cropharvest.utils import DATAFOLDER_PATH
from cropharvest.config import DATAFOLDER_PATH
from cropharvest.engineer import TestInstance

from config import (
Expand Down
22 changes: 7 additions & 15 deletions benchmarks/dl/maml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import dill
import warnings
from random import shuffle, random
from collections import defaultdict

import torch
from torch import nn
Expand All @@ -19,10 +18,10 @@

from cropharvest.datasets import CropHarvest, CropHarvestLabels, Task
from cropharvest import countries
from cropharvest.config import TEST_DATASETS, TEST_REGIONS
from cropharvest.config import TEST_COUNTRIES_TO_CROPS
from cropharvest.utils import NoDataForBoundingBoxError

from typing import Dict, Tuple, Optional, List, DefaultDict
from typing import Dict, Tuple, Optional, List


class TrainDataLoader:
Expand Down Expand Up @@ -363,18 +362,11 @@ def _make_tasks(
) -> Tuple[Dict[str, CropHarvest], Dict[str, CropHarvest]]:
labels = CropHarvestLabels(self.root)

# remove any test regions, and collect the countries / crops
test_countries_to_crops: DefaultDict[str, List[str]] = defaultdict(list)

# reshuffle the test_regions dict so its a little easier to
# manipulate in this function
for identifier, _ in TEST_REGIONS.items():
country, crop, _, _ = identifier.split("_")
test_countries_to_crops[country].append(crop)

label_to_task: Dict[str, CropHarvest] = {}

countries_to_ignore = [country for country, _ in TEST_DATASETS.items() if crop is None]
countries_to_ignore = [
country for country, crop in TEST_COUNTRIES_TO_CROPS.items() if crop is not None
]

for country in tqdm(countries.get_countries()):
if country in countries_to_ignore:
Expand All @@ -394,8 +386,8 @@ def _make_tasks(
label_to_task[task.id] = task

for label in labels.classes_in_bbox(country_bbox):
if country in test_countries_to_crops:
if label in test_countries_to_crops[country]:
if country in TEST_COUNTRIES_TO_CROPS:
if label in TEST_COUNTRIES_TO_CROPS[country]:
continue
try:
task = CropHarvest(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sklearn.ensemble import RandomForestClassifier

from cropharvest.datasets import CropHarvest
from cropharvest.utils import DATAFOLDER_PATH
from cropharvest.config import DATAFOLDER_PATH
from cropharvest.engineer import TestInstance

from config import SHUFFLE_SEEDS, DATASET_TO_SIZES, RANDOM_FOREST
Expand Down
77 changes: 77 additions & 0 deletions cropharvest/boundingbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from dataclasses import dataclass
from pathlib import Path
from shapely.geometry import Polygon
from math import sin, cos, radians
from typing import List, Tuple
import re

from typing import Optional


@dataclass
class BBox:

min_lat: float
max_lat: float
min_lon: float
max_lon: float

name: Optional[str] = None

def __post_init__(self):
if self.max_lon < self.min_lon:
raise ValueError("max_lon should be larger than min_lon")
if self.max_lat < self.min_lat:
raise ValueError("max_lat should be larger than min_lat")

self.url = (
f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}"
)

def contains(self, lat: float, lon: float) -> bool:
return (
(lat >= self.min_lat)
& (lat <= self.max_lat)
& (lon >= self.min_lon)
& (lon <= self.max_lon)
)

def contains_bbox(self, bbox: "BBox") -> bool:
return (
(bbox.min_lat >= self.min_lat)
& (bbox.max_lat <= self.max_lat)
& (bbox.min_lon >= self.min_lon)
& (bbox.max_lon <= self.max_lon)
)

@property
def three_dimensional_points(self) -> List[float]:
r"""
If we are passing the central latitude and longitude to
an ML model, we want it to know the extremes are close together.
Mapping them to 3d space allows us to do that
"""
lat, lon = self.get_centre(in_radians=True)
return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

neato


def get_centre(self, in_radians: bool = True) -> Tuple[float, float]:

# roughly calculate the centres
lat = self.min_lat + ((self.max_lat - self.min_lat) / 2)
lon = self.min_lon + ((self.max_lon - self.min_lon) / 2)
if in_radians:
return radians(lat), radians(lon)
else:
return lat, lon

@classmethod
def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None):
(min_lon, min_lat, max_lon, max_lat) = polygon.bounds
return cls(min_lat, max_lat, min_lon, max_lon, name)

@classmethod
def from_eo_tif_file(cls, path: Path) -> "BBox":
decimals_in_p = re.findall(r"=-?\d*\.?\d*", path.stem)
coords = [float(d[1:]) for d in decimals_in_p[0:4]]
bbox = cls(min_lat=coords[0], min_lon=coords[1], max_lat=coords[2], max_lon=coords[3])
return bbox
14 changes: 13 additions & 1 deletion cropharvest/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def date_columns(cls) -> List[str]:

class RequiredColumns(Columns):

INDEX = "index"
INDEX = "dataset_index"
IS_CROP = "is_crop"
LAT = "lat"
LON = "lon"
Expand All @@ -42,3 +42,15 @@ class NullableColumns(Columns):
@classmethod
def date_columns(cls) -> List[str]:
return [cls.HARVEST_DATE, cls.PLANTING_DATE]


class EngColumns:
"""
Some columns uniquely created & used by the labels
as loaded by the Engineer
"""

FEATURES_FILENAME = "features_filename"
FEATURES_PATH = "features_path"
EXISTS = "feature_exists"
TIF_FILEPATHS = "tif_path"
28 changes: 27 additions & 1 deletion cropharvest/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from .countries import BBox
from pathlib import Path
from collections import defaultdict

from .boundingbox import BBox

from typing import Dict

Expand All @@ -22,6 +25,14 @@
FEATURES_DIR = "features"
TEST_FEATURES_DIR = "test_features"

# These values describe the structure of the data folder
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ivanzvonkov , this locks in the folder structure but I think that's fine.

We could potentially have a way of over-riding this datafolder path at a package level, but otherwise I'd be for removing folder manipulation for the user entirely and controlling it here.

DATAFOLDER_PATH = Path(__file__).parent.parent / "data"
EO_FILEPATH = DATAFOLDER_PATH / "eo_data"
TEST_EO_FILEPATH = DATAFOLDER_PATH / "test_eo_data"
FEATURES_FILEPATH = DATAFOLDER_PATH / FEATURES_DIR
ARRAYS_FILEPATH = FEATURES_FILEPATH / "arrays"
TEST_FEATURES_FILEPATH = DATAFOLDER_PATH / TEST_FEATURES_DIR

# the default seed is useful because it also seeds the deterministic
# shuffling algorithm we use (in cropharvest.utils.deterministic_shuffle)
# so fixing this ensures the evaluation sets consist of the same data no matter
Expand All @@ -47,3 +58,18 @@
}

TEST_DATASETS = {"Togo": "togo-eval"}


def test_countries_to_crops():
output_dict = defaultdict(list)
for identifier, _ in TEST_REGIONS.items():
country, crop, _, _ = identifier.split("_")
output_dict[country].append(crop)

for country, _ in TEST_DATASETS.items():
output_dict[country].append(None)

return output_dict


TEST_COUNTRIES_TO_CROPS = test_countries_to_crops()
69 changes: 3 additions & 66 deletions cropharvest/countries.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,12 @@
from dataclasses import dataclass
import geopandas
from shapely.geometry import Polygon, MultiPolygon
from math import sin, cos, radians
from typing import List, Tuple
from typing import List
from pathlib import Path

from typing import Optional
from cropharvest.boundingbox import BBox
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!


COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile"))


@dataclass
class BBox:

min_lat: float
max_lat: float
min_lon: float
max_lon: float

name: Optional[str] = None

def __post_init__(self):
if self.max_lon < self.min_lon:
raise ValueError("max_lon should be larger than min_lon")
if self.max_lat < self.min_lat:
raise ValueError("max_lat should be larger than min_lat")

self.url = (
f"http://bboxfinder.com/#{self.min_lat},{self.min_lon},{self.max_lat},{self.max_lon}"
)

def contains(self, lat: float, lon: float) -> bool:
return (
(lat >= self.min_lat)
& (lat <= self.max_lat)
& (lon >= self.min_lon)
& (lon <= self.max_lon)
)

def contains_bbox(self, bbox) -> bool:
return (
(bbox.min_lat >= self.min_lat)
& (bbox.max_lat <= self.max_lat)
& (bbox.min_lon >= self.min_lon)
& (bbox.max_lon <= self.max_lon)
)

@property
def three_dimensional_points(self) -> List[float]:
r"""
If we are passing the central latitude and longitude to
an ML model, we want it to know the extremes are close together.
Mapping them to 3d space allows us to do that
"""
lat, lon = self.get_centre(in_radians=True)
return [cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat)]

def get_centre(self, in_radians: bool = True) -> Tuple[float, float]:

# roughly calculate the centres
lat = self.min_lat + ((self.max_lat - self.min_lat) / 2)
lon = self.min_lon + ((self.max_lon - self.min_lon) / 2)
if in_radians:
return radians(lat), radians(lon)
else:
return lat, lon

@classmethod
def polygon_to_bbox(cls, polygon: Polygon, name: Optional[str] = None):
(min_lon, min_lat, max_lon, max_lat) = polygon.bounds
return cls(min_lat, max_lat, min_lon, max_lon, name)
COUNTRY_SHAPEFILE = geopandas.read_file(str(Path(__file__).parent / "country_shapefile"))


def get_country_bbox(country_name: str) -> List[BBox]:
Expand Down
Loading