diff --git a/scripts/export.py b/scripts/export.py index 601b0f8da..17c359097 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -1,11 +1,13 @@ from pathlib import Path +import numpy as np import sys sys.path.append('..') from src.exporters import (ERA5Exporter, VHIExporter, CHIRPSExporter, ERA5ExporterPOS, GLEAMExporter, ESACCIExporter, - S5Exporter, SRTMExporter, KenyaAdminExporter) + S5Exporter, SRTMExporter, KenyaAdminExporter, + NDVIExporter) def export_era5(): @@ -170,12 +172,23 @@ def export_kenya_boundaries(): exporter.export() +def export_ndvi(): + if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': + data_path = Path('data') + else: + data_path = Path('../data') + + exporter = NDVIExporter(data_path) + exporter.export() + + if __name__ == '__main__': - export_era5() - export_vhi() - export_chirps() - export_era5POS() - export_gleam() - export_esa() - export_s5() - export_kenya_boundaries() + # export_era5() + # export_vhi() + # export_chirps() + # export_era5POS() + # export_gleam() + # export_esa() + # export_s5() + # export_kenya_boundaries() + export_ndvi() \ No newline at end of file diff --git a/src/exporters/__init__.py b/src/exporters/__init__.py index c9b06d532..a69393392 100644 --- a/src/exporters/__init__.py +++ b/src/exporters/__init__.py @@ -6,9 +6,12 @@ from .gleam import GLEAMExporter from .srtm import SRTMExporter from .esa_cci import ESACCIExporter +from .ndvi import NDVIExporter from .admin_boundaries import KenyaAdminExporter + __all__ = [ 'ERA5Exporter', 'VHIExporter', 'ERA5ExporterPOS', 'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter', - 'SRTMExporter', 'ESACCIExporter', 'KenyaAdminExporter'] + 'NDVIExporter', 'SRTMExporter', 'ESACCIExporter', + 'KenyaAdminExporter'] diff --git a/src/exporters/ndvi.py b/src/exporters/ndvi.py new file mode 100644 index 000000000..a69c8776b --- /dev/null +++ b/src/exporters/ndvi.py @@ -0,0 +1,125 @@ +from pathlib import Path +from typing import List, Optional +import urllib.request +import numpy as np +import os +import multiprocessing + +import re + +from .base import BaseExporter +BeautifulSoup = None + + +class NDVIExporter(BaseExporter): + """Exports Normalised Difference Vegetation Index from NOAA + + https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/ + """ + + def __init__(self, data_folder: Path = Path('data')) -> None: + global BeautifulSoup + if BeautifulSoup is None: + from bs4 import BeautifulSoup + + super().__init__(data_folder) + + self.ndvi_folder = self.raw_folder / "ndvi" + if not self.ndvi_folder.exists(): + self.ndvi_folder.mkdir() + + self.base_url = 'https://www.ncei.noaa.gov/data/' \ + 'avhrr-land-normalized-difference-vegetation-index/' \ + 'access'.replace(' ', '') + + + @staticmethod + def beautiful_soup_url(url: str) -> BeautifulSoup: # type: ignore + # use urllib.request to read the page source + req = urllib.request.Request(url) + response = urllib.request.urlopen(req) + the_page = response.read() + + # use BeautifulSoup to parse the html source + soup = BeautifulSoup(the_page, features="lxml") # type: ignore + + return soup + + def get_ndvi_url_paths(self, + selected_years: Optional[List[int]] = None, + ) -> List[str]: + # use BeautifulSoup to parse the html source + soup = self.beautiful_soup_url(self.base_url) + # find all links (to the years) + years = [ + yrs.string.replace('/', '') + for yrs in soup.find_all('a') # type: ignore + if re.match(r'[0-9]{4}', yrs.string) + ] + + # filter for selected_years + if selected_years is not None: + years = [y for y in years if int(y) in selected_years] + + # build the year urls + year_urls = [ + f'{self.base_url}/{y}' + for y in years + ] + + # get the urls for the .nc files + all_urls = [] + for url in year_urls: + links = self.beautiful_soup_url(url).find_all('a') # type: ignore + nc_links = [ + f'{url}/{l.string}' + for l in links + if '.nc' in l.string + ] + all_urls.extend(nc_links) + + return all_urls + + def wget_file(self, url) -> None: + # create year subdirectories + year = url.split('/')[-2] + out_folder = self.ndvi_folder / year + if not out_folder.exists(): + out_folder.mkdir(parents=True, exist_ok=True) + + # check if file already exists + fname = url.split('/')[-1] + if (out_folder / fname).exists(): + print(f'{fname} for {year} already donwloaded!') + return + + os.system(f'wget -np -nH {url} -P {out_folder.as_posix()}') + print(f'{fname} for {year} downloaded!') + + def export(self, years: Optional[List[int]] = None, + parallel_processes: int = 1) -> None: + """Export functionality for the NDVI product from AVHRR (NOAA) + 1981 - 2019 (daily). + Arguments + ---------- + years: Optional list of ints, default = None + The years of data to download. If None, all data will be downloaded + parallel_processes: int, default = 1 + number of processes to parallelize the downloading of data + """ + if years is not None: + valid_years = np.arange(1981, 2020) + assert np.isin(years, valid_years).all(), \ + 'Expected `years` argument to be in range 1981-2019' + + urls = self.get_ndvi_url_paths(selected_years=years) + + if parallel_processes <= 1: # sequential + for url in urls: + self.wget_file(url) + else: # parallel + pool = multiprocessing.Pool(processes=parallel_processes) + pool.map(self.wget_file, urls) + + +# diff --git a/tests/exporters/test_ndvi.py b/tests/exporters/test_ndvi.py new file mode 100644 index 000000000..98616c7de --- /dev/null +++ b/tests/exporters/test_ndvi.py @@ -0,0 +1,46 @@ +from unittest.mock import patch + +from src.exporters import NDVIExporter + + +class TestNDVIExporter: + def test_init(self, tmp_path): + e = NDVIExporter(tmp_path) + + assert e.ndvi_folder.name == 'ndvi' + assert (tmp_path / 'raw' / 'ndvi').exists() + + @patch('os.system', autospec=True) + def test_checkpointing(self, mock_system, tmp_path, capsys): + # checks we don't redownload files + exporter = NDVIExporter(tmp_path) + + # setup the already downloaded file + test_filename = '1981/testy_test.nc' + (tmp_path / 'raw/ndvi/1981').mkdir(parents=True, exist_ok=True) + (tmp_path / f'raw/ndvi/{test_filename}').touch() + + exporter.wget_file(test_filename) + captured = capsys.readouterr() + + expected_stdout = f'testy_test.nc for 1981 already donwloaded!\n' + assert captured.out == expected_stdout, \ + f'Expected stdout to be {expected_stdout}, got {captured.out}' + mock_system.assert_not_called(), 'os.system was called! Should have been skipped' + + @patch('os.system') + def test_beautiful_soup_regex_parse(self, mock_system, tmp_path): + exporter = NDVIExporter(tmp_path) + files = exporter.get_ndvi_url_paths(selected_years=[1981]) + + # check that all netcdf files + assert all([f[-3:] == '.nc' for f in files]) + + # check base of string + base_url_str = 'https://www.ncei.noaa.gov/data/' \ + 'avhrr-land-normalized-difference-vegetation-index/access/1981/' + assert all([f.split('AVHRR')[0] == base_url_str for f in files]) + + # check got 31 December + timestamp = '19811231' + assert files[-1].split('_')[-2] == timestamp