Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exporters/ndvi #73

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions scripts/export.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from pathlib import Path
import numpy as np

import sys
sys.path.append('..')
from src.exporters import (ERA5Exporter, VHIExporter,
CHIRPSExporter, ERA5ExporterPOS,
GLEAMExporter, ESACCIExporter,
S5Exporter, SRTMExporter, KenyaAdminExporter)
S5Exporter, SRTMExporter, KenyaAdminExporter,
NDVIExporter)


def export_era5():
Expand Down Expand Up @@ -170,12 +172,23 @@ def export_kenya_boundaries():
exporter.export()


def export_ndvi():
if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
data_path = Path('data')
else:
data_path = Path('../data')

exporter = NDVIExporter(data_path)
exporter.export()


if __name__ == '__main__':
export_era5()
export_vhi()
export_chirps()
export_era5POS()
export_gleam()
export_esa()
export_s5()
export_kenya_boundaries()
# export_era5()
# export_vhi()
# export_chirps()
# export_era5POS()
# export_gleam()
# export_esa()
# export_s5()
# export_kenya_boundaries()
export_ndvi()
5 changes: 4 additions & 1 deletion src/exporters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
from .gleam import GLEAMExporter
from .srtm import SRTMExporter
from .esa_cci import ESACCIExporter
from .ndvi import NDVIExporter
from .admin_boundaries import KenyaAdminExporter


__all__ = [
'ERA5Exporter', 'VHIExporter', 'ERA5ExporterPOS',
'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter',
'SRTMExporter', 'ESACCIExporter', 'KenyaAdminExporter']
'NDVIExporter', 'SRTMExporter', 'ESACCIExporter',
'KenyaAdminExporter']
125 changes: 125 additions & 0 deletions src/exporters/ndvi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from pathlib import Path
from typing import List, Optional
import urllib.request
import numpy as np
import os
import multiprocessing

import re

from .base import BaseExporter
BeautifulSoup = None


class NDVIExporter(BaseExporter):
"""Exports Normalised Difference Vegetation Index from NOAA

https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/
"""

def __init__(self, data_folder: Path = Path('data')) -> None:
global BeautifulSoup
if BeautifulSoup is None:
from bs4 import BeautifulSoup

super().__init__(data_folder)

self.ndvi_folder = self.raw_folder / "ndvi"
if not self.ndvi_folder.exists():
self.ndvi_folder.mkdir()

self.base_url = 'https://www.ncei.noaa.gov/data/' \
'avhrr-land-normalized-difference-vegetation-index/' \
'access'.replace(' ', '')


@staticmethod
def beautiful_soup_url(url: str) -> BeautifulSoup: # type: ignore
# use urllib.request to read the page source
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
the_page = response.read()

# use BeautifulSoup to parse the html source
soup = BeautifulSoup(the_page, features="lxml") # type: ignore

return soup

def get_ndvi_url_paths(self,
selected_years: Optional[List[int]] = None,
) -> List[str]:
# use BeautifulSoup to parse the html source
soup = self.beautiful_soup_url(self.base_url)
# find all links (to the years)
years = [
yrs.string.replace('/', '')
for yrs in soup.find_all('a') # type: ignore
if re.match(r'[0-9]{4}', yrs.string)
]

# filter for selected_years
if selected_years is not None:
years = [y for y in years if int(y) in selected_years]

# build the year urls
year_urls = [
f'{self.base_url}/{y}'
for y in years
]

# get the urls for the .nc files
all_urls = []
for url in year_urls:
links = self.beautiful_soup_url(url).find_all('a') # type: ignore
nc_links = [
f'{url}/{l.string}'
for l in links
if '.nc' in l.string
]
all_urls.extend(nc_links)

return all_urls

def wget_file(self, url) -> None:
# create year subdirectories
year = url.split('/')[-2]
out_folder = self.ndvi_folder / year
if not out_folder.exists():
out_folder.mkdir(parents=True, exist_ok=True)

# check if file already exists
fname = url.split('/')[-1]
if (out_folder / fname).exists():
print(f'{fname} for {year} already donwloaded!')
return

os.system(f'wget -np -nH {url} -P {out_folder.as_posix()}')
print(f'{fname} for {year} downloaded!')

def export(self, years: Optional[List[int]] = None,
parallel_processes: int = 1) -> None:
"""Export functionality for the NDVI product from AVHRR (NOAA)
1981 - 2019 (daily).
Arguments
----------
years: Optional list of ints, default = None
The years of data to download. If None, all data will be downloaded
parallel_processes: int, default = 1
number of processes to parallelize the downloading of data
"""
if years is not None:
valid_years = np.arange(1981, 2020)
assert np.isin(years, valid_years).all(), \
'Expected `years` argument to be in range 1981-2019'

urls = self.get_ndvi_url_paths(selected_years=years)

if parallel_processes <= 1: # sequential
for url in urls:
self.wget_file(url)
else: # parallel
pool = multiprocessing.Pool(processes=parallel_processes)
pool.map(self.wget_file, urls)


#
46 changes: 46 additions & 0 deletions tests/exporters/test_ndvi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from unittest.mock import patch

from src.exporters import NDVIExporter


class TestNDVIExporter:
def test_init(self, tmp_path):
e = NDVIExporter(tmp_path)

assert e.ndvi_folder.name == 'ndvi'
assert (tmp_path / 'raw' / 'ndvi').exists()

@patch('os.system', autospec=True)
def test_checkpointing(self, mock_system, tmp_path, capsys):
# checks we don't redownload files
exporter = NDVIExporter(tmp_path)

# setup the already downloaded file
test_filename = '1981/testy_test.nc'
(tmp_path / 'raw/ndvi/1981').mkdir(parents=True, exist_ok=True)
(tmp_path / f'raw/ndvi/{test_filename}').touch()

exporter.wget_file(test_filename)
captured = capsys.readouterr()

expected_stdout = f'testy_test.nc for 1981 already donwloaded!\n'
assert captured.out == expected_stdout, \
f'Expected stdout to be {expected_stdout}, got {captured.out}'
mock_system.assert_not_called(), 'os.system was called! Should have been skipped'

@patch('os.system')
def test_beautiful_soup_regex_parse(self, mock_system, tmp_path):
exporter = NDVIExporter(tmp_path)
files = exporter.get_ndvi_url_paths(selected_years=[1981])

# check that all netcdf files
assert all([f[-3:] == '.nc' for f in files])

# check base of string
base_url_str = 'https://www.ncei.noaa.gov/data/' \
'avhrr-land-normalized-difference-vegetation-index/access/1981/'
assert all([f.split('AVHRR')[0] == base_url_str for f in files])

# check got 31 December
timestamp = '19811231'
assert files[-1].split('_')[-2] == timestamp