Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exporters/ndvi #73

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions scripts/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
sys.path.append('..')
from src.exporters import (ERA5Exporter, VHIExporter,
CHIRPSExporter, ERA5ExporterPOS,
GLEAMExporter)

GLEAMExporter, NDVIExporter,
S5Exporter)
import numpy as np

def export_era5():
# if the working directory is alread ml_drought don't need ../data
Expand Down Expand Up @@ -63,9 +64,53 @@ def export_gleam():
exporter.export(['E', 'SMroot', 'SMsurf'], 'monthly')


def export_ndvi():
if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
data_path = Path('data')
else:
data_path = Path('../data')

exporter = NDVIExporter(data_path)
exporter.export()



def export_s5(
granularity='hourly', pressure_level=False,
variable='total_precipitation', min_year=1993,
max_year=1994, min_month=1, max_month=12,
):
if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
data_path = Path('data')
else:
data_path = Path('../data')

exporter = S5Exporter(
data_folder=data_path,
granularity=granularity,
pressure_level=pressure_level,
)
max_leadtime = None
pressure_levels = [200, 500, 925]
selection_request = None
n_parallel_requests = 1

exporter.export(
variable=variable,
min_year=min_year,
max_year=max_year,
min_month=min_month,
max_month=max_month,
max_leadtime=max_leadtime,
pressure_levels=pressure_levels,
n_parallel_requests=n_parallel_requests,
)

if __name__ == '__main__':
export_era5()
export_vhi()
export_chirps()
export_era5POS()
export_gleam()
export_ndvi()
# export_s5()
42 changes: 0 additions & 42 deletions scripts/export_s5.py

This file was deleted.

4 changes: 3 additions & 1 deletion src/exporters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from .planetOS import ERA5ExporterPOS
from .seas5.s5 import S5Exporter
from .gleam import GLEAMExporter
from .ndvi import NDVIExporter

__all__ = [
'ERA5Exporter', 'VHIExporter', 'ERA5ExporterPOS',
'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter'
'CHIRPSExporter', 'S5Exporter', 'GLEAMExporter',
'NDVIExporter',
]
120 changes: 120 additions & 0 deletions src/exporters/ndvi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from pathlib import Path
from typing import List, Optional
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import os
import multiprocessing

import re

from .base import BaseExporter


class NDVIExporter(BaseExporter):
"""Exports Normalised Difference Vegetation Index from NOAA

https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/
"""

def __init__(self, data_folder: Path = Path('data')) -> None:
super().__init__(data_folder)

self.ndvi_folder = self.raw_folder / "ndvi"
if not self.ndvi_folder.exists():
self.ndvi_folder.mkdir()

self.base_url = 'https://www.ncei.noaa.gov/data/' \
'avhrr-land-normalized-difference-vegetation-index/' \
'access'.replace(' ', '')

@staticmethod
def beautiful_soup_url(url: str) -> BeautifulSoup:
# use urllib.request to read the page source
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
the_page = response.read()

# use BeautifulSoup to parse the html source
soup = BeautifulSoup(the_page, features="lxml")

return soup

def get_ndvi_url_paths(self,
selected_years: Optional[List[int]] = None,
) -> List[str]:
# use BeautifulSoup to parse the html source
soup = self.beautiful_soup_url(self.base_url)
# find all links (to the years)
years = [
yrs.string.replace('/', '')
for yrs in soup.find_all('a')
if re.match(r'[0-9]{4}', yrs.string)
]

# filter for selected_years
if selected_years is not None:
years = [y for y in years if int(y) in selected_years]

# build the year urls
year_urls = [
f'{self.base_url}/{y}'
for y in years
]

# get the urls for the .nc files
all_urls = []
for url in year_urls:
links = self.beautiful_soup_url(url).find_all('a')
nc_links = [
f'{url}/{l.string}'
for l in links
if '.nc' in l.string
]
all_urls.extend(nc_links)

return all_urls

def wget_file(self, url) -> None:
# create year subdirectories
year = url.split('/')[-2]
out_folder = self.ndvi_folder / year
if not out_folder.exists():
out_folder.mkdir(parents=True, exist_ok=True)

# check if file already exists
fname = url.split('/')[-1]
if (out_folder / fname).exists():
print(f'{fname} for {year} already donwloaded!')
return

os.system(f'wget -np -nH {url} -P {out_folder.as_posix()}')
print(f'{fname} for {year} downloaded!')

def export(self, years: Optional[List[int]] = None,
parallel_processes: int = 1) -> None:
"""Export functionality for the NDVI product from AVHRR (NOAA)
1981 - 2019.
Arguments
----------
years: Optional list of ints, default = None
The years of data to download. If None, all data will be downloaded
parallel_processes: int, default = 1
number of processes to parallelize the downloading of data
"""
if years is not None:
valid_years = np.arange(1981, 2020)
assert np.isin(years, valid_years).all(), \
'Expected `years` argument to be in range 1981-2019'

urls = self.get_ndvi_url_paths(selected_years=years)

if parallel_processes <= 1: # sequential
for url in urls:
self.wget_file(url)
else: # parallel
pool = multiprocessing.Pool(processes=parallel_processes)
pool.map(self.wget_file, urls)


#
104 changes: 104 additions & 0 deletions tests/exporters/test_ndvi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# from pathlib import Path
from unittest.mock import patch, MagicMock
import urllib.request
# import pytest
import numpy as np

from src.exporters import NDVIExporter


class TestNDVIExporter:
def test_init(self, tmp_path):
e = NDVIExporter(tmp_path)

assert e.ndvi_folder.name == 'ndvi'
assert (tmp_path / 'raw' / 'ndvi').exists()

@patch('os.system', autospec=True)
def test_checkpointing(self, mock_system, tmp_path, capsys):
# checks we don't redownload files
exporter = NDVIExporter(tmp_path)

# setup the already downloaded file
test_filename = '1981/testy_test.nc'
(tmp_path / 'raw/ndvi/1981').mkdir(parents=True, exist_ok=True)
(tmp_path / f'raw/ndvi/{test_filename}').touch()

exporter.wget_file(test_filename)
captured = capsys.readouterr()

expected_stdout = f'testy_test.nc for 1981 already donwloaded!\n'
assert captured.out == expected_stdout, \
f'Expected stdout to be {expected_stdout}, got {captured.out}'
mock_system.assert_not_called(), 'os.system was called! Should have been skipped'

@patch('urllib.request.Request', autospec=True)
def test_get_filenames(self, request_patch, monkeypatch, tmp_path):
tommylees112 marked this conversation as resolved.
Show resolved Hide resolved
# First 1000 characters of the urllib response from the https,
# pulled on July 23 2019
request_patch.return_value = MagicMock()

# EXPECTED response for first page (all years)
expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD ' \
'HTML 3.2 Final//EN">\n<html>\n <head>\n<title>Index of' \
'/data/avhrr-land-normalized-difference-vegetation-index/access</title>\n' \
'</head>\n <body>\n<h1>Index of' \
'/data/avhrr-land-normalized-difference-vegetation-index/access</h1>' \
'\n<table><tr><th>&nbsp;</th><th><a' \
'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a' \
'href="?C=S;O=A">Size</a></th><th><a' \
'href="?C=D;O=A">Description</a></th></tr><tr><th' \
'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
'href="/data/avhrr-land-normalized-difference-vegetation-index/">Parent' \
'Directory</a></td><td>&nbsp;</td><td align="right"> -' \
'</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
'href="1981/">1981/</a></td><td align="right">14-Jul-2019 16:09'

# EXPECTED response for second page (1981 all .nc files)
expected_response = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">\n<html>\n' \
'<head>\n <title>Index of' \
'/data/avhrr-land-normalized-difference-vegetation-index/access/' \
'1981</title>\n</head>\n <body>\n<h1>Index of' \
'/data/avhrr-land-normalized-difference-vegetation-index/access/1981' \
'</h1>\n<table><tr><th>&nbsp;</th><th><a' \
'href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last' \
'modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a' \
'href="?C=D;O=A">Description</a></th></tr><tr><th' \
'colspan="5"><hr></th></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
'href="/data/avhrr-land-normalized-difference-vegetation-index/access/">Parent' \
'Directory</a></td><td>&nbsp;</td><td align="right"> -' \
'</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc">' \
'AVHRR-Land_v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc</a></td><td' \
'align="right">12-Jul-2019 10:37 </td><td align="right">' \
'51M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a' \
'href="AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc">' \
'AVHRR-Land_v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc</a></td><td' \
'align="right">12-Jul-2019 10:37 </td><td align="right">' \
'59M</td><td>&nbsp;</td></tr>\n<tr><td valign="top">&nbsp;</td><td><a'

expected_urls = [
'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
'difference-vegetation-index/access/1981/AVHRR-Land_'
'v005_AVH13C1_NOAA-07_19810624_c20170610041337.nc',
'https://www.ncei.noaa.gov/data/avhrr-land-normalized-'
'difference-vegetation-index/access/1981/AVHRR-Land_'
'v005_AVH13C1_NOAA-07_19810625_c20170610042839.nc',
]

# HOW TO MOCK beautiful_soup_url function
def mockreturn(request):
class OpenURL:
def read(self):
return expected_response
open_url = OpenURL()
return open_url

# i want to patch this ::L34-L36 $ the_page = response.read()
monkeypatch.setattr(urllib.request, 'urlopen', mockreturn)

exporter = NDVIExporter(tmp_path)
filenames = exporter.get_ndvi_url_paths(selected_years=np.arange(1981, 1985))

assert filenames is not None
assert expected_urls is not None