forked from svenvanc/BSc-AutoML4SeaIce
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
106 lines (78 loc) · 3.62 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Function to preprocess ASIP2 scenes."""
# -- File info -- #
__author__ = 'Andreas R. Stokholm'
__contributors__ = 'Andrzej S. Kucik'
__copyright__ = ['Technical University of Denmark', 'European Space Agency']
__contact__ = ['[email protected]', '[email protected]']
__version__ = '0.2.0'
__date__ = '2022-02-08'
# -- Built-in modules -- #
import glob
import os
# -- Third-part modules -- #
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
# --Proprietary modules -- #
from functions.preprocessing_functions import PreprocessParallel
from functions.utils import colour_str, SCENE_VARIABLES
from init import OPTIONS
# Parameters
NUM_WORKERS = 4
def collate_function(x):
return x
def main():
"""Run preprocessing routine."""
print(colour_str('\n# -- Preprocessing Scenes -- #\n', 'cyan'))
# List of .nc files
if not os.path.exists(OPTIONS['path_to_processed_data']):
os.mkdir(OPTIONS['path_to_processed_data'])
dirlist = glob.glob(os.path.join(OPTIONS['path_to_data'], '*.nc'))
# print("1")
# print(dirlist)
# - Ignore files which were already processed
dirlist = [file for file in dirlist if
os.path.split(file)[-1][:15] + '_pro.nc' not in os.listdir(OPTIONS['path_to_processed_data'])
or
os.path.split(file)[-1][:15] + '_bins.npy' not in os.listdir('misc/scene_pro_bins')]
# print("2")
if not len(dirlist):
print('No scenes to process.')
preprocess_fast = PreprocessParallel(files=dirlist,
train_variables=SCENE_VARIABLES[:5],
train_fill_value=OPTIONS['train_fill_value'],
class_fill_values=OPTIONS['class_fill_values'],
pixel_spacing=OPTIONS['pixel_spacing'],
n_classes=OPTIONS['n_classes'])
# print("3")
run_preprocess = DataLoader(preprocess_fast,
batch_size=None,
num_workers=NUM_WORKERS,
shuffle=False,
collate_fn=collate_function)
# print("4")
# Process the scenes
for scene, bins in tqdm(iterable=run_preprocess, total=len(dirlist), colour='green'):
# - Check if the scenes' extreme values lie within the desired normalized range
# print("5")
for variable in SCENE_VARIABLES[:5]:
scene_min = np.min(scene[variable].values)
scene_max = np.max(scene[variable].values)
if scene_min < OPTIONS['normalize_range'][0]:
print('Minimum value of:', colour_str(scene_min, 'blue'), 'in scene',
colour_str(scene.attrs['scene_id'], 'purple'), 'for variable ', colour_str(variable, 'green'))
elif scene_max > OPTIONS['normalize_range'][1]:
print('Maximum value of:', colour_str(scene_min, 'blue'), 'in scene',
colour_str(scene.attrs['scene_id'], 'purple'), 'for variable ', colour_str(variable, 'green'))
elif np.isnan(np.sum(scene[variable])):
print('NaN in', colour_str(scene.scene_id, 'purple'))
# print("6")
# - Save the processed scene and the bins
scene.to_netcdf(os.path.join(OPTIONS['path_to_processed_data'], scene.attrs['scene_id']))
# print("7")
np.save('misc/scene_pro_bins/' + scene.attrs['scene_id'][:15] + '_bins', bins, allow_pickle=True)
# print("8")
if __name__ == '__main__':
main()