Skip to content

Commit

Permalink
New YSI Exo sonde loading code finally works, even if it's a bit hackish
Browse files Browse the repository at this point in the history
  • Loading branch information
nickrsan committed Jul 23, 2019
1 parent 148075c commit eb28869
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 30 deletions.
2 changes: 1 addition & 1 deletion arcproject/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

__all__ = ['waterquality', 'scripts']

__version__ = "2019.5.20"
__version__ = "2019.07.23"
2 changes: 1 addition & 1 deletion arcproject/scripts/load_data_bulk/load_2019_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def load_2019():
print("Loading 2019 Data")
path = data
s = slurp.Slurper(instrument=wqt_timestamp_match.ysi)
rename_files(s)
#rename_files(s)
s.add_new_sites = True
s.dst = True
s.skipext = [".xlsx", ".xls", ".dbf", ".prj", ".shp", ".shx", ".lyr"]
Expand Down
4 changes: 2 additions & 2 deletions arcproject/scripts/tests/test_wqt_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,8 @@ def setUp(self):

def test_coordinate_cleaning(self):
ysi = wqt_timestamp_match.ysi
ysi.handle(self.test_file)
ysi.handle(self.test_file_compiled, skip_rows=0)
ysi.wq_from_file(self.test_file) # this is the raw file, and it will have a few things different than the compiled file, particularly encoding
ysi.wq_from_file(self.test_file_compiled, skip_rows=0) # compiled file, in UTF-8, from Excel


if __name__ == '__main__':
Expand Down
106 changes: 89 additions & 17 deletions arcproject/scripts/wqt_timestamp_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

# define constants
source_field = "WQ_SOURCE"

DEBUG = False

class Instrument(object):
"""
Expand All @@ -46,6 +46,13 @@ def handle_gps(self, wq, transect_gps, dst_adjustment):
"""
return wq

def wq_from_file(self, water_quality_raw_data):
"""
This function processes a water quality file into a pandas data frame. Different for each instrument
:param water_quality_raw_data:
:return:
"""

@property
def load_fields(self):
return self.water_quality_header_map.keys() # was originally a list comprehension, but changed criteria and simplified it
Expand Down Expand Up @@ -163,10 +170,11 @@ def __init__(self):

self.new_latitude_field = "Lat_DD"
self.new_longitude_field = "Lon_DD"
self.new_datetime_field = "dt_correction"

self.datetime_format = '%Y-%m-%dt%H:%M:%S' # actual off instrument is '%m/%d/%Yt%H:%M:%S', but gets changed (by pandas?)
self.datetime_format = '%m/%d/%Yt%H:%M:%S' # actual off instrument is '%m/%d/%Yt%H:%M:%S', but gets changed (by pandas?)

def wq_from_file(self, water_quality_raw_data, skip_rows=17):
def wq_from_file(self, water_quality_raw_data, skip_rows=0):
"""
The new YSI Sonde has a different format - it has a bunch of changes it needs in the file.
1. The first 17 rows need to be stripped off - they're not useful to us - check the header first to see if
Expand Down Expand Up @@ -196,6 +204,7 @@ def wq_from_file(self, water_quality_raw_data, skip_rows=17):

# wq = convert_file_encoding(wq) # the source data is in UCS-2 LE BOM, which Python sees as null bytes. Let's make it unicode instead
self.wq = water_quality_raw_data
self.source_filename = os.path.split(water_quality_raw_data)[1]
self.detect_file_encoding() # figure out what the file encoding is so we can open it correctly

# basic input cleaning
Expand All @@ -205,14 +214,12 @@ def wq_from_file(self, water_quality_raw_data, skip_rows=17):
# some corrections of our own.

self.correct_coordinates()
#self.dump_schema_ini()

reprojected_features = self.make_spatial_and_calculate_fields()

pandas_version = wqtshp2pd(reprojected_features, date_field="lDate", time_field="lTime", instrument=self)

# add column with source filename
addsourcefield(pandas_version, source_field, water_quality_raw_data)

return pandas_version

def detect_file_encoding(self):
Expand All @@ -224,8 +231,9 @@ def detect_file_encoding(self):
as that encoding.
:return:
"""
file_data = open(self.wq, 'r').read() # read in the file data
self.detected_encoding = chardet.detect(file_data)['encoding']
with open(self.wq, 'r') as filehandle: # read in the file data
file_data = filehandle.read()
self.detected_encoding = chardet.detect(file_data)['encoding']

if self.detected_encoding == "UTF-16":
self.detected_encoding = "utf_16_le" # we'll use this encoding in this case - if it detects UTF-16 off the YSI
Expand All @@ -245,32 +253,87 @@ def make_spatial_and_calculate_fields(self):
try:
# we don't strictly need to do this, but it gives us an on-disk representation that we can calculate fields on
# too. Otherwise, if we skip it, reprojection will happen in wqtshp2pd
reprojected_features = reproject_features(xy_event_layer, projection_spatial_reference)
self.reprojected_features = reproject_features(xy_event_layer, projection_spatial_reference)
print("spatial data at {}".format(self.reprojected_features))
finally:
arcpy.Delete_management(xy_event_layer) # clean up the layer in memory if something happens
finally:
arcpy.Delete_management("xy_table_view")

arcpy.AddField_management(reprojected_features, field_name="turbidity", field_type="DOUBLE")
arcpy.AddField_management(self.reprojected_features, field_name="turbidity", field_type="DOUBLE")
# change proprietary FNU to NTU based on scaling factor from Chris Jasper
arcpy.CalculateField_management(reprojected_features, field="turbidity", expression="!FNU! * 1.24",
arcpy.CalculateField_management(self.reprojected_features, field="turbidity", expression="!FNU! * 1.24",
expression_type="PYTHON")

self.make_date_time_strings("lDate_original", "lDate", index=0)
self.make_date_time_strings("lTime_original", "lTime", index=1)

return self.reprojected_features

def dump_schema_ini(self):
"""
Sometimes when loading the data into ArcGIS, it interprets
times as dates and then the whole pipeline blows up. Here we
add entries to a schema.ini file in order to force it to
read date and time fields as text, not as dates. If this
doesn't work, we'll need to load the CSV to pandas and process
times there prior to doing any reprojection.
:return:
"""
schema_folder, csv_file_name = os.path.split(self.cleaned_csv_file)
schema_path = os.path.join(schema_folder, "schema.ini")

with open(schema_path, 'a') as schema_file:
schema_file.write("\n[{}]\n".format(csv_file_name))
lines = ["Format=CSVDelimited\n",
"ColNameHeader=True\n",
"MaxScanRows=0\n",
"lTime_original Text\n",
"lDate_original Text\n",
"lTime Text\n",
"lDate Text\n",
]
schema_file.writelines(lines)

def make_date_time_strings(self, field, new_field, index):
"""
ArcGIS loads the date fields as type Date and also the *time* fields as type date,
which gums up Pandas later on.
Force them to strings in new fields by splitting our previously combined datetime field
on the letter "t" - this is a hacky workaround because we're rushed for time right now
and don't have the time to rework the whole datetime processing pipeline for the new
instrument.
A different option would be to skip loading into Arc and instead just do all
field calculations in Pandas.
:param field:
:param new_field:
:param index: the index to be used when splitting the datetime field back out
:return:
"""

arcpy.AddField_management(self.reprojected_features, new_field, "TEXT", field_length=20)
arcpy.CalculateField_management(self.reprojected_features,
field=new_field,
expression="!{}!.split('t')[{}]".format(self.new_datetime_field, index),
expression_type="PYTHON")
return reprojected_features

def fix_csv_header(self, skip_rows):
with open(self.wq, 'r') as wq_data:
wq_data_converted = codecs.EncodedFile(wq_data, data_encoding='utf_8', file_encoding=self.detected_encoding)
wq_rows = wq_data_converted.readlines()
cleaned_data = wq_rows[skip_rows:] # strip off the first skip_rows rows because they mess things up

if "Date" not in cleaned_data[0]:
raise ValueError("Header on file {} is malformed or missing!".format(self.source_filename))

# then make the header characters legal
cleaned_data[0] = cleaned_data[0].replace("°C", "degrees_C")
cleaned_data[0] = cleaned_data[0].replace(" ", "_")
cleaned_data[0] = cleaned_data[0].replace("-", "_")
cleaned_data[0] = cleaned_data[0].replace("/", "_")
cleaned_data[0] = cleaned_data[0].replace("%", "pct")
cleaned_data[0] = cleaned_data[0].replace("Date", "lDate")
cleaned_data[0] = cleaned_data[0].replace("Time", "lTime")
cleaned_data[0] = cleaned_data[0].replace("Date", "lDate_original")
cleaned_data[0] = cleaned_data[0].replace("Time", "lTime_original")

output_temp = tempfile.mkstemp(prefix="arcproject_wq_", suffix=".csv")
self.cleaned_csv_file = output_temp[1]
Expand All @@ -290,9 +353,16 @@ def correct_coordinates(self):
fieldnames = cleaned_rows.fieldnames
fieldnames.append(self.new_latitude_field)
fieldnames.append(self.new_longitude_field)
fieldnames.append(self.new_datetime_field)
fieldnames.append(source_field)
for record in cleaned_rows:
if record["Lat"] in (None, "") or record["Lon"] in (None, ""): # skip records with no locations
#print("skipping record with lat/long of {}, {}".format(record["Lat"], record["Lon"]))
continue
record[self.new_latitude_field] = dms_to_dd(record["Lat"], force_negative=False) # convert the latitude
record[self.new_longitude_field] = dms_to_dd(record["Lon"], force_negative=True) # convert the longitude, force it to western hemisphere
record[self.new_datetime_field] = record["lDate_original"] + "t" + record["lTime_original"]
record[source_field] = self.source_filename
output_records.append(record)

with open(self.cleaned_csv_file, 'wb') as cleaned_data_csv: # wb probably won't work under Python 3 and we'd want to just be explicit about line endings instead.
Expand Down Expand Up @@ -507,7 +577,7 @@ def TimestampFromDateTime(date, time, format_string='%Y-%m-%dt%I:%M:%S%p'):
"""
Returns python datetime object
:param date: a date, by default, in format of %Y-%m-%d
:param time: a time, by default, in format of %I:%M:%S%p
:param time: a time, by default, in format of %I:%M:%S%p for hydrolab
:param format_string: the string to use to parse the time and the date. They will be concatenated with a "t"
in the middle
:return: datetime object
Expand Down Expand Up @@ -576,7 +646,8 @@ def wqtshp2pd(feature_class, date_field="GPS_Date", time_field="GPS_Time", instr
addsourcefield(df, "GPS_SOURCE", feature_class)

# cast Date field to str instead of timestamp
df[date_field] = df[date_field].dt.date.astype(str) # ArcGis adds some artificial times
if df[date_field].dtype is pd.Timestamp: # only happens with Hydrolab data though, so only cast it to str if it's a timestamp now
df[date_field] = df[date_field].dt.date.astype(str) # ArcGis adds some artificial times

# combine GPS date and GPS time fields into a single column
df['Date_Time'] = df.apply(lambda row: TimestampFromDateTime(row[date_field], row[time_field], format_string=instrument.datetime_format), axis=1)
Expand Down Expand Up @@ -659,14 +730,15 @@ def JoinMatchPercent(original, joined):
return percent_match


def wq_append_fromlist(list_of_wq_files, raise_exc=True, instrument=hydrolab):
def wq_append_fromlist(list_of_wq_files, raise_exc=DEBUG, instrument=hydrolab):
"""
Takes a list of water quality files and appends them to a single dataframe
:param list_of_wq_files: list of raw water quality files paths
:return: single dataframe with all the inputs
"""
master_wq_df = pd.DataFrame()
for wq in list_of_wq_files:
print("Processing {}".format(wq))
try:
pwq = instrument.wq_from_file(wq)
# append to master wq
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import print_function

__author__ = "ambell, nickrsan"
__author__ = "nickrsan, ambell"

try:
from arcproject import __version__ as version
except ImportError:
version = '2019.5.20'
version = '2019.07.23'

from setuptools import setup

Expand All @@ -23,6 +23,6 @@
## "pandas >= 0.16.1", "matplotlib", and "numpy >= 1.9.2" also needed, but cause issues on ArcGIS 10.4 install where it tries to upgrade numpy
author=__author__,
author_email="[email protected]",
url='https://github.com/ucd-cws/amaptor',
url='https://github.com/ucd-cws/arcproject-wq-processing',
include_package_data=True,
)
6 changes: 0 additions & 6 deletions user_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
r_dependencies = ["RSQLite", "plyr", "gplots", "devtools", "ggplot2"]
r_github_dependencies = ["ucd-cws/wq-heatplot"]

devtools::install_github("r-lib/remotes", ref = "e56a41e1d0cad55cbe7d60b274b99ab7b7a76b5c")
try:
import winreg
except ImportError:
import _winreg as winreg


def set_up_r_dependencies():
import launchR # imported here because it will be installed before this is called, but won't be installed at load time in all cases
Expand Down

0 comments on commit eb28869

Please sign in to comment.