New YSI Exo sonde loading code finally works, even if it's a bit hackish

ucd-cws · Jul 23, 2019 · eb28869 · eb28869
1 parent 148075c
commit eb28869
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 30 deletions.
diff --git a/arcproject/__init__.py b/arcproject/__init__.py
@@ -3,4 +3,4 @@
 
 __all__ = ['waterquality', 'scripts']
 
-__version__ = "2019.5.20"
+__version__ = "2019.07.23"
diff --git a/arcproject/scripts/load_data_bulk/load_2019_data.py b/arcproject/scripts/load_data_bulk/load_2019_data.py
@@ -23,7 +23,7 @@ def load_2019():
 	print("Loading 2019 Data")
 	path = data
 	s = slurp.Slurper(instrument=wqt_timestamp_match.ysi)
-	rename_files(s)
+	#rename_files(s)
 	s.add_new_sites = True
 	s.dst = True
 	s.skipext = [".xlsx", ".xls", ".dbf", ".prj", ".shp", ".shx", ".lyr"]

diff --git a/arcproject/scripts/tests/test_wqt_unittest.py b/arcproject/scripts/tests/test_wqt_unittest.py
@@ -349,8 +349,8 @@ def setUp(self):
 
 	def test_coordinate_cleaning(self):
 		ysi = wqt_timestamp_match.ysi
-		ysi.handle(self.test_file)
-		ysi.handle(self.test_file_compiled, skip_rows=0)
+		ysi.wq_from_file(self.test_file)  # this is the raw file, and it will have a few things different than the compiled file, particularly encoding
+		ysi.wq_from_file(self.test_file_compiled, skip_rows=0)  # compiled file, in UTF-8, from Excel
 
 
 if __name__ == '__main__':

diff --git a/arcproject/scripts/wqt_timestamp_match.py b/arcproject/scripts/wqt_timestamp_match.py
@@ -22,7 +22,7 @@
 
 # define constants
 source_field = "WQ_SOURCE"
-
+DEBUG = False
 
 class Instrument(object):
 	"""
@@ -46,6 +46,13 @@ def handle_gps(self, wq, transect_gps, dst_adjustment):
 		"""
 		return wq
 
+	def wq_from_file(self, water_quality_raw_data):
+		"""
+			This function processes a water quality file into a pandas data frame. Different for each instrument
+		:param water_quality_raw_data:
+		:return:
+		"""
+
 	@property
 	def load_fields(self):
 		return self.water_quality_header_map.keys()  # was originally a list comprehension, but changed criteria and simplified it
@@ -163,10 +170,11 @@ def __init__(self):
 
 		self.new_latitude_field = "Lat_DD"
 		self.new_longitude_field = "Lon_DD"
+		self.new_datetime_field = "dt_correction"
 
-		self.datetime_format = '%Y-%m-%dt%H:%M:%S'  # actual off instrument is '%m/%d/%Yt%H:%M:%S', but gets changed (by pandas?)
+		self.datetime_format = '%m/%d/%Yt%H:%M:%S'  # actual off instrument is '%m/%d/%Yt%H:%M:%S', but gets changed (by pandas?)
 
-	def wq_from_file(self, water_quality_raw_data, skip_rows=17):
+	def wq_from_file(self, water_quality_raw_data, skip_rows=0):
 		"""
 			The new YSI Sonde has a different format - it has a bunch of changes it needs in the file.
 			1. The first 17 rows need to be stripped off - they're not useful to us - check the header first to see if
@@ -196,6 +204,7 @@ def wq_from_file(self, water_quality_raw_data, skip_rows=17):
 
 		# wq = convert_file_encoding(wq)  # the source data is in UCS-2 LE BOM, which Python sees as null bytes. Let's make it unicode instead
 		self.wq = water_quality_raw_data
+		self.source_filename = os.path.split(water_quality_raw_data)[1]
 		self.detect_file_encoding()  # figure out what the file encoding is so we can open it correctly
 
 		# basic input cleaning
@@ -205,14 +214,12 @@ def wq_from_file(self, water_quality_raw_data, skip_rows=17):
 		# some corrections of our own.
 
 		self.correct_coordinates()
+		#self.dump_schema_ini()
 
 		reprojected_features = self.make_spatial_and_calculate_fields()
 
 		pandas_version = wqtshp2pd(reprojected_features, date_field="lDate", time_field="lTime", instrument=self)
 
-		# add column with source filename
-		addsourcefield(pandas_version, source_field, water_quality_raw_data)
-
 		return pandas_version
 
 	def detect_file_encoding(self):
@@ -224,8 +231,9 @@ def detect_file_encoding(self):
 			 as that encoding.
 		:return:
 		"""
-		file_data = open(self.wq, 'r').read()  # read in the file data
-		self.detected_encoding = chardet.detect(file_data)['encoding']
+		with open(self.wq, 'r') as filehandle:  # read in the file data
+			file_data = filehandle.read()
+			self.detected_encoding = chardet.detect(file_data)['encoding']
 
 		if self.detected_encoding == "UTF-16":
 			self.detected_encoding = "utf_16_le"  # we'll use this encoding in this case - if it detects UTF-16 off the YSI
@@ -245,32 +253,87 @@ def make_spatial_and_calculate_fields(self):
 			try:
 				# we don't strictly need to do this, but it gives us an on-disk representation that we can calculate fields on
 				# too. Otherwise, if we skip it, reprojection will happen in wqtshp2pd
-				reprojected_features = reproject_features(xy_event_layer, projection_spatial_reference)
+				self.reprojected_features = reproject_features(xy_event_layer, projection_spatial_reference)
+				print("spatial data at {}".format(self.reprojected_features))
 			finally:
 				arcpy.Delete_management(xy_event_layer)  # clean up the layer in memory if something happens
 		finally:
 			arcpy.Delete_management("xy_table_view")
 
-		arcpy.AddField_management(reprojected_features, field_name="turbidity", field_type="DOUBLE")
+		arcpy.AddField_management(self.reprojected_features, field_name="turbidity", field_type="DOUBLE")
 		# change proprietary FNU to NTU based on scaling factor from Chris Jasper
-		arcpy.CalculateField_management(reprojected_features, field="turbidity", expression="!FNU! * 1.24",
+		arcpy.CalculateField_management(self.reprojected_features, field="turbidity", expression="!FNU! * 1.24",
+										expression_type="PYTHON")
+
+		self.make_date_time_strings("lDate_original", "lDate", index=0)
+		self.make_date_time_strings("lTime_original", "lTime", index=1)
+
+		return self.reprojected_features
+
+	def dump_schema_ini(self):
+		"""
+			Sometimes when loading the data into ArcGIS, it interprets
+			times as dates and then the whole pipeline blows up. Here we
+			add entries to a schema.ini file in order to force it to
+			read date and time fields as text, not as dates. If this
+			doesn't work, we'll need to load the CSV to pandas and process
+			times there prior to doing any reprojection.
+		:return:
+		"""
+		schema_folder, csv_file_name = os.path.split(self.cleaned_csv_file)
+		schema_path = os.path.join(schema_folder, "schema.ini")
+
+		with open(schema_path, 'a') as schema_file:
+			schema_file.write("\n[{}]\n".format(csv_file_name))
+			lines = ["Format=CSVDelimited\n",
+					 "ColNameHeader=True\n",
+					 "MaxScanRows=0\n",
+					 "lTime_original Text\n",
+					 "lDate_original Text\n",
+					 "lTime Text\n",
+					 "lDate Text\n",
+					 ]
+			schema_file.writelines(lines)
+
+	def make_date_time_strings(self, field, new_field, index):
+		"""
+			ArcGIS loads the date fields as type Date and also the *time* fields as type date,
+			which gums up Pandas later on.
+			Force them to strings in new fields by splitting our previously combined datetime field
+			on the letter "t" - this is a hacky workaround because we're rushed for time right now
+			and don't have the time to rework the whole datetime processing pipeline for the new
+			instrument.
+			A different option would be to skip loading into Arc and instead just do all
+			field calculations in Pandas.
+		:param field:
+		:param new_field:
+		:param index: the index to be used when splitting the datetime field back out
+		:return:
+		"""
+
+		arcpy.AddField_management(self.reprojected_features, new_field, "TEXT", field_length=20)
+		arcpy.CalculateField_management(self.reprojected_features,
+										field=new_field,
+										expression="!{}!.split('t')[{}]".format(self.new_datetime_field, index),
 										expression_type="PYTHON")
-		return reprojected_features
 
 	def fix_csv_header(self, skip_rows):
 		with open(self.wq, 'r') as wq_data:
 			wq_data_converted = codecs.EncodedFile(wq_data, data_encoding='utf_8', file_encoding=self.detected_encoding)
 			wq_rows = wq_data_converted.readlines()
 			cleaned_data = wq_rows[skip_rows:]  # strip off the first skip_rows rows because they mess things up
 
+			if "Date" not in cleaned_data[0]:
+				raise ValueError("Header on file {} is malformed or missing!".format(self.source_filename))
+
 			# then make the header characters legal
 			cleaned_data[0] = cleaned_data[0].replace("°C", "degrees_C")
 			cleaned_data[0] = cleaned_data[0].replace(" ", "_")
 			cleaned_data[0] = cleaned_data[0].replace("-", "_")
 			cleaned_data[0] = cleaned_data[0].replace("/", "_")
 			cleaned_data[0] = cleaned_data[0].replace("%", "pct")
-			cleaned_data[0] = cleaned_data[0].replace("Date", "lDate")
-			cleaned_data[0] = cleaned_data[0].replace("Time", "lTime")
+			cleaned_data[0] = cleaned_data[0].replace("Date", "lDate_original")
+			cleaned_data[0] = cleaned_data[0].replace("Time", "lTime_original")
 
 			output_temp = tempfile.mkstemp(prefix="arcproject_wq_", suffix=".csv")
 			self.cleaned_csv_file = output_temp[1]
@@ -290,9 +353,16 @@ def correct_coordinates(self):
 			fieldnames = cleaned_rows.fieldnames
 			fieldnames.append(self.new_latitude_field)
 			fieldnames.append(self.new_longitude_field)
+			fieldnames.append(self.new_datetime_field)
+			fieldnames.append(source_field)
 			for record in cleaned_rows:
+				if record["Lat"] in (None, "") or record["Lon"] in (None, ""):  # skip records with no locations
+					#print("skipping record with lat/long of {}, {}".format(record["Lat"], record["Lon"]))
+					continue
 				record[self.new_latitude_field] = dms_to_dd(record["Lat"], force_negative=False)  # convert the latitude
 				record[self.new_longitude_field] = dms_to_dd(record["Lon"], force_negative=True)  # convert the longitude, force it to western hemisphere
+				record[self.new_datetime_field] = record["lDate_original"] + "t" + record["lTime_original"]
+				record[source_field] = self.source_filename
 				output_records.append(record)
 
 		with open(self.cleaned_csv_file, 'wb') as cleaned_data_csv:  # wb probably won't work under Python 3 and we'd want to just be explicit about line endings instead.
@@ -507,7 +577,7 @@ def TimestampFromDateTime(date, time, format_string='%Y-%m-%dt%I:%M:%S%p'):
 	"""
 	Returns python datetime object
 	:param date: a date, by default, in format of %Y-%m-%d
-	:param time: a time, by default, in format of %I:%M:%S%p
+	:param time: a time, by default, in format of %I:%M:%S%p for hydrolab
 	:param format_string: the string to use to parse the time and the date. They will be concatenated with a "t"
 						in the middle
 	:return: datetime object
@@ -576,7 +646,8 @@ def wqtshp2pd(feature_class, date_field="GPS_Date", time_field="GPS_Time", instr
 		addsourcefield(df, "GPS_SOURCE", feature_class)
 
 		# cast Date field to str instead of timestamp
-		df[date_field] = df[date_field].dt.date.astype(str)  # ArcGis adds some artificial times
+		if df[date_field].dtype is pd.Timestamp:  # only happens with Hydrolab data though, so only cast it to str if it's a timestamp now
+			df[date_field] = df[date_field].dt.date.astype(str)  # ArcGis adds some artificial times
 
 		# combine GPS date and GPS time fields into a single column
 		df['Date_Time'] = df.apply(lambda row: TimestampFromDateTime(row[date_field], row[time_field], format_string=instrument.datetime_format), axis=1)
@@ -659,14 +730,15 @@ def JoinMatchPercent(original, joined):
 	return percent_match
 
 
-def wq_append_fromlist(list_of_wq_files, raise_exc=True, instrument=hydrolab):
+def wq_append_fromlist(list_of_wq_files, raise_exc=DEBUG, instrument=hydrolab):
 	"""
 	Takes a list of water quality files and appends them to a single dataframe
 	:param list_of_wq_files: list of raw water quality files paths
 	:return: single dataframe with all the inputs
 	"""
 	master_wq_df = pd.DataFrame()
 	for wq in list_of_wq_files:
+		print("Processing {}".format(wq))
 		try:
 			pwq = instrument.wq_from_file(wq)
 			# append to master wq

diff --git a/setup.py b/setup.py
@@ -1,11 +1,11 @@
 from __future__ import print_function
 
-__author__ = "ambell, nickrsan"
+__author__ = "nickrsan, ambell"
 
 try:
 	from arcproject import __version__ as version
 except ImportError:
-	version = '2019.5.20'
+	version = '2019.07.23'
 
 from setuptools import setup
 
@@ -23,6 +23,6 @@
 						## "pandas >= 0.16.1", "matplotlib", and "numpy >= 1.9.2" also needed, but cause issues on ArcGIS 10.4 install where it tries to upgrade numpy
 		author=__author__,
 		author_email="[email protected]",
-		url='https://github.com/ucd-cws/amaptor',
+		url='https://github.com/ucd-cws/arcproject-wq-processing',
 		include_package_data=True,
 	)
diff --git a/user_setup.py b/user_setup.py
@@ -7,12 +7,6 @@
 r_dependencies = ["RSQLite", "plyr", "gplots", "devtools", "ggplot2"]
 r_github_dependencies = ["ucd-cws/wq-heatplot"]
 
-devtools::install_github("r-lib/remotes", ref = "e56a41e1d0cad55cbe7d60b274b99ab7b7a76b5c")
-try:
-	import winreg
-except ImportError:
-	import _winreg as winreg
-
 
 def set_up_r_dependencies():
 	import launchR  # imported here because it will be installed before this is called, but won't be installed at load time in all cases
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,4 @@

		__all__ = ['waterquality', 'scripts']

		__version__ = "2019.5.20"
		__version__ = "2019.07.23"