Phase3 OVER WOOOOO

mehallhm · Jun 6, 2024 · a13089b · a13089b
2 parents d45cce3 + 13f1386
commit a13089b
Show file tree

Hide file tree

Showing 59 changed files with 18,585 additions and 11,507 deletions.
diff --git a/api/backend/enterprises/enterprises_routes.py b/api/backend/enterprises/enterprises_routes.py
@@ -0,0 +1,176 @@
+########################################################
+# Sample customers blueprint of endpoints
+# Remove this file if you are not using it in your project
+########################################################
+
+from flask import Blueprint, request, jsonify, make_response, current_app
+import json
+from backend.db_connection import db
+
+
+enterprises = Blueprint('enterprises', __name__)
+
+# get all of the enterprise tags from database
+@enterprises.route('/tags', methods=['GET'])
+def get_tags():
+    # get a cursor object from the database
+    cursor = db.get_db().cursor()
+
+    cursor.execute('''
+    SELECT description 
+    FROM EmissionTags 
+    WHERE EmissionTags.id IN (
+        SELECT tag_id 
+        FROM EntTags 
+        WHERE EntTags.enterprise_id = 1
+    );
+''')
+
+    # grab the column headers from the returned data
+    column_headers = [x[0] for x in cursor.description]
+
+    # create an empty dictionary object to use in 
+    # putting column headers together with data
+    json_data = []
+
+    # fetch all the data from the cursor
+    theData = cursor.fetchall()
+
+    # for each of the rows, zip the data elements together with
+    # the column headers. 
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
+
+
+# get all of the matching NGO's based on tags
+@enterprises.route('/NGOMatch', methods=['GET'])
+def get_matches():
+    # get a cursor object from the database
+    cursor = db.get_db().cursor()
+
+    cursor.execute('''
+    SELECT NGO.name, EmissionTags.description
+    FROM NGO
+    JOIN NGOTags ON NGO.id = NGOTags.ngo_id
+    JOIN EmissionTags ON NGOTags.tag_id = EmissionTags.id
+    WHERE EmissionTags.id IN (
+        SELECT tag_id
+        FROM EntTags
+        WHERE EntTags.enterprise_id = 1
+    );
+''')
+
+    # grab the column headers from the returned data
+    column_headers = [x[0] for x in cursor.description]
+
+    # create an empty dictionary object to use in 
+    # putting column headers together with data
+    json_data = []
+
+    # fetch all the data from the cursor
+    theData = cursor.fetchall()
+
+    # for each of the rows, zip the data elements together with
+    # the column headers. 
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
+
+
+# get my emissions, my country's, and avg other companies in same country emissions
+@enterprises.route('/EntCompare', methods=['GET'])
+def get_comparison():
+    # get a cursor object from the database
+    cursor = db.get_db().cursor()
+
+    cursor.execute('''
+        SELECT AVG(Enterprises.emission_result) AS 'Average Emission (by Country)',
+               Country.name                     AS 'Country',
+               (SELECT e2.emission_result
+                FROM Enterprises e2
+                WHERE e2.id = 1)                AS 'Your Emissions'
+        FROM Enterprises
+                 JOIN Country ON Enterprises.country_id = Country.id
+        WHERE Country.name =
+              (SELECT Country.name
+               FROM Enterprises
+                        JOIN Country ON Enterprises.country_id = Country.id
+               WHERE Enterprises.id = 1
+               LIMIT 1)
+        GROUP BY Country.name;
+    ''')
+
+    # grab the column headers from the returned data
+    column_headers = [x[0] for x in cursor.description]
+
+    # create an empty dictionary object to use in 
+    # putting column headers together with data
+    json_data = []
+
+    # fetch all the data from the cursor
+    theData = cursor.fetchall()
+
+    # for each of the rows, zip the data elements together with
+    # the column headers. 
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
+
+
+# Get all the supply chain history for this enterprise
+@enterprises.route('/EntSupplyChain', methods=['GET'])
+def get_supplychain():
+    cursor = db.get_db().cursor()
+
+    cursor.execute('SELECT * FROM SupplyChain WHERE SupplyChain.enterprise_id = 1')
+
+    column_headers = [x[0] for x in cursor.description]
+
+    json_data = []
+
+    theData = cursor.fetchall()
+
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
+
+# Get all the operating cost history for this enterprise
+@enterprises.route('/EntCosts', methods=['GET'])
+def get_costs():
+    cursor = db.get_db().cursor()
+
+    cursor.execute('SELECT * FROM operatingEmission WHERE operatingEmission.enterprise_id = 1')
+
+    column_headers = [x[0] for x in cursor.description]
+
+    json_data = []
+
+    theData = cursor.fetchall()
+
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
+
+# Get all the flights history for this enterprise
+@enterprises.route('/EntFlights', methods=['GET'])
+def get_flights():
+    cursor = db.get_db().cursor()
+
+    cursor.execute('SELECT * FROM Flight WHERE Flight.enterprise_id = 1')
+
+    column_headers = [x[0] for x in cursor.description]
+
+    json_data = []
+
+    theData = cursor.fetchall()
+
+    for row in theData:
+        json_data.append(dict(zip(column_headers, row)))
+
+    return jsonify(json_data)
diff --git a/eda/clean.ipynb → api/backend/ml_models/__init__.py b/eda/clean.ipynb → api/backend/ml_models/__init__.py
diff --git a/api/backend/ml_models/model_alpha.py b/api/backend/ml_models/model_alpha.py
@@ -0,0 +1,212 @@
+"""
+The Train, Test, and Predict functions for the CO2 Emission Linear Regression
+ML Model
+"""
+
+import numpy as np
+import pandas as pd
+import pandasdmx as sdmx
+from sklearn.metrics import r2_score
+from functools import reduce
+import pandasdmx as sdmx
+
+def train() -> np.array:
+	"""
+	Calculates the slopes for the CO2 emissions regression model.
+
+	:returns: An array with the slopes in shape (3,)
+	"""
+	estat = sdmx.Request("ESTAT")
+	resp = estat.data(
+		"ENV_AIR_GGE",
+		key={
+			"unit": "THS_T",
+			"freq": "A",
+			"src_crf": "TOTX4_MEMONIA",
+			"airpol": "GHG"
+		}
+	)
+	emission_df = (resp
+				.to_pandas(datetime={'dim': 'TIME_PERIOD'})
+				.droplevel(level=['unit', 'freq', 'src_crf', 'airpol'], axis=1))
+	melted_emissions_df = melt_smdx_dataframe(emission_df)
+
+	resp = estat.data(
+			"NRG_D_HHQ",
+			key={
+				"siec": "TOTAL",
+				"unit": "TJ",
+				"nrg_bal": "FC_OTH_HH_E",
+				"freq": "A",
+			}
+		)
+	household_energy_df = (resp
+				.to_pandas(datetime={'dim': 'TIME_PERIOD', 'freq': 'freq'})
+				.droplevel(level=["siec", "unit", "nrg_bal"], axis=1))
+	melted_household_energy_df = melt_smdx_dataframe(household_energy_df)
+
+	resp = estat.data(
+		"TEN00127",
+		key={
+			"unit": "KTOE",
+			"freq": "A",
+			"siec": "O4652XR5210B",
+			"nrg_bal": "FC_TRA_ROAD_E"
+		}
+	)
+	gas_df = (resp
+				.to_pandas(datetime={'dim': 'TIME_PERIOD'})
+				.droplevel(level=['unit', 'freq', 'siec', "nrg_bal"], axis=1))
+	melted_gas_df = melt_smdx_dataframe(gas_df)
+
+	merged_df = merge_dataframes([melted_emissions_df,
+											  melted_household_energy_df,
+											  melted_gas_df])
+	merged_df.columns = ["year", "geo", "emissions", "energy", "gas"]
+	merged_df = merged_df.drop(merged_df[(merged_df.geo == "EU27_2020") |
+									   (merged_df.geo == "EU20")].index)
+	merged_df = merged_df.drop("year", axis=1)
+	standard_df = standardize(merged_df)
+
+	df_dummies = pd.get_dummies(standard_df, dtype=int, columns=["geo"])
+	df_dummies = df_dummies.fillna(0)
+
+	#X = np.pad(df_dummies.iloc[:, 1:].to_numpy(dtype=np.float64),
+	# ((0,0), (1,0)), mode="constant", constant_values=1)
+	X = np.pad(standard_df.iloc[:,1:3].to_numpy(dtype=np.float64),
+			 ((0,0), (1,0)), mode="constant", constant_values=1)
+	y = np.array(df_dummies["emissions"], dtype=np.float64)
+
+	m = np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y))
+
+	return m
+
+
+def test(X: np.array, y: np.array) -> any:
+	"""
+	Tests the CO2 emissions regression model
+
+	:param X: The padded X features
+	:param y: The y features
+	:returns: The R2 value of the model w/ LOO-CV
+	"""
+	np_remove = lambda a, i: np.concatenate([a[:i,], a[i + 1:,]])
+	lin_reg = lambda X, Y: np.matmul(np.linalg.inv(np.matmul(X.T, X)),
+								   np.matmul(X.T, Y))
+
+	y_pred = []
+	for i in range(len(X)):
+		holdout_X = X[i]
+
+		loo_X = np_remove(X, i)
+		loo_y = np_remove(y, i)
+		loo_b = lin_reg(loo_X, loo_y)
+
+		y_hat = np.matmul(holdout_X, loo_b)
+		y_pred.append(y_hat)
+
+	r2 = r2_score(y, y_pred)
+
+	return r2
+
+
+def predict(feats:list[float], beta:list[float]) -> float:
+	"""
+	Predicts the Greenhouse Gas Emissions for an inividual user in ktonnes.
+
+	:param feats: The unpadded input features from the user:
+		- Motor Gassoline in ktoes
+		- Household Energy in TJ
+	:param beta: The slopes (and intercept) for the trained model of shape: (3,)
+	:returns: The predicted greenhouse gass emission in CO2 equiventlents
+		measured in ktonnes
+	"""
+	x = np.concatenate([[1], np.array(feats, dtype=np.float64)])
+	beta = np.array(beta, dtype=np.float64)
+	y_hat = np.matmul(x, beta)
+
+	return y_hat
+
+
+
+def melt_smdx_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+	"""
+	Given an ESTAT smdx dataframe, convert the datetimes to years and melt
+
+	:param df: The raw SDMX parsed dataframe from ESTAT
+	:returns: A melted dataframe with the columns of:
+		`year` - the year of the observation
+		`geo` - the country of the observation
+		`value` - the value of the observation
+	"""
+	df = df.reset_index()
+	df["year"] = df["TIME_PERIOD"].dt.year
+	df = df.drop("TIME_PERIOD", axis=1)
+	return pd.melt(df, id_vars="year")
+
+def merge_dataframes(dataframes: list[pd.DataFrame]) -> pd.DataFrame:
+	"""
+	"""
+	for i, df in enumerate(dataframes):
+		df.columns = ["geo", "year", i]
+
+	merged_df = reduce(lambda l, r: pd.merge(l, r, left_on=["year", "geo"], right_on=["year", "geo"]), dataframes)
+	return merged_df
+
+def fill_holes(df: pd.DataFrame) -> pd.DataFrame:
+	"""
+	"""
+	lin_reg = lambda X, Y: np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, Y))
+
+	dfs = []
+
+	for name, group in df.groupby('geo'):
+		cols = [[name for _ in range(len(group.index))]]
+		for i in range(1, len(group.columns)):
+			d = group.iloc[:, i:i+1].to_numpy()
+
+			missing_mask = np.isnan(d) | (d == 0)
+			present_mask = ~missing_mask
+
+			missing_mask = missing_mask.reshape(1, -1)[0]
+			present_mask = present_mask.reshape(1, -1)[0]
+
+			if not np.any(missing_mask):
+				d = d.reshape(1, -1)[0]
+				cols.append(d)
+				continue
+
+			if not np.any(present_mask):
+				d = d.reshape(1, -1)[0]
+				cols.append(d)
+				continue
+
+			x_present = np.pad(np.arange(len(d))[present_mask].reshape(-1, 1), ((0, 0), (1, 0)), mode="constant", constant_values=1)
+			y_present = d[present_mask]
+
+			w = lin_reg(x_present, y_present)
+
+			x_missing = np.pad(np.arange(len(d))[missing_mask].reshape(-1, 1), ((0, 0), (1, 0)), mode="constant", constant_values=1)
+			y_missing_pred = np.matmul(x_missing, w)
+
+			d[missing_mask] = y_missing_pred
+			d = d.reshape(1, -1)[0]
+
+			cols.append(d)
+
+		dfs.append(pd.DataFrame(cols).T)	
+
+	df_unswissed = pd.concat(dfs, axis=0)
+	df_unswissed.columns = df.columns
+	return df_unswissed
+
+def standardize(df: pd.DataFrame) -> pd.DataFrame:
+	"""
+	"""
+	df_standard = pd.DataFrame()
+	for feat in df.columns:
+		if feat == "geo": continue
+		df_standard[f'{feat}'] = ((df[feat] - df[feat].mean()) / df[feat].std())
+	df_standard["geo"] = df["geo"]
+
+	return df_standard