oceanprotocol · kdetry · May 16, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/pdr_backend/accuracy/app.py b/pdr_backend/accuracy/app.py
@@ -1,12 +1,12 @@
 import logging
 import threading
+import json
 from datetime import datetime, timedelta
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 from enforce_typing import enforce_types
 from flask import Flask, jsonify
 
-from pdr_backend.lake.etl import ETL
 from pdr_backend.lake.gql_data_factory import GQLDataFactory
 from pdr_backend.lake.persistent_data_store import PersistentDataStore
 from pdr_backend.lake.slot import Slot
@@ -23,6 +23,14 @@
 SECONDS_IN_A_DAY = 86400
 logger = logging.getLogger("accuracy_app")
 
+# Take the ppss file from the cli run command
+
+accuracy_ppss = PPSS(
+    yaml_filename="./ppss.yaml",
+    network="sapphire-mainnet",
+    nested_override_args=None,
+)
+
 
 @enforce_types
 def calculate_prediction_result(
@@ -248,18 +256,57 @@ def calculate_timeframe_timestamps(
 
 
 @enforce_types
-def fetch_statistics_using_ETL():
-    # return
-    ppss = PPSS(
-        yaml_filename="./ppss.yaml",
-        network="sapphire-mainnet",
-        nested_override_args=None,
+def calculate_statistics_from_DuckDB_tables():
+    four_weeks_ago = datetime.utcnow() - timedelta(weeks=4)
+    start_ts = UnixTimeS(int(four_weeks_ago.timestamp()))
+
+    db_conn = PersistentDataStore(accuracy_ppss.lake_ss.lake_dir)
+    slots_table_name = Slot.get_lake_table_name()
+
+    slots_table = db_conn.query_data(
+        f"""
+        SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts}
+        """
+    )
+    print(
+        "THE_QUERY:",
+        f"""
+          SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts}
+          """,
     )
+    db_conn.duckdb_conn.close()
+
+    slots_table = slots_table.group_by("ID").first()
+    slots_table = slots_table.sort("slot")
+
+    all_slots: List[Slot] = []
+
+    # Iterate over rows and create objects
+    for row in slots_table.rows(named=True):
+        slot = Slot(
+            row["ID"],
+            row["timestamp"],
+            row["slot"],
+            row["truevalue"],
+            row["roundSumStakesUp"],
+            row["roundSumStakes"],
+        )
+        all_slots.append(slot)
 
-    gql_data_factory = GQLDataFactory(ppss)
-    etl = ETL(ppss, gql_data_factory)
+    data = transform_slots_to_statistics(all_slots)
+    json_data = json.dumps(data)
+    ## put the data in a file
+    with open(JSON_FILE_PATH, "w") as f:
+        f.write(json_data)
+
+
+@enforce_types
+def fetch_statistics_using_ETL():
+    # return
+    gql_data_factory = GQLDataFactory(accuracy_ppss)
     while True:
-        etl.do_etl()
+        gql_data_factory.get_gql_tables()
+        calculate_statistics_from_DuckDB_tables()
         threading.Event().wait(300)  # Wait for 5 minutes (300 seconds)
 
 
@@ -339,7 +386,7 @@ def transform_slots_to_statistics(all_slots: List[Slot]):
 
 @enforce_types
 @app.route("/statistics", methods=["GET"])
-def calculate_statistics_from_DuckDB_tables():
+def serve_statisctics():
     """
     Serves statistical data from a JSON file via a GET request.
 
@@ -349,38 +396,10 @@ def calculate_statistics_from_DuckDB_tables():
 
     If the file cannot be read or another error occurs, it returns a 500 Internal Server Error.
     """
-
-    four_weeks_ago = datetime.utcnow() - timedelta(weeks=4)
-    start_ts = UnixTimeS(int(four_weeks_ago.timestamp()))
-    end_ts = UnixTimeS(int(datetime.utcnow().timestamp()))
     try:
-        db_conn = PersistentDataStore("./lake_data", read_only=True)
-        slots_table_name = Slot.get_lake_table_name()
-        slots_table = db_conn.query_data(
-            f"""
-            SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts} AND SLOT < {end_ts}
-            """
-        )
-        db_conn.duckdb_conn.close()
-
-        slots_table = slots_table.group_by("ID").first()
-        slots_table = slots_table.sort("slot")
-
-        all_slots: List[Slot] = []
-
-        # Iterate over rows and create objects
-        for row in slots_table.rows(named=True):
-            slot = Slot(
-                row["ID"],
-                row["timestamp"],
-                row["slot"],
-                row["truevalue"],
-                row["roundSumStakesUp"],
-                row["roundSumStakes"],
-            )
-            all_slots.append(slot)
-
-        data = transform_slots_to_statistics(all_slots)
+        with open(JSON_FILE_PATH, "r") as f:
+            data = json.load(f)  # Load JSON data from file
+
         response = jsonify(data)
         response.headers.add("Access-Control-Allow-Origin", "*")  # Allow any origin
         return response

diff --git a/pdr_backend/accuracy/readme.md b/pdr_backend/accuracy/readme.md
@@ -0,0 +1,80 @@
+# pdr_backend/accuracy
+
+This document provides instructions on how to deploy and run the `app.py` script in the `pdr_backend/accuracy` directory.
+
+## Requirements
+
+- Python 3.x
+- Required Python packages (listed in `requirements.txt`)
+
+## Deployment
+
+```bash
+git clone <repository_url>
+cd pdr_backend
+
+pip install -r requirements.txt
+```
+
+## Usage
+
+The `app.py` script is used to calculate the accuracy of a model based on the predicted and true values.
+
+```bash
+python pdr_backend/accuracy/app.py
+```
+
+
+The script uses the `GQLDataFactory` to fetch the predictions. Be sure to provide the correct values for `st_timestr` in the `ppss.yaml` file to get the correct predictions. It needs to be at least `28 days ago` from the current date. 
+
+The script will output the accuracy of the predictoor based on the provided values to a file named `pdr_backend/accuracy/output/accuracy.json`.
+
+The data includes the pair name, the average accuracy of the model, the total staked amount yesterday, and the total staked amount today.
+
+Example Output:
+```json
+
+[
+    {
+        "alias": "5m",
+        "statistics": {
+            "0xb1c55346023dee4d8b0d7b10049f0c8854823766": {
+                "token_name": "LTC/USDT",
+                "average_accuracy": 53.67847411444142,
+                "total_staked_yesterday": 217456.43999999997,
+                "total_staked_today": 203650.03999999992
+            },
+            ........
+        }
+    },
+    {
+        "alias": "1h",
+        "statistics": {
+            "0xb1c55346023dee4d8b0d7b10049f0c8854823766": {
+                "token_name": "LTC/USDT",
+                "average_accuracy": 53.67847411444142,
+                "total_staked_yesterday": 217456.43999999997,
+                "total_staked_today": 203650.03999999992
+            },
+            ........
+
+    }
+]
+```
+
+## Flask API
+
+The `app.py` script provides a Flask endpoint with `/statistics` route to get the accuracy of the model.
+
+```bash
+curl http://localhost:5000/statistics
+```
+
+The endpoint will return the accuracy of the model in JSON format.
+
+
+## Warning about multithreading
+
+The `app.py` script uses multithreading to handle multiple tasks simultaneously. Please ensure that the script is run in a thread-safe environment to avoid any issues.
+
+The main thread will be used to handle the Flask API requests, while the worker threads will be used to calculate the accuracy of the model.
diff --git a/pdr_backend/accuracy/test/test_app.py b/pdr_backend/accuracy/test/test_app.py
@@ -1,5 +1,8 @@
 from typing import List
+from datetime import datetime, timedelta
+from unittest.mock import patch
 
+import polars as pl
 from enforce_typing import enforce_types
 
 from pdr_backend.subgraph.subgraph_predictions import ContractIdAndSPE
@@ -8,9 +11,12 @@
     process_single_slot,
     aggregate_statistics,
     calculate_statistics_for_all_assets,
+    calculate_statistics_from_DuckDB_tables,
 )
 from pdr_backend.lake.slot import Slot
 from pdr_backend.util.time_types import UnixTimeS
+from pdr_backend.lake.persistent_data_store import PersistentDataStore
+from pdr_backend.ppss.ppss import mock_ppss
 
 # Sample data for tests
 SAMPLE_PREDICT_SLOT = Slot(
@@ -87,3 +93,91 @@ def test_calculate_statistics_for_all_assets():
     print("test_calculate_statistics_for_all_assets", statistics)
     # Verify
     assert statistics["0xAsset"]["average_accuracy"] == 100.0
+
+
+@enforce_types
+def test_calculate_statistics_from_DuckDB_tables(tmpdir):
+    ppss = mock_ppss(
+        [{"predict": "binance BTC/USDT c 5m", "train_on": "binance BTC/USDT c 5m"}],
+        "sapphire-mainnet",
+        str(tmpdir),
+        st_timestr="2023-12-20",
+        fin_timestr="now",
+    )
+    # check if slots table exists
+    PersistentDataStore(ppss.lake_ss.lake_dir).execute_sql(
+        "DROP TABLE IF EXISTS slots;"
+    )
+
+    two_weeks_ago = datetime.utcnow() - timedelta(weeks=2)
+    slot_timestamp = UnixTimeS(int(two_weeks_ago.timestamp()))
+
+    # Generate 100 slot timestamps with 5 minute intervals
+    slot_timestamps = [slot_timestamp + i * 300 for i in range(100)]
+
+    # generate IDS with 0x18f54cc21b7a2fdd011bea06bba7801b280e3151-slot_timestamp
+    generated_ids = [
+        f"0x18f54cc21b7a2fdd011bea06bba7801b280e3151-{slot}" for slot in slot_timestamps
+    ]
+    # slots dataframe
+    slots_df = pl.DataFrame(
+        {
+            "ID": generated_ids,
+            "timestamp": slot_timestamps,
+            "slot": slot_timestamps,
+            "truevalue": [True] * 100,
+            "roundSumStakesUp": [150.0] * 100,
+            "roundSumStakes": [100.0] * 100,
+        }
+    )
+
+    # Main work
+    PersistentDataStore(ppss.lake_ss.lake_dir).insert_to_table(slots_df, "pdr_slots")
+
+    test_json_file_path = str(tmpdir.join("test.json"))
+    with patch("pdr_backend.accuracy.app.JSON_FILE_PATH", test_json_file_path):
+        with patch("pdr_backend.accuracy.app.accuracy_ppss", ppss):
+            calculate_statistics_from_DuckDB_tables()
+
+    # Verify
+    expected_result = """[{"alias": "5m", "statistics": {"0x18f54cc21b7a2fdd011bea06bba7801b280e3151": {"token_name": "ADA/USDT", "average_accuracy": 100.0, "total_staked_yesterday": 0.0, "total_staked_today": 0.0}}}, {"alias": "1h", "statistics": {}}]"""  # pylint: disable=line-too-long
+
+    with open(test_json_file_path, "r") as f:
+        result = f.read()
+        assert result == expected_result
+
+    # Test with false values
+    false_start_timestamp = slot_timestamps[-1] + 300
+
+    # Generate 100 slot timestamps with 5 minute intervals
+    false_slot_timestamps = [false_start_timestamp + i * 300 for i in range(100)]
+
+    # generate IDS with 0x18f54cc21b7a2fdd011bea06bba7801b280e3151-slot_timestamp
+    generated_ids = [
+        f"0x18f54cc21b7a2fdd011bea06bba7801b280e3151-{slot}"
+        for slot in false_slot_timestamps
+    ]
+    # slots dataframe
+    false_slots = pl.DataFrame(
+        {
+            "ID": generated_ids,
+            "timestamp": slot_timestamps,
+            "slot": slot_timestamps,
+            "truevalue": [False] * 100,
+            "roundSumStakesUp": [150.0] * 100,
+            "roundSumStakes": [100.0] * 100,
+        }
+    )
+
+    PersistentDataStore(ppss.lake_ss.lake_dir).insert_to_table(false_slots, "pdr_slots")
+
+    test_json_file_path = str(tmpdir.join("test.json"))
+    with patch("pdr_backend.accuracy.app.JSON_FILE_PATH", test_json_file_path):
+        with patch("pdr_backend.accuracy.app.accuracy_ppss", ppss):
+            calculate_statistics_from_DuckDB_tables()
+
+    expected_result = """[{"alias": "5m", "statistics": {"0x18f54cc21b7a2fdd011bea06bba7801b280e3151": {"token_name": "ADA/USDT", "average_accuracy": 50.0, "total_staked_yesterday": 0.0, "total_staked_today": 0.0}}}, {"alias": "1h", "statistics": {}}]"""  # pylint: disable=line-too-long
+
+    with open(test_json_file_path, "r") as f:
+        result = f.read()
+        assert result == expected_result