Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #1048 - The accuracy module testing, fixes and readme #1057

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 61 additions & 42 deletions pdr_backend/accuracy/app.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging
import threading
import json
from datetime import datetime, timedelta
from typing import Any, Callable, Dict, List, Optional, Tuple

from enforce_typing import enforce_types
from flask import Flask, jsonify

from pdr_backend.lake.etl import ETL
from pdr_backend.lake.gql_data_factory import GQLDataFactory
from pdr_backend.lake.persistent_data_store import PersistentDataStore
from pdr_backend.lake.slot import Slot
Expand All @@ -23,6 +23,14 @@
SECONDS_IN_A_DAY = 86400
logger = logging.getLogger("accuracy_app")

# Take the ppss file from the cli run command

accuracy_ppss = PPSS(
yaml_filename="./ppss.yaml",
network="sapphire-mainnet",
nested_override_args=None,
)


@enforce_types
def calculate_prediction_result(
Expand Down Expand Up @@ -248,18 +256,57 @@ def calculate_timeframe_timestamps(


@enforce_types
def fetch_statistics_using_ETL():
# return
ppss = PPSS(
yaml_filename="./ppss.yaml",
network="sapphire-mainnet",
nested_override_args=None,
def calculate_statistics_from_DuckDB_tables():
four_weeks_ago = datetime.utcnow() - timedelta(weeks=4)
start_ts = UnixTimeS(int(four_weeks_ago.timestamp()))

db_conn = PersistentDataStore(accuracy_ppss.lake_ss.lake_dir)
slots_table_name = Slot.get_lake_table_name()

slots_table = db_conn.query_data(
f"""
SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts}
"""
)
print(
"THE_QUERY:",
f"""
SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts}
""",
)
db_conn.duckdb_conn.close()

slots_table = slots_table.group_by("ID").first()
slots_table = slots_table.sort("slot")

all_slots: List[Slot] = []

# Iterate over rows and create objects
for row in slots_table.rows(named=True):
slot = Slot(
row["ID"],
row["timestamp"],
row["slot"],
row["truevalue"],
row["roundSumStakesUp"],
row["roundSumStakes"],
)
all_slots.append(slot)

gql_data_factory = GQLDataFactory(ppss)
etl = ETL(ppss, gql_data_factory)
data = transform_slots_to_statistics(all_slots)
json_data = json.dumps(data)
## put the data in a file
with open(JSON_FILE_PATH, "w") as f:
f.write(json_data)


@enforce_types
def fetch_statistics_using_ETL():
# return
gql_data_factory = GQLDataFactory(accuracy_ppss)
while True:
etl.do_etl()
gql_data_factory.get_gql_tables()
calculate_statistics_from_DuckDB_tables()
threading.Event().wait(300) # Wait for 5 minutes (300 seconds)


Expand Down Expand Up @@ -339,7 +386,7 @@ def transform_slots_to_statistics(all_slots: List[Slot]):

@enforce_types
@app.route("/statistics", methods=["GET"])
def calculate_statistics_from_DuckDB_tables():
def serve_statisctics():
"""
Serves statistical data from a JSON file via a GET request.

Expand All @@ -349,38 +396,10 @@ def calculate_statistics_from_DuckDB_tables():

If the file cannot be read or another error occurs, it returns a 500 Internal Server Error.
"""

four_weeks_ago = datetime.utcnow() - timedelta(weeks=4)
start_ts = UnixTimeS(int(four_weeks_ago.timestamp()))
end_ts = UnixTimeS(int(datetime.utcnow().timestamp()))
try:
db_conn = PersistentDataStore("./lake_data", read_only=True)
slots_table_name = Slot.get_lake_table_name()
slots_table = db_conn.query_data(
f"""
SELECT * FROM {slots_table_name} WHERE SLOT > {start_ts} AND SLOT < {end_ts}
"""
)
db_conn.duckdb_conn.close()

slots_table = slots_table.group_by("ID").first()
slots_table = slots_table.sort("slot")

all_slots: List[Slot] = []

# Iterate over rows and create objects
for row in slots_table.rows(named=True):
slot = Slot(
row["ID"],
row["timestamp"],
row["slot"],
row["truevalue"],
row["roundSumStakesUp"],
row["roundSumStakes"],
)
all_slots.append(slot)

data = transform_slots_to_statistics(all_slots)
with open(JSON_FILE_PATH, "r") as f:
data = json.load(f) # Load JSON data from file

response = jsonify(data)
response.headers.add("Access-Control-Allow-Origin", "*") # Allow any origin
return response
Expand Down
80 changes: 80 additions & 0 deletions pdr_backend/accuracy/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# pdr_backend/accuracy

This document provides instructions on how to deploy and run the `app.py` script in the `pdr_backend/accuracy` directory.

## Requirements

- Python 3.x
- Required Python packages (listed in `requirements.txt`)

## Deployment

```bash
git clone <repository_url>
cd pdr_backend

pip install -r requirements.txt
```

## Usage

The `app.py` script is used to calculate the accuracy of a model based on the predicted and true values.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'to calculate the accuracy' -> 'to calculate the accuracy and other statistics'


```bash
python pdr_backend/accuracy/app.py
```


The script uses the `GQLDataFactory` to fetch the predictions. Be sure to provide the correct values for `st_timestr` in the `ppss.yaml` file to get the correct predictions. It needs to be at least `28 days ago` from the current date.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change 'to get correct predictions' to 'to get the required data for the calculations'


The script will output the accuracy of the predictoor based on the provided values to a file named `pdr_backend/accuracy/output/accuracy.json`.

The data includes the pair name, the average accuracy of the model, the total staked amount yesterday, and the total staked amount today.

Example Output:
```json

[
{
"alias": "5m",
"statistics": {
"0xb1c55346023dee4d8b0d7b10049f0c8854823766": {
"token_name": "LTC/USDT",
"average_accuracy": 53.67847411444142,
"total_staked_yesterday": 217456.43999999997,
"total_staked_today": 203650.03999999992
},
........
}
},
{
"alias": "1h",
"statistics": {
"0xb1c55346023dee4d8b0d7b10049f0c8854823766": {
"token_name": "LTC/USDT",
"average_accuracy": 53.67847411444142,
"total_staked_yesterday": 217456.43999999997,
"total_staked_today": 203650.03999999992
},
........

}
]
```

## Flask API

The `app.py` script provides a Flask endpoint with `/statistics` route to get the accuracy of the model.

```bash
curl http://localhost:5000/statistics
```

The endpoint will return the accuracy of the model in JSON format.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could provide a JSON example of the response with all the fields and also mention that the API returns multiple values not just accuracy(token_name, total_staked_yesterday, total_staked_today), hence the statistics names.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I will bring an example JSON and mention about it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put the information to the "Usage" subtitle, under the json file info



## Warning about multithreading

The `app.py` script uses multithreading to handle multiple tasks simultaneously. Please ensure that the script is run in a thread-safe environment to avoid any issues.

The main thread will be used to handle the Flask API requests, while the worker threads will be used to calculate the accuracy of the model.
94 changes: 94 additions & 0 deletions pdr_backend/accuracy/test/test_app.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import List
from datetime import datetime, timedelta
from unittest.mock import patch

import polars as pl
from enforce_typing import enforce_types

from pdr_backend.subgraph.subgraph_predictions import ContractIdAndSPE
Expand All @@ -8,9 +11,12 @@
process_single_slot,
aggregate_statistics,
calculate_statistics_for_all_assets,
calculate_statistics_from_DuckDB_tables,
)
from pdr_backend.lake.slot import Slot
from pdr_backend.util.time_types import UnixTimeS
from pdr_backend.lake.persistent_data_store import PersistentDataStore
from pdr_backend.ppss.ppss import mock_ppss

# Sample data for tests
SAMPLE_PREDICT_SLOT = Slot(
Expand Down Expand Up @@ -87,3 +93,91 @@ def test_calculate_statistics_for_all_assets():
print("test_calculate_statistics_for_all_assets", statistics)
# Verify
assert statistics["0xAsset"]["average_accuracy"] == 100.0


@enforce_types
def test_calculate_statistics_from_DuckDB_tables(tmpdir):
ppss = mock_ppss(
[{"predict": "binance BTC/USDT c 5m", "train_on": "binance BTC/USDT c 5m"}],
"sapphire-mainnet",
str(tmpdir),
st_timestr="2023-12-20",
fin_timestr="now",
)
# check if slots table exists
PersistentDataStore(ppss.lake_ss.lake_dir).execute_sql(
"DROP TABLE IF EXISTS slots;"
)

two_weeks_ago = datetime.utcnow() - timedelta(weeks=2)
slot_timestamp = UnixTimeS(int(two_weeks_ago.timestamp()))

# Generate 100 slot timestamps with 5 minute intervals
slot_timestamps = [slot_timestamp + i * 300 for i in range(100)]

# generate IDS with 0x18f54cc21b7a2fdd011bea06bba7801b280e3151-slot_timestamp
generated_ids = [
f"0x18f54cc21b7a2fdd011bea06bba7801b280e3151-{slot}" for slot in slot_timestamps
]
# slots dataframe
slots_df = pl.DataFrame(
{
"ID": generated_ids,
"timestamp": slot_timestamps,
"slot": slot_timestamps,
"truevalue": [True] * 100,
"roundSumStakesUp": [150.0] * 100,
"roundSumStakes": [100.0] * 100,
}
)

# Main work
PersistentDataStore(ppss.lake_ss.lake_dir).insert_to_table(slots_df, "pdr_slots")

test_json_file_path = str(tmpdir.join("test.json"))
with patch("pdr_backend.accuracy.app.JSON_FILE_PATH", test_json_file_path):
with patch("pdr_backend.accuracy.app.accuracy_ppss", ppss):
calculate_statistics_from_DuckDB_tables()

# Verify
expected_result = """[{"alias": "5m", "statistics": {"0x18f54cc21b7a2fdd011bea06bba7801b280e3151": {"token_name": "ADA/USDT", "average_accuracy": 100.0, "total_staked_yesterday": 0.0, "total_staked_today": 0.0}}}, {"alias": "1h", "statistics": {}}]""" # pylint: disable=line-too-long

with open(test_json_file_path, "r") as f:
result = f.read()
assert result == expected_result

# Test with false values
false_start_timestamp = slot_timestamps[-1] + 300

# Generate 100 slot timestamps with 5 minute intervals
false_slot_timestamps = [false_start_timestamp + i * 300 for i in range(100)]

# generate IDS with 0x18f54cc21b7a2fdd011bea06bba7801b280e3151-slot_timestamp
generated_ids = [
f"0x18f54cc21b7a2fdd011bea06bba7801b280e3151-{slot}"
for slot in false_slot_timestamps
]
# slots dataframe
false_slots = pl.DataFrame(
{
"ID": generated_ids,
"timestamp": slot_timestamps,
"slot": slot_timestamps,
"truevalue": [False] * 100,
"roundSumStakesUp": [150.0] * 100,
"roundSumStakes": [100.0] * 100,
}
)

PersistentDataStore(ppss.lake_ss.lake_dir).insert_to_table(false_slots, "pdr_slots")

test_json_file_path = str(tmpdir.join("test.json"))
with patch("pdr_backend.accuracy.app.JSON_FILE_PATH", test_json_file_path):
with patch("pdr_backend.accuracy.app.accuracy_ppss", ppss):
calculate_statistics_from_DuckDB_tables()

expected_result = """[{"alias": "5m", "statistics": {"0x18f54cc21b7a2fdd011bea06bba7801b280e3151": {"token_name": "ADA/USDT", "average_accuracy": 50.0, "total_staked_yesterday": 0.0, "total_staked_today": 0.0}}}, {"alias": "1h", "statistics": {}}]""" # pylint: disable=line-too-long

with open(test_json_file_path, "r") as f:
result = f.read()
assert result == expected_result
Loading