diff --git a/.gitignore b/.gitignore index e92e892..7712c4e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ # do not archive notebook checkpoints .ipynb_checkpoints __pycache__/ +.DS_Store diff --git a/end-to-end-rest/docker-compose.yml b/end-to-end-rest/docker-compose.yml index ebf90cb..fc3a468 100644 --- a/end-to-end-rest/docker-compose.yml +++ b/end-to-end-rest/docker-compose.yml @@ -34,7 +34,7 @@ services: - "8000:8000" # Change port mapping appropriately before deploying. # open browser to http://127.0.0.1:8000/docs environment: - - "SKEMA_RS_ADDESS=http://skema-rs:8080" + - "SKEMA_RS_ADDRESS=http://skema-rs:8080" - "SKEMA_GRAPH_DB_PROTO=bolt://" - "SKEMA_GRAPH_DB_PORT=7687" - "SKEMA_GRAPH_DB_HOST=graphdb" @@ -57,6 +57,7 @@ services: - "SKEMA_RS_PORT=8080" - "SKEMA_GRAPH_DB_HOST=graphdb" - "SKEMA_GRAPH_DB_PORT=7687" + - "SKEMA_GRAPH_DB_PROTO=bolt://" # We currently use Memgraph (in-memory graph database). graphdb: diff --git a/end-to-end-rest/notebooks/M11-Q6/data/code/chime_trimmed.zip b/end-to-end-rest/notebooks/M11-Q6/data/code/chime_trimmed.zip new file mode 100644 index 0000000..a80426d Binary files /dev/null and b/end-to-end-rest/notebooks/M11-Q6/data/code/chime_trimmed.zip differ diff --git a/end-to-end-rest/notebooks/M11-Q6/data/code/code1/README.md b/end-to-end-rest/notebooks/M11-Q6/data/code/code1/README.md new file mode 100644 index 0000000..501d05c --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/code/code1/README.md @@ -0,0 +1 @@ +This is a readme for the SIR model in the base test scenario. This contains no code. \ No newline at end of file diff --git a/end-to-end-rest/notebooks/M11-Q6/data/code/code1/code.py b/end-to-end-rest/notebooks/M11-Q6/data/code/code1/code.py new file mode 100644 index 0000000..3f6911b --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/code/code1/code.py @@ -0,0 +1,20 @@ +# SIR model dynamics definition +def sir( + s: float, i: float, r: float, beta: float, gamma: float, n: float +) -> Tuple[float, float, float]: + """The SIR model, one time step.""" + s_n = (-beta * s * i) + s + i_n = (beta * s * i - gamma * i) + i + r_n = gamma * i + r + scale = n / (s_n + i_n + r_n) + return s_n * scale, i_n * scale, r_n * scale + + +if __name__ == "main": + # run sir model sample + result = sir(0.99, 0.01, 0, 0.2, 0.1, 1) + + print(result) + + +# 7e475a4c-cf38-44f3-b349-2badeff967e8 diff --git a/end-to-end-rest/notebooks/M11-Q6/data/code/code_sir.zip b/end-to-end-rest/notebooks/M11-Q6/data/code/code_sir.zip new file mode 100644 index 0000000..a64fefd Binary files /dev/null and b/end-to-end-rest/notebooks/M11-Q6/data/code/code_sir.zip differ diff --git a/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_amr.json b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_amr.json new file mode 100644 index 0000000..9fefe0c --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_amr.json @@ -0,0 +1,41 @@ +{ + "semantics": { + "ode": { + "parameters": [ + {"name": "a"}, + {"name": "b"}, + {"name": "c"}, + {"name": "int_add"}, + {"name": "float_add"}, + {"name": "str_concat"}, + {"name": "list_concat"}, + {"name": "int_sub"}, + {"name": "float_sub"}, + {"name": "int_mult"}, + {"name": "float_mult"}, + {"name": "str_repeat"}, + {"name": "list_repeat"}, + {"name": "float_div"}, + {"name": "int_floor_div"}, + {"name": "int_mod"}, + {"name": "int_pow"}, + {"name": "float_pow"}, + {"name": "int_lshift"}, + {"name": "int_rshift"}, + {"name": "bit_or"}, + {"name": "bit_and"}, + {"name": "bit_xor"}, + {"name": "eq_int"}, + {"name": "eq_str"}, + {"name": "eq_list"}, + {"name": "neq_int"}, + {"name": "neq_str"}, + {"name": "neq_list"}, + {"name": "gt_int"}, + {"name": "gte_int"}, + {"name": "lt_int"}, + {"name": "lte_int"} + ] + } + } +} diff --git a/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_source.py b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_source.py new file mode 100644 index 0000000..93f28ba --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/complex_source.py @@ -0,0 +1,71 @@ +# Assignment types +a = 10 +b = a +c = a + b + +# Primitive addition +int_add = 5 + 3 +float_add = 3.14 + 2.71 +str_concat = "Hello" + "World" +list_concat = [1, 2, 3] + [4, 5, 6] + +# Primitive subtraction +int_sub = 10 - 3 +float_sub = 5.5 - 2.2 + +# Primitive multiplication +int_mult = 4 * 7 +float_mult = 2.5 * 3.5 +str_repeat = "abc" * 3 +list_repeat = [1, 2, 3] * 2 + +# Primitive division +float_div = 7.5 / 2.5 + +# Primitive floor division +int_floor_div = 11 // 3 + +# Primitive modulo +int_mod = 17 % 5 + +# Primitive exponentiation +int_pow = 2 ** 3 +float_pow = 2.0 ** 3 + +# Primitive left shift +int_lshift = 8 << 2 + +# Primitive right shift +int_rshift = 32 >> 2 + +# Primitive bitwise or +bit_or = 5 | 3 + +# Primitive bitwise and +bit_and = 5 & 3 + +# Primitive bitwise xor +bit_xor = 5 ^ 3 + +# Primitive equality +eq_int = 5 == 5 +eq_str = "Hello" == "World" +eq_list = [1, 2, 3] == [1, 2, 3] + +# Primitive inequality +neq_int = 5 != 3 +neq_str = "Hello" != "Hello" +neq_list = [1, 2, 3] != [4, 5, 6] + +# Primitive greater than +gt_int = 5 > 3 + +# Primitive greater than and equal to +gte_int = 5 >= 5 + +# Primitive less than +lt_int = 3 < 5 + +# Primitive less than and equal to +lte_int = 5 <= 5 + diff --git a/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_amr.json b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_amr.json new file mode 100644 index 0000000..a32375e --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_amr.json @@ -0,0 +1,148 @@ +{ + "header": { + "name": "mathml model", + "schema": "https://github.com/DARPA-ASKEM/Model-Representations/blob/main/petrinet/petrinet_schema.json", + "schema_name": "PetriNet", + "description": "This is a model from equations", + "model_version": "0.1" + }, + "model": { + "states": [ + { + "id": "D", + "name": "D" + }, + { + "id": "E", + "name": "E" + }, + { + "id": "I", + "name": "I" + }, + { + "id": "R", + "name": "R" + }, + { + "id": "S", + "name": "S" + } + ], + "transitions": [ + { + "id": "t0", + "input": [ + "I", + "S" + ], + "output": [ + "E", + "I" + ] + }, + { + "id": "t1", + "input": [ + "E" + ], + "output": [ + "I" + ] + }, + { + "id": "t2", + "input": [ + "I" + ], + "output": [ + "R" + ] + }, + { + "id": "t3", + "input": [ + "I" + ], + "output": [ + "D" + ] + } + ] + }, + "semantics": { + "ode": { + "rates": [ + { + "target": "t0", + "expression": "N*beta*I*S", + "expression_mathml": "IbetaSN" + }, + { + "target": "t1", + "expression": "sigma*E", + "expression_mathml": "sigmaE" + }, + { + "target": "t2", + "expression": "gamma*I", + "expression_mathml": "gammaI" + }, + { + "target": "t3", + "expression": "delta*I", + "expression_mathml": "deltaI" + } + ], + "initials": [ + { + "target": "S", + "expression": "", + "expression_mathml": "" + }, + { + "target": "E", + "expression": "", + "expression_mathml": "" + }, + { + "target": "I", + "expression": "", + "expression_mathml": "" + }, + { + "target": "R", + "expression": "", + "expression_mathml": "" + }, + { + "target": "D", + "expression": "", + "expression_mathml": "" + } + ], + "parameters": [ + { + "id": "N", + "name": "N" + }, + { + "id": "beta", + "name": "beta" + }, + { + "id": "delta", + "name": "delta" + }, + { + "id": "gamma", + "name": "gamma" + }, + { + "id": "sigma", + "name": "sigma" + } + ] + } + } +} \ No newline at end of file diff --git a/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_source.py b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_source.py new file mode 100644 index 0000000..8e412ce --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/data/execution_engine/epi_model_source.py @@ -0,0 +1,36 @@ +import numpy as np +from scipy.integrate import odeint +import matplotlib.pyplot as plt + +def seird_model(y, t, N, beta, sigma, gamma, delta): + S, E, I, R, D = y + dSdt = -beta * S * I / N + dEdt = beta * S * I / N - sigma * E + dIdt = sigma * E - gamma * I - delta * I + dRdt = gamma * I + dDdt = delta * I + return dSdt, dEdt, dIdt, dRdt, dDdt + +N = 1000 +beta = 0.2 +sigma = 0.1 +gamma = 0.05 +delta = 0.01 +y0 = N-1, 1, 0, 0, 0 + +t = np.linspace(0, 100, 1000) + +result = odeint(seird_model, y0, t, args=(N, beta, sigma, gamma, delta)) +S, E, I, R, D = result.T + +plt.figure(figsize=(10,6)) +plt.plot(t, S, 'b', label='Susceptible') +plt.plot(t, E, 'y', label='Exposed') +plt.plot(t, I, 'r', label='Infected') +plt.plot(t, R, 'g', label='Recovered') +plt.plot(t, D, 'k', label='Dead') +plt.xlabel('Time') +plt.ylabel('Population') +plt.legend() +plt.grid(True) +plt.show() \ No newline at end of file diff --git a/end-to-end-rest/notebooks/M11-Q6/morae_endpoint_demo.ipynb b/end-to-end-rest/notebooks/M11-Q6/morae_endpoint_demo.ipynb new file mode 100644 index 0000000..9b41966 --- /dev/null +++ b/end-to-end-rest/notebooks/M11-Q6/morae_endpoint_demo.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This notebook demo's the following:\n", + "1. Codebase -> Dynamics Linespan\n", + "2. LLM-assisted-Codebase -> Petrinet AMR\n", + "3. AMR enrichment with Parameter extraction\n", + "\n", + "Date created: 11/22/23" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "\n", + "SKEMA_ADDRESS = os.environ.get(\"SKEMA_ADDRESS\", \"https://api.askem.lum.ai\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Codebase -> Dynamics Linespan\n", + "- Overview: \n", + " - This endpoint takes in a zip file containing code with dynamics somewhere in it (ideally), and returns a linespan entry for each file in it. \n", + "- Current Details:\n", + " - This endpoint prompts an LLM, as a result it can be quite slow for large repo's (and other random times)\n", + " - As of right now, each python file in the repo will get a linespan entry. If the model does not suspect there are dynamics in that file it will output a linespan of [L0-L0] and have a description of \"Failed to parse dynamics\".\n", + " - Each linespan entry has a name which corresponds to the file it refers to. \n", + "- Future Work:\n", + " - Adding support beyond only python, to match the same coverage as our code2fn functionality\n", + " - We will our developing own model for this functionality as well, it will likely be run in parallel to this unless it is significantly superior. The goal of our own model will be easier inference over a entire codebase instead of file by file like this initial support does. Also to be faster. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zipfile_path = \"./data/code/code_sir.zip\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(open(\"./data/code/code1/code.py\").read())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "URL = f\"{SKEMA_ADDRESS}/morae/linespan-given-filepaths-zip\"\n", + "response_zip = requests.post(URL, files={\"zip_file\": open(zipfile_path, \"rb\")},)\n", + "response_zip.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next example contains 4 code files, each a different versio chime. One is just the dynamics, one is the complete chime model code, and other two are partial modifications of the code. Note how we get 4 responces back and dynamics was found for each file. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CHIME_SIR_URL = (\n", + " \"https://artifacts.askem.lum.ai/askem/data/models/zip-archives/CHIME-SIR-model.zip\"\n", + ")\n", + "response = requests.get(CHIME_SIR_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "URL = f\"{SKEMA_ADDRESS}/morae/linespan-given-filepaths-zip\"\n", + "response_zip = requests.post(URL, files={\"zip_file\": response.content},)\n", + "response_zip.json()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LLM-assisted-codebase -> Petrinet AMR\n", + "- Overview: \n", + " - This endpoint takes in a zip file containing code with dynamics somewhere in it (ideally), and returns a Petrinet AMR. \n", + "- Current Details:\n", + " - This endpoint has the same input and output as our /workflows/code/codebase-to-pn-amr endpoint. This is to make it easier to integrate.\n", + " - This endpoint takes in the codebase and uses the linespan functionality of before and slices the relevant the code which is then sent to our code-snippets endpoint. This reduces the chance for errors from our code ingestion pipeline and simplifies our extraction process as only a subset of the code is ingested, with the goal of greatly increasing the robustness of this workflow. \n", + " - For the case where there could multiple files with dynamics we currently only return one AMR to match the input and output of original un-assisted endpoint. To do these we return the AMR with the most \"states\", using it as a proxy for completeness. \n", + "- Future Work:\n", + " - Multiple AMR outputs could be an option\n", + " - Expanded coverage of coding idioms\n", + " - Once we have our own developed linespan model up, we will replace the original endpoint with that model assisting it and run these two endpoints in parallel, unless one is significantly better than the other. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# simple sir\n", + "URL = f\"{SKEMA_ADDRESS}/workflows/code/llm-assisted-codebase-to-pn-amr\"\n", + "response_zip = requests.post(URL, files={\"zip_file\": open(zipfile_path, \"rb\")},)\n", + "print(json.dumps(response_zip.json(), indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 4 CHIME models at once\n", + "URL = f\"{SKEMA_ADDRESS}/workflows/code/llm-assisted-codebase-to-pn-amr\"\n", + "response_zip = requests.post(URL, files={\"zip_file\": response.content},)\n", + "print(json.dumps(response_zip.json(), indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# penn chime repo, trimmed to only the python files ~49 files. \n", + "# NOTE: Takes 3-5 minutes\n", + "zipfile_path = \"./data/code/chime_trimmed.zip\"\n", + "response_zip = requests.post(URL, files={\"zip_file\": open(zipfile_path, \"rb\")},)\n", + "print(json.dumps(response_zip.json(), indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### AMR enrichment with parameter extraction\n", + "- Overview: \n", + " - This endpoint takes an AMR and the code it was derived from and enriches the AMR with the relevant parameters in the code. \n", + "- Current Details:\n", + " - This feature checks the AMR for parameters and finds their entries in the code. It then creates a dataflow trace of their assignment and executes it to extract their value. \n", + " - This execution framework also gives us access to all the values a parameter can take on. However this is not output into the AMR as of now. \n", + "- Future Work:\n", + " - Expanded coverage of types of assignments/executions to be extracted\n", + " - Handling for cases when the parameter names in the AMR do not match the variable names in the code, but they should be matched.\n", + " - Support for outputting parameter ranges, based on parameters that take on multiple values during execution. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we show enriching a simple epidemiology model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "amr_path = Path(\"./data/execution_engine/epi_model_amr.json\")\n", + "source_path = Path(\"./data/execution_engine/epi_model_source.py\")\n", + "\n", + "print(json.dumps(json.loads(amr_path.read_text()), indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "request = {\n", + " \"amr\": json.loads(amr_path.read_text()),\n", + " \"source\": source_path.read_text(),\n", + " \"filename\": \"epi_model_source.py\"\n", + "}\n", + "URL = f\"{SKEMA_ADDRESS}/execution-engine/amr-enrichment\"\n", + "response = requests.post(URL, json=request)\n", + "enriched_amr = response.json()\n", + "print(json.dumps(enriched_amr, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we show a test file showing some of the coverage of parameter extraction capabilites." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "amr_path = Path(\"./data/execution_engine/complex_amr.json\")\n", + "source_path = Path(\"./data/execution_engine/complex_source.py\")\n", + "\n", + "print(source_path.read_text())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "request = {\n", + " \"amr\": json.loads(amr_path.read_text()),\n", + " \"source\": source_path.read_text(),\n", + " \"filename\": \"complex_source.py\"\n", + "}\n", + "URL = f\"{SKEMA_ADDRESS}/execution-engine/amr-enrichment\"\n", + "response = requests.post(URL, json=request)\n", + "enriched_amr = response.json()\n", + "print(json.dumps(enriched_amr, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell below computes the percent coverage for the give list of coding idioms we currently support" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_params = 0\n", + "enriched_params = 0\n", + "for entry in enriched_amr['semantics']['ode']['parameters']:\n", + " total_params+=1\n", + " if \"value\" in entry:\n", + " enriched_params+=1\n", + "\n", + "percent_coverage = (enriched_params/total_params) * 100\n", + "print(f\"Percent Coverage: {percent_coverage:.2f}%\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}