From 8546b931ae3527ca5711603b96944cf312ffb724 Mon Sep 17 00:00:00 2001 From: Justin Date: Fri, 27 Oct 2023 12:25:04 -0700 Subject: [PATCH 1/8] wip --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 90fb5aec325..c88329901e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,8 +82,11 @@ core = ["skema[img2mml]", "skema[isa]", "skema[tr]", "skema[metal]"] # see skema/img2mml/render_mml/mathpix_annotator annotations = ["matplotlib", "notebook"] +# for llm use in skema +llms = ["langchain==0.0.325"] + # all extras -all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotations]"] +all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotations]", "skema[llms]"] [tool.setuptools.package-dir] "skema.gromet" = "skema/gromet" From 9bb6c8b598f0bfd40af359e6e3e1a4e7aab2f69a Mon Sep 17 00:00:00 2001 From: Justin Date: Fri, 27 Oct 2023 13:49:09 -0700 Subject: [PATCH 2/8] added endpoint for llm dynamics extraction --- skema/rest/api.py | 7 ++ skema/rest/llm_proxy.py | 138 ++++++++++++++++++++++++++++++++++++++++ skema/rest/proxies.py | 3 +- 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 skema/rest/llm_proxy.py diff --git a/skema/rest/api.py b/skema/rest/api.py index c90080417a3..435bcc56fec 100644 --- a/skema/rest/api.py +++ b/skema/rest/api.py @@ -7,6 +7,7 @@ integrated_text_reading_proxy, morae_proxy, metal_proxy, + llm_proxy, ) from skema.img2mml import eqn2mml from skema.skema_py import server as code2fn @@ -110,6 +111,12 @@ tags=["morae", "skema-rs"], ) +app.include_router( + llm_proxy.router, + prefix="/morae", + tags=["morae"], +) + app.include_router( integrated_text_reading_proxy.router, prefix="/text-reading", diff --git a/skema/rest/llm_proxy.py b/skema/rest/llm_proxy.py new file mode 100644 index 00000000000..c3538e7d4c5 --- /dev/null +++ b/skema/rest/llm_proxy.py @@ -0,0 +1,138 @@ +from langchain.chat_models import ChatOpenAI +from langchain.prompts import ( + ChatPromptTemplate, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) +from langchain.output_parsers import ( + StructuredOutputParser, + ResponseSchema +) +from fastapi import APIRouter, FastAPI, File, UploadFile +from io import BytesIO +from zipfile import ZipFile +import requests +from pathlib import Path +import json +from skema.rest.proxies import SKEMA_OPENAI_KEY + +router = APIRouter() + +class LineSpan: + def __init__(self, line_begin, line_end): + self.line_begin = line_begin + self.line_end = line_end + + +@router.post( + "/linespan-given-filepaths-zip", + summary=( + "Send a zip file containing a code file," + " get a line span of the dynamics back." + ), +) +async def get_lines_of_model(zip_file: UploadFile = File()) -> LineSpan: + """ + Endpoint for generating a line span containing the dynamics from a zip archive. Currently + it only expects there to be one python file in the zip. There can be other files, such as a + README.md, but only one .py. Future versions will generalize support to arbritary zip contents. + + ### Python example + ``` + import requests + + files = { + "zip_file": open(zip_path, "rb"), + } + + response = requests.post(f"{ENDPOINT}/morae/linespan-given-filepaths-zip", files=files) + gromet_json = response.json() + """ + files=[] + blobs=[] + with ZipFile(BytesIO(zip_file.file.read()), "r") as zip: + for file in zip.namelist(): + file_obj = Path(file) + if file_obj.suffix in [".py"]: + files.append(file) + blobs.append(zip.open(file).read()) + + # read in the code, for the prompt + code = blobs[0].decode("utf-8") # needs to be regular string, not byte string + file = files[0] + # json for the fn construction + single_snippet_payload = { + "files": [file], + "blobs": [code], + } + + # this is the formatting instructions + response_schemas = [ + ResponseSchema(name="model_function", description="The name of the function that contains the model dynamics") + ] + + # for structured output parsing, converts schema to langhchain object + output_parser = StructuredOutputParser.from_response_schemas(response_schemas) + + # for structured output parsing, makes the instructions to be passed as a variable to prompt template + format_instructions = output_parser.get_format_instructions() + + # low temp as is not generative + temperature = 0.1 + + # initialize the models + openai = ChatOpenAI( + temperature=temperature, + model_name='gpt-3.5-turbo', + openai_api_key=SKEMA_OPENAI_KEY + ) + + # construct the prompts + template="You are a assistant that answers questions about code." + system_message_prompt = SystemMessagePromptTemplate.from_template(template) + human_template="Find the function that contains the model dynamics in {code} \n{format_instructions}" + human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) + + # combining the templates for a chat template + chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt]) + + # formatting the prompt with input variables + formatted_prompt = chat_prompt.format_prompt(code=code, format_instructions = format_instructions).to_messages() + + # running the model + output = openai(formatted_prompt) + + # parsing the output + try: + parsed_output = output_parser.parse(output.content) + + function_name = parsed_output['model_function'] + + # Get the FN from it + url = "https://api.askem.lum.ai/code2fn/fn-given-filepaths" + response_zip = requests.post(url, json=single_snippet_payload) + + # get metadata entry for function + for entry in response_zip.json()['modules'][0]['fn_array']: + try: + if entry['b'][0]['name'][0:len(function_name)] == function_name: + metadata_idx = entry['b'][0]['metadata'] + except: + None + + # get line span using metadata + for (i,metadata) in enumerate(response_zip.json()['modules'][0]['metadata_collection']): + if i == (metadata_idx - 1): + line_begin = metadata[0]['line_begin'] + line_end = metadata[0]['line_end'] + except: + print("Failed to parse dynamics") + line_begin = 0 + line_end = 0 + + output = LineSpan(line_begin,line_end) + return output + + +app = FastAPI() +app.include_router(router) \ No newline at end of file diff --git a/skema/rest/proxies.py b/skema/rest/proxies.py index 502a355b6c9..2a61fcefd25 100644 --- a/skema/rest/proxies.py +++ b/skema/rest/proxies.py @@ -7,7 +7,7 @@ # MORAE etc SKEMA_RS_ADDESS = os.environ.get("SKEMA_RS_ADDRESS", "https://skema-rs.askem.lum.ai") - +SKEMA_OPENAI_KEY = os.environ.get("SKEMA_OPENAI_KEY", "YOU_FORGOT_TO_SET_SKEMA_OPENAI_KEY") # MathJAX service SKEMA_MATHJAX_PROTOCOL = os.environ.get("SKEMA_MATHJAX_PROTOCOL", "http://") @@ -24,4 +24,3 @@ SKEMA_TR_ADDRESS = os.environ.get("SKEMA_TR_ADDRESS", "http://hopper.sista.arizona.edu") OPENAI_KEY = os.environ.get("OPENAI_KEY", "YOU_FORGOT_TO_SET_OPENAI_KEY") COSMOS_ADDRESS = os.environ.get("COSMOS_ADDRESS", "https://xdd.wisc.edu/cosmos_service") - From 82c1d8219b328344fd6e9ee492d3bb8892988d62 Mon Sep 17 00:00:00 2001 From: Justin Date: Fri, 27 Oct 2023 14:24:27 -0700 Subject: [PATCH 3/8] made data class pydantic BaseModel --- skema/rest/llm_proxy.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/skema/rest/llm_proxy.py b/skema/rest/llm_proxy.py index c3538e7d4c5..4d1424de7ce 100644 --- a/skema/rest/llm_proxy.py +++ b/skema/rest/llm_proxy.py @@ -13,15 +13,14 @@ from zipfile import ZipFile import requests from pathlib import Path -import json +from pydantic import BaseModel from skema.rest.proxies import SKEMA_OPENAI_KEY router = APIRouter() -class LineSpan: - def __init__(self, line_begin, line_end): - self.line_begin = line_begin - self.line_end = line_end +class LineSpan(BaseModel): + line_begin: int + line_end: int @router.post( @@ -130,7 +129,7 @@ async def get_lines_of_model(zip_file: UploadFile = File()) -> LineSpan: line_begin = 0 line_end = 0 - output = LineSpan(line_begin,line_end) + output = LineSpan(line_begin=line_begin,line_end=line_end) return output From 477fa9f9e2f77f9fad003bb13b8ec01611ae261f Mon Sep 17 00:00:00 2001 From: Justin Date: Mon, 30 Oct 2023 15:35:46 -0400 Subject: [PATCH 4/8] updated output class, based on HMI recommendations --- skema/rest/llm_proxy.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/skema/rest/llm_proxy.py b/skema/rest/llm_proxy.py index 4d1424de7ce..8a3b8d015c8 100644 --- a/skema/rest/llm_proxy.py +++ b/skema/rest/llm_proxy.py @@ -13,15 +13,22 @@ from zipfile import ZipFile import requests from pathlib import Path -from pydantic import BaseModel +from pydantic import BaseModel, Field +from typing import List, Optional from skema.rest.proxies import SKEMA_OPENAI_KEY router = APIRouter() -class LineSpan(BaseModel): - line_begin: int - line_end: int +class Dynamics(BaseModel): + """ + Dynamics Data Model for capturing dynamics within a CodeFile. + """ + name: Optional[str] = Field(description="Name of the dynamics section.") + description: Optional[str] = Field(description="Description of the dynamics.") + block: List[str] = Field( + description="A list containing strings indicating the line numbers in the file that contain the dynamics, e.g., ['L205-L213', 'L225-L230']." + ) @router.post( "/linespan-given-filepaths-zip", @@ -49,6 +56,7 @@ async def get_lines_of_model(zip_file: UploadFile = File()) -> LineSpan: """ files=[] blobs=[] + block=[] with ZipFile(BytesIO(zip_file.file.read()), "r") as zip: for file in zip.namelist(): file_obj = Path(file) @@ -129,7 +137,9 @@ async def get_lines_of_model(zip_file: UploadFile = File()) -> LineSpan: line_begin = 0 line_end = 0 - output = LineSpan(line_begin=line_begin,line_end=line_end) + block.append(f"L{line_begin}-L{line_end}") + + output = Dynamics(name=None, description=None, block=block) return output From b5f7b109d23ced9af8f1f5cabc0d35635c45c6c0 Mon Sep 17 00:00:00 2001 From: Justin Date: Mon, 30 Oct 2023 16:36:59 -0400 Subject: [PATCH 5/8] added test for LLM prompt --- skema/rest/tests/test_llms.py | 57 +++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 skema/rest/tests/test_llms.py diff --git a/skema/rest/tests/test_llms.py b/skema/rest/tests/test_llms.py new file mode 100644 index 00000000000..12582500128 --- /dev/null +++ b/skema/rest/tests/test_llms.py @@ -0,0 +1,57 @@ +from langchain.chat_models import ChatOpenAI +from langchain.prompts import ( + ChatPromptTemplate, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) +from langchain.output_parsers import ( + StructuredOutputParser, + ResponseSchema +) +from skema.rest.proxies import SKEMA_OPENAI_KEY + +def test_prompt_construction(): + """Tests prompt template instantiation""" + # TODO: your assertion here that the template instantiation returns a string/valid type + + code = "def sir(\n s: float, i: float, r: float, beta: float, gamma: float, n: float\n) -> Tuple[float, float, float]:\n \"\"\"The SIR model, one time step.\"\"\"\n s_n = (-beta * s * i) + s\n i_n = (beta * s * i - gamma * i) + i\n r_n = gamma * i + r\n scale = n / (s_n + i_n + r_n)\n return s_n * scale, i_n * scale, r_n * scale" + + # this is the formatting instructions + response_schemas = [ + ResponseSchema(name="model_function", description="The name of the function that contains the model dynamics") + ] + + # for structured output parsing, converts schema to langhchain object + output_parser = StructuredOutputParser.from_response_schemas(response_schemas) + + # for structured output parsing, makes the instructions to be passed as a variable to prompt template + format_instructions = output_parser.get_format_instructions() + + # low temp as is not generative + temperature = 0.0 + + # initialize the models + openai = ChatOpenAI( + temperature=temperature, + model_name='gpt-3.5-turbo', + openai_api_key=SKEMA_OPENAI_KEY + ) + + # construct the prompts + template="You are a assistant that answers questions about code." + system_message_prompt = SystemMessagePromptTemplate.from_template(template) + human_template="Find the function that contains the model dynamics in {code} \n{format_instructions}" + human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) + + # combining the templates for a chat template + chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt]) + + # formatting the prompt with input variables + formatted_prompt = chat_prompt.format_prompt(code=code, format_instructions = format_instructions).to_messages() + + # running the model + output = openai(formatted_prompt) + + parsed_output = output_parser.parse(output.content) + + assert isinstance(parsed_output['model_function'], str) \ No newline at end of file From d0c81466a82244eb64b91474ea403b57260d5541 Mon Sep 17 00:00:00 2001 From: Justin Date: Mon, 30 Oct 2023 17:01:38 -0400 Subject: [PATCH 6/8] simplified the test --- skema/rest/tests/test_llms.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/skema/rest/tests/test_llms.py b/skema/rest/tests/test_llms.py index 12582500128..4a7c7cd38c1 100644 --- a/skema/rest/tests/test_llms.py +++ b/skema/rest/tests/test_llms.py @@ -8,6 +8,7 @@ StructuredOutputParser, ResponseSchema ) +import langchain.schema from skema.rest.proxies import SKEMA_OPENAI_KEY def test_prompt_construction(): @@ -27,16 +28,6 @@ def test_prompt_construction(): # for structured output parsing, makes the instructions to be passed as a variable to prompt template format_instructions = output_parser.get_format_instructions() - # low temp as is not generative - temperature = 0.0 - - # initialize the models - openai = ChatOpenAI( - temperature=temperature, - model_name='gpt-3.5-turbo', - openai_api_key=SKEMA_OPENAI_KEY - ) - # construct the prompts template="You are a assistant that answers questions about code." system_message_prompt = SystemMessagePromptTemplate.from_template(template) @@ -49,9 +40,5 @@ def test_prompt_construction(): # formatting the prompt with input variables formatted_prompt = chat_prompt.format_prompt(code=code, format_instructions = format_instructions).to_messages() - # running the model - output = openai(formatted_prompt) - - parsed_output = output_parser.parse(output.content) - - assert isinstance(parsed_output['model_function'], str) \ No newline at end of file + assert isinstance(formatted_prompt[0], langchain.schema.messages.SystemMessage) + assert isinstance(formatted_prompt[1], langchain.schema.messages.HumanMessage) \ No newline at end of file From 74bc5599b0d41aa5a8d0108b11c3b51042421331 Mon Sep 17 00:00:00 2001 From: Justin Date: Mon, 30 Oct 2023 17:08:59 -0400 Subject: [PATCH 7/8] mocked model output to test parser --- skema/rest/tests/test_llms.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/skema/rest/tests/test_llms.py b/skema/rest/tests/test_llms.py index 4a7c7cd38c1..ae23823813b 100644 --- a/skema/rest/tests/test_llms.py +++ b/skema/rest/tests/test_llms.py @@ -40,5 +40,12 @@ def test_prompt_construction(): # formatting the prompt with input variables formatted_prompt = chat_prompt.format_prompt(code=code, format_instructions = format_instructions).to_messages() + # mockes the output from the model + output_mock = langchain.schema.messages.AIMessage(content='```json\n{\n\t"model_function": "sir"\n}\n```',additional_kwargs={}, example=False ) + + parsed_output = output_parser.parse(output_mock.content) + + assert isinstance(parsed_output['model_fuction'], str) assert isinstance(formatted_prompt[0], langchain.schema.messages.SystemMessage) - assert isinstance(formatted_prompt[1], langchain.schema.messages.HumanMessage) \ No newline at end of file + assert isinstance(formatted_prompt[1], langchain.schema.messages.HumanMessage) + From 0fbc0c2f93b6cec5419cb9ebda663d1d707f8620 Mon Sep 17 00:00:00 2001 From: Justin Date: Mon, 30 Oct 2023 18:45:23 -0400 Subject: [PATCH 8/8] miss spelled function --- skema/rest/tests/test_llms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skema/rest/tests/test_llms.py b/skema/rest/tests/test_llms.py index ae23823813b..b95691db217 100644 --- a/skema/rest/tests/test_llms.py +++ b/skema/rest/tests/test_llms.py @@ -45,7 +45,7 @@ def test_prompt_construction(): parsed_output = output_parser.parse(output_mock.content) - assert isinstance(parsed_output['model_fuction'], str) + assert isinstance(parsed_output['model_function'], str) assert isinstance(formatted_prompt[0], langchain.schema.messages.SystemMessage) assert isinstance(formatted_prompt[1], langchain.schema.messages.HumanMessage)