reworkd · asim-shrestha · Nov 13, 2023 · Nov 13, 2023
diff --git a/bananalyzer/data/schemas.py b/bananalyzer/data/schemas.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Literal, Optional, Union
 
+import pytest
 from pydantic import BaseModel, Field, model_validator
 
 from bananalyzer.data.fetch_schemas import fetch_schemas
@@ -37,8 +38,10 @@ def eval_action(self, _: str) -> bool:
         # We don't care about action level evaluations
         return True
 
-    def eval_results(self, result: Dict[str, Any]) -> bool:
-        return result == self.expected
+    def eval_results(self, result: Dict[str, Any]) -> None:
+        if result != self.expected:
+            diff_msg = f"Expected: {self.expected}\nActual: {result}"
+            pytest.fail(f"JSONEval mismatch:\n{diff_msg}")
 
 
 class ActionEval(BaseModel):

diff --git a/bananalyzer/runner/runner.py b/bananalyzer/runner/runner.py
@@ -29,7 +29,7 @@ async def test_{example.id.replace("-", "_")}() -> None:
         # The agent is imported into the global context prior to this call 
         result = await agent.run(context, example)
         for curr_eval in example.evals:
-            assert curr_eval.eval_results(result)
+            curr_eval.eval_results(result)
 """,
         example=example,
     )

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "bananalyzer"
-version = "0.2.1"
+version = "0.2.2"
 description = "Open source AI Agent evaluation framework for web tasks 🐒🍌"
 authors = ["asim-shrestha <[email protected]>"]
 readme = "README.md"

diff --git a/tests/data/test_example.py b/tests/data/test_example.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, Optional
 
 import pytest
+from _pytest.outcomes import Failed
 from pydantic import ValidationError
 
 from bananalyzer.data.fetch_schemas import fetch_schemas
@@ -12,9 +13,10 @@ def test_json_eval() -> None:
 
     eval = JSONEval(expected=json)
 
-    assert eval.eval_results(json)
-    assert eval.eval_results({"two": "two", "one": "one"})
-    assert not eval.eval_results({"test": "test"})
+    eval.eval_results(json)
+    eval.eval_results({"two": "two", "one": "one"})
+    with pytest.raises(Failed):
+        eval.eval_results({"test": "test"})
 
 
 def create_default_example(