ansari-project · waleedkadous · Jan 20, 2025 · Jan 20, 2025
diff --git a/src/ansari/agents/ansari_workflow.py b/src/ansari/agents/ansari_workflow.py
@@ -53,7 +53,7 @@ class AnsariWorkflow:
 
     """
 
-    def __init__(self, settings, message_logger=None, json_format=False):
+    def __init__(self, settings, message_logger=None, json_format=False, system_prompt_file=None):
         self.settings = settings
         sq = SearchQuran(settings.KALEMAT_API_KEY.get_secret_value())
         sh = SearchHadith(settings.KALEMAT_API_KEY.get_secret_value())
@@ -81,7 +81,8 @@ def __init__(self, settings, message_logger=None, json_format=False):
         }
         self.model = settings.MODEL
         self.pm = PromptMgr()
-        self.sys_msg = self.pm.bind(settings.SYSTEM_PROMPT_FILE_NAME).render()
+        prompt_file = system_prompt_file or settings.SYSTEM_PROMPT_FILE_NAME
+        self.sys_msg = self.pm.bind(prompt_file).render()
         self.tools = [x.get_tool_description() for x in self.tool_name_to_instance.values()]
         self.json_format = json_format
         self.message_logger = message_logger
@@ -149,16 +150,22 @@ def _execute_gen_answer_step(self, step_params, prev_outputs):
             search_results = "\n---\n".join(
                 [prev_outputs[i] for i in step_params["search_results_indices"]],
             )
-        prompt = f"""Using {search_results}, compose a response that:
+        prompt = f""" Consider the following question: '{step_params["input"]}'
+
+            Using the excerpts from tafsirs below, compose a response that:
             1. Directly answers the query of the user
             2. Matches user's language/tone
-            3. Adheres to your system instructions as Ansari."""
+            3. Adheres to your system instructions as Ansari.
+
+            {search_results}
+
+            Reminder: the key question is: '{step_params["input"]}'. 
+            """
         model_response = litellm.completion(
             model=self.model,
             messages=[
                 {"role": "system", "content": self.sys_msg},
-                {"role": "system", "content": prompt},
-                {"role": "user", "content": step_params["input"]},
+                {"role": "user", "content": prompt},
             ],
             stream=False,
             timeout=30.0,

diff --git a/src/ansari/app/main_api.py b/src/ansari/app/main_api.py
@@ -705,6 +705,7 @@ class AyahQuestionRequest(BaseModel):
     ayah: int
     question: str
     augment_question: bool | None = False
+    use_cache: bool | None = True
     apikey: str
 
 
@@ -722,16 +723,20 @@ async def answer_ayah_question(
         raise HTTPException(status_code=401, detail="Unauthorized")
 
     try:
-        # Create AnsariWorkflow instance
-        logging.debug("Creating AnsariWorkflow instance for {req.surah}:{req.ayah}")
-        ansari_workflow = AnsariWorkflow(settings)
+        # Create AnsariWorkflow instance with ayah-specific system prompt
+        logging.debug(f"Creating Ansari Workflow instance for {req.surah}:{req.ayah}")
+        ansari_workflow = AnsariWorkflow(
+            settings,
+            system_prompt_file=settings.AYAH_SYSTEM_PROMPT_FILE_NAME
+        )
 
         ayah_id = req.surah * 1000 + req.ayah
 
         # Check if the answer is already stored in the database
-        stored_answer = db.get_quran_answer(req.surah, req.ayah, req.question)
-        if stored_answer:
-            return {"response": stored_answer}
+        if req.use_cache:
+            stored_answer = db.get_quran_answer(req.surah, req.ayah, req.question)
+            if stored_answer:
+                return {"response": stored_answer}
 
         # Define the workflow steps
         workflow_steps = [
@@ -746,6 +751,8 @@ async def answer_ayah_question(
             ("gen_query", {"input": req.question, "target_corpus": "tafsir"}),
             ("gen_answer", {"input": req.question, "search_results_indices": [0]}),
         ]
+        # If augment_question is False, skip the query generation step to use 
+        # the original question directly
         if not req.augment_question:
             workflow_steps.pop(1)
 

diff --git a/src/ansari/app/main_file.py b/src/ansari/app/main_file.py
@@ -1,9 +1,71 @@
 import typer
+import logging
+from typing import Optional
 
 from ansari.agents import Ansari
 from ansari.config import get_settings
 from ansari.presenters.file_presenter import FilePresenter
+from ansari.presenters.ayah_file_presenter import AyahFilePresenter
+
+logging.basicConfig(
+    level=logging.DEBUG,
+   )
+
+def main(
+    input_file: str,
+    output_file: str,
+    ayah_mode: bool = typer.Option(
+        False,
+        "--ayah-mode",
+        "-a",
+        help="Process input as ayah questions (CSV format: surah:ayah,question)",
+    ),
+    use_query_generation: bool = typer.Option(
+        True,
+        "--use-query-generation",
+        "-q",
+        help="Use query generation step in ayah mode",
+    ),
+    answer_column: str = typer.Option(
+        "answer",
+        "--answer-column",
+        "-c",
+        help="Name of the column to store answers in the output CSV (ayah mode only)",
+    ),
+    system_message: Optional[str] = typer.Option(
+        None,
+        "--system-message",
+        "-s",
+        help="Path to system message file. If not provided, uses default.",
+    ),
+):
+    """
+    Process input file and generate answers
+
+    Args:
+        input_file: Path to input file
+        output_file: Path to output file
+        ayah_mode: Whether to process in ayah mode
+        use_query_generation: Whether to use query generation
+        answer_column: Name of column to store answers
+        system_message: The name of the system message file. If not provided, uses default.
+    """
+    settings = get_settings()
+
+    if system_message:
+        settings.AYAH_SYSTEM_PROMPT_FILE_NAME = system_message
+
+    if ayah_mode:
+        presenter = AyahFilePresenter(
+            settings=settings,
+            use_query_generation=use_query_generation,
+            answer_column=answer_column
+        )
+    else:
+        ansari = Ansari(settings)
+        presenter = FilePresenter(ansari)
+
+    presenter.present(input_file, output_file)
 
 if __name__ == "__main__":
-    ansari = Ansari(get_settings())
-    typer.run(FilePresenter(ansari).present)
+    typer.run(main)
diff --git a/src/ansari/config.py b/src/ansari/config.py
@@ -135,6 +135,7 @@ def get_resource_path(filename):
     MAX_TOOL_TRIES: int = Field(default=3)
     MAX_FAILURES: int = Field(default=1)
     SYSTEM_PROMPT_FILE_NAME: str = Field(default="system_msg_tool")
+    AYAH_SYSTEM_PROMPT_FILE_NAME: str = Field(default="system_msg_ayah")
     PROMPT_PATH: str = Field(default=str(get_resource_path("prompts")))
 
     LOGGING_LEVEL: str = Field(default="INFO")

diff --git a/src/ansari/presenters/ayah_file_presenter.py b/src/ansari/presenters/ayah_file_presenter.py
@@ -0,0 +1,136 @@
+import csv
+import logging
+import os
+from typing import List, Dict, Tuple
+
+
+from ansari.agents.ansari_workflow import AnsariWorkflow
+from ansari.config import get_settings
+
+
+class AyahFilePresenter:
+    def __init__(self, settings, use_query_generation: bool = False, answer_column: str = "answer"):
+        self.settings = settings
+        self.use_query_generation = use_query_generation
+        self.answer_column = answer_column
+
+    def _parse_ayah_reference(self, ayah_ref: str) -> Tuple[int, int]:
+        """Parse a surah:ayah reference into separate numbers.
+
+        Args:
+            ayah_ref: String in format "surah:ayah"
+
+        Returns:
+            Tuple of (surah_num, ayah_num)
+
+        Raises:
+            ValueError: If format is invalid or empty
+        """
+        if not ayah_ref or not ayah_ref.strip():
+            raise ValueError("Empty ayah reference")
+
+        try:
+            surah_str, ayah_str = ayah_ref.strip().split(":")
+            return int(surah_str), int(ayah_str)
+        except ValueError:
+            raise ValueError(f"Invalid ayah reference format: {ayah_ref}. Expected format: surah:ayah (e.g. 1:1)")
+
+    def present(self, input_file_path: str, output_file_path: str):
+        try:
+            # First pass: read header to get all field names
+            with open(input_file_path, newline='') as input_file:
+                # Skip empty lines and get header
+                for line in input_file:
+                    if line.strip():  # First non-empty line is header
+                        reader = csv.reader([line])
+                        header = next(reader)
+                        if len(header) < 2:
+                            logging.error("Input CSV must contain at least two columns")
+                            return
+                        break
+                else:
+                    logging.error("Empty input file")
+                    return
+
+                # Create fieldnames, preserving original names
+                fieldnames = header
+                if self.answer_column not in fieldnames:
+                    fieldnames = fieldnames + [self.answer_column]
+
+            # Second pass: process all rows
+            with open(input_file_path, newline='') as input_file:
+                reader = csv.reader(input_file)
+
+                # Open output file and write
+                with open(output_file_path, 'w', newline='') as output_file:
+                    writer = csv.writer(output_file)
+                    writer.writerow(fieldnames)
+
+                    for row in reader:
+                        # Skip empty lines
+                        if not any(row):
+                            continue
+
+                        try:
+                            # Get values from first and second columns using column positions
+                            ayah_ref = row[0]
+                            question = row[1]
+
+                            # Validate required fields
+                            if not ayah_ref or not question:
+                                raise ValueError("Missing required fields in first or second column")
+
+                            surah, ayah = self._parse_ayah_reference(ayah_ref)
+                            question = question.strip()
+
+                            print(f"Processing surah {surah}, ayah {ayah}, question: {question}")
+
+                            # Create a new workflow instance for each question
+                            workflow = AnsariWorkflow(
+                                self.settings,
+                                system_prompt_file=self.settings.AYAH_SYSTEM_PROMPT_FILE_NAME
+                            )
+
+                            ayah_id = surah * 1000 + ayah
+                            workflow_steps = [
+                                (
+                                    "search",
+                                    {
+                                        "query": question,
+                                        "tool_name": "search_tafsir",
+                                        "metadata_filter": f"part.from_ayah_int<={ayah_id} AND part.to_ayah_int>={ayah_id}",
+                                    },
+                                ),
+                            ]
+
+                            if self.use_query_generation:
+                                workflow_steps.append(
+                                    ("gen_query", {"input": question, "target_corpus": "tafsir"})
+                                )
+
+                            workflow_steps.append(
+                                ("gen_answer", {"input": question, "search_results_indices": [0]})
+                            )
+
+                            # Execute the workflow
+                            workflow_output = workflow.execute_workflow(workflow_steps)
+                            # The answer is the last item in the workflow output
+                            answer = workflow_output[-1]
+
+                            # Add answer to row and write
+                            row.append(answer)
+                            writer.writerow(row)
+                            output_file.flush()
+
+                        except Exception as e:
+                            logging.error(f"Error processing row: {e}")
+                            row.append(f"ERROR: {str(e)}")
+                            writer.writerow(row)
+                            output_file.flush()
+                            continue
+
+            print(f"Results saved to {os.path.abspath(output_file_path)}")
+
+        except Exception as e:
+            logging.error(f"Error processing file: {e}")
+            return
diff --git a/src/ansari/resources/prompts/system_msg_ayah.txt b/src/ansari/resources/prompts/system_msg_ayah.txt
@@ -0,0 +1,30 @@
+You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 
+Quran-related questions with accuracy and depth. 
+
+Fluent in languages such as Arabic (including transliteration), 
+Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 
+craft precise, evidence-based responses exclusively from the Sunni tradition. 
+
+Here's how you work: You receive a an ayah and a question along with the
+desired response language and search results from any tafsirs available. 
+Currently that includes Ibn Kathir. 
+
+If you attribute a statement or opinion to a scholar, you will include EXACTLY
+the sentence in which the mufassir says so. 
+
+If you say there is a hadith that says something, you will include the hadith 
+EXACTLY as it was in the source text.  
+
+Quoting from the source material is highly recommended when attributing 
+statements or opinions to scholars or hadith, especially when the source text is 
+weak or unverified.
+
+Crucially, only attribute specific statements or opinions to these scholars if you 
+have specific referenceable evidence to support that attribution. When referencing 
+the Quran, you, Ansari, include the ayah number, Arabic text, and translation 
+(if the user's language is different from Arabic). 
+
+The person reading your answer is a well informed scholar. You may use terms
+that an informed scholar would use. Make sure you target your answe at an 
+nformed scholar. 
+
diff --git a/src/ansari/resources/prompts/system_msg_ayah_lay.txt b/src/ansari/resources/prompts/system_msg_ayah_lay.txt
@@ -0,0 +1,30 @@
+You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer 
+Quran-related questions with accuracy and depth. 
+
+Fluent in languages such as Arabic (including transliteration), 
+Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, 
+craft precise, evidence-based responses exclusively from the Sunni tradition. 
+
+Here's how you work: You receive a an ayah and a question along with the
+desired response language and search results from any tafsirs available. 
+Currently that includes Ibn Kathir. 
+
+If you attribute a statement or opinion to a scholar, you will include EXACTLY
+the sentence in which the mufassir says so. 
+
+If you say there is a hadith that says something, you will include the hadith 
+EXACTLY as it was in the source text.  
+
+Quoting from the source material is highly recommended when attributing 
+statements or opinions to scholars or hadith, especially when the source text is 
+weak or unverified.
+
+Crucially, only attribute specific statements or opinions to these scholars if you 
+have specific referenceable evidence to support that attribution. When referencing 
+the Quran, you, Ansari, include the ayah number, Arabic text, and translation 
+(if the user's language is different from Arabic). 
+
+The person reading your answer is a general member of the public who 
+may or may not be a Muslim. Assume the reader only has a basic knowledge of
+Islam. 
+