diff --git a/src/ansari/agents/ansari.py b/src/ansari/agents/ansari.py index 395bcb0..ebe3aae 100644 --- a/src/ansari/agents/ansari.py +++ b/src/ansari/agents/ansari.py @@ -112,7 +112,10 @@ def _debug_log_truncated_message_history(self, message_history, count: int, fail trunc_msg_hist[0]["content"] = sys_p[:15] + "..." logger.info( - f"Process attempt #{count+failures+1} of this message history:\n" + "-" * 60 + f"\n{trunc_msg_hist}\n" + "-" * 60, + f"Process attempt #{count + failures + 1} of this message history:\n" + + "-" * 60 + + f"\n{trunc_msg_hist}\n" + + "-" * 60, ) @observe(capture_input=False, capture_output=False) diff --git a/src/ansari/agents/ansari_workflow.py b/src/ansari/agents/ansari_workflow.py index 755660b..7061749 100644 --- a/src/ansari/agents/ansari_workflow.py +++ b/src/ansari/agents/ansari_workflow.py @@ -53,7 +53,7 @@ class AnsariWorkflow: """ - def __init__(self, settings, message_logger=None, json_format=False): + def __init__(self, settings, message_logger=None, json_format=False, system_prompt_file=None): self.settings = settings sq = SearchQuran(settings.KALEMAT_API_KEY.get_secret_value()) sh = SearchHadith(settings.KALEMAT_API_KEY.get_secret_value()) @@ -81,7 +81,8 @@ def __init__(self, settings, message_logger=None, json_format=False): } self.model = settings.MODEL self.pm = PromptMgr() - self.sys_msg = self.pm.bind(settings.SYSTEM_PROMPT_FILE_NAME).render() + prompt_file = system_prompt_file or settings.SYSTEM_PROMPT_FILE_NAME + self.sys_msg = self.pm.bind(prompt_file).render() self.tools = [x.get_tool_description() for x in self.tool_name_to_instance.values()] self.json_format = json_format self.message_logger = message_logger @@ -149,16 +150,22 @@ def _execute_gen_answer_step(self, step_params, prev_outputs): search_results = "\n---\n".join( [prev_outputs[i] for i in step_params["search_results_indices"]], ) - prompt = f"""Using {search_results}, compose a response that: + prompt = f""" Consider the following question: '{step_params["input"]}' + + Using the excerpts from tafsirs below, compose a response that: 1. Directly answers the query of the user 2. Matches user's language/tone - 3. Adheres to your system instructions as Ansari.""" + 3. Adheres to your system instructions as Ansari. + + {search_results} + + Reminder: the key question is: '{step_params["input"]}'. + """ model_response = litellm.completion( model=self.model, messages=[ {"role": "system", "content": self.sys_msg}, - {"role": "system", "content": prompt}, - {"role": "user", "content": step_params["input"]}, + {"role": "user", "content": prompt}, ], stream=False, timeout=30.0, diff --git a/src/ansari/app/main_api.py b/src/ansari/app/main_api.py index 46ca5b4..08d66f9 100644 --- a/src/ansari/app/main_api.py +++ b/src/ansari/app/main_api.py @@ -705,6 +705,7 @@ class AyahQuestionRequest(BaseModel): ayah: int question: str augment_question: bool | None = False + use_cache: bool | None = True apikey: str @@ -722,16 +723,17 @@ async def answer_ayah_question( raise HTTPException(status_code=401, detail="Unauthorized") try: - # Create AnsariWorkflow instance - logging.debug("Creating AnsariWorkflow instance for {req.surah}:{req.ayah}") - ansari_workflow = AnsariWorkflow(settings) + # Create AnsariWorkflow instance with ayah-specific system prompt + logging.debug(f"Creating Ansari Workflow instance for {req.surah}:{req.ayah}") + ansari_workflow = AnsariWorkflow(settings, system_prompt_file=settings.AYAH_SYSTEM_PROMPT_FILE_NAME) ayah_id = req.surah * 1000 + req.ayah # Check if the answer is already stored in the database - stored_answer = db.get_quran_answer(req.surah, req.ayah, req.question) - if stored_answer: - return {"response": stored_answer} + if req.use_cache: + stored_answer = db.get_quran_answer(req.surah, req.ayah, req.question) + if stored_answer: + return {"response": stored_answer} # Define the workflow steps workflow_steps = [ @@ -746,6 +748,8 @@ async def answer_ayah_question( ("gen_query", {"input": req.question, "target_corpus": "tafsir"}), ("gen_answer", {"input": req.question, "search_results_indices": [0]}), ] + # If augment_question is False, skip the query generation step to use + # the original question directly if not req.augment_question: workflow_steps.pop(1) diff --git a/src/ansari/app/main_file.py b/src/ansari/app/main_file.py index 6008630..4736107 100644 --- a/src/ansari/app/main_file.py +++ b/src/ansari/app/main_file.py @@ -1,9 +1,81 @@ import typer +import logging +from typing import Optional from ansari.agents import Ansari from ansari.config import get_settings from ansari.presenters.file_presenter import FilePresenter +from ansari.presenters.ayah_file_presenter import AyahFilePresenter + +logging.basicConfig( + level=logging.DEBUG, +) + + +def main( + input_file: str, + output_file: str, + ayah_mode: bool = typer.Option( + False, + "--ayah-mode", + "-a", + help="Process input as ayah questions (CSV format: surah:ayah,question)", + ), + use_query_generation: bool = typer.Option( + True, + "--use-query-generation", + "-q", + help="Use query generation step in ayah mode", + ), + answer_column: str = typer.Option( + "answer", + "--answer-column", + "-c", + help="Name of the column to store answers in the output CSV (ayah mode only)", + ), + system_message: Optional[str] = typer.Option( + None, + "--system-message", + "-s", + help="The name of the system message file. If not provided, uses default.", + ), + model: str = typer.Option( + "gpt-4", + "--model", + "-m", + help="The LLM model to use (e.g., gpt-4, gpt-3.5-turbo)", + ), +): + """ + Process input file and generate answers + + Args: + input_file: Path to input file + output_file: Path to output file + ayah_mode: Whether to process in ayah mode + use_query_generation: Whether to use query generation + answer_column: Name of column to store answers + system_message: The name of the system message file. If not provided, uses default. + model: The LLM model to use for generating answers + """ + settings = get_settings() + + if system_message: + settings.AYAH_SYSTEM_PROMPT_FILE_NAME = system_message + + # Set the model in settings + settings.MODEL = model + + if ayah_mode: + presenter = AyahFilePresenter( + settings=settings, use_query_generation=use_query_generation, answer_column=answer_column + ) + else: + ansari = Ansari(settings) + presenter = FilePresenter(ansari) + + presenter.present(input_file, output_file) + if __name__ == "__main__": - ansari = Ansari(get_settings()) - typer.run(FilePresenter(ansari).present) + typer.run(main) diff --git a/src/ansari/config.py b/src/ansari/config.py index 41e1e01..b60f9da 100644 --- a/src/ansari/config.py +++ b/src/ansari/config.py @@ -108,7 +108,7 @@ def get_resource_path(filename): { "name": "query", "type": "string", - "description": "The topic to search for in Tafsir Ibn Kathir. " "You will translate this query into English.", + "description": "The topic to search for in Tafsir Ibn Kathir. You will translate this query into English.", }, ], ) @@ -135,6 +135,7 @@ def get_resource_path(filename): MAX_TOOL_TRIES: int = Field(default=3) MAX_FAILURES: int = Field(default=1) SYSTEM_PROMPT_FILE_NAME: str = Field(default="system_msg_tool") + AYAH_SYSTEM_PROMPT_FILE_NAME: str = Field(default="system_msg_ayah") PROMPT_PATH: str = Field(default=str(get_resource_path("prompts"))) LOGGING_LEVEL: str = Field(default="INFO") diff --git a/src/ansari/presenters/ayah_file_presenter.py b/src/ansari/presenters/ayah_file_presenter.py new file mode 100644 index 0000000..8b14d7c --- /dev/null +++ b/src/ansari/presenters/ayah_file_presenter.py @@ -0,0 +1,129 @@ +import csv +import logging +import os +from typing import Tuple + +from ansari.agents.ansari_workflow import AnsariWorkflow + + +class AyahFilePresenter: + def __init__(self, settings, use_query_generation: bool = False, answer_column: str = "answer"): + self.settings = settings + self.use_query_generation = use_query_generation + self.answer_column = answer_column + + def _parse_ayah_reference(self, ayah_ref: str) -> Tuple[int, int]: + """Parse a surah:ayah reference into separate numbers. + + Args: + ayah_ref: String in format "surah:ayah" + + Returns: + Tuple of (surah_num, ayah_num) + + Raises: + ValueError: If format is invalid or empty + """ + if not ayah_ref or not ayah_ref.strip(): + raise ValueError("Empty ayah reference") + + try: + surah_str, ayah_str = ayah_ref.strip().split(":") + return int(surah_str), int(ayah_str) + except ValueError: + raise ValueError(f"Invalid ayah reference format: {ayah_ref}. Expected format: surah:ayah (e.g. 1:1)") + + def present(self, input_file_path: str, output_file_path: str): + try: + # First pass: read header to get all field names + with open(input_file_path, newline="") as input_file: + # Skip empty lines and get header + for line in input_file: + if line.strip(): # First non-empty line is header + reader = csv.reader([line]) + header = next(reader) + if len(header) < 2: + logging.error("Input CSV must contain at least two columns") + return + break + else: + logging.error("Empty input file") + return + + # Create fieldnames, preserving original names + fieldnames = header + if self.answer_column not in fieldnames: + fieldnames = fieldnames + [self.answer_column] + + # Second pass: process all rows + with open(input_file_path, newline="") as input_file: + reader = csv.reader(input_file) + + # Open output file and write + with open(output_file_path, "w", newline="") as output_file: + writer = csv.writer(output_file) + writer.writerow(fieldnames) + + for row in reader: + # Skip empty lines + if not any(row): + continue + + try: + # Get values from first and second columns using column positions + ayah_ref = row[0] + question = row[1] + + # Validate required fields + if not ayah_ref or not question: + raise ValueError("Missing required fields in first or second column") + + surah, ayah = self._parse_ayah_reference(ayah_ref) + question = question.strip() + + print(f"Processing surah {surah}, ayah {ayah}, question: {question}") + + # Create a new workflow instance for each question + workflow = AnsariWorkflow( + self.settings, system_prompt_file=self.settings.AYAH_SYSTEM_PROMPT_FILE_NAME + ) + + ayah_id = surah * 1000 + ayah + workflow_steps = [ + ( + "search", + { + "query": question, + "tool_name": "search_tafsir", + "metadata_filter": f"part.from_ayah_int<={ayah_id} AND part.to_ayah_int>={ayah_id}", + }, + ), + ] + + if self.use_query_generation: + workflow_steps.append(("gen_query", {"input": question, "target_corpus": "tafsir"})) + + workflow_steps.append(("gen_answer", {"input": question, "search_results_indices": [0]})) + + # Execute the workflow + workflow_output = workflow.execute_workflow(workflow_steps) + # The answer is the last item in the workflow output + answer = workflow_output[-1] + + # Add answer to row and write + row.append(answer) + writer.writerow(row) + output_file.flush() + + except Exception as e: + logging.error(f"Error processing row: {e}") + row.append(f"ERROR: {str(e)}") + writer.writerow(row) + output_file.flush() + continue + + print(f"Results saved to {os.path.abspath(output_file_path)}") + + except Exception as e: + logging.error(f"Error processing file: {e}") + return diff --git a/src/ansari/resources/prompts/system_msg_ayah.txt b/src/ansari/resources/prompts/system_msg_ayah.txt new file mode 100644 index 0000000..ec82f45 --- /dev/null +++ b/src/ansari/resources/prompts/system_msg_ayah.txt @@ -0,0 +1,35 @@ +You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer +Quran-related questions with accuracy and depth. + +Fluent in languages such as Arabic (including transliteration), +Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, +craft precise, evidence-based responses exclusively from the Sunni tradition. + +Here's how you work: You receive a an ayah and a question along with the +desired response language and search results from any tafsirs available. +Currently that includes Ibn Kathir. + +If you attribute a statement or opinion to a scholar, you will include EXACTLY +the sentence in which the mufassir says so. + +If you say there is a hadith that says something, you will include the hadith +EXACTLY as it was in the source text. + +Quoting from the source material is highly recommended when attributing +statements or opinions to scholars or hadith, especially when the source text is +weak or unverified. + +Crucially, only attribute specific statements or opinions to these scholars if you +have specific referenceable evidence to support that attribution. When referencing +the Quran, you, Ansari, include the ayah number, Arabic text, and translation +(if the user's language is different from Arabic). + +If you provide a translation, include the name of the translation (e.g. Saheeh +International). Generally Ibn Kathir uses Saheeh International. + +The person reading your answer is a well informed scholar. You may use terms +that an informed scholar would use. You should use more citations and references +than a general member of the public would. + + + diff --git a/src/ansari/resources/prompts/system_msg_ayah_lay.txt b/src/ansari/resources/prompts/system_msg_ayah_lay.txt new file mode 100644 index 0000000..a244d79 --- /dev/null +++ b/src/ansari/resources/prompts/system_msg_ayah_lay.txt @@ -0,0 +1,32 @@ +You are Ansari Qur'an, a multilingual Islamic digital assistant designed to answer +Quran-related questions with accuracy and depth. + +Fluent in languages such as Arabic (including transliteration), +Bahasa, Bosnian, French, Turkish, Urdu, and more, you, Ansari, +craft precise, evidence-based responses exclusively from the Sunni tradition. + +Here's how you work: You receive a an ayah and a question along with the +desired response language and search results from any tafsirs available. +Currently that includes Ibn Kathir. + +If you attribute a statement or opinion to a scholar, you will include EXACTLY +the sentence in which the mufassir says so. + +If you say there is a hadith that says something, you will include the hadith +EXACTLY as it was in the source text. + +Quoting from the source material is highly recommended when attributing +statements or opinions to scholars or hadith, especially when the source text is +weak or unverified. + +Crucially, only attribute specific statements or opinions to these scholars if you +have specific referenceable evidence to support that attribution. When referencing +the Quran, you, Ansari, include the ayah number, Arabic text, and translation +(if the user's language is different from Arabic). + +If you provide a translation, include the name of the translation (e.g. Saheeh +International). Generally Ibn Kathir uses Saheeh International. +The person reading your answer is a general member of the public who +may or may not be a Muslim. Assume the reader only has a basic knowledge of +Islam. + diff --git a/tests/test_answer_quality.py b/tests/test_answer_quality.py index 794e90e..2c25139 100644 --- a/tests/test_answer_quality.py +++ b/tests/test_answer_quality.py @@ -21,11 +21,11 @@ def data(): def answer_question(question, q_temp, cache): - logger.info(f'Answering question: {question["question"]}') + logger.info(f"Answering question: {question['question']}") options = [o.strip() for o in question["options"].split(",")] prompt = q_temp.render(question=question["question"], options=options) if prompt in cache.keys(): - logger.info(f'Found {question["question"]} in cache') + logger.info(f"Found {question['question']} in cache") return cache[prompt] ansari = Ansari(get_settings()) result = "".join(filter(lambda x: x is not None, ansari.process_input(prompt))) diff --git a/tests/test_main_api.py b/tests/test_main_api.py index fac41ee..859ee99 100644 --- a/tests/test_main_api.py +++ b/tests/test_main_api.py @@ -501,7 +501,13 @@ async def test_cors(): email = f"{base}+{uuid.uuid4()}@{domain}" response = client.post( "/api/v2/users/register", - headers={"origin": disallowed_origin}, + headers={ + "origin": disallowed_origin, + # Testserver bypasses CORS check + # Hence we need to explicitly set host + # to not-testserver + "host": "not-testserver", + }, json={ "email": email, "password": valid_password,