MAIN-2806: Enable-AI-Output-Customization (#252)

robusta-dev · Jan 28, 2025 · c58da9b · c58da9b
1 parent 567d528
commit c58da9b
Show file tree

Hide file tree

Showing 207 changed files with 3,365 additions and 1,522 deletions.
diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py
@@ -8,7 +8,6 @@
 from holmes.core.supabase_dal import SupabaseDal
 from holmes.utils.robusta import load_robusta_api_key
 
-
 def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config):
     load_robusta_api_key(dal=dal, config=config)
     context = dal.get_issue_data(
@@ -37,12 +36,13 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
         issue,
         prompt=investigate_request.prompt_template,
         post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
+        sections=investigate_request.sections,
         instructions=resource_instructions,
         global_instructions=global_instructions
     )
-
     return InvestigationResult(
         analysis=investigation.result,
+        sections=investigation.sections,
         tool_calls=investigation.tool_calls or [],
         instructions=investigation.instructions,
     )
diff --git a/holmes/core/investigation_structured_output.py b/holmes/core/investigation_structured_output.py
@@ -0,0 +1,45 @@
+from typing import Any, Dict
+
+DEFAULT_SECTIONS = {
+    "Alert Explanation": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about",
+    "Investigation": "what you checked and found",
+    "Conclusions and Possible Root causes": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains",
+    "Next Steps": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
+    "Related logs": "Truncate and share the most relevant logs, especially if these explain the root cause. For example: \nLogs from pod robusta-holmes:\n```\n<logs>```\n. Always embed the surroundding +/- 5 log lines to any relevant logs. ",
+    "App or Infra?": "Explain whether the issue is more likely an infrastructure or an application level issue and why you think that.",
+    "External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link."
+}
+
+def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, Any]:
+
+    properties = {}
+    required_fields = []
+
+    for title, description in sections.items():
+        properties[title] = {
+            "type": ["string", "null"],
+            "description": description
+        }
+        required_fields.append(title)
+
+    schema = {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "type": "object",
+        "required": required_fields,
+        "properties": properties,
+        "additionalProperties": False
+    }
+
+    output_format = { "type": "json_schema", "json_schema": { "name": "InvestigationResult", "schema": schema, "strict": False} }
+
+    return output_format
+
+def combine_sections(sections: Any) -> str:
+    if isinstance(sections, dict):
+        content = ''
+        for section_title, section_content in sections.items():
+            if section_content:
+                # content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
+                content = content + f'\n# {section_title}\n{section_content}\n'
+        return content
+    return f"{sections}"
diff --git a/holmes/core/models.py b/holmes/core/models.py
@@ -1,11 +1,12 @@
 from holmes.core.tool_calling_llm import ToolCallResult
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
 from pydantic import BaseModel, model_validator
 from enum import Enum
 
 
 class InvestigationResult(BaseModel):
     analysis: Optional[str] = None
+    sections: Optional[Dict[str, Union[str, None]]] = None
     tool_calls: List[ToolCallResult] = []
     instructions: List[str] = []
 
@@ -20,6 +21,7 @@ class InvestigateRequest(BaseModel):
     include_tool_calls: bool = False
     include_tool_call_results: bool = False
     prompt_template: str = "builtin://generic_investigation.jinja2"
+    sections: Optional[Dict[str, str]] = None
     # TODO in the future
     # response_handler: ...
 

diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -2,13 +2,12 @@
 import json
 import logging
 import textwrap
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Type, Union
+from holmes.core.investigation_structured_output import DEFAULT_SECTIONS, get_output_format_for_investigation, combine_sections
 from holmes.core.performance_timing import PerformanceTiming
 from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.plugins.prompts import load_and_render_prompt
-from typing import List, Optional
 from holmes.core.llm import LLM
-from holmes.plugins.prompts import load_and_render_prompt
 from openai import BadRequestError
 from openai._types import NOT_GIVEN
 from openai.types.chat.chat_completion_message_tool_call import (
@@ -31,6 +30,7 @@ class ToolCallResult(BaseModel):
 
 class LLMResult(BaseModel):
     tool_calls: Optional[List[ToolCallResult]] = None
+    sections: Optional[Dict[str, Union[str, None]]] = None
     result: Optional[str] = None
     unprocessed_result: Optional[str] = None
     instructions: List[str] = []
@@ -57,12 +57,10 @@ class ResourceInstructionDocument(BaseModel):
 class Instructions(BaseModel):
     instructions: List[str] = []
 
-
 class ResourceInstructions(BaseModel):
     instructions: List[str] = []
     documents: List[ResourceInstructionDocument] = []
 
-
 class ToolCallingLLM:
 
     llm: LLM
@@ -77,7 +75,7 @@ def prompt_call(
         system_prompt: str,
         user_prompt: str,
         post_process_prompt: Optional[str] = None,
-        response_format: Optional[dict] = None,
+        response_format: Optional[Union[dict, Type[BaseModel]]] = None,
     ) -> LLMResult:
         messages = [
             {"role": "system", "content": system_prompt},
@@ -91,7 +89,7 @@ def messages_call(
         self,
         messages: List[Dict[str, str]],
         post_process_prompt: Optional[str] = None,
-        response_format: Optional[dict] = None,
+        response_format: Optional[Union[dict, Type[BaseModel]]] = None,
     ) -> LLMResult:
 
         return self.call(messages, post_process_prompt, response_format)
@@ -100,7 +98,7 @@ def call(
         self,
         messages: List[Dict[str, str]],
         post_process_prompt: Optional[str] = None,
-        response_format: dict = None,
+        response_format: Optional[Union[dict, Type[BaseModel]]] = None,
         user_prompt: Optional[str] = None,
     ) -> LLMResult:
         perf_timing = PerformanceTiming("tool_calling_llm.call")
@@ -160,13 +158,24 @@ def call(
             )
 
             tools_to_call = getattr(response_message, "tool_calls", None)
+            text_response = response_message.content
+            sections:Optional[Dict[str, str]] = None
+            if isinstance(text_response, str):
+                try:
+                    parsed_json = json.loads(text_response)
+                    text_response = parsed_json
+                except json.JSONDecodeError:
+                    pass
+            if not isinstance(text_response, str):
+                sections = text_response
+                text_response = combine_sections(sections)
 
             if not tools_to_call:
                 # For chatty models post process and summarize the result
                 # this only works for calls where user prompt is explicitly passed through
                 if post_process_prompt and user_prompt:
                     logging.info(f"Running post processing on investigation.")
-                    raw_response = response_message.content
+                    raw_response = text_response
                     post_processed_response = self._post_processing_call(
                         prompt=user_prompt,
                         investigation=raw_response,
@@ -176,6 +185,7 @@ def call(
                     perf_timing.end()
                     return LLMResult(
                         result=post_processed_response,
+                        sections=sections,
                         unprocessed_result=raw_response,
                         tool_calls=tool_calls,
                         prompt=json.dumps(messages, indent=2),
@@ -184,7 +194,8 @@ def call(
 
                 perf_timing.end()
                 return LLMResult(
-                    result=response_message.content,
+                    result=text_response,
+                    sections=sections,
                     tool_calls=tool_calls,
                     prompt=json.dumps(messages, indent=2),
                     messages=messages,
@@ -280,7 +291,7 @@ def _post_processing_call(
             full_response = self.llm.completion(messages=messages, temperature=0)
             logging.debug(f"Post processing response {full_response}")
             return full_response.choices[0].message.content
-        except Exception as error:
+        except Exception:
             logging.exception("Failed to run post processing", exc_info=True)
             return investigation
 
@@ -347,10 +358,14 @@ def investigate(
         console: Optional[Console] = None,
         global_instructions: Optional[Instructions] = None,
         post_processing_prompt: Optional[str] = None,
+        sections: Optional[Dict[str, str]] = None
     ) -> LLMResult:
         runbooks = self.runbook_manager.get_instructions_for_issue(issue)
 
-        if instructions != None and instructions.instructions:
+        if not sections or len(sections) == 0:
+            sections = DEFAULT_SECTIONS
+
+        if instructions is not None and instructions.instructions:
             runbooks.extend(instructions.instructions)
 
         if console and runbooks:
@@ -361,7 +376,7 @@ def investigate(
             console.print(
                 "[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
             )
-        system_prompt = load_and_render_prompt(prompt, {"issue": issue})
+        system_prompt = load_and_render_prompt(prompt, {"issue": issue, "sections": sections})
 
         if instructions != None and len(instructions.documents) > 0:
             docPrompts = []
@@ -388,6 +403,6 @@ def investigate(
         )
         logging.debug("Rendered user prompt:\n%s", textwrap.indent(user_prompt, "    "))
 
-        res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt)
+        res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt, response_format=get_output_format_for_investigation(sections))
         res.instructions = runbooks
         return res
diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -13,6 +13,7 @@ In general:
 * if you don't know, say that the analysis was inconclusive.
 * if there are multiple possible causes list them in a numbered list.
 * there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
+* Always check a pod's logs when checking if it is healthy. Don't assume that because the pod is running and reports healthy that it is running without issues.
 
 If investigating Kubernetes problems:
 * run as many kubectl commands as you need to gather more information, then respond.
@@ -21,6 +22,7 @@ If investigating Kubernetes problems:
 * if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
 * use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
 * when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs
+* if a pod has multiple containers, make sure you fetch the logs for either all or relevant containers using one of the containers log functions like kubectl_logs_all_containers, kubectl_logs_all_containers_grep or any other.
 * do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
 * do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
 * if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
@@ -44,7 +46,7 @@ If Helm tools are unavailable, skip this step.
  - Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
  - Focus on identifying absent permissions that align with the error message.
 4. **Update the Configuration**
-If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: 
+If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
 ```
 holmes:
     customClusterRoleRules:
@@ -64,7 +66,7 @@ Reminder:
 * Strive for thoroughness and precision, ensuring the issue is fully addressed.
 
 Special cases and how to reply:
-* if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
+* if you lack tools to access the right data or don't have access to a system, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
 * make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
 * as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
 * that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details

diff --git a/holmes/plugins/prompts/generic_ask_conversation.jinja2 b/holmes/plugins/prompts/generic_ask_conversation.jinja2
@@ -6,10 +6,8 @@ If you have a good and concrete suggestion for how the user can fix something, t
 
 Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
 
-
 {% include '_general_instructions.jinja2' %}
 
-
 Style guide:
 * Reply with terse output.
 * Be painfully concise.
@@ -29,4 +27,4 @@ Relevant logs:
 2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
 ```
 
-Validation error led to unhandled Java exception causing a crash.
+Validation error led to unhandled Java exception causing a crash.
diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -32,18 +32,9 @@ Style Guide:
 * But only quote relevant numbers or metrics that are available. Do not guess.
 * Remove unnecessary words
 
-Give your answer in the following format:
+Give your answer in a JSON format with the following sections. You can set a null value to a section if it's not relevant to the investigation. The content of each section should be formatted with markdown:
 
-# Alert Explanation
-<1-2 sentences explaining the alert itself - note don't say "The alert indicates a warning event related to a Kubernetes pod doing blah" rather just say "The pod XYZ did blah" because that is what the user actually cares about>
-
-# Investigation
-<what you checked and found>
-
-# Conclusions and Possible Root causes
-<what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains>
-
-# Next Steps
-<what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)>
-
-<DO NOT list tools used and DO NOT add a `# Tools` section>
+{% for title, description in sections.items() %}
+- {{ title }}: {{ description }}
+{% endfor %}
+- <DO NOT list tools used and DO NOT add a `# Tools` section>
diff --git a/holmes/plugins/toolsets/kubernetes_logs.yaml b/holmes/plugins/toolsets/kubernetes_logs.yaml
@@ -11,7 +11,7 @@ toolsets:
     tools:
       - name: "kubectl_previous_logs"
         description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash. Never give a deployment name or a resource that is not a pod."
-        command: "kubectl logs {{ name}} -n {{ namespace }} --previous"
+        command: "kubectl logs {{pod_name}} -n {{ namespace }} --previous"
 
       - name: "kubectl_previous_logs_all_containers"
         description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash."
@@ -23,7 +23,7 @@ toolsets:
 
       - name: "kubectl_logs"
         description: "Run `kubectl logs` on a single Kubernetes pod. Never give a deployment name or a resource that is not a pod."
-        command: "kubectl logs {{name}} -n {{ namespace }}"
+        command: "kubectl logs {{pod_name}} -n {{ namespace }}"
 
       - name: "kubectl_logs_all_containers"
         description: "Run `kubectl logs` on all containers within a single Kubernetes pod."
@@ -35,4 +35,8 @@ toolsets:
 
       - name: "kubectl_logs_grep"
         description: "Search for a specific term in the logs of a single Kubernetes pod. Only provide a pod name, not a deployment or other resource."
-        command: "kubectl logs {{ name }} -n {{ namespace }} | grep {{ search_term }}"
+        command: "kubectl logs {{ pod_name }} -n {{ namespace }} | grep {{ search_term }}"
+
+      - name: "kubectl_logs_all_containers_grep"
+        description: "Search for a specific term in the logs of a single Kubernetes pod across all of its containers. Only provide a pod name, not a deployment or other resource."
+        command: "kubectl logs {{pod_name}} -n {{ namespace }} --all-containers | grep {{ search_term }}"