Skip to content

Commit

Permalink
MAIN-2806: Enable-AI-Output-Customization (#252)
Browse files Browse the repository at this point in the history
  • Loading branch information
nherment authored Jan 28, 2025
1 parent 567d528 commit c58da9b
Show file tree
Hide file tree
Showing 207 changed files with 3,365 additions and 1,522 deletions.
4 changes: 2 additions & 2 deletions holmes/core/investigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from holmes.core.supabase_dal import SupabaseDal
from holmes.utils.robusta import load_robusta_api_key


def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config):
load_robusta_api_key(dal=dal, config=config)
context = dal.get_issue_data(
Expand Down Expand Up @@ -37,12 +36,13 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
issue,
prompt=investigate_request.prompt_template,
post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
sections=investigate_request.sections,
instructions=resource_instructions,
global_instructions=global_instructions
)

return InvestigationResult(
analysis=investigation.result,
sections=investigation.sections,
tool_calls=investigation.tool_calls or [],
instructions=investigation.instructions,
)
45 changes: 45 additions & 0 deletions holmes/core/investigation_structured_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import Any, Dict

DEFAULT_SECTIONS = {
"Alert Explanation": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about",
"Investigation": "what you checked and found",
"Conclusions and Possible Root causes": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains",
"Next Steps": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
"Related logs": "Truncate and share the most relevant logs, especially if these explain the root cause. For example: \nLogs from pod robusta-holmes:\n```\n<logs>```\n. Always embed the surroundding +/- 5 log lines to any relevant logs. ",
"App or Infra?": "Explain whether the issue is more likely an infrastructure or an application level issue and why you think that.",
"External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link."
}

def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, Any]:

properties = {}
required_fields = []

for title, description in sections.items():
properties[title] = {
"type": ["string", "null"],
"description": description
}
required_fields.append(title)

schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": required_fields,
"properties": properties,
"additionalProperties": False
}

output_format = { "type": "json_schema", "json_schema": { "name": "InvestigationResult", "schema": schema, "strict": False} }

return output_format

def combine_sections(sections: Any) -> str:
if isinstance(sections, dict):
content = ''
for section_title, section_content in sections.items():
if section_content:
# content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
content = content + f'\n# {section_title}\n{section_content}\n'
return content
return f"{sections}"
4 changes: 3 additions & 1 deletion holmes/core/models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from holmes.core.tool_calling_llm import ToolCallResult
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, model_validator
from enum import Enum


class InvestigationResult(BaseModel):
analysis: Optional[str] = None
sections: Optional[Dict[str, Union[str, None]]] = None
tool_calls: List[ToolCallResult] = []
instructions: List[str] = []

Expand All @@ -20,6 +21,7 @@ class InvestigateRequest(BaseModel):
include_tool_calls: bool = False
include_tool_call_results: bool = False
prompt_template: str = "builtin://generic_investigation.jinja2"
sections: Optional[Dict[str, str]] = None
# TODO in the future
# response_handler: ...

Expand Down
43 changes: 29 additions & 14 deletions holmes/core/tool_calling_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import json
import logging
import textwrap
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Type, Union
from holmes.core.investigation_structured_output import DEFAULT_SECTIONS, get_output_format_for_investigation, combine_sections
from holmes.core.performance_timing import PerformanceTiming
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
from holmes.plugins.prompts import load_and_render_prompt
from typing import List, Optional
from holmes.core.llm import LLM
from holmes.plugins.prompts import load_and_render_prompt
from openai import BadRequestError
from openai._types import NOT_GIVEN
from openai.types.chat.chat_completion_message_tool_call import (
Expand All @@ -31,6 +30,7 @@ class ToolCallResult(BaseModel):

class LLMResult(BaseModel):
tool_calls: Optional[List[ToolCallResult]] = None
sections: Optional[Dict[str, Union[str, None]]] = None
result: Optional[str] = None
unprocessed_result: Optional[str] = None
instructions: List[str] = []
Expand All @@ -57,12 +57,10 @@ class ResourceInstructionDocument(BaseModel):
class Instructions(BaseModel):
instructions: List[str] = []


class ResourceInstructions(BaseModel):
instructions: List[str] = []
documents: List[ResourceInstructionDocument] = []


class ToolCallingLLM:

llm: LLM
Expand All @@ -77,7 +75,7 @@ def prompt_call(
system_prompt: str,
user_prompt: str,
post_process_prompt: Optional[str] = None,
response_format: Optional[dict] = None,
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
) -> LLMResult:
messages = [
{"role": "system", "content": system_prompt},
Expand All @@ -91,7 +89,7 @@ def messages_call(
self,
messages: List[Dict[str, str]],
post_process_prompt: Optional[str] = None,
response_format: Optional[dict] = None,
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
) -> LLMResult:

return self.call(messages, post_process_prompt, response_format)
Expand All @@ -100,7 +98,7 @@ def call(
self,
messages: List[Dict[str, str]],
post_process_prompt: Optional[str] = None,
response_format: dict = None,
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
user_prompt: Optional[str] = None,
) -> LLMResult:
perf_timing = PerformanceTiming("tool_calling_llm.call")
Expand Down Expand Up @@ -160,13 +158,24 @@ def call(
)

tools_to_call = getattr(response_message, "tool_calls", None)
text_response = response_message.content
sections:Optional[Dict[str, str]] = None
if isinstance(text_response, str):
try:
parsed_json = json.loads(text_response)
text_response = parsed_json
except json.JSONDecodeError:
pass
if not isinstance(text_response, str):
sections = text_response
text_response = combine_sections(sections)

if not tools_to_call:
# For chatty models post process and summarize the result
# this only works for calls where user prompt is explicitly passed through
if post_process_prompt and user_prompt:
logging.info(f"Running post processing on investigation.")
raw_response = response_message.content
raw_response = text_response
post_processed_response = self._post_processing_call(
prompt=user_prompt,
investigation=raw_response,
Expand All @@ -176,6 +185,7 @@ def call(
perf_timing.end()
return LLMResult(
result=post_processed_response,
sections=sections,
unprocessed_result=raw_response,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
Expand All @@ -184,7 +194,8 @@ def call(

perf_timing.end()
return LLMResult(
result=response_message.content,
result=text_response,
sections=sections,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
messages=messages,
Expand Down Expand Up @@ -280,7 +291,7 @@ def _post_processing_call(
full_response = self.llm.completion(messages=messages, temperature=0)
logging.debug(f"Post processing response {full_response}")
return full_response.choices[0].message.content
except Exception as error:
except Exception:
logging.exception("Failed to run post processing", exc_info=True)
return investigation

Expand Down Expand Up @@ -347,10 +358,14 @@ def investigate(
console: Optional[Console] = None,
global_instructions: Optional[Instructions] = None,
post_processing_prompt: Optional[str] = None,
sections: Optional[Dict[str, str]] = None
) -> LLMResult:
runbooks = self.runbook_manager.get_instructions_for_issue(issue)

if instructions != None and instructions.instructions:
if not sections or len(sections) == 0:
sections = DEFAULT_SECTIONS

if instructions is not None and instructions.instructions:
runbooks.extend(instructions.instructions)

if console and runbooks:
Expand All @@ -361,7 +376,7 @@ def investigate(
console.print(
"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
)
system_prompt = load_and_render_prompt(prompt, {"issue": issue})
system_prompt = load_and_render_prompt(prompt, {"issue": issue, "sections": sections})

if instructions != None and len(instructions.documents) > 0:
docPrompts = []
Expand All @@ -388,6 +403,6 @@ def investigate(
)
logging.debug("Rendered user prompt:\n%s", textwrap.indent(user_prompt, " "))

res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt)
res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt, response_format=get_output_format_for_investigation(sections))
res.instructions = runbooks
return res
6 changes: 4 additions & 2 deletions holmes/plugins/prompts/_general_instructions.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ In general:
* if you don't know, say that the analysis was inconclusive.
* if there are multiple possible causes list them in a numbered list.
* there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
* Always check a pod's logs when checking if it is healthy. Don't assume that because the pod is running and reports healthy that it is running without issues.

If investigating Kubernetes problems:
* run as many kubectl commands as you need to gather more information, then respond.
Expand All @@ -21,6 +22,7 @@ If investigating Kubernetes problems:
* if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
* use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
* when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs
* if a pod has multiple containers, make sure you fetch the logs for either all or relevant containers using one of the containers log functions like kubectl_logs_all_containers, kubectl_logs_all_containers_grep or any other.
* do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
* do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
* if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
Expand All @@ -44,7 +46,7 @@ If Helm tools are unavailable, skip this step.
- Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
- Focus on identifying absent permissions that align with the error message.
4. **Update the Configuration**
If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
```
holmes:
customClusterRoleRules:
Expand All @@ -64,7 +66,7 @@ Reminder:
* Strive for thoroughness and precision, ensuring the issue is fully addressed.

Special cases and how to reply:
* if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
* if you lack tools to access the right data or don't have access to a system, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
* make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
* that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
Expand Down
4 changes: 1 addition & 3 deletions holmes/plugins/prompts/generic_ask_conversation.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@ If you have a good and concrete suggestion for how the user can fix something, t

Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.


{% include '_general_instructions.jinja2' %}


Style guide:
* Reply with terse output.
* Be painfully concise.
Expand All @@ -29,4 +27,4 @@ Relevant logs:
2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
```

Validation error led to unhandled Java exception causing a crash.
Validation error led to unhandled Java exception causing a crash.
19 changes: 5 additions & 14 deletions holmes/plugins/prompts/generic_investigation.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,9 @@ Style Guide:
* But only quote relevant numbers or metrics that are available. Do not guess.
* Remove unnecessary words

Give your answer in the following format:
Give your answer in a JSON format with the following sections. You can set a null value to a section if it's not relevant to the investigation. The content of each section should be formatted with markdown:

# Alert Explanation
<1-2 sentences explaining the alert itself - note don't say "The alert indicates a warning event related to a Kubernetes pod doing blah" rather just say "The pod XYZ did blah" because that is what the user actually cares about>
# Investigation
<what you checked and found>
# Conclusions and Possible Root causes
<what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains>
# Next Steps
<what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)>
<DO NOT list tools used and DO NOT add a `# Tools` section>
{% for title, description in sections.items() %}
- {{ title }}: {{ description }}
{% endfor %}
- <DO NOT list tools used and DO NOT add a `# Tools` section>
10 changes: 7 additions & 3 deletions holmes/plugins/toolsets/kubernetes_logs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ toolsets:
tools:
- name: "kubectl_previous_logs"
description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash. Never give a deployment name or a resource that is not a pod."
command: "kubectl logs {{ name}} -n {{ namespace }} --previous"
command: "kubectl logs {{pod_name}} -n {{ namespace }} --previous"

- name: "kubectl_previous_logs_all_containers"
description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash."
Expand All @@ -23,7 +23,7 @@ toolsets:

- name: "kubectl_logs"
description: "Run `kubectl logs` on a single Kubernetes pod. Never give a deployment name or a resource that is not a pod."
command: "kubectl logs {{name}} -n {{ namespace }}"
command: "kubectl logs {{pod_name}} -n {{ namespace }}"

- name: "kubectl_logs_all_containers"
description: "Run `kubectl logs` on all containers within a single Kubernetes pod."
Expand All @@ -35,4 +35,8 @@ toolsets:

- name: "kubectl_logs_grep"
description: "Search for a specific term in the logs of a single Kubernetes pod. Only provide a pod name, not a deployment or other resource."
command: "kubectl logs {{ name }} -n {{ namespace }} | grep {{ search_term }}"
command: "kubectl logs {{ pod_name }} -n {{ namespace }} | grep {{ search_term }}"

- name: "kubectl_logs_all_containers_grep"
description: "Search for a specific term in the logs of a single Kubernetes pod across all of its containers. Only provide a pod name, not a deployment or other resource."
command: "kubectl logs {{pod_name}} -n {{ namespace }} --all-containers | grep {{ search_term }}"
Loading

0 comments on commit c58da9b

Please sign in to comment.