Skip to content

Commit

Permalink
feat: improve logging and add optional perf logging
Browse files Browse the repository at this point in the history
  • Loading branch information
nherment committed Jan 14, 2025
1 parent bfafbde commit f99f574
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 61 deletions.
2 changes: 2 additions & 0 deletions holmes/common/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ def load_bool(env_var, default: bool):
HOLMES_POST_PROCESSING_PROMPT = os.environ.get("HOLMES_POST_PROCESSING_PROMPT", "")
ROBUSTA_AI = load_bool("ROBUSTA_AI", False)
ROBUSTA_API_ENDPOINT = os.environ.get("ROBUSTA_API_ENDPOINT", "https://api.robusta.dev")

LOG_PERFORMANCE = os.environ.get("LOG_PERFORMANCE", None)
47 changes: 21 additions & 26 deletions holmes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@

from pydantic import FilePath, SecretStr, Field
from pydash.arrays import concat
from rich.console import Console


from holmes.core.runbooks import RunbookManager
from holmes.core.supabase_dal import SupabaseDal
from holmes.core.tool_calling_llm import (IssueInvestigator,
from holmes.core.tool_calling_llm import (IssueInvestigator,
ToolCallingLLM,
ToolExecutor)
from holmes.core.tools import ToolsetPattern, get_matching_toolsets, ToolsetStatusEnum, ToolsetTag
Expand All @@ -29,9 +28,7 @@
load_toolsets_from_file)
from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
from holmes.utils.definitions import CUSTOM_TOOLSET_LOCATION
from pydantic import ValidationError
from holmes.utils.holmes_sync_toolsets import load_custom_toolsets_config, merge_and_override_bultin_toolsets_with_toolsets_config
from holmes.core.tools import YAMLToolset
from holmes.common.env_vars import ROBUSTA_CONFIG_PATH
from holmes.utils.definitions import RobustaConfig

Expand Down Expand Up @@ -77,7 +74,7 @@ class Config(RobustaBaseConfig):

custom_runbooks: List[FilePath] = []
custom_toolsets: List[FilePath] = []

enabled_toolsets_names: List[str] = Field(default_factory=list)

@classmethod
Expand Down Expand Up @@ -109,7 +106,7 @@ def load_from_env(cls):
kwargs[field_name] = val
kwargs["cluster_name"] = Config.__get_cluster_name()
return cls(**kwargs)

@staticmethod
def __get_cluster_name() -> Optional[str]:
config_file_path = ROBUSTA_CONFIG_PATH
Expand All @@ -130,20 +127,20 @@ def __get_cluster_name() -> Optional[str]:
return None

def create_console_tool_executor(
self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
) -> ToolExecutor:
"""
Creates ToolExecutor for the cli
Creates ToolExecutor for the cli
"""
default_toolsets = [toolset for toolset in load_builtin_toolsets(dal) if any(tag in (ToolsetTag.CORE, ToolsetTag.CLI) for tag in toolset.tags)]

if allowed_toolsets == "*":
matching_toolsets = default_toolsets
else:
matching_toolsets = get_matching_toolsets(
default_toolsets, allowed_toolsets.split(",")
)
)

# Enable all matching toolsets that have CORE or CLI tag
for toolset in matching_toolsets:
toolset.enabled = True
Expand All @@ -155,7 +152,7 @@ def create_console_tool_executor(
toolsets_loaded_from_config,
matched_default_toolsets_by_name,
)

for toolset in filtered_toolsets_by_name.values():
if toolset.enabled:
toolset.check_prerequisites()
Expand All @@ -169,22 +166,22 @@ def create_console_tool_executor(
logging.info(f"Disabled toolset: {ts.name} from {ts.get_path()})")
elif ts.get_status() == ToolsetStatusEnum.FAILED:
logging.info(f"Failed loading toolset {ts.name} from {ts.get_path()}: ({ts.get_error()})")

for ts in default_toolsets:
if ts.name not in filtered_toolsets_by_name.keys():
logging.debug(f"Toolset {ts.name} from {ts.get_path()} was filtered out due to allowed_toolsets value")

enabled_tools = concat(*[ts.tools for ts in enabled_toolsets])
logging.debug(
f"Starting AI session with tools: {[t.name for t in enabled_tools]}"
)
return ToolExecutor(enabled_toolsets)

def create_tool_executor(
self, console: Console, dal:Optional[SupabaseDal]
self, dal:Optional[SupabaseDal]
) -> ToolExecutor:
"""
Creates ToolExecutor for the server endpoints
Creates ToolExecutor for the server endpoints
"""

all_toolsets = load_builtin_toolsets(dal=dal)
Expand All @@ -201,21 +198,21 @@ def create_tool_executor(
f"Starting AI session with tools: {[t.name for t in enabled_tools]}"
)
return ToolExecutor(enabled_toolsets)

def create_console_toolcalling_llm(
self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None
self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None
) -> ToolCallingLLM:
tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal)
tool_executor = self.create_console_tool_executor(allowed_toolsets, dal)
return ToolCallingLLM(
tool_executor,
self.max_steps,
self._get_llm()
)

def create_toolcalling_llm(
self, console: Console, dal:Optional[SupabaseDal] = None
self, dal:Optional[SupabaseDal] = None
) -> ToolCallingLLM:
tool_executor = self.create_tool_executor(console, dal)
tool_executor = self.create_tool_executor(dal)
return ToolCallingLLM(
tool_executor,
self.max_steps,
Expand All @@ -224,25 +221,23 @@ def create_toolcalling_llm(

def create_issue_investigator(
self,
console: Console,
dal: Optional[SupabaseDal] = None
) -> IssueInvestigator:
all_runbooks = load_builtin_runbooks()
for runbook_path in self.custom_runbooks:
all_runbooks.extend(load_runbooks_from_file(runbook_path))

runbook_manager = RunbookManager(all_runbooks)
tool_executor = self.create_tool_executor(console, dal)
tool_executor = self.create_tool_executor(dal)
return IssueInvestigator(
tool_executor,
runbook_manager,
self.max_steps,
self._get_llm()
)

def create_console_issue_investigator(
self,
console: Console,
allowed_toolsets: ToolsetPattern,
dal: Optional[SupabaseDal] = None
) -> IssueInvestigator:
Expand All @@ -251,7 +246,7 @@ def create_console_issue_investigator(
all_runbooks.extend(load_runbooks_from_file(runbook_path))

runbook_manager = RunbookManager(all_runbooks)
tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal)
tool_executor = self.create_console_tool_executor(allowed_toolsets, dal)
return IssueInvestigator(
tool_executor,
runbook_manager,
Expand Down
8 changes: 3 additions & 5 deletions holmes/core/investigation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from typing import Optional
from rich.console import Console
from holmes.common.env_vars import HOLMES_POST_PROCESSING_PROMPT
from holmes.config import Config
Expand All @@ -8,7 +9,7 @@
from holmes.utils.robusta import load_robusta_api_key


def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console):
def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config):
load_robusta_api_key(dal=dal, config=config)
context = dal.get_issue_data(
investigate_request.context.get("robusta_issue_id")
Expand All @@ -23,9 +24,7 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
if context:
raw_data["extra_context"] = context

ai = config.create_issue_investigator(
console, dal=dal
)
ai = config.create_issue_investigator(dal=dal)
issue = Issue(
id=context["id"] if context else "",
name=investigate_request.title,
Expand All @@ -37,7 +36,6 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
investigation = ai.investigate(
issue,
prompt=investigate_request.prompt_template,
console=console,
post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
instructions=resource_instructions,
global_instructions=global_instructions
Expand Down
4 changes: 2 additions & 2 deletions holmes/core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ def check_llm(self, model:str, api_key:Optional[str]):
"https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces"
)
else:
#
#
api_key_env_var = f"{provider.upper()}_API_KEY"
if api_key:
os.environ[api_key_env_var] = api_key
model_requirements = litellm.validate_environment(model=model)

if not model_requirements["keys_in_environment"]:
raise Exception(f"model {model} requires the following environment variables: {model_requirements['missing_keys']}")

Expand Down
57 changes: 57 additions & 0 deletions holmes/core/performance_timing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import time
import logging

from functools import wraps

from holmes.common.env_vars import (
LOG_PERFORMANCE,
)

class PerformanceTiming:
def __init__(self, name):
if not LOG_PERFORMANCE:
return
self.ended = False

self.name = name
self.start_time = time.time()
self.last_measure_time = self.start_time
self.last_measure_label = "Start"
self.timings = []

def measure(self, label):
if not LOG_PERFORMANCE:
return
if self.ended:
raise Exception("cannot measure a perf timing that is already ended")
current_time = time.time()

time_since_start = int((current_time - self.start_time) * 1000)
time_since_last = int((current_time - self.last_measure_time) * 1000)

self.timings.append((label, time_since_last, time_since_start))

self.last_measure_time = current_time
self.last_measure_label = label

def end(self):
if not LOG_PERFORMANCE:
return
self.ended = True
current_time = time.time()
time_since_start = int((current_time - self.start_time) * 1000)
message = f'{self.name}(TOTAL) {time_since_start}ms'
logging.info(message)
for label, time_since_last, time_since_start in self.timings:
logging.info(f'\t{self.name}({label}) +{time_since_last}ms {time_since_start}ms')

def log_function_timing(func):
@wraps(func)
def function_timing_wrapper(*args, **kwargs):
start_time = time.perf_counter()
result = func(*args, **kwargs)
end_time = time.perf_counter()
total_time = int((end_time - start_time) * 1000)
logging.info(f'Function "{func.__name__}()" took {total_time}ms')
return result
return function_timing_wrapper
26 changes: 19 additions & 7 deletions holmes/core/tool_calling_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import textwrap
from typing import List, Optional, Dict
from holmes.core.performance_timing import PerformanceTiming
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
from holmes.plugins.prompts import load_and_render_prompt
from typing import List, Optional
Expand Down Expand Up @@ -102,10 +103,12 @@ def call(
response_format: dict = None,
user_prompt: Optional[str] = None,
) -> LLMResult:

perf_timing = PerformanceTiming("tool_calling_llm.call")
tool_calls = []
tools = self.tool_executor.get_all_tools_openai_format()
perf_timing.measure("get_all_tools_openai_format")
for i in range(self.max_steps):
perf_timing.measure(f"start iteration {i}")
logging.debug(f"running iteration {i}")
# on the last step we don't allow tools - we want to force a reply, not a request to run another tool
tools = NOT_GIVEN if i == self.max_steps - 1 else tools
Expand All @@ -114,12 +117,15 @@ def call(
total_tokens = self.llm.count_tokens_for_message(messages)
max_context_size = self.llm.get_context_window_size()
maximum_output_token = self.llm.get_maximum_output_token()
perf_timing.measure("count tokens")

if (total_tokens + maximum_output_token) > max_context_size:
logging.warning("Token limit exceeded. Truncating tool responses.")
messages = self.truncate_messages_to_fit_context(
messages, max_context_size, maximum_output_token
)
perf_timing.measure("truncate_messages_to_fit_context")


logging.debug(f"sending messages={messages}\n\ntools={tools}")
try:
Expand All @@ -132,6 +138,8 @@ def call(
drop_params=True,
)
logging.debug(f"got response {full_response.to_json()}")

perf_timing.measure("llm.completion")
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
except BadRequestError as e:
if (
Expand Down Expand Up @@ -165,6 +173,7 @@ def call(
user_prompt=post_process_prompt,
)

perf_timing.end()
return LLMResult(
result=post_processed_response,
unprocessed_result=raw_response,
Expand All @@ -173,13 +182,15 @@ def call(
messages=messages,
)

perf_timing.end()
return LLMResult(
result=response_message.content,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
messages=messages,
)

perf_timing.measure("pre-tool-calls")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
futures = []
for t in tools_to_call:
Expand All @@ -196,6 +207,7 @@ def call(
"content": tool_call_result.result,
}
)
perf_timing.measure(f"tool completed {tool_call_result.tool_name}")

def _invoke_tool(
self, tool_to_call: ChatCompletionMessageToolCall
Expand Down Expand Up @@ -331,8 +343,8 @@ def investigate(
self,
issue: Issue,
prompt: str,
console: Console,
instructions: Optional[ResourceInstructions],
console: Optional[Console] = None,
global_instructions: Optional[Instructions] = None,
post_processing_prompt: Optional[str] = None,
) -> LLMResult:
Expand All @@ -341,13 +353,13 @@ def investigate(
if instructions != None and instructions.instructions:
runbooks.extend(instructions.instructions)

if runbooks:
if console and runbooks:
console.print(
f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]"
)
else:
elif console:
console.print(
f"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
)
system_prompt = load_and_render_prompt(prompt, {"issue": issue})

Expand All @@ -365,10 +377,10 @@ def investigate(
user_prompt += f"* {runbook_str}\n"

user_prompt = f'My instructions to check \n"""{user_prompt}"""'

if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0:
user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n"

user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"

logging.debug(
Expand Down
Loading

0 comments on commit f99f574

Please sign in to comment.