Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve logging and add optional perf logging #248

Merged
merged 2 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions holmes/common/env_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ def load_bool(env_var, default: bool):
HOLMES_POST_PROCESSING_PROMPT = os.environ.get("HOLMES_POST_PROCESSING_PROMPT", "")
ROBUSTA_AI = load_bool("ROBUSTA_AI", False)
ROBUSTA_API_ENDPOINT = os.environ.get("ROBUSTA_API_ENDPOINT", "https://api.robusta.dev")

LOG_PERFORMANCE = os.environ.get("LOG_PERFORMANCE", None)
47 changes: 21 additions & 26 deletions holmes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@

from pydantic import FilePath, SecretStr, Field
from pydash.arrays import concat
from rich.console import Console


from holmes.core.runbooks import RunbookManager
from holmes.core.supabase_dal import SupabaseDal
from holmes.core.tool_calling_llm import (IssueInvestigator,
from holmes.core.tool_calling_llm import (IssueInvestigator,
ToolCallingLLM,
ToolExecutor)
from holmes.core.tools import ToolsetPattern, get_matching_toolsets, ToolsetStatusEnum, ToolsetTag
Expand All @@ -29,9 +28,7 @@
load_toolsets_from_file)
from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
from holmes.utils.definitions import CUSTOM_TOOLSET_LOCATION
from pydantic import ValidationError
from holmes.utils.holmes_sync_toolsets import load_custom_toolsets_config, merge_and_override_bultin_toolsets_with_toolsets_config
from holmes.core.tools import YAMLToolset
from holmes.common.env_vars import ROBUSTA_CONFIG_PATH
from holmes.utils.definitions import RobustaConfig

Expand Down Expand Up @@ -77,7 +74,7 @@ class Config(RobustaBaseConfig):

custom_runbooks: List[FilePath] = []
custom_toolsets: List[FilePath] = []

enabled_toolsets_names: List[str] = Field(default_factory=list)

@classmethod
Expand Down Expand Up @@ -109,7 +106,7 @@ def load_from_env(cls):
kwargs[field_name] = val
kwargs["cluster_name"] = Config.__get_cluster_name()
return cls(**kwargs)

@staticmethod
def __get_cluster_name() -> Optional[str]:
config_file_path = ROBUSTA_CONFIG_PATH
Expand All @@ -130,20 +127,20 @@ def __get_cluster_name() -> Optional[str]:
return None

def create_console_tool_executor(
self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
) -> ToolExecutor:
"""
Creates ToolExecutor for the cli
Creates ToolExecutor for the cli
"""
default_toolsets = [toolset for toolset in load_builtin_toolsets(dal) if any(tag in (ToolsetTag.CORE, ToolsetTag.CLI) for tag in toolset.tags)]

if allowed_toolsets == "*":
matching_toolsets = default_toolsets
else:
matching_toolsets = get_matching_toolsets(
default_toolsets, allowed_toolsets.split(",")
)
)

# Enable all matching toolsets that have CORE or CLI tag
for toolset in matching_toolsets:
toolset.enabled = True
Expand All @@ -155,7 +152,7 @@ def create_console_tool_executor(
toolsets_loaded_from_config,
matched_default_toolsets_by_name,
)

for toolset in filtered_toolsets_by_name.values():
if toolset.enabled:
toolset.check_prerequisites()
Expand All @@ -169,22 +166,22 @@ def create_console_tool_executor(
logging.info(f"Disabled toolset: {ts.name} from {ts.get_path()})")
elif ts.get_status() == ToolsetStatusEnum.FAILED:
logging.info(f"Failed loading toolset {ts.name} from {ts.get_path()}: ({ts.get_error()})")

for ts in default_toolsets:
if ts.name not in filtered_toolsets_by_name.keys():
logging.debug(f"Toolset {ts.name} from {ts.get_path()} was filtered out due to allowed_toolsets value")

enabled_tools = concat(*[ts.tools for ts in enabled_toolsets])
logging.debug(
f"Starting AI session with tools: {[t.name for t in enabled_tools]}"
)
return ToolExecutor(enabled_toolsets)

def create_tool_executor(
self, console: Console, dal:Optional[SupabaseDal]
self, dal:Optional[SupabaseDal]
) -> ToolExecutor:
"""
Creates ToolExecutor for the server endpoints
Creates ToolExecutor for the server endpoints
"""

all_toolsets = load_builtin_toolsets(dal=dal)
Expand All @@ -201,21 +198,21 @@ def create_tool_executor(
f"Starting AI session with tools: {[t.name for t in enabled_tools]}"
)
return ToolExecutor(enabled_toolsets)

def create_console_toolcalling_llm(
self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None
self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None
) -> ToolCallingLLM:
tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal)
tool_executor = self.create_console_tool_executor(allowed_toolsets, dal)
return ToolCallingLLM(
tool_executor,
self.max_steps,
self._get_llm()
)

def create_toolcalling_llm(
self, console: Console, dal:Optional[SupabaseDal] = None
self, dal:Optional[SupabaseDal] = None
) -> ToolCallingLLM:
tool_executor = self.create_tool_executor(console, dal)
tool_executor = self.create_tool_executor(dal)
return ToolCallingLLM(
tool_executor,
self.max_steps,
Expand All @@ -224,25 +221,23 @@ def create_toolcalling_llm(

def create_issue_investigator(
self,
console: Console,
dal: Optional[SupabaseDal] = None
) -> IssueInvestigator:
all_runbooks = load_builtin_runbooks()
for runbook_path in self.custom_runbooks:
all_runbooks.extend(load_runbooks_from_file(runbook_path))

runbook_manager = RunbookManager(all_runbooks)
tool_executor = self.create_tool_executor(console, dal)
tool_executor = self.create_tool_executor(dal)
return IssueInvestigator(
tool_executor,
runbook_manager,
self.max_steps,
self._get_llm()
)

def create_console_issue_investigator(
self,
console: Console,
allowed_toolsets: ToolsetPattern,
dal: Optional[SupabaseDal] = None
) -> IssueInvestigator:
Expand All @@ -251,7 +246,7 @@ def create_console_issue_investigator(
all_runbooks.extend(load_runbooks_from_file(runbook_path))

runbook_manager = RunbookManager(all_runbooks)
tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal)
tool_executor = self.create_console_tool_executor(allowed_toolsets, dal)
return IssueInvestigator(
tool_executor,
runbook_manager,
Expand Down
8 changes: 3 additions & 5 deletions holmes/core/investigation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@

from typing import Optional
from rich.console import Console
from holmes.common.env_vars import HOLMES_POST_PROCESSING_PROMPT
from holmes.config import Config
Expand All @@ -8,7 +9,7 @@
from holmes.utils.robusta import load_robusta_api_key


def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console):
def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config):
load_robusta_api_key(dal=dal, config=config)
context = dal.get_issue_data(
investigate_request.context.get("robusta_issue_id")
Expand All @@ -23,9 +24,7 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
if context:
raw_data["extra_context"] = context

ai = config.create_issue_investigator(
console, dal=dal
)
ai = config.create_issue_investigator(dal=dal)
issue = Issue(
id=context["id"] if context else "",
name=investigate_request.title,
Expand All @@ -37,7 +36,6 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
investigation = ai.investigate(
issue,
prompt=investigate_request.prompt_template,
console=console,
post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
instructions=resource_instructions,
global_instructions=global_instructions
Expand Down
4 changes: 2 additions & 2 deletions holmes/core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ def check_llm(self, model:str, api_key:Optional[str]):
"https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces"
)
else:
#
#
api_key_env_var = f"{provider.upper()}_API_KEY"
if api_key:
os.environ[api_key_env_var] = api_key
model_requirements = litellm.validate_environment(model=model)

if not model_requirements["keys_in_environment"]:
raise Exception(f"model {model} requires the following environment variables: {model_requirements['missing_keys']}")

Expand Down
57 changes: 57 additions & 0 deletions holmes/core/performance_timing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import time
import logging

from functools import wraps

from holmes.common.env_vars import (
LOG_PERFORMANCE,
)

class PerformanceTiming:
def __init__(self, name):
if not LOG_PERFORMANCE:
return
self.ended = False

self.name = name
self.start_time = time.time()
self.last_measure_time = self.start_time
self.last_measure_label = "Start"
self.timings = []

def measure(self, label):
if not LOG_PERFORMANCE:
return
if self.ended:
raise Exception("cannot measure a perf timing that is already ended")
current_time = time.time()

time_since_start = int((current_time - self.start_time) * 1000)
time_since_last = int((current_time - self.last_measure_time) * 1000)

self.timings.append((label, time_since_last, time_since_start))

self.last_measure_time = current_time
self.last_measure_label = label

def end(self):
if not LOG_PERFORMANCE:
return
self.ended = True
current_time = time.time()
time_since_start = int((current_time - self.start_time) * 1000)
message = f'{self.name}(TOTAL) {time_since_start}ms'
logging.info(message)
for label, time_since_last, time_since_start in self.timings:
logging.info(f'\t{self.name}({label}) +{time_since_last}ms {time_since_start}ms')

def log_function_timing(func):
@wraps(func)
def function_timing_wrapper(*args, **kwargs):
start_time = time.perf_counter()
result = func(*args, **kwargs)
end_time = time.perf_counter()
total_time = int((end_time - start_time) * 1000)
logging.info(f'Function "{func.__name__}()" took {total_time}ms')
return result
return function_timing_wrapper
26 changes: 19 additions & 7 deletions holmes/core/tool_calling_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import textwrap
from typing import List, Optional, Dict
from holmes.core.performance_timing import PerformanceTiming
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
from holmes.plugins.prompts import load_and_render_prompt
from typing import List, Optional
Expand Down Expand Up @@ -102,10 +103,12 @@ def call(
response_format: dict = None,
user_prompt: Optional[str] = None,
) -> LLMResult:

perf_timing = PerformanceTiming("tool_calling_llm.call")
tool_calls = []
tools = self.tool_executor.get_all_tools_openai_format()
perf_timing.measure("get_all_tools_openai_format")
for i in range(self.max_steps):
perf_timing.measure(f"start iteration {i}")
logging.debug(f"running iteration {i}")
# on the last step we don't allow tools - we want to force a reply, not a request to run another tool
tools = NOT_GIVEN if i == self.max_steps - 1 else tools
Expand All @@ -114,12 +117,15 @@ def call(
total_tokens = self.llm.count_tokens_for_message(messages)
max_context_size = self.llm.get_context_window_size()
maximum_output_token = self.llm.get_maximum_output_token()
perf_timing.measure("count tokens")

if (total_tokens + maximum_output_token) > max_context_size:
logging.warning("Token limit exceeded. Truncating tool responses.")
messages = self.truncate_messages_to_fit_context(
messages, max_context_size, maximum_output_token
)
perf_timing.measure("truncate_messages_to_fit_context")


logging.debug(f"sending messages={messages}\n\ntools={tools}")
try:
Expand All @@ -132,6 +138,8 @@ def call(
drop_params=True,
)
logging.debug(f"got response {full_response.to_json()}")

perf_timing.measure("llm.completion")
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
except BadRequestError as e:
if (
Expand Down Expand Up @@ -165,6 +173,7 @@ def call(
user_prompt=post_process_prompt,
)

perf_timing.end()
return LLMResult(
result=post_processed_response,
unprocessed_result=raw_response,
Expand All @@ -173,13 +182,15 @@ def call(
messages=messages,
)

perf_timing.end()
return LLMResult(
result=response_message.content,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
messages=messages,
)

perf_timing.measure("pre-tool-calls")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
futures = []
for t in tools_to_call:
Expand All @@ -196,6 +207,7 @@ def call(
"content": tool_call_result.result,
}
)
perf_timing.measure(f"tool completed {tool_call_result.tool_name}")

def _invoke_tool(
self, tool_to_call: ChatCompletionMessageToolCall
Expand Down Expand Up @@ -331,8 +343,8 @@ def investigate(
self,
issue: Issue,
prompt: str,
console: Console,
instructions: Optional[ResourceInstructions],
console: Optional[Console] = None,
global_instructions: Optional[Instructions] = None,
post_processing_prompt: Optional[str] = None,
) -> LLMResult:
Expand All @@ -341,13 +353,13 @@ def investigate(
if instructions != None and instructions.instructions:
runbooks.extend(instructions.instructions)

if runbooks:
if console and runbooks:
console.print(
f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]"
)
else:
elif console:
console.print(
f"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
)
system_prompt = load_and_render_prompt(prompt, {"issue": issue})

Expand All @@ -365,10 +377,10 @@ def investigate(
user_prompt += f"* {runbook_str}\n"

user_prompt = f'My instructions to check \n"""{user_prompt}"""'

if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0:
user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n"

user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"

logging.debug(
Expand Down
Loading
Loading