Add grafana loki tool (#227)

Adds toolset to query loki logs proxying through grafana. Some env vars configure the access to grafana (url, username & api key). The fields used to get pods and nodes logs are configurable through the config file. --------- Co-authored-by: Avi-Robusta <[email protected]> Co-authored-by: Mohse Morad <[email protected]>
robusta-dev · Jan 29, 2025 · 79820ce · 79820ce
1 parent bfab843
commit 79820ce
Show file tree

Hide file tree

Showing 16 changed files with 1,748 additions and 646 deletions.
diff --git a/README.md b/README.md
@@ -598,9 +598,9 @@ If your llm provider url uses a certificate from a custom CA, in order to trust
 <summary>Confluence</summary>
 HolmesGPT can read runbooks from Confluence. To give it access, set the following environment variables:
 
-* CONFLUENCE_BASE_URL - e.g. https://robusta-dev-test.atlassian.net
-* CONFLUENCE_USER - e.g. [email protected]
-* CONFLUENCE_API_KEY - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
+* `CONFLUENCE_BASE_URL` - e.g. https://robusta-dev-test.atlassian.net
+* `CONFLUENCE_USER` - e.g. [email protected]
+* `CONFLUENCE_API_KEY` - [refer to Atlassian docs on generating API keys](https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
 </details>
 
 <details>
@@ -624,13 +624,59 @@ This is done through a HTTP GET and the resulting HTML is then cleaned and parse
 Any Javascript that is on the webpage is ignored.
 </details>
 
+<details>
+<summary>
+Using Grafana Loki
+</summary>
+
+HolmesGPT can consult logs from [Loki](https://grafana.com/oss/loki/) by proxying through a [Grafana](https://grafana.com/oss/grafana/) instance.
+
+There are 2 parts to configuring access to Grafana Loki: Access/Authentication and search terms.
+
+For access and authentication, add the following environment variables:
+
+* `GRAFANA_URL` - e.g. https://my-org.grafana.net
+* `GRAFANA_API_KEY` - e.g. glsa_bsm6ZS_sdfs25f
+
+For search terms, you can optionally tweak the search terms used by the toolset.
+This is done by appending the following to your Holmes configuration file:
+
+```yaml
+grafana:
+  url: https://my-org.grafana.net #
+  api_key: glsa_bsm6ZS_sdfs25f
+  loki:
+    pod_name_search_key: "pod"
+    namespace_search_key: "namespace"
+    node_name_search_key: "node"
+```
+
+> You only need to tweak the configuration file if your Loki logs settings for pod, namespace and node differ from the above defaults.
+
+The Loki toolset is configured the using the same Grafana settings as the Grafana Tempo toolset.
+</details>
+
+<summary>
+Using Grafana Tempo
+</summary>
+
+HolmesGPT can fetch trace information from Grafana Tempo to debug performance related issues.
+
+Tempo is configured the using the same Grafana settings as the Grafana Loki toolset.
+
+grafana:
+  url: https://my-org.grafana.net #
+</details>
+
+
 <details>
 <summary>
 ArgoCD
 </summary>
 
 Holmes can use the `argocd` CLI to get details about the ArgoCD setup like the apps configuration and status, clusters and projects within ArgoCD.
 To enable ArgoCD, set the `ARGOCD_AUTH_TOKEN` environment variable as described in the [argocd documentation](https://argo-cd.readthedocs.io/en/latest/user-guide/commands/argocd_account_generate-token/).
+
 </details>
 
 ## More Use Cases

diff --git a/holmes/config.py b/holmes/config.py
@@ -31,7 +31,8 @@
 from holmes.plugins.sources.opsgenie import OpsGenieSource
 from holmes.plugins.sources.pagerduty import PagerDutySource
 from holmes.plugins.sources.prometheus.plugin import AlertManagerSource
-from holmes.plugins.toolsets import load_builtin_toolsets, load_toolsets_from_file
+
+from holmes.plugins.toolsets import load_builtin_toolsets
 from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
 from holmes.utils.definitions import CUSTOM_TOOLSET_LOCATION
 from pydantic import ValidationError
@@ -190,9 +191,7 @@ def create_console_tool_executor(
         for toolset in matching_toolsets:
             toolset.enabled = True
 
-        toolsets_by_name = {
-            toolset.name: toolset for toolset in matching_toolsets
-        }
+        toolsets_by_name = {toolset.name: toolset for toolset in matching_toolsets}
 
         toolsets_loaded_from_config = self.load_custom_toolsets_config()
         if toolsets_loaded_from_config:
@@ -245,9 +244,10 @@ def create_tool_executor(self, dal: Optional[SupabaseDal]) -> ToolExecutor:
         """
         Creates ToolExecutor for the server endpoints
         """
+
         if self._server_tool_executor:
             return self._server_tool_executor
-        
+
         logging.info("Creating server tool executor")
         all_toolsets = load_builtin_toolsets(dal=dal)
 

diff --git a/holmes/core/tools.py b/holmes/core/tools.py
@@ -57,6 +57,7 @@ class ToolsetStatusEnum(str, Enum):
     DISABLED = "disabled"
     FAILED = "failed"
 
+
 class ToolsetTag(str, Enum):
     CORE = "core"
     CLUSTER = "cluster"
@@ -103,7 +104,7 @@ def get_openai_format(self):
                 },
             },
         }
- 
+
         # gemini doesnt have parameters object if it is without params
         if tool_properties is None:
             result["function"].pop("parameters")
@@ -139,7 +140,7 @@ def __infer_parameters(self):
         #    if param not in self.parameters:
         #        self.parameters[param] = ToolParameter()
         for param in inferred_params:
-            if param not in self.parameters: 
+            if param not in self.parameters:
                 self.parameters[param] = ToolParameter()
 
     def get_parameterized_one_liner(self, params) -> str:
@@ -263,17 +264,19 @@ class Toolset(BaseModel):
             StaticPrerequisite,
             ToolsetCommandPrerequisite,
             ToolsetEnvironmentPrerequisite,
-            CallablePrerequisite
+            CallablePrerequisite,
         ]
     ] = []
     tools: List[Tool]
-    tags: List[ToolsetTag] = Field(default_factory=lambda: [ToolsetTag.CORE],)
+    tags: List[ToolsetTag] = Field(
+        default_factory=lambda: [ToolsetTag.CORE],
+    )
     config: Optional[Any] = None
     is_default: bool = False
 
-    _path: Optional[str] =  PrivateAttr(None)
-    _status: ToolsetStatusEnum =  PrivateAttr(ToolsetStatusEnum.DISABLED)
-    _error: Optional[str]  = PrivateAttr(None)
+    _path: Optional[str] = PrivateAttr(None)
+    _status: ToolsetStatusEnum = PrivateAttr(ToolsetStatusEnum.DISABLED)
+    _error: Optional[str] = PrivateAttr(None)
 
     def override_with(self, override: "ToolsetYamlFromConfig") -> None:
         """
@@ -351,9 +354,7 @@ def check_prerequisites(self):
                     logging.debug(
                         f"Toolset {self.name} : Failed to run prereq command {prereq}; {str(e)}"
                     )
-                    self._error = (
-                        f"Prerequisites check failed with errorcode {e.returncode}: {str(e)}"
-                    )
+                    self._error = f"Prerequisites check failed with errorcode {e.returncode}: {str(e)}"
                     return
 
             elif isinstance(prereq, ToolsetEnvironmentPrerequisite):
@@ -367,7 +368,7 @@ def check_prerequisites(self):
                 if not prereq.enabled:
                     self._status = ToolsetStatusEnum.DISABLED
                     return
-                
+
             elif isinstance(prereq, CallablePrerequisite):
                 res = prereq.callable(self.config)
                 if not res:
@@ -420,6 +421,7 @@ def get_tool_by_name(self, name: str) -> Optional[YAMLTool]:
     def get_all_tools_openai_format(self):
         return [tool.get_openai_format() for tool in self.tools_by_name.values()]
 
+
 class ToolsetYamlFromConfig(Toolset):
     name: str
     enabled: bool = True

diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py
@@ -5,6 +5,9 @@
 
 from holmes.core.supabase_dal import SupabaseDal
 from holmes.plugins.toolsets.findings import FindingsToolset
+from holmes.plugins.toolsets.grafana.common import GrafanaConfig
+from holmes.plugins.toolsets.grafana.toolset_grafana_loki import GrafanaLokiToolset
+from holmes.plugins.toolsets.grafana.toolset_grafana_tempo import GrafanaTempoToolset
 from holmes.plugins.toolsets.internet import InternetToolset
 
 from holmes.core.tools import Toolset, YAMLToolset
@@ -16,7 +19,9 @@
 THIS_DIR = os.path.abspath(os.path.dirname(__file__))
 
 
-def load_toolsets_from_file(path: str, silent_fail: bool = False, is_default: bool = False) -> List[YAMLToolset]:
+def load_toolsets_from_file(
+    path: str, silent_fail: bool = False, is_default: bool = False
+) -> List[YAMLToolset]:
     file_toolsets = []
     with open(path) as file:
         parsed_yaml = yaml.safe_load(file)
@@ -26,25 +31,31 @@ def load_toolsets_from_file(path: str, silent_fail: bool = False, is_default: bo
                 toolset = YAMLToolset(**config, name=name, is_default=is_default)
                 toolset.set_path(path)
                 file_toolsets.append(YAMLToolset(**config, name=name))
-            except Exception as e:
+            except Exception:
                 if not silent_fail:
-                    logging.error(f"Error happened while loading {name} toolset from {path}",
-                                  exc_info=True)
+                    logging.error(
+                        f"Error happened while loading {name} toolset from {path}",
+                        exc_info=True,
+                    )
 
     return file_toolsets
 
 
-def load_python_toolsets(dal:Optional[SupabaseDal]) -> List[Toolset]:
+def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
     logging.debug("loading python toolsets")
-    toolsets: list[Toolset] = [InternetToolset(), FindingsToolset(dal)]
+    toolsets: list[Toolset] = [
+        InternetToolset(),
+        FindingsToolset(dal),
+        OpenSearchToolset(),
+        GrafanaLokiToolset(),
+        GrafanaTempoToolset(),
+    ]
 
-    opensearch = OpenSearchToolset()
-    toolsets.append(opensearch)
     return toolsets
 
 
-def load_builtin_toolsets(dal:Optional[SupabaseDal] = None) -> List[Toolset]:
-    all_toolsets: list[Toolset] = []
+def load_builtin_toolsets(dal: Optional[SupabaseDal] = None) -> List[Toolset]:
+    all_toolsets = []
     logging.debug(f"loading toolsets from {THIS_DIR}")
     for filename in os.listdir(THIS_DIR):
         if not filename.endswith(".yaml"):

diff --git a/holmes/plugins/toolsets/grafana/__init__.py b/holmes/plugins/toolsets/grafana/__init__.py
diff --git a/holmes/plugins/toolsets/grafana/base_grafana_toolset.py b/holmes/plugins/toolsets/grafana/base_grafana_toolset.py
@@ -0,0 +1,39 @@
+import logging
+from typing import Any
+from holmes.core.tools import (
+    Tool,
+    Toolset,
+    ToolsetTag,
+    CallablePrerequisite,
+)
+from holmes.plugins.toolsets.grafana.common import GrafanaConfig
+from holmes.plugins.toolsets.grafana.grafana_api import get_health
+
+
+class BaseGrafanaToolset(Toolset):
+    def __init__(self, name: str, description: str, icon_url: str, tools: list[Tool]):
+        super().__init__(
+            name=name,
+            description=description,
+            icon_url=icon_url,
+            prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
+            tools=tools,
+            tags=[
+                ToolsetTag.CORE,
+            ],
+            enabled=False
+        )
+
+    def prerequisites_callable(self, config: dict[str, Any]) -> bool:
+        if not config:
+            logging.warning("Grafana config not provided")
+            return False
+
+        try:
+            self._grafana_config = GrafanaConfig(**config)
+            is_healthy = get_health(self._grafana_config.url, self._grafana_config.api_key)
+            return is_healthy
+
+        except Exception:
+            logging.exception("Failed to set up grafana toolset")
+            return False
diff --git a/holmes/plugins/toolsets/grafana/common.py b/holmes/plugins/toolsets/grafana/common.py
@@ -0,0 +1,65 @@
+from typing import Dict, Optional, Union
+import uuid
+import time
+import os
+from pydantic import BaseModel
+
+
+GRAFANA_URL_ENV_NAME = "GRAFANA_URL"
+GRAFANA_API_KEY_ENV_NAME = "GRAFANA_API_KEY"
+ONE_HOUR_IN_SECONDS = 3600
+
+
+class GrafanaLokiConfig(BaseModel):
+    pod_name_search_key: str = "pod"
+    namespace_search_key: str = "namespace"
+    node_name_search_key: str = "node"
+
+
+class GrafanaConfig(BaseModel):
+    loki: GrafanaLokiConfig = GrafanaLokiConfig()
+    api_key: str
+    url: str
+
+
+def headers(api_key: str):
+    return {
+        "Authorization": f"Bearer {api_key}",
+        "Accept": "application/json",
+        "Content-Type": "application/json",
+    }
+
+
+def process_timestamps(
+    start_timestamp: Optional[Union[int, str]], end_timestamp: Optional[Union[int, str]]
+):
+    if start_timestamp and isinstance(start_timestamp, str):
+        start_timestamp = int(start_timestamp)
+    if end_timestamp and isinstance(end_timestamp, str):
+        end_timestamp = int(end_timestamp)
+
+    if not end_timestamp:
+        end_timestamp = int(time.time())
+    if not start_timestamp:
+        start_timestamp = end_timestamp - ONE_HOUR_IN_SECONDS
+    if start_timestamp < 0:
+        start_timestamp = end_timestamp + start_timestamp
+    return (start_timestamp, end_timestamp)
+
+
+def get_param_or_raise(dict: Dict, param: str) -> str:
+    value = dict.get(param)
+    if not value:
+        raise Exception(f'Missing param "{param}"')
+    return value
+
+
+def get_datasource_id(dict: Dict, param: str) -> str:
+    datasource_id = get_param_or_raise(dict, param)
+    try:
+        if uuid.UUID(datasource_id, version=4):
+            return f"uid/{datasource_id}"
+    except:
+        pass
+
+    return datasource_id