Merge branch 'main' into fix/release-workflow

keephq · Aug 20, 2024 · 0063f0b · 0063f0b
2 parents d5732af + 09dc301
commit 0063f0b
Show file tree

Hide file tree

Showing 87 changed files with 3,140 additions and 625 deletions.
diff --git a/.github/workflows/test-pr-e2e.yml b/.github/workflows/test-pr-e2e.yml
@@ -22,6 +22,8 @@ env:
   POSTGRES_USER: keepuser
   POSTGRES_PASSWORD: keeppassword
   POSTGRES_DB: keepdb
+  # To test if imports are working properly
+  EE_ENABLED: true
 
 jobs:
   tests:
@@ -136,14 +138,14 @@ jobs:
           verbose: true
 
       - name: Dump backend logs
-        if: failure()
+        if: always()
         run: |
           docker compose --project-directory . -f tests/e2e_tests/docker-compose-e2e-${{ matrix.db_type }}.yml logs keep-backend > backend_logs-${{ matrix.db_type }}.txt
           docker compose --project-directory . -f tests/e2e_tests/docker-compose-e2e-${{ matrix.db_type }}.yml logs keep-frontend > frontend_logs-${{ matrix.db_type }}.txt
         continue-on-error: true
 
       - name: Upload test artifacts on failure
-        if: failure()
+        if: always()
         uses: actions/upload-artifact@v3
         with:
           name: test-artifacts

diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,9 @@ __pycache__/
 # C extensions
 *.so
 
+# .csv files
+*.csv
+
 # Distribution / packaging
 .Python
 build/

diff --git a/README.md b/README.md
@@ -135,6 +135,8 @@ Workflow triggers can either be executed manually when an alert is activated or
     <img width=32 height=32 src="https://github.com/keephq/keep/blob/main/keep-ui/public/icons/splunk-icon.png?raw=true"/>
     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
     <img width=32 height=32 src="https://github.com/keephq/keep/blob/main/keep-ui/public/icons/incidentmanager-icon.png"/>
+    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+    <img width=32 height=32 src="https://github.com/keephq/keep/blob/main/keep-ui/public/icons/coralogix-icon.png?raw=true" />
 </p>
 <h3 align="center">Databases and data warehouses</h3>
 <p align="center">

diff --git a/STRESS.md b/STRESS.md
@@ -0,0 +1,58 @@
+
+# UNDER CONSTRUCTION
+
+# First, create a kubernetes cluster
+
+
+# Install Keep
+gcloud config set project keep-dev-429814
+gcloud container clusters get-credentials keep-stress --zone us-central1-c --project keep-dev-429814
+helm repo add keephq https://keephq.github.io/helm-charts
+helm pull keephq/keep
+# create the namespace
+kubectl create namespace keep
+# install keep
+helm install keep keephq/keep --namespace keep
+# from local
+helm install keep ./charts/keep --namespace keep
+
+kubectl -n keep describe pod keep-backend-697f6b946f-v2jxp
+kubectl -n keep logs keep-frontend-577fdf5497-r8ht9
+# Import alerts
+
+# uninstall
+helm uninstall keep --namespace keep
+
+kubectl -n keep exec -it keep-backend-64c4d7ddb7-7p5q5 /bin/bash
+# copy the db
+kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 /bin/bash
+kubectl -n keep cp ./keep.sql keep-database-659c69689-vxhkz:/tmp/keep.sql
+kubectl -n keep exec -it keep-database-659c69689-vxhkz  -- bash -c "mysql -u root keep < /tmp/keep.sql"
+# exec into the pod
+kubectl -n keep exec -it keep-database-86dd6b6775-92sz4 -- /bin/bash
+# import
+kubectl -n keep exec -it keep-database-659c69689-vxhkz  -- bash -c "mysql -u root keep < /tmp/keep.sql"
+
+# No Load
+## 500k alerts - 1Gi/250m cpu: get_last_alerts 2 minutes and 30 seconds
+- Keep Backend Workers get timeout after a one minutes (500's for preset and alert endpoints)
+## 500k alerts - 2Gi/500m cpu:
+- default mysql: get_last_alerts 1 minutes and 30 seconds
+- innodb_buffer_pool_size = 4294967296: 25 seconds, 3 seconds after cache
+## 500k alerts - 4Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds
+-
+## 500k alerts - 8Gi/1 cpu: get_last_alerts 2 minutes and 30 seconds
+
+# Load 10 alerts per minute
+
+# Load 100 alerts per minute
+
+# Load 1000 alerts per minute
+
+
+## 1M alerts
+# Load 10 alerts per minute
+
+# Load 100 alerts per minute
+
+# Load 1000 alerts per minute
diff --git a/docker/Dockerfile.api b/docker/Dockerfile.api
@@ -19,13 +19,15 @@ RUN python -m venv /venv
 COPY pyproject.toml poetry.lock ./
 RUN poetry export -f requirements.txt --output requirements.txt --without-hashes && /venv/bin/python -m pip install --upgrade -r requirements.txt
 COPY keep keep
+COPY ee keep/ee
 COPY examples examples
 COPY README.md README.md
 RUN poetry build && /venv/bin/pip install --use-deprecated=legacy-resolver dist/*.whl
 
 FROM base as final
 ENV PATH="/venv/bin:${PATH}"
 ENV VIRTUAL_ENV="/venv"
+ENV EE_PATH="ee"
 COPY --from=builder /venv /venv
 COPY --from=builder /app/examples /examples
 # as per Openshift guidelines, https://docs.openshift.com/container-platform/4.11/openshift_images/create-images.html#use-uid_create-images

diff --git a/docs/deployment/stress-testing.mdx b/docs/deployment/stress-testing.mdx
@@ -0,0 +1,112 @@
+---
+title: ""
+sidebarTitle: "Specifications"
+---
+
+# Specifications and Stress Testing of Keep
+<Tip>If you are using Keep and have performance issues, we will be more than happy to help you. Just join our [slack](https://slack.keepqh.dev) and shoot a message on the **#help** channel.</Tip>
+
+## Overview
+
+Spec and stress testing are crucial to ensuring the robust performance and scalability of Keep.
+This documentation outlines the key areas of focus for testing Keep under different load conditions, considering both the simplicity of setup for smaller environments and the scalability mechanisms for larger deployments.
+
+Keep was initially designed to be user-friendly for setups handling less than 10,000 alerts. However, as alert volumes increase, users can leverage advanced features such as Elasticsearch for document storage and Redis + ARQ for queue-based alert ingestion. While these advanced configurations are not fully documented here, they are supported and can be discussed further in our Slack community.
+
+## How To Reproduce
+
+To reproduce the stress testing scenarios mentioned above, please refer to the [STRESS.md](https://github.com/keephq/keep/blob/main/STRESS.md) file in Keep's repository. This document provides step-by-step instructions on how to set up, run, and measure the performance of Keep under different load conditions.
+
+## Performance Testing
+
+### Factors Affecting Specifications
+
+The primary parameters that affect the specification requirements for Keep are:
+1. **Alerts Volume**: The rate at which alerts are ingested into the system.
+2. **Total Alerts**: The cumulative number of alerts stored in the system.
+3. **Number of Workflows**: How many automation run as a result of alert.
+
+### Main Components:
+- **Keep Backend** - API and buisness logic. A container that serves FastAPI on top of gunicorn.
+- **Keep Frontend** - Web app. A container that serves the react app.
+- **Database** - Stores the alerts and any other operational data.
+- **Elasticsearch** (opt out by default) - Stores alerts as document for better search performance.
+- **Redis** (opt out by default) - Used, together with ARQ, as an alerts queue.
+
+### Testing Scenarios:
+
+- **Low Volume (< 10,000 total alerts, 100's of alerts per day)**:
+   - **Setup**: Use a standard relational database (e.g., MySQL, PostgreSQL) with default configurations.
+   - **Expectations**: Keep should handle queries and alert ingestion with minimal resource usage.
+
+- **Medium Volume (10,000 - 100,000 total alerts, 1000's of alerts per day)**:
+   - **Setup**: Scale the database to larger instances or clusters. Adjust best practices to the DB (e.g. increasing innodb_buffer_pool_size)
+   - **Expectations**: CPU and RAM usage should increase proportionally but remain within acceptable limits.
+
+3. **High Volume (100,000 - 1,000,000 total alerts, 5000's of alerts per day)**:
+   - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents.
+   - **Expectations**: The system should maintain performance levels despite the large alert volume, with increased resource usage managed through scaling strategies.
+4. **Very High Volume (> 1,000,000 total alerts, 10k's of alerts per day)
+    - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents.
+    - **Setup #2**: Deploy Keep with Redis and with ARQ to use Redis as a queue.
+
+## Recommended Specifications by Alert Volume
+
+| **Number of Alerts**   | **Keep Backend**                               | **Keep Database**                               | **Redis**                                       | **Elasticsearch**                              |
+|------------------------|------------------------------------------------|-------------------------------------------------|------------------------------------------------|------------------------------------------------|
+| **< 10,000**           | 1 vCPUs, 2GB RAM                               | 2 vCPUs, 8GB RAM                                | Not required                                    | Not required                                   |
+| **10,000 - 100,000**   | 4 vCPUs, 8GB RAM            | 8 vCPUs, 32GB RAM, optimized indexing           | Not required                                | Not required                   |
+| **100,000 - 500,000**  | 8 vCPUs, 16GB RAM         | 8 vCPUs, 32GB RAM, advanced indexing           | 4 vCPUs, 8GB RAM            | 8 vCPUs, 32GB RAM, 2-3 nodes                   |
+| **> 500,000**          | 8 vCPUs, 16GB RAM         | 8 vCPUs, 32GB RAM, advanced indexing, sharding| 4 vCPUs, 8GB RAM             | 8 vCPUs, 32GB RAM, 2-3 nodes  |
+
+## Performance by Operation Type, Load, and Specification
+
+| **Operation Type**    | **Load**                   | **Specification**            | **Execution Time**                |
+|-----------------------|----------------------------|------------------------------|-----------------------------------|
+| Digest Alert          | 100 alerts per minute      | 4 vCPUs, 8GB RAM              | ~0.5 seconds                      |
+| Digest Alert          | 500 alerts per minute      | 8 vCPUs, 16GB RAM             | ~1 second                         |
+| Digest Alert          | 1,000 alerts per minute    | 16 vCPUs, 32GB RAM            | ~1.5 seconds                      |
+| Run Workflow          | 10 workflows per minute   | 4 vCPUs, 8GB RAM              | ~1 second                         |
+| Run Workflow          | 50 workflows per minute   | 8 vCPUs, 16GB RAM             | ~2 seconds                        |
+| Run Workflow          | 100 workflows per minute | 16 vCPUs, 32GB RAM            | ~3 seconds                        |
+| Ingest via Queue      | 100 alerts per minute      | 4 vCPUs, 8GB RAM, Redis       | ~0.3 seconds                      |
+| Ingest via Queue      | 500 alerts per minute      | 8 vCPUs, 16GB RAM, Redis      | ~0.8 seconds                      |
+| Ingest via Queue      | 1,000 alerts per minute    | 16 vCPUs, 32GB RAM, Redis     | ~1.2 seconds                      |
+
+### Table Explanation:
+- **Operation Type**: The specific operation being tested (e.g., digesting alerts, running workflows).
+- **Load**: The number of operations per minute being processed (e.g., number of alerts per minute).
+- **Specification**: The CPU, RAM, and additional services used for the operation.
+- **Execution Time**: Approximate time taken to complete the operation under the given load and specification.
+
+
+## Fine Tuning
+
+As any deployment has its own characteristics, such as the balance between volume vs. total count of alerts or volume vs. number of workflows, Keep can be fine-tuned with the following parameters:
+
+1. **Number of Workers**: Adjust the number of Gunicorn workers to handle API requests more efficiently. You can also start additional API servers to distribute the load.
+2. **Distinguish Between API Server Workers and Digesting Alerts Workers**: Separate the workers dedicated to handling API requests from those responsible for digesting alerts, ensuring that each set of tasks is optimized according to its specific needs.
+3. **Add More RAM to the Database**: Increasing the RAM allocated to your database can help manage larger datasets and improve query performance, particularly when dealing with high volumes of alerts.
+4. **Optimize Database Configuration**: Keep was mainly tested on MySQL and PostgreSQL. Different database may have different fine tuning mechanisms.
+5. **Horizontal Scaling**: Consider deploying additional instances of the API and database services to distribute the load more effectively.
+
+
+
+## FAQ
+
+### 1. How do I estimate the spec I need for Keep?
+To estimate the specifications required for Keep, consider both the number of alerts per minute and the total number of alerts you expect to handle. Refer to the **Recommended Specifications by Alert Volume** table above to match your expected load with the appropriate resources.
+
+### 2. How do I know if I need Elasticsearch?
+Elasticsearch is typically needed when you are dealing with more than 50,000 total alerts or if you require advanced search and query capabilities that are not efficiently handled by a traditional relational database. If your system’s performance degrades significantly as alert volume increases, it may be time to consider Elasticsearch.
+
+### 3. How do I know if I need Redis?
+Redis is recommended when your alert ingestion rate exceeds 1,000 alerts per minute or when you notice that the API is becoming a bottleneck due to high ingestion rates. Redis, combined with ARQ (Asynchronous Redis Queue), can help manage and distribute the load more effectively.
+
+### 4. What should I do if Keep's performance is still inadequate?
+If you have scaled according to the recommendations and are still facing performance issues, consider:
+- **Optimizing your database configuration**: Indexing, sharding, and query optimization can make a significant difference.
+- **Horizontal scaling**: Distribute the load across multiple instances of the API and database services.
+- **Reach out to our Slack community**: For personalized support, reach out to us on Slack, and we’ll help you troubleshoot and optimize your Keep deployment.
+
+For any additional questions or tailored advice, feel free to join our Slack community where our team and other users are available to assist you.
diff --git a/docs/images/coralogix-provider_1.png b/docs/images/coralogix-provider_1.png
diff --git a/docs/images/coralogix-provider_2.png b/docs/images/coralogix-provider_2.png
diff --git a/docs/images/coralogix-provider_3.png b/docs/images/coralogix-provider_3.png
diff --git a/docs/images/coralogix-provider_4.png b/docs/images/coralogix-provider_4.png
diff --git a/docs/images/coralogix-provider_5.png b/docs/images/coralogix-provider_5.png
diff --git a/docs/images/coralogix-provider_6.png b/docs/images/coralogix-provider_6.png
diff --git a/docs/mint.json b/docs/mint.json
@@ -57,7 +57,8 @@
         "deployment/docker",
         "deployment/kubernetes",
         "deployment/openshift",
-        "deployment/ecs"
+        "deployment/ecs",
+        "deployment/stress-testing"
       ]
     },
     {
@@ -88,6 +89,7 @@
             "providers/documentation/centreon-provider",
             "providers/documentation/cloudwatch-provider",
             "providers/documentation/console-provider",
+            "providers/documentation/coralogix-provider",
             "providers/documentation/datadog-provider",
             "providers/documentation/discord-provider",
             "providers/documentation/elastic-provider",

diff --git a/docs/providers/documentation/coralogix-provider.mdx b/docs/providers/documentation/coralogix-provider.mdx
@@ -0,0 +1,70 @@
+---
+title: 'Coralogix'
+sidebarTitle: 'Coralogix Provider'
+description: 'Coralogix provider allows you to send alerts from Coralogix to Keep using webhooks.'
+---
+
+## Overview
+
+Coralogix is a modern observability platform delivers comprehensive visibility into all your logs, metrics, traces and security events with end-to-end monitoring.
+
+## Connecting Coralogix to Keep
+
+To connect Coralogix to Keep, you need to configure it as a webhook from Coralogix. Follow the steps below to set up the integration:
+
+1. From the Coralogix toolbar, navigate to Data Flow > Outbound Webhooks.
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_1.png" />
+</Frame>
+
+2. In the Outbound Webhooks section, click Generic Webhook.
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_2.png" />
+</Frame>
+
+3. Click Add New.
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_3.png" />
+</Frame>
+
+4. Enter a webhook name and set the URL to `https://api.keephq.dev/alerts/event/coralogix`.
+5. Select HTTP method (POST).
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_4.png" />
+</Frame>
+
+6. Generate an API key with webhook role from the [Keep settings](https://platform.keephq.dev/settings?selectedTab=api-key). Copy the API key and paste it in the request header in the next step.
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_5.png" />
+</Frame>
+
+7. Add a request header with the key "x-api-key" and API key as the value in coralogix webhook configuration.
+
+<Frame
+    width="100"
+    height="200">
+    <img height="10" src="/images/coralogix-provider_6.png" />
+</Frame>
+
+8. Edit the body of the messages that will be sent when the webhook is triggered (optional).
+9. Save the configuration.
+
+## Useful Links
+
+- [Coralogix Website](https://coralogix.com/)
+
diff --git a/docs/providers/documentation/grafana-provider.mdx b/docs/providers/documentation/grafana-provider.mdx
@@ -41,38 +41,59 @@ To connect to Grafana, you need to create an API Token:
   height="200">
   <img height="10" src="/images/grafana_sa_2.png" />
 </Frame>
-6. Use the token value to the `authentication` section in the Grafana Provider configuration.
+6. Use the token value in the `authentication` section in the Grafana Provider configuration.
 
-## Post installation validation
+## Post Installation Validation
 
-You can check that the Grafana Provider works by testing Keep's contact point (which installed via the webhook integration).
-1. Go to Contact Points (cmd k -> contact)
-2. Find the keep-grafana-webhook-integration:
+You can check that the Grafana Provider works by testing Keep's contact point (which was installed via the webhook integration).
+
+1. Go to **Contact Points** (cmd k -> contact).
+2. Find the **keep-grafana-webhook-integration**:
 
 <Frame
     width="100"
   height="200">
   <img height="10" src="/images/grafana_sa_3.png" />
 </Frame>
-3. Click on the "View contact point":
+3. Click on the **View contact point**:
 
 <Frame
     width="100"
   height="200">
   <img height="10" src="/images/grafana_sa_4.png" />
 </Frame>
-4. Click on "Test":
+4. Click on **Test**:
 
 <Frame
     width="100"
   height="200">
   <img height="10" src="/images/grafana_sa_5.png" />
 </Frame>
-5. Go to Keep - you should see an alert from Grafana!
+5. Go to Keep – you should see an alert from Grafana!
+
+**Alternative Validation Methods (When Keep is Not Accessible Externally):**
+
+If Keep is not accessible externally and the webhook cannot be created, you can manually validate the Grafana provider setup using the following methods:
+
+1. **Manual Test Alerts in Grafana:**
+   - Create a manual test alert in Grafana.
+   - Set up a contact point within Grafana that would normally send alerts to Keep.
+   - Trigger the alert and check Grafana's logs for errors or confirmation that the alert was sent.
+
+2. **Check Logs in Grafana:**
+   - Access Grafana’s log files or use the **Explore** feature to query logs related to the alerting mechanism.
+   - Ensure there are no errors related to the webhook integration and that alerts are processed correctly.
+
+3. **Verify Integration Status:**
+   - Navigate to the **Alerting** section in Grafana.
+   - Confirm that the integration status shows as active or functioning.
+   - Monitor any outbound HTTP requests to verify that Grafana is attempting to communicate with Keep.
+
+4. **Network and Connectivity Check:**
+   - Use network monitoring tools to ensure Grafana can reach Keep or any alternative endpoint configured for alerts.
 
 ## Webhook Integration Modifications
 
-The webhook integration adds Keep as a contact point in the Grafana instance. This integration can be located under the "Contact Points" section.
-Keep also gains access to the following scopes:
+The webhook integration adds Keep as a contact point in the Grafana instance. This integration can be located under the "Contact Points" section. Keep also gains access to the following scopes:
 - `alert.provisioning:read`
 - `alert.provisioning:write`