From 7210ae26c29ae1ccec5313131f3b3da3b4ed70b8 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Fri, 3 Jan 2025 11:21:10 -0800
Subject: [PATCH] ci: add CI action to reindex site into Trieve

---
 .../workflows/index-trieve-search-index.yaml  | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 .github/workflows/index-trieve-search-index.yaml

diff --git a/.github/workflows/index-trieve-search-index.yaml b/.github/workflows/index-trieve-search-index.yaml
new file mode 100644
index 000000000..3291912dc
--- /dev/null
+++ b/.github/workflows/index-trieve-search-index.yaml
@@ -0,0 +1,105 @@
+name: "Index Trieve Search Component"
+
+on:
+  push:
+  workflow_dispatch:
+
+jobs:
+  create-search-index:
+    runs-on: ubuntu-latest
+    steps:
+      - name: update-curl
+        shell: bash
+        run: |
+          sudo apt-get update && sudo apt-get install curl
+
+      - name: "Ingest Trieve Search Index"
+        shell: bash
+        env:
+          ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
+          DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
+          API_KEY: ${{ secrets.TRIEVE_API_KEY }}
+        run: |
+          # Load environment variables
+          TRIEVE_URL="https://api.trieve.ai"
+
+          all_envs() {
+            echo "
+          These github repo secrets must to be set:
+          TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
+          TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
+          TRIEVE_API_KEY="\<your-trieve-api-key\>"
+
+          alternatively you can modify the actions env's as such
+          ORGANIZATION_ID="\<your-trieve-organization-id\>"
+          DATASET_ID="\<your-trieve-dataset-id\>"
+          API_KEY="\<your-trieve-api-key\>"
+            "
+          }
+
+          [ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
+          [ -z $DATASET_ID ] && echo "DATASET_ID is not found" &&  all_envs && exit 1
+          [ -z $API_KEY ] && echo "API_KEY is not found" &&  all_envs && exit 1
+
+          PAYLOAD='{
+              "crawl_options": {
+                  "allow_external_links": null,
+                  "boost_titles": true,
+                  "exclude_paths": [],
+                  "exclude_tags": [
+                      "nav",
+                      "img",
+                      "pre",
+                      ".PageFeedback_feedbackContainer___tGjJ",
+                      ".doc-sidenav",
+                      ".doc-toc",
+                      "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md",
+                      ".related-articles",
+                      "aside",
+                      "footer"
+                  ],
+                  "include_paths": [
+                      "faqs",
+                      "docs"
+                  ],
+                  "include_tags": [
+                      ".doc-content",
+                      "main"
+                  ],
+                  "interval": "daily",
+                  "limit": 10000,
+                  "site_url": "https://signoz.io/docs/",
+                  "scrape_options": null
+              },
+              "dataset_id": "'${DATASET_ID}'"
+          }'
+
+          echo "Making call to ${TRIEVE_URL}/api/dataset to scrape dataset ${DATASET_ID} org ${ORGANIZATION_ID}"
+
+          uname -a
+          curl --version
+
+          # Make the API call using curl and capture the response code
+          curl -X PUT "${TRIEVE_URL}/api/dataset" \
+              -H "Content-Type: application/json" \
+              -H "Authorization: ${API_KEY}" \
+              -H "TR-Organization: ${ORGANIZATION_ID}" \
+              -H "TR-Dataset: ${DATAST_ID}" \
+              -d "${PAYLOAD}" \
+              -w "\n%{http_code}" \
+              -i
+
+          # # Extract the response code from the last line
+          # http_code=$(echo "$response" | tail -n1)
+          # # Extract the response body (everything except the last line)
+          # response_body=$(echo "$response" | sed \$d)
+          #
+          # # Check if the response code is 200
+          # if [ "$http_code" -eq 200 ]; then
+          #     echo "Crawling finished Successfully"
+          #     exit 0
+          # else
+          #     echo "Error: Received HTTP status code $http_code"
+          #     echo "Response: $response_body"
+          #     exit 1
+          # fi