From 146ba9ba7d53536ce44963f8f2b5d6ec0c8582c3 Mon Sep 17 00:00:00 2001
From: cdxker <fdenzell@gmail.com>
Date: Fri, 3 Jan 2025 11:21:10 -0800
Subject: [PATCH] ci: add CI action to reindex site into Trieve

---
 .../workflows/index-trieve-search-index.yaml  | 19 ++++++++
 .github/workflows/index.sh                    | 48 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 .github/workflows/index-trieve-search-index.yaml
 create mode 100755 .github/workflows/index.sh

diff --git a/.github/workflows/index-trieve-search-index.yaml b/.github/workflows/index-trieve-search-index.yaml
new file mode 100644
index 000000000..fabfc4bcd
--- /dev/null
+++ b/.github/workflows/index-trieve-search-index.yaml
@@ -0,0 +1,19 @@
+name: "Index Trieve Search Component"
+
+on:
+  push:
+  workflow_dispatch:
+
+jobs:
+  create-search-index:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: "Ingest Trieve Search Index"
+        shell: bash
+        env:
+          ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
+          DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
+          API_KEY: ${{ secrets.TRIEVE_API_KEY }}
+        run: |
+          bash .github/workflows/index.sh
diff --git a/.github/workflows/index.sh b/.github/workflows/index.sh
new file mode 100755
index 000000000..6a1bab5c7
--- /dev/null
+++ b/.github/workflows/index.sh
@@ -0,0 +1,48 @@
+# Load environment variables
+TRIEVE_URL="https://api.trieve.ai"
+
+all_envs() {
+echo "
+These github repo secrets must to be set:
+TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
+TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
+TRIEVE_API_KEY="\<your-trieve-api-key\>"
+
+alternatively you can modify the actions env's as such
+ORGANIZATION_ID="\<your-trieve-organization-id\>"
+DATASET_ID="\<your-trieve-dataset-id\>"
+API_KEY="\<your-trieve-api-key\>"
+"
+}
+
+[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
+[ -z $DATASET_ID ] && echo "DATASET_ID is not found" &&  all_envs && exit 1
+[ -z $API_KEY ] && echo "API_KEY is not found" &&  all_envs && exit 1
+
+PAYLOAD='{ "crawl_options": { "allow_external_links": null, "boost_titles": true, "exclude_paths": [], "exclude_tags": [ "nav", "img", "pre", ".PageFeedback_feedbackContainer___tGjJ", ".doc-sidenav", ".doc-toc", "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", ".related-articles", "aside", "footer" ], "include_paths": [ "faqs", "docs" ], "include_tags": [ ".doc-content", "main" ], "interval": "daily", "limit": 10000, "site_url": "https://signoz.io/docs/", "scrape_options": null }, "dataset_id": "'${DATASET_ID}'" }'
+
+echo "Payload $PAYLOAD"
+
+# Make the API call using curl and capture the response code
+response=$(curl -s -X PUT "${TRIEVE_URL}/api/dataset" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: ${API_KEY}" \
+  -H "TR-Organization: ${ORGANIZATION_ID}" \
+  -H "TR-Dataset: ${DATAST_ID}" \
+  -d "${PAYLOAD}" \
+  -w "\n%{http_code}")
+
+# Extract the response code from the last line
+http_code=$(echo "$response" | tail -n1)
+# Extract the response body (everything except the last line)
+response_body=$(echo "$response" | sed \$d)
+
+# Check if the response code is 200
+if [ "$http_code" -eq 200 ]; then
+  echo "Crawling started Successfully"
+  exit 0
+else
+  echo "Error: Received HTTP status code $http_code"
+  echo "Response: $response_body"
+  exit 1
+fi