Skip to content

Commit

Permalink
ci: add CI action to reindex site into Trieve
Browse files Browse the repository at this point in the history
  • Loading branch information
cdxker committed Jan 6, 2025
1 parent 14c4750 commit 0b4c43f
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 0 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/index-trieve-search-index.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: "Index Trieve Search Component"

on:
push:
workflow_dispatch:

jobs:
create-search-index:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v4
- name: "Ingest Trieve Search Index"
shell: bash
env:
ORGANIZATION_ID: ${{ vars.TRIEVE_ORGANIZATION_ID }}
DATASET_ID: ${{ vars.TRIEVE_DATASET_ID }}
API_KEY: ${{ vars.TRIEVE_API_KEY }}
run: |
bash .github/workflows/index.sh
create-search-index-nushell:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4

- name: "Install nushell"
uses: hustcer/setup-nu@main
with:
version: "*"

- name: "Ingest Trieve Search Index"
shell: nu {0}
env:
ORGANIZATION_ID: ${{ vars.TRIEVE_ORGANIZATION_ID }}
DATASET_ID: ${{ vars.TRIEVE_DATASET_ID }}
API_KEY: ${{ vars.TRIEVE_API_KEY }}
run: |
nu .github/workflows/index.nu
77 changes: 77 additions & 0 deletions .github/workflows/index.nu
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
let TRIEVE_URL = "https://api.trieve.ai"

def all_envs [] {
echo "
These github repo secrets must to be set:
TRIEVE_ORGANIZATION_ID=<your-trieve-organization-id>
TRIEVE_DATASET_ID=<your-trieve-dataset-id>
TRIEVE_API_KEY=<your-trieve-api-key>
alternatively you can modify the actions env's as such
ORGANIZATION_ID=<your-trieve-organization-id>
DATASET_ID=<your-trieve-dataset-id>
API_KEY=<your-trieve-api-key>
"
}

# Check environment variables
if ($env | get ORGANIZATION_ID | is-empty) {
echo "ORGANIZATION_ID is not found"
all_envs
exit 1
}
if ($env | get DATASET_ID | is-empty) {
echo "DATASET_ID is not found"
all_envs
exit 1
}
if ($env | get API_KEY | is-empty) {
echo "API_KEY is not found"
all_envs
exit 1
}

let PAYLOAD = {
crawl_options: {
allow_external_links: null,
boost_titles: true,
exclude_paths: [],
exclude_tags: [
"nav", "img", "pre",
".PageFeedback_feedbackContainer___tGjJ",
".doc-sidenav", ".doc-toc",
"div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md",
".related-articles", "aside", "footer"
],
include_paths: ["faqs", "docs"],
include_tags: [".doc-content", "main"],
interval: "daily",
limit: 10000,
site_url: "https://signoz.io/docs/",
scrape_options: null
},
dataset_id: $env.DATASET_ID
}

echo $"Payload ($PAYLOAD | to json)"

let headers = {
"TR-Organization": $env.ORGANIZATION_ID
"TR-Dataset": $env.DATASET_ID
"Authorization": $env.API_KEY
}
http put --content-type application/json $"($TRIEVE_URL)/api/dataset" $PAYLOAD --headers $headers -e
# -d ($PAYLOAD | to json)
# --write-out "\n%{http_code}")
#
# let http_code = ($response | split row "\n" | last)
# let response_body = ($response | split row "\n" | drop -1 | str join "\n")
#
# if $http_code == "200" {
# echo "Crawling finished Successfully"
# exit 0
# } else {
# echo $"Error: Received HTTP status code ($http_code)"
# echo $"Response: ($response_body)"
# exit 1
# }
49 changes: 49 additions & 0 deletions .github/workflows/index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Load environment variables
TRIEVE_URL="https://api.trieve.ai"

all_envs() {
echo "
These github repo secrets must to be set:
TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
TRIEVE_API_KEY="\<your-trieve-api-key\>"
alternatively you can modify the actions env's as such
ORGANIZATION_ID="\<your-trieve-organization-id\>"
DATASET_ID="\<your-trieve-dataset-id\>"
API_KEY="\<your-trieve-api-key\>"
"
}

[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
[ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1
[ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1

PAYLOAD='{ "crawl_options": { "allow_external_links": null, "boost_titles": true, "exclude_paths": [], "exclude_tags": [ "nav", "img", "pre", ".PageFeedback_feedbackContainer___tGjJ", ".doc-sidenav", ".doc-toc", "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", ".related-articles", "aside", "footer" ], "include_paths": [ "faqs", "docs" ], "include_tags": [ ".doc-content", "main" ], "interval": "daily", "limit": 10000, "site_url": "https://signoz.io/docs/", "scrape_options": null }, "dataset_id": "'${DATASET_ID}'" }'

echo "Payload $PAYLOAD"

# Make the API call using curl and capture the response code
curl -X PUT "${TRIEVE_URL}/api/dataset" \
-H "Content-Type: application/json" \
-H "Authorization: ${API_KEY}" \
-H "TR-Organization: ${ORGANIZATION_ID}" \
-H "TR-Dataset: ${DATAST_ID}" \
-d "${PAYLOAD}" \
-w "\n%{http_code}"
#
# # Extract the response code from the last line
# http_code=$(echo "$response" | tail -n1)
# # Extract the response body (everything except the last line)
# response_body=$(echo "$response" | sed \$d)
#
# # Check if the response code is 200
# if [ "$http_code" -eq 200 ]; then
# echo "Crawling started Successfully"
# exit 0
# else
# echo "Error: Received HTTP status code $http_code"
# echo "Response: $response_body"
# echo "Full response $response"
# exit 1
# fi

0 comments on commit 0b4c43f

Please sign in to comment.