From 146ba9ba7d53536ce44963f8f2b5d6ec0c8582c3 Mon Sep 17 00:00:00 2001 From: cdxker Date: Fri, 3 Jan 2025 11:21:10 -0800 Subject: [PATCH] ci: add CI action to reindex site into Trieve --- .../workflows/index-trieve-search-index.yaml | 19 ++++++++ .github/workflows/index.sh | 48 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 .github/workflows/index-trieve-search-index.yaml create mode 100755 .github/workflows/index.sh diff --git a/.github/workflows/index-trieve-search-index.yaml b/.github/workflows/index-trieve-search-index.yaml new file mode 100644 index 000000000..fabfc4bcd --- /dev/null +++ b/.github/workflows/index-trieve-search-index.yaml @@ -0,0 +1,19 @@ +name: "Index Trieve Search Component" + +on: + push: + workflow_dispatch: + +jobs: + create-search-index: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + - name: "Ingest Trieve Search Index" + shell: bash + env: + ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }} + DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }} + API_KEY: ${{ secrets.TRIEVE_API_KEY }} + run: | + bash .github/workflows/index.sh diff --git a/.github/workflows/index.sh b/.github/workflows/index.sh new file mode 100755 index 000000000..6a1bab5c7 --- /dev/null +++ b/.github/workflows/index.sh @@ -0,0 +1,48 @@ +# Load environment variables +TRIEVE_URL="https://api.trieve.ai" + +all_envs() { +echo " +These github repo secrets must to be set: +TRIEVE_ORGANIZATION_ID="\" +TRIEVE_DATASET_ID="\" +TRIEVE_API_KEY="\" + +alternatively you can modify the actions env's as such +ORGANIZATION_ID="\" +DATASET_ID="\" +API_KEY="\" +" +} + +[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1 +[ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1 +[ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1 + +PAYLOAD='{ "crawl_options": { "allow_external_links": null, "boost_titles": true, "exclude_paths": [], "exclude_tags": [ "nav", "img", "pre", ".PageFeedback_feedbackContainer___tGjJ", ".doc-sidenav", ".doc-toc", "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", ".related-articles", "aside", "footer" ], "include_paths": [ "faqs", "docs" ], "include_tags": [ ".doc-content", "main" ], "interval": "daily", "limit": 10000, "site_url": "https://signoz.io/docs/", "scrape_options": null }, "dataset_id": "'${DATASET_ID}'" }' + +echo "Payload $PAYLOAD" + +# Make the API call using curl and capture the response code +response=$(curl -s -X PUT "${TRIEVE_URL}/api/dataset" \ + -H "Content-Type: application/json" \ + -H "Authorization: ${API_KEY}" \ + -H "TR-Organization: ${ORGANIZATION_ID}" \ + -H "TR-Dataset: ${DATAST_ID}" \ + -d "${PAYLOAD}" \ + -w "\n%{http_code}") + +# Extract the response code from the last line +http_code=$(echo "$response" | tail -n1) +# Extract the response body (everything except the last line) +response_body=$(echo "$response" | sed \$d) + +# Check if the response code is 200 +if [ "$http_code" -eq 200 ]; then + echo "Crawling started Successfully" + exit 0 +else + echo "Error: Received HTTP status code $http_code" + echo "Response: $response_body" + exit 1 +fi