From 7210ae26c29ae1ccec5313131f3b3da3b4ed70b8 Mon Sep 17 00:00:00 2001 From: cdxker Date: Fri, 3 Jan 2025 11:21:10 -0800 Subject: [PATCH] ci: add CI action to reindex site into Trieve --- .../workflows/index-trieve-search-index.yaml | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 .github/workflows/index-trieve-search-index.yaml diff --git a/.github/workflows/index-trieve-search-index.yaml b/.github/workflows/index-trieve-search-index.yaml new file mode 100644 index 000000000..3291912dc --- /dev/null +++ b/.github/workflows/index-trieve-search-index.yaml @@ -0,0 +1,105 @@ +name: "Index Trieve Search Component" + +on: + push: + workflow_dispatch: + +jobs: + create-search-index: + runs-on: ubuntu-latest + steps: + - name: update-curl + shell: bash + run: | + sudo apt-get update && sudo apt-get install curl + + - name: "Ingest Trieve Search Index" + shell: bash + env: + ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }} + DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }} + API_KEY: ${{ secrets.TRIEVE_API_KEY }} + run: | + # Load environment variables + TRIEVE_URL="https://api.trieve.ai" + + all_envs() { + echo " + These github repo secrets must to be set: + TRIEVE_ORGANIZATION_ID="\" + TRIEVE_DATASET_ID="\" + TRIEVE_API_KEY="\" + + alternatively you can modify the actions env's as such + ORGANIZATION_ID="\" + DATASET_ID="\" + API_KEY="\" + " + } + + [ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1 + [ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1 + [ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1 + + PAYLOAD='{ + "crawl_options": { + "allow_external_links": null, + "boost_titles": true, + "exclude_paths": [], + "exclude_tags": [ + "nav", + "img", + "pre", + ".PageFeedback_feedbackContainer___tGjJ", + ".doc-sidenav", + ".doc-toc", + "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", + ".related-articles", + "aside", + "footer" + ], + "include_paths": [ + "faqs", + "docs" + ], + "include_tags": [ + ".doc-content", + "main" + ], + "interval": "daily", + "limit": 10000, + "site_url": "https://signoz.io/docs/", + "scrape_options": null + }, + "dataset_id": "'${DATASET_ID}'" + }' + + echo "Making call to ${TRIEVE_URL}/api/dataset to scrape dataset ${DATASET_ID} org ${ORGANIZATION_ID}" + + uname -a + curl --version + + # Make the API call using curl and capture the response code + curl -X PUT "${TRIEVE_URL}/api/dataset" \ + -H "Content-Type: application/json" \ + -H "Authorization: ${API_KEY}" \ + -H "TR-Organization: ${ORGANIZATION_ID}" \ + -H "TR-Dataset: ${DATAST_ID}" \ + -d "${PAYLOAD}" \ + -w "\n%{http_code}" \ + -i + + # # Extract the response code from the last line + # http_code=$(echo "$response" | tail -n1) + # # Extract the response body (everything except the last line) + # response_body=$(echo "$response" | sed \$d) + # + # # Check if the response code is 200 + # if [ "$http_code" -eq 200 ]; then + # echo "Crawling finished Successfully" + # exit 0 + # else + # echo "Error: Received HTTP status code $http_code" + # echo "Response: $response_body" + # exit 1 + # fi