From 0b4c43fe7c5ea6d1edc748d6f28398923825ccb2 Mon Sep 17 00:00:00 2001 From: cdxker Date: Fri, 3 Jan 2025 11:21:10 -0800 Subject: [PATCH] ci: add CI action to reindex site into Trieve --- .../workflows/index-trieve-search-index.yaml | 38 +++++++++ .github/workflows/index.nu | 77 +++++++++++++++++++ .github/workflows/index.sh | 49 ++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 .github/workflows/index-trieve-search-index.yaml create mode 100644 .github/workflows/index.nu create mode 100755 .github/workflows/index.sh diff --git a/.github/workflows/index-trieve-search-index.yaml b/.github/workflows/index-trieve-search-index.yaml new file mode 100644 index 000000000..68b02c824 --- /dev/null +++ b/.github/workflows/index-trieve-search-index.yaml @@ -0,0 +1,38 @@ +name: "Index Trieve Search Component" + +on: + push: + workflow_dispatch: + +jobs: + create-search-index: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + - name: "Ingest Trieve Search Index" + shell: bash + env: + ORGANIZATION_ID: ${{ vars.TRIEVE_ORGANIZATION_ID }} + DATASET_ID: ${{ vars.TRIEVE_DATASET_ID }} + API_KEY: ${{ vars.TRIEVE_API_KEY }} + run: | + bash .github/workflows/index.sh + + create-search-index-nushell: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + + - name: "Install nushell" + uses: hustcer/setup-nu@main + with: + version: "*" + + - name: "Ingest Trieve Search Index" + shell: nu {0} + env: + ORGANIZATION_ID: ${{ vars.TRIEVE_ORGANIZATION_ID }} + DATASET_ID: ${{ vars.TRIEVE_DATASET_ID }} + API_KEY: ${{ vars.TRIEVE_API_KEY }} + run: | + nu .github/workflows/index.nu diff --git a/.github/workflows/index.nu b/.github/workflows/index.nu new file mode 100644 index 000000000..7cae713af --- /dev/null +++ b/.github/workflows/index.nu @@ -0,0 +1,77 @@ +let TRIEVE_URL = "https://api.trieve.ai" + +def all_envs [] { +echo " +These github repo secrets must to be set: +TRIEVE_ORGANIZATION_ID= +TRIEVE_DATASET_ID= +TRIEVE_API_KEY= + +alternatively you can modify the actions env's as such +ORGANIZATION_ID= +DATASET_ID= +API_KEY= +" +} + +# Check environment variables +if ($env | get ORGANIZATION_ID | is-empty) { +echo "ORGANIZATION_ID is not found" +all_envs +exit 1 +} +if ($env | get DATASET_ID | is-empty) { +echo "DATASET_ID is not found" +all_envs +exit 1 +} +if ($env | get API_KEY | is-empty) { +echo "API_KEY is not found" +all_envs +exit 1 +} + +let PAYLOAD = { + crawl_options: { + allow_external_links: null, + boost_titles: true, + exclude_paths: [], + exclude_tags: [ + "nav", "img", "pre", + ".PageFeedback_feedbackContainer___tGjJ", + ".doc-sidenav", ".doc-toc", + "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", + ".related-articles", "aside", "footer" + ], + include_paths: ["faqs", "docs"], + include_tags: [".doc-content", "main"], + interval: "daily", + limit: 10000, + site_url: "https://signoz.io/docs/", + scrape_options: null + }, + dataset_id: $env.DATASET_ID +} + +echo $"Payload ($PAYLOAD | to json)" + +let headers = { + "TR-Organization": $env.ORGANIZATION_ID + "TR-Dataset": $env.DATASET_ID + "Authorization": $env.API_KEY +} +http put --content-type application/json $"($TRIEVE_URL)/api/dataset" $PAYLOAD --headers $headers -e +# -d ($PAYLOAD | to json) +# --write-out "\n%{http_code}") +# +# let http_code = ($response | split row "\n" | last) +# let response_body = ($response | split row "\n" | drop -1 | str join "\n") +# +# if $http_code == "200" { +# echo "Crawling finished Successfully" +# exit 0 +# } else { +# echo $"Error: Received HTTP status code ($http_code)" +# echo $"Response: ($response_body)" +# exit 1 +# } diff --git a/.github/workflows/index.sh b/.github/workflows/index.sh new file mode 100755 index 000000000..8c5d1017d --- /dev/null +++ b/.github/workflows/index.sh @@ -0,0 +1,49 @@ +# Load environment variables +TRIEVE_URL="https://api.trieve.ai" + +all_envs() { +echo " +These github repo secrets must to be set: +TRIEVE_ORGANIZATION_ID="\" +TRIEVE_DATASET_ID="\" +TRIEVE_API_KEY="\" + +alternatively you can modify the actions env's as such +ORGANIZATION_ID="\" +DATASET_ID="\" +API_KEY="\" +" +} + +[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1 +[ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1 +[ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1 + +PAYLOAD='{ "crawl_options": { "allow_external_links": null, "boost_titles": true, "exclude_paths": [], "exclude_tags": [ "nav", "img", "pre", ".PageFeedback_feedbackContainer___tGjJ", ".doc-sidenav", ".doc-toc", "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", ".related-articles", "aside", "footer" ], "include_paths": [ "faqs", "docs" ], "include_tags": [ ".doc-content", "main" ], "interval": "daily", "limit": 10000, "site_url": "https://signoz.io/docs/", "scrape_options": null }, "dataset_id": "'${DATASET_ID}'" }' + +echo "Payload $PAYLOAD" + +# Make the API call using curl and capture the response code +curl -X PUT "${TRIEVE_URL}/api/dataset" \ + -H "Content-Type: application/json" \ + -H "Authorization: ${API_KEY}" \ + -H "TR-Organization: ${ORGANIZATION_ID}" \ + -H "TR-Dataset: ${DATAST_ID}" \ + -d "${PAYLOAD}" \ + -w "\n%{http_code}" +# +# # Extract the response code from the last line +# http_code=$(echo "$response" | tail -n1) +# # Extract the response body (everything except the last line) +# response_body=$(echo "$response" | sed \$d) +# +# # Check if the response code is 200 +# if [ "$http_code" -eq 200 ]; then +# echo "Crawling started Successfully" +# exit 0 +# else +# echo "Error: Received HTTP status code $http_code" +# echo "Response: $response_body" +# echo "Full response $response" +# exit 1 +# fi