forked from SigNoz/signoz-web
-
Notifications
You must be signed in to change notification settings - Fork 0
162 lines (141 loc) · 5.99 KB
/
index-trieve-search-index.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
name: "Index Trieve Search Component"
on:
push:
workflow_dispatch:
jobs:
create-search-index:
runs-on: ubuntu-24.04
steps:
- name: "Ingest Trieve Search Index"
shell: bash
env:
ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
API_KEY: ${{ secrets.TRIEVE_API_KEY }}
run: |
# Load environment variables
TRIEVE_URL="https://api.trieve.ai"
all_envs() {
echo "
These github repo secrets must to be set:
TRIEVE_ORGANIZATION_ID="\<your-trieve-organization-id\>"
TRIEVE_DATASET_ID="\<your-trieve-dataset-id\>"
TRIEVE_API_KEY="\<your-trieve-api-key\>"
alternatively you can modify the actions env's as such
ORGANIZATION_ID="\<your-trieve-organization-id\>"
DATASET_ID="\<your-trieve-dataset-id\>"
API_KEY="\<your-trieve-api-key\>"
"
}
[ -z $ORGANIZATION_ID ] && echo "ORGANIZATION_ID is not found" && all_envs && exit 1
[ -z $DATASET_ID ] && echo "DATASET_ID is not found" && all_envs && exit 1
[ -z $API_KEY ] && echo "API_KEY is not found" && all_envs && exit 1
PAYLOAD='{ "crawl_options": { "allow_external_links": null, "boost_titles": true, "exclude_paths": [], "exclude_tags": [ "nav", "img", "pre", ".PageFeedback_feedbackContainer___tGjJ", ".doc-sidenav", ".doc-toc", "div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md", ".related-articles", "aside", "footer" ], "include_paths": [ "faqs", "docs" ], "include_tags": [ ".doc-content", "main" ], "interval": "daily", "limit": 10000, "site_url": "https://signoz.io/docs/", "scrape_options": null }, "dataset_id": "'${DATASET_ID}'" }'
echo "Payload $PAYLOAD"
# Make the API call using curl and capture the response code
response=$(curl -s -X PUT "${TRIEVE_URL}/api/dataset" \
-H "Content-Type: application/json" \
-H "Authorization: ${API_KEY}" \
-H "TR-Organization: ${ORGANIZATION_ID}" \
-H "TR-Dataset: ${DATAST_ID}" \
-d "${PAYLOAD}" \
-w "\n%{http_code}")
# Extract the response code from the last line
http_code=$(echo "$response" | tail -n1)
# Extract the response body (everything except the last line)
response_body=$(echo "$response" | sed \$d)
# Check if the response code is 200
if [ "$http_code" -eq 200 ]; then
echo "Crawling finished Successfully"
exit 0
else
echo "Error: Received HTTP status code $http_code"
echo "Response: $response_body"
exit 1
fi
name: "Index Trieve Search Component"
create-search-index-nushell:
runs-on: ubuntu-24.04
steps:
- name: "Install Nushell"
run: |
sudo apt-get update
sudo apt-get install -y nushell
- name: "Ingest Trieve Search Index"
shell: nu
env:
ORGANIZATION_ID: ${{ secrets.TRIEVE_ORGANIZATION_ID }}
DATASET_ID: ${{ secrets.TRIEVE_DATASET_ID }}
API_KEY: ${{ secrets.TRIEVE_API_KEY }}
run: |
let TRIEVE_URL = "https://api.trieve.ai"
def all_envs [] {
echo "
These github repo secrets must to be set:
TRIEVE_ORGANIZATION_ID=<your-trieve-organization-id>
TRIEVE_DATASET_ID=<your-trieve-dataset-id>
TRIEVE_API_KEY=<your-trieve-api-key>
alternatively you can modify the actions env's as such
ORGANIZATION_ID=<your-trieve-organization-id>
DATASET_ID=<your-trieve-dataset-id>
API_KEY=<your-trieve-api-key>
"
}
# Check environment variables
if ($env | get ORGANIZATION_ID | is-empty) {
echo "ORGANIZATION_ID is not found"
all_envs
exit 1
}
if ($env | get DATASET_ID | is-empty) {
echo "DATASET_ID is not found"
all_envs
exit 1
}
if ($env | get API_KEY | is-empty) {
echo "API_KEY is not found"
all_envs
exit 1
}
let PAYLOAD = {
crawl_options: {
allow_external_links: null,
boost_titles: true,
exclude_paths: [],
exclude_tags: [
"nav", "img", "pre",
".PageFeedback_feedbackContainer___tGjJ",
".doc-sidenav", ".doc-toc",
"div.z-[10].flex.flex-col.justify-center.border-t.border-solid.border-gray-900.bg-signoz_ink-500.bg-opacity-70.backdrop-blur-md",
".related-articles", "aside", "footer"
],
include_paths: ["faqs", "docs"],
include_tags: [".doc-content", "main"],
interval: "daily",
limit: 10000,
site_url: "https://signoz.io/docs/",
scrape_options: null
},
dataset_id: $env.DATASET_ID
}
echo $"Payload ($PAYLOAD | to json)"
let response = (curl
--silent
-X PUT
$"($TRIEVE_URL)/api/dataset"
-H "Content-Type: application/json"
-H $"Authorization: ($env.API_KEY)"
-H $"TR-Organization: ($env.ORGANIZATION_ID)"
-H $"TR-Dataset: ($env.DATASET_ID)"
-d ($PAYLOAD | to json)
--write-out "\n%{http_code}")
let http_code = ($response | split row "\n" | last)
let response_body = ($response | split row "\n" | drop -1 | str join "\n")
if $http_code == "200" {
echo "Crawling finished Successfully"
exit 0
} else {
echo $"Error: Received HTTP status code ($http_code)"
echo $"Response: ($response_body)"
exit 1
}