Skip to content

Commit

Permalink
etc: add heal_dataset script
Browse files Browse the repository at this point in the history
  • Loading branch information
palkan committed Jan 3, 2025
1 parent f0d009e commit 9ce7c1f
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 7 deletions.
34 changes: 34 additions & 0 deletions etc/heal_dataset.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

# A script to remove broken chunks from the dataset, i.e., chunks matching the given link pattern

require "uptriever"
require "ruby-progressbar"

client = Uptriever::Client.new

pattern = Regexp.new(Regexp.escape(ARGV[0]))

matching_chunks = Set.new

usage = client.usage

puts "Total chunks: #{usage["chunk_count"]}"

progressbar = ProgressBar.create(title: "Scroll chunks", total: usage["chunk_count"])

client.scroll_chunks do |chunk|
progressbar.increment
if chunk["link"] =~ pattern
matching_chunks << chunk["id"]
end
end

puts "Found #{matching_chunks.size} chunks to delete.\nDeleting...\n"
progressbar = ProgressBar.create(title: "Chunks", total: matching_chunks.size)

matching_chunks.each do
client.delete_chunk(_1)
progressbar.increment
end
46 changes: 39 additions & 7 deletions lib/uptriever/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ class Client
BASE_URL = "https://api.trieve.ai/api"

attr_reader :headers
private attr_reader :dry_run
private attr_reader :dry_run, :dataset_id

def initialize(api_key, dataset, dry_run: false)
def initialize(api_key = ENV["TRIEVE_API_KEY"], dataset = ENV["TRIEVE_DATASET"], dry_run: false)
@dataset_id = dataset
@dry_run = dry_run
@headers = {
"Authorization" => api_key,
Expand All @@ -28,19 +29,50 @@ def push_chunk(chunk, upsert: true)
perform_request("/chunk", chunk.to_json)
end

def scroll_chunks(per_page: 100)
data = {
filters: {must: nil},
page_size: per_page
}

offset_id = nil

loop do
data[:offset_chunk_id] = offset_id if offset_id
data = perform_request("/chunks/scroll", data.to_json)

chunks = data.fetch("chunks")
chunks = chunks.select { _1["id"] != offset_id } if offset_id

break if chunks.empty?

chunks.each { yield _1 }

offset_id = chunks.last["id"]
end
end

def delete_chunk(id)
perform_request("/chunk/#{id}", method: :delete, expected_code: 204)
end

def usage
perform_request("/dataset/usage/#{dataset_id}", method: :get)
end

private

def perform_request(path, data)
def perform_request(path, data = nil, method: :post, expected_code: 200)
uri = URI.parse(BASE_URL + path)

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if uri.scheme == "https"

request = Net::HTTP::Post.new(
request = Net::HTTP.const_get(method.to_s.capitalize).new(
uri.request_uri,
headers.merge("Content-Type" => "application/json")
)
request.body = data
request.body = data if data

if dry_run
puts "[DRY RUN] Perform POST #{path}: #{data}"
Expand All @@ -49,11 +81,11 @@ def perform_request(path, data)

response = http.request(request)

if response.code.to_i != 200
if response.code.to_i != expected_code
raise "Invalid response code: #{response.code} (#{response.body[100...]})"
end

JSON.parse(response.body)
JSON.parse(response.body) if response.body
end
end
end

0 comments on commit 9ce7c1f

Please sign in to comment.