From c5e64cb128ca3ef9a154b947d6de9852b1469716 Mon Sep 17 00:00:00 2001 From: dev Date: Fri, 8 Nov 2024 14:24:13 +0530 Subject: [PATCH] Updated SPARQL transformations --- src/lib/headless_browser.rb | 9 +-------- src/lib/rdf_processor.rb | 10 +--------- src/main.rb | 12 ++++++++++-- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/lib/headless_browser.rb b/src/lib/headless_browser.rb index b9fa4fa..c2f29c8 100644 --- a/src/lib/headless_browser.rb +++ b/src/lib/headless_browser.rb @@ -3,7 +3,7 @@ require 'linkeddata' module HeadlessBrowser - def self.fetch_json_ld_objects(entity_urls, base_url, headers) + def self.fetch_json_ld_objects(entity_urls, base_url, headers, sparql_paths) puts "Loading browser..." browser = Ferrum::Browser.new(browser_path: "/usr/bin/google-chrome-stable", headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil }) browser.headers.set(headers) @@ -30,13 +30,6 @@ def self.fetch_json_ld_objects(entity_urls, base_url, headers) puts "Error processing #{entity_url} in headless mode: #{e.message}" end end - sparql_paths = [ - "./sparql/replace_blank_nodes.sparql", - "./sparql/fix_entity_type_capital.sparql", - "./sparql/fix_date_timezone.sparql", - "./sparql/fix_address_country_name.sparql", - "./sparql/remove_objects.sparql" - ] SparqlProcessor.perform_sparql_transformations(graph, sparql_paths, base_url) graph diff --git a/src/lib/rdf_processor.rb b/src/lib/rdf_processor.rb index 930ca64..1df8975 100644 --- a/src/lib/rdf_processor.rb +++ b/src/lib/rdf_processor.rb @@ -2,7 +2,7 @@ require 'sparql' require_relative 'sparql_processor' module RDFProcessor - def self.process_rdf(entity_urls, base_url, headers) + def self.process_rdf(entity_urls, base_url, headers, sparql_paths) graph = RDF::Graph.new add_url_sparql_file = File.read('./sparql/add_derived_from.sparql') @@ -20,14 +20,6 @@ def self.process_rdf(entity_urls, base_url, headers) end end - sparql_paths = [ - "./sparql/replace_blank_nodes.sparql", - "./sparql/fix_entity_type_capital.sparql", - "./sparql/fix_date_timezone.sparql", - "./sparql/fix_address_country_name.sparql", - "./sparql/remove_objects.sparql" - ] - SparqlProcessor.perform_sparql_transformations(graph, sparql_paths, base_url) end diff --git a/src/main.rb b/src/main.rb index 73a6e40..fa996b6 100644 --- a/src/main.rb +++ b/src/main.rb @@ -15,13 +15,21 @@ entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers) base_url = page_url.split('/')[0..2].join('/') +sparql_paths = [ + "./sparql/remove_objects.sparql", + "./sparql/replace_blank_nodes.sparql", + "./sparql/fix_entity_type_capital.sparql", + "./sparql/fix_date_timezone.sparql", + "./sparql/fix_address_country_name.sparql" +] + if headless == 'true' - graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url, headers) + graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url, headers, sparql_paths) File.open(file_name, 'w') do |file| file.puts(graph.dump(:jsonld)) end else - graph = RDFProcessor.process_rdf(entity_urls, base_url, headers) + graph = RDFProcessor.process_rdf(entity_urls, base_url, headers, sparql_paths) File.open(file_name, 'w') do |file| file.puts(graph.dump(:jsonld)) end