Skip to content

Commit

Permalink
major refactor still missing tests
Browse files Browse the repository at this point in the history
  • Loading branch information
saumier committed Nov 20, 2024
1 parent e68e39d commit 5d31ee5
Show file tree
Hide file tree
Showing 13 changed files with 254 additions and 235 deletions.
4 changes: 2 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
source "https://rubygems.org"

gem 'linkeddata'
gem 'linkeddata', '~> 3.3'
gem 'minitest'
gem 'nokogiri'
gem 'open-uri'
gem 'rake'
gem 'ferrum'
gem 'mocha'

gem 'jsonlint', '~> 0.4.0'

87 changes: 42 additions & 45 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
GEM
remote: https://rubygems.org/
specs:
addressable (2.8.6)
public_suffix (>= 2.0.2, < 6.0)
addressable (2.8.7)
public_suffix (>= 2.0.2, < 7.0)
bcp47_spec (0.2.1)
builder (3.2.4)
concurrent-ruby (1.2.3)
bigdecimal (3.1.8)
builder (3.3.0)
concurrent-ruby (1.3.4)
connection_pool (2.4.1)
date (3.3.4)
date (3.4.0)
ebnf (2.4.0)
htmlentities (~> 4.3)
rdf (~> 3.3)
Expand All @@ -27,16 +28,20 @@ GEM
concurrent-ruby (~> 1.0)
htmlentities (4.3.4)
json-canonicalization (1.0.0)
json-ld (3.3.1)
json-ld (3.3.2)
htmlentities (~> 4.3)
json-canonicalization (~> 1.0)
link_header (~> 0.0, >= 0.0.8)
multi_json (~> 1.15)
rack (>= 2.2, < 4)
rdf (~> 3.3)
json-ld-preloaded (3.3.0)
rexml (~> 3.2)
json-ld-preloaded (3.3.1)
json-ld (~> 3.3)
rdf (~> 3.3)
jsonlint (0.4.0)
oj (~> 3)
optimist (~> 3)
ld-patch (3.3.0)
ebnf (~> 2.4)
rdf (~> 3.3)
Expand Down Expand Up @@ -72,38 +77,34 @@ GEM
sparql (~> 3.3)
sparql-client (~> 3.3)
yaml-ld (~> 0.0)
logger (1.6.0)
logger (1.6.1)
matrix (0.4.2)
minitest (5.21.2)
minitest (5.25.1)
mocha (2.5.0)
ruby2_keywords (>= 0.0.5)
multi_json (1.15.0)
net-http-persistent (4.0.2)
net-http-persistent (4.0.4)
connection_pool (~> 2.2)
nokogiri (1.16.2-aarch64-linux)
racc (~> 1.4)
nokogiri (1.16.2-arm-linux)
racc (~> 1.4)
nokogiri (1.16.2-arm64-darwin)
racc (~> 1.4)
nokogiri (1.16.2-x86-linux)
racc (~> 1.4)
nokogiri (1.16.2-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.16.2-x86_64-linux)
nokogiri (1.16.7-x86_64-darwin)
racc (~> 1.4)
open-uri (0.4.1)
oj (3.16.7)
bigdecimal (>= 3.0)
ostruct (>= 0.2)
open-uri (0.5.0)
stringio
time
uri
psych (5.1.2)
optimist (3.2.0)
ostruct (0.6.1)
psych (5.2.0)
stringio
public_suffix (5.0.4)
racc (1.7.3)
rack (3.0.9)
rake (13.1.0)
rdf (3.3.1)
public_suffix (6.0.1)
racc (1.8.1)
rack (3.1.8)
rake (13.2.1)
rdf (3.3.2)
bcp47_spec (~> 0.2)
bigdecimal (~> 3.1, >= 3.1.5)
link_header (~> 0.0, >= 0.0.8)
rdf-aggregate-repo (3.3.0)
rdf (~> 3.3)
Expand Down Expand Up @@ -161,12 +162,12 @@ GEM
rdf-turtle (3.3.0)
ebnf (~> 2.4)
rdf (~> 3.3)
rdf-vocab (3.3.0)
rdf-vocab (3.3.2)
rdf (~> 3.3)
rdf-xsd (3.3.0)
rdf (~> 3.3)
rexml (~> 3.2)
rexml (3.2.6)
rexml (3.3.9)
ruby2_keywords (0.0.5)
scanf (1.0.0)
shacl (0.4.1)
Expand Down Expand Up @@ -195,18 +196,18 @@ GEM
sparql-client (3.3.0)
net-http-persistent (~> 4.0, >= 4.0.2)
rdf (~> 3.3)
stringio (3.1.0)
stringio (3.1.2)
sxp (1.3.0)
matrix (~> 0.4)
rdf (~> 3.3)
temple (0.10.3)
thor (1.3.0)
tilt (2.3.0)
time (0.3.0)
thor (1.3.2)
tilt (2.4.0)
time (0.4.1)
date
unicode-types (1.9.0)
uri (0.13.0)
webrick (1.8.1)
unicode-types (1.10.0)
uri (1.0.2)
webrick (1.9.0)
websocket-driver (0.7.6)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
Expand All @@ -217,21 +218,17 @@ GEM
rdf-xsd (~> 3.3)

PLATFORMS
aarch64-linux
arm-linux
arm64-darwin
x86-linux
x86_64-darwin
x86_64-linux
x86_64-darwin-23

DEPENDENCIES
ferrum
linkeddata
jsonlint (~> 0.4.0)
linkeddata (~> 3.3)
minitest
mocha
nokogiri
open-uri
rake

BUNDLED WITH
2.5.3
2.3.22
1 change: 1 addition & 0 deletions sparql/remove_objects.sparql
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ WHERE {
schema:WPHeader
schema:WPFooter
schema:BreadcrumbList
schema:ListItem
}
}
UNION
Expand Down
3 changes: 2 additions & 1 deletion src/lib/entity_fetcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_enti
puts "Fetching entity urls from #{url}..."
if fetch_entity_urls_headlessly == 'true'
puts "Entity url fetch mode - Headless"
main_doc = Nokogiri::HTML(HeadlessBrowser.fetch_entity_urls_headless(url, headers))
html = HeadlessBrowser.new(headers).fetch_entity_urls_headless(url)
main_doc = Nokogiri::HTML(html)
else
puts "Entity url fetch mode - Headful"
main_doc = Nokogiri::HTML(self.fetch_entity_urls_headful(url, headers))
Expand Down
34 changes: 34 additions & 0 deletions src/lib/graph_fetcher.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
require_relative 'headless_browser'
require_relative 'rdf_processor'
require_relative 'sparql_processor'

# Fetch the data at each entity url to build the graph
# Parameters:
# - entity_urls: an array of entity URLs
class GraphFetcher
def self.load(entity_urls: [], base_url: nil, headers: nil, headless: false)
@entity_urls = entity_urls
@base_url = base_url
@headers = headers ||= {"User-Agent" => "artsdata-crawler"}
@graph = if headless
headless_browser = HeadlessBrowser.new(headers)
headless_browser.fetch_json_ld_objects(entity_urls)
else
RDFProcessor.process_rdf(entity_urls, base_url, headers)
end

sparql_paths = [
"./sparql/remove_objects.sparql",
"./sparql/fix_entity_type_capital.sparql",
"./sparql/fix_date_timezone.sparql",
"./sparql/fix_address_country_name.sparql",
"./sparql/fix_malformed_urls.sparql",
"./sparql/replace_blank_nodes.sparql",
]

base_url = entity_urls[0].split('/')[0..2].join('/')
sparql_processor = SparqlProcessor.new(sparql_paths, base_url)
@graph = sparql_processor.perform_sparql_transformations(@graph)
@graph
end
end
98 changes: 51 additions & 47 deletions src/lib/headless_browser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,72 +3,76 @@
require 'linkeddata'
require 'rbconfig'

module HeadlessBrowser
def self.fetch_json_ld_objects(entity_urls, base_url, headers, sparql_paths, browser: nil, graph: nil)
browser ||= create_browser(headers)
graph ||= RDF::Graph.new
add_url_sparql_file = File.read('./sparql/add_derived_from.sparql')

entity_urls.each do |entity_url|
process_entity_url(entity_url, browser, graph, add_url_sparql_file)
end
class HeadlessBrowser
def initialize(headers = nil)
@browser = create_browser(headers)
@add_url_sparql_file = File.read('./sparql/add_derived_from.sparql')
end

SparqlProcessor.perform_sparql_transformations(graph, sparql_paths, base_url)
graph
# Main method to return html for a single url in headless mode
# Outputs: RDF::Graph
def fetch_entity_urls_headless(url)
@browser.go_to(url)
sleep 15
@browser.body
end

def self.create_browser(headers = nil)
browser_path = if running_on_macos?
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
else
"/usr/bin/google-chrome-stable"
end
browser = Ferrum::Browser.new(browser_path: browser_path, headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil })
browser.headers.set(headers) if headers
browser
# Main method to return an RDF::Graph using the list of entity URLs
# Outputs: RDF::Graph
def fetch_json_ld_objects(entity_urls)
@graph = RDF::Graph.new
entity_urls.each do |entity_url|
process_entity_url(entity_url)
end
@graph
end

def self.process_entity_url(entity_url, browser, graph, add_url_sparql_file = nil)
def process_entity_url(entity_url)
puts "Processing #{entity_url} in headless mode"
browser.go_to(entity_url)
sleep 15
browser.stop
json_ld_scripts = browser.css("script[type='application/ld+json']")
@browser.go_to(entity_url)
sleep 5
@browser.stop

# Process the HTML content and extract JSON-LD
json_ld_scripts = @browser.css("script[type='application/ld+json']") #TODO: Check if Nokogiri::HTML works better
entity_graph = RDF::Graph.new
options = {unique_bnodes: true}
json_ld_scripts.each do |script|
process_json_ld_script(script, entity_url, graph, add_url_sparql_file)
json_ld = string_to_json(script.text)
JSON::LD::API.toRdf(json_ld, **options) do |statement|
entity_graph << statement
end
end

# Add the derivedFrom triple to the graph
sparql_file_with_url = @add_url_sparql_file.gsub("subject_url", entity_url)
entity_graph.query(SPARQL.parse(sparql_file_with_url, update: true))

@graph << entity_graph
rescue StandardError => e
puts "Error processing #{entity_url} in headless mode: #{e.message}"
end

def self.process_json_ld_script(script, entity_url, graph, add_url_sparql_file = nil)
# Parse the JSON-LD string into a JSON object
json_ld = string_to_json(script.text)
# Convert the JSON-LD object to an RDF graph
loaded_graph = RDF::Graph.new << JSON::LD::API.toRdf(json_ld)
if add_url_sparql_file
sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url)
loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true))
end
graph << loaded_graph
rescue JSON::ParserError => e
puts "Error parsing JSON-LD: #{e.message}"
end

def self.string_to_json(crawled_str)
def string_to_json(crawled_str)
# Remove any linefeeds from the string
crawled_str.gsub!("\n", "")
JSON.parse(crawled_str)
end

def self.fetch_entity_urls_headless(url, headers, browser: nil)
browser ||= create_browser(headers)
browser.go_to(url)
sleep 15
browser.body
private

def create_browser(headers = nil)
browser_path = if running_on_macos?
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
else
"/usr/bin/google-chrome-stable"
end
browser = Ferrum::Browser.new(browser_path: browser_path, headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil })
browser.headers.set(headers) if headers
browser
end

def self.running_on_macos?
def running_on_macos?
RbConfig::CONFIG['host_os'] =~ /darwin|mac os/
end
end
Loading

0 comments on commit 5d31ee5

Please sign in to comment.