Skip to content

Commit

Permalink
Merge pull request #4 from culturecreates/enhancement/issue-81
Browse files Browse the repository at this point in the history
Added support to fetch URLs headlessly
  • Loading branch information
dev-aravind authored Nov 7, 2024
2 parents 05bf438 + 4cd35f6 commit bade0b3
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 32 deletions.
6 changes: 5 additions & 1 deletion action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ inputs:
description: 'URL to send back the data validation report asynchronously using POST "Content-Type: application/json"'
shacl:
description: 'URL to the SHACL file'
fetch-urls-headlessly:
description: 'Set as true to fetch the entity URLs headlessly'

runs:
using: 'composite'
Expand Down Expand Up @@ -95,14 +97,16 @@ runs:
run: |
isPaginated=${{ inputs.is-paginated || 'false' }}
headless=${{ inputs.headless || 'false' }}
fetchUrlsHeadlessly=${{ inputs.fetch-urls-headlessly || 'false' }}
docker pull ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:main
docker run --shm-size=1g -v $(pwd)/output:/usr/src/app/output ghcr.io/culturecreates/artsdata-pipeline-action/artsdata-rdf-fetcher:main \
"${{ inputs.page-url }}" \
"${{ inputs.entity-identifier }}" \
"output/${{ inputs.downloadFile }}" \
"$isPaginated" \
"$headless"
"$headless" \
"$fetchUrlsHeadlessly"
shell: bash

Expand Down
42 changes: 22 additions & 20 deletions src/lib/entity_fetcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
require 'open-uri'

module EntityFetcher
def self.fetch_entity_urls(page_url, entity_identifier, is_paginated)
def self.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_entity_urls_headlessly, headers)
base_url = page_url.split('/')[0..2].join('/')
entity_urls = []

Expand All @@ -13,28 +13,15 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated)
else
page_number = is_paginated.to_i
end

max_retries, retry_count = 3, 0

loop do
url = "#{page_url}#{page_number}"
puts "Fetching entity urls from #{url}..."
begin
linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"}
main_page_html_text = URI.open(url, headers).read
rescue StandardError => e
retry_count += 1
if retry_count < max_retries
retry
else
puts "Max retries reached. Unable to fetch the content for page #{page_number}."
puts e.message
break
end
if fetch_entity_urls_headlessly == 'true'
main_doc = Nokogiri::HTML(HeadlessBrowser.fetch_entity_urls_headless(url, headers))
else
main_doc = Nokogiri::HTML(self.fetch_entity_urls_headful(url, headers))
end

main_doc = Nokogiri::HTML(main_page_html_text)
entities_data = main_doc.css(entity_identifier)
number_of_entities = entity_urls.length
entities_data.each do |entity|
Expand All @@ -45,9 +32,24 @@ def self.fetch_entity_urls(page_url, entity_identifier, is_paginated)
break if entity_urls.length == number_of_entities || page_number.nil?

page_number += 1
retry_count = 0
end

entity_urls.uniq
end

def self.fetch_entity_urls_headful(url, headers)
retry_count = 0
max_retries = 3
begin
main_page_html_text = URI.open(url, headers).read
rescue StandardError => e
retry_count += 1
if retry_count < max_retries
retry
else
puts "Max retries reached. Unable to fetch the content for page #{page_number}."
puts e.message
end
end
main_page_html_text
end
end
14 changes: 11 additions & 3 deletions src/lib/headless_browser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
require 'linkeddata'

module HeadlessBrowser
def self.fetch_json_ld_objects(entity_urls, base_url)
def self.fetch_json_ld_objects(entity_urls, base_url, headers)
puts "Loading browser..."
browser = Ferrum::Browser.new(browser_path: "/usr/bin/google-chrome-stable", headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil })
linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
browser.headers.set({"User-Agent" => "artsdata-crawler/#{linkeddata_version}"})
browser.headers.set(headers)
graph = RDF::Graph.new
add_url_sparql_file = File.read('./sparql/add_derived_from.sparql')
entity_urls.each do |entity_url|
Expand Down Expand Up @@ -42,4 +41,13 @@ def self.fetch_json_ld_objects(entity_urls, base_url)
SparqlProcessor.perform_sparql_transformations(graph, sparql_paths, base_url)
graph
end

def self.fetch_entity_urls_headless(url, headers)
puts "Loading browser..."
browser = Ferrum::Browser.new(browser_path: "/usr/bin/google-chrome-stable", headless: true, pending_connection_errors: false, process_timeout: 60, xvfb: true, browser_options: { 'no-sandbox': nil })
browser.headers.set(headers)
browser.go_to(url)
sleep 15
browser.body
end
end
5 changes: 2 additions & 3 deletions src/lib/rdf_processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
require 'sparql'
require_relative 'sparql_processor'
module RDFProcessor
def self.process_rdf(entity_urls, base_url)
def self.process_rdf(entity_urls, base_url, headers)
graph = RDF::Graph.new
add_url_sparql_file = File.read('./sparql/add_derived_from.sparql')

entity_urls.each do |entity_url|
begin
puts "Processing #{entity_url} in non-headless mode"
entity_url = entity_url.gsub(' ', '+')
linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
options = { headers: { 'User-Agent' => "artsdata-crawler/#{linkeddata_version}" } }
options = { headers: headers }
loaded_graph = RDF::Graph.load(entity_url, **options)
sparql_file_with_url = add_url_sparql_file.gsub("subject_url", entity_url)
loaded_graph.query(SPARQL.parse(sparql_file_with_url, update: true))
Expand Down
13 changes: 8 additions & 5 deletions src/main.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,25 @@
require_relative 'lib/headless_browser'

if ARGV.length < 4
puts "Usage: ruby script_name.rb <page_url> <entity_identifier> <file_name> <is_paginated> <headless>"
puts "Usage: ruby script_name.rb <page_url> <entity_identifier> <file_name> <is_paginated> <headless> <fetch_urls_headlessly>"
exit
end

page_url, entity_identifier, file_name, is_paginated, headless = ARGV[0..4]
page_url, entity_identifier, file_name, is_paginated, headless, fetch_urls_headlessly = ARGV[0..5]

entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated)
linkeddata_version = Gem::Specification.find_by_name('linkeddata').version.to_s
headers = {"User-Agent" => "artsdata-crawler/#{linkeddata_version}"}

entity_urls = EntityFetcher.fetch_entity_urls(page_url, entity_identifier, is_paginated, fetch_urls_headlessly, headers)
base_url = page_url.split('/')[0..2].join('/')

if headless == 'true'
graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url)
graph = HeadlessBrowser.fetch_json_ld_objects(entity_urls, base_url, headers)
File.open(file_name, 'w') do |file|
file.puts(graph.dump(:jsonld))
end
else
graph = RDFProcessor.process_rdf(entity_urls, base_url)
graph = RDFProcessor.process_rdf(entity_urls, base_url, headers)
File.open(file_name, 'w') do |file|
file.puts(graph.dump(:jsonld))
end
Expand Down

0 comments on commit bade0b3

Please sign in to comment.