From a0a6de150ad55071b732bc8871f9e70f9e56c9b4 Mon Sep 17 00:00:00 2001 From: Robert Glaser Date: Wed, 9 Feb 2011 17:32:14 +0100 Subject: [PATCH] Parser should ignore anchor tags without an href attribute. --- lib/rawler/crawler.rb | 2 +- spec/lib/rawler/crawler_spec.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rawler/crawler.rb b/lib/rawler/crawler.rb index 155656b..bdb4c97 100644 --- a/lib/rawler/crawler.rb +++ b/lib/rawler/crawler.rb @@ -16,7 +16,7 @@ def links response = Rawler::Request.get(url) doc = Nokogiri::HTML(response.body) - doc.css('a').map { |a| a['href'] }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) } + doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) } rescue Errno::ECONNREFUSED write("Couldn't connect to #{url}") [] diff --git a/spec/lib/rawler/crawler_spec.rb b/spec/lib/rawler/crawler_spec.rb index 7668e4a..53d1069 100644 --- a/spec/lib/rawler/crawler_spec.rb +++ b/spec/lib/rawler/crawler_spec.rb @@ -85,7 +85,7 @@ let(:url) { 'http://example.com/path' } let(:crawler) { Rawler::Crawler.new(url) } let(:js_url) { "javascript:fn('nbjmup;jhfs.esf{fio/dpn');" } - let(:content) { "foo" } + let(:content) { "foo" } before(:each) do register(url, content)