Skip to content

Commit

Permalink
0.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
oscardelben committed Jan 10, 2011
1 parent 5f285f0 commit 911e235
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 10 deletions.
8 changes: 7 additions & 1 deletion History.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
=== 1.0.0 / 2011-01-10
=== 0.0.2 / 2011-01-10

* 1 major enhancement

* Handle relative urls

=== 0.0.1 / 2011-01-10

* 1 major enhancement

Expand Down
1 change: 0 additions & 1 deletion README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ gem install rawler

== TODO

* Handle relative urls!
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
* Export to html
* Handle multiple urls at once
Expand Down
2 changes: 1 addition & 1 deletion lib/rawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))

module Rawler
VERSION = '0.0.1'
VERSION = '0.0.2'

autoload :Base, "rawler/base"
autoload :Crawler, "rawler/crawler"
Expand Down
8 changes: 7 additions & 1 deletion lib/rawler/crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@ def links
content = Net::HTTP.get(URI.parse(url))

doc = Nokogiri::HTML(content)
doc.css('a').map { |a| a['href'] }
doc.css('a').map { |a| absolute_url(a['href']) }
rescue Errno::ECONNREFUSED
$output.puts "Couldn't connect to #{url}"
[]
end

private

def absolute_url(path)
URI.parse(url).merge(path.to_s).to_s
end

end

Expand Down
8 changes: 7 additions & 1 deletion pkg/rawler-0.0.1/History.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
=== 1.0.0 / 2011-01-10
=== 0.0.2 / 2011-01-10

* 1 major enhancement

* Handle relative urls

=== 0.0.1 / 2011-01-10

* 1 major enhancement

Expand Down
1 change: 0 additions & 1 deletion pkg/rawler-0.0.1/README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ gem install rawler

== TODO

* Handle relative urls!
* Handle https (now returns 400). See http://stackoverflow.com/questions/1719809/ruby-on-rails-https-post-bad-request
* Export to html
* Handle multiple urls at once
Expand Down
2 changes: 1 addition & 1 deletion pkg/rawler-0.0.1/lib/rawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))

module Rawler
VERSION = '0.0.1'
VERSION = '0.0.2'

autoload :Base, "rawler/base"
autoload :Crawler, "rawler/crawler"
Expand Down
8 changes: 7 additions & 1 deletion pkg/rawler-0.0.1/lib/rawler/crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@ def links
content = Net::HTTP.get(URI.parse(url))

doc = Nokogiri::HTML(content)
doc.css('a').map { |a| a['href'] }
doc.css('a').map { |a| absolute_url(a['href']) }
rescue Errno::ECONNREFUSED
$output.puts "Couldn't connect to #{url}"
[]
end

private

def absolute_url(path)
URI.parse(url).merge(path.to_s).to_s
end

end

Expand Down
30 changes: 29 additions & 1 deletion pkg/rawler-0.0.1/spec/unit/crawler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,40 @@
describe Rawler::Crawler do

it "should parse all links" do
url = 'http://example.com'
url = 'http://example.com/'
register(url, site)

Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
end

it "should return an empty array when raising Errno::ECONNREFUSED" do
url = 'http://example.com'
register(url, site)

Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED

crawler = Rawler::Crawler.new(url).links.should == []
end

it "should parse relative links" do
url = 'http://example.com/path'
register(url, '<a href="/foo">foo</a>')

Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
end

# it "should print a message when raising Errno::ECONNREFUSED" do
# pending "refactor output. Don't use a global variable"
# url = 'http://example.com'
# register(url, site)
#
# Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
#
# $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
#
# Rawler::Crawler.new(url).links
# end

private

def site
Expand Down
30 changes: 29 additions & 1 deletion spec/unit/crawler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,40 @@
describe Rawler::Crawler do

it "should parse all links" do
url = 'http://example.com'
url = 'http://example.com/'
register(url, site)

Rawler::Crawler.new(url).links.should == ['http://example.com/foo', 'http://external.com/bar']
end

it "should return an empty array when raising Errno::ECONNREFUSED" do
url = 'http://example.com'
register(url, site)

Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED

crawler = Rawler::Crawler.new(url).links.should == []
end

it "should parse relative links" do
url = 'http://example.com/path'
register(url, '<a href="/foo">foo</a>')

Rawler::Crawler.new(url).links.should == ['http://example.com/foo']
end

# it "should print a message when raising Errno::ECONNREFUSED" do
# pending "refactor output. Don't use a global variable"
# url = 'http://example.com'
# register(url, site)
#
# Net::HTTP.should_receive(:get).and_raise Errno::ECONNREFUSED
#
# $stdout.should_receive(:puts).with("Couldn't connect to #{url}")
#
# Rawler::Crawler.new(url).links
# end

private

def site
Expand Down

0 comments on commit 911e235

Please sign in to comment.