-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpicturepages.rb
80 lines (64 loc) · 3.16 KB
/
picturepages.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# === Author ===
# * Eric Larson
# * UW-Madison Libraries
# === License ===
# * No license
# === REQUIREMENTS ===
# * ruby
# * imagemagick
# - brew install imagemagick
# 1) Set up our directory structure
# - british_flora
# - british_flora/working
# - british_flora/images
`mkdir british_flora`
`mkdir british_flora/working`
`mkdir british_flora/images`
# 2) Download a book from the Internet Archive
# - Example (Color): The British flora medica, or, History of the medicinal plants of Great Britain - (34 images)
# - http://openlibrary.org/books/OL13997282M
# $> curl 'http://ia600309.us.archive.org/BookReader/BookReaderImages.php?zip=/33/items/britishfloramedi01bartuoft/britishfloramedi01bartuoft_jp2.zip&file=britishfloramedi01bartuoft_jp2/britishfloramedi01bartuoft_[0000-0482].jp2&scale=2&rotate=0' -o "file_#1.jpg"
`curl 'http://ia600309.us.archive.org/BookReader/BookReaderImages.php?zip=/33/items/britishfloramedi01bartuoft/britishfloramedi01bartuoft_jp2.zip&file=britishfloramedi01bartuoft_jp2/britishfloramedi01bartuoft_[0000-0482].jp2&scale=2&rotate=0' -o "british_flora/file_#1.jpg"`
# 3) Walk the book page images, detect images within images
Dir.glob('british_flora/*.jpg').each do |file|
filename = file.split("/").last.gsub(".jpg", "")
# 1) Desaturate the image
# `convert #{file} -colorspace Gray british_flora/working/#{filename}G.jpg`
# 2) Contrast x 8!
# `convert british_flora/working/#{filename}G.jpg -contrast -contrast -contrast -contrast -contrast -contrast -contrast -contrast british_flora/working/#{filename}C.jpg`
# 3) Convert image to 1px x height
# `convert british_flora/working/#{filename}C.jpg -resize 1x1500! british_flora/working/#{filename}V.jpg`
# 4) Sharpen the image
# `convert british_flora/working/#{filename}V.jpg -sharpen 0x5 british_flora/working/#{filename}S.jpg`
# 5) Heavy-handed grayscale conversion
# `convert british_flora/working/#{filename}S.jpg -negate -threshold 0 -negate british_flora/working/#{filename}N.jpg`
# 6) Color list
# `convert british_flora/working/#{filename}N.jpg TXT:british_flora/working/#{filename}.txt`
# *) Much faster version of steps 1-6, calling convert just twice
` convert #{file} -colorspace Gray -contrast -contrast -contrast -contrast -contrast -contrast -contrast -contrast -resize 1X1500! -sharpen 0x5 miff:- | \
convert - -negate -threshold 0 -negate TXT:british_flora/working/#{filename}.txt`
# 7) More than 200 black pixels in a row is an IMAGE
begin
File.open("british_flora/working/#{filename}.txt",'r') do |file|
@color = nil
@count = 0
file.each_line do |line|
line_color = line.split(" ").last.strip
if @color == line_color
@count = @count + 1
if @count > 200 && @color == "black"
puts "IMAGE - #{filename}"
`cp british_flora/#{filename}.jpg british_flora/images/#{filename}.jpg`
break
end
else
@color = line_color
@count = 0
end
end
end
rescue
img_count = Dir.entries("british_flora/images").size - 2 # '.' and '..' are not interesting
puts "\nComplete - Found #{img_count} images / Expected 34 images"
end
end