-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetlc.go
141 lines (120 loc) · 3.04 KB
/
getlc.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// This is code to get all the MARC records LC has. It is not good example
// code for Go, it's just what I crammed together quickly to get the job done.
// Don't use this to learn Go.
package main
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"path"
"time"
)
const (
Server = "https://chroniclingamerica.loc.gov/"
TitlesPattern = "search/titles/results/?format=json&page=%d"
MARCPattern = "lccn/%s/marc.xml"
)
type Title struct {
LCCN string `json:"lccn"`
}
type SearchResult struct {
EndIndex int `json:"endIndex"`
TotalItems int `json:"totalItems"`
Titles []Title `json:"items"`
}
func setLastPageRead(p int) {
var f, err = os.Create(".lastpage")
if err != nil {
log.Fatalf("Unable to write to .lastpage: %s", err)
}
defer f.Close()
err = binary.Write(f, binary.LittleEndian, int32(p))
if err != nil {
log.Fatalf("Unable to serialize to .lastpage: %s", err)
}
}
func getLastPageRead() int {
var f, err = os.Open(".lastpage")
if err != nil {
log.Printf("Unable to read .lastpage; defaulting to page 0 (%s)", err)
return 0
}
defer f.Close()
var p int32
err = binary.Read(f, binary.LittleEndian, &p)
if err != nil {
log.Printf("Unable to read bytes from .lastpage; defaulting to page 0 (%s)", err)
return 0
}
return int(p)
}
func getSearchPage(p int) SearchResult {
var url = Server + fmt.Sprintf(TitlesPattern, p+1)
log.Printf("GET %s", url)
var response, err = http.Get(url)
if err != nil {
log.Fatalf("Error searching page %d: %s", p, err)
}
defer response.Body.Close()
setLastPageRead(p)
var buf = &bytes.Buffer{}
_, err = buf.ReadFrom(response.Body)
if err != nil {
log.Fatalf("Error reading from chroniclingamerica.loc.gov: %s", err)
}
var r SearchResult
ioutil.WriteFile(".searchdebug", buf.Bytes(), 0644)
err = json.Unmarshal(buf.Bytes(), &r)
if err != nil {
log.Fatalf("Error unmarshaling JSON: %s", err)
}
return r
}
func getMARC(lccn string) {
log.Printf("INFO - LCCN: %#v", lccn)
var response, err = http.Get(Server + fmt.Sprintf(MARCPattern, lccn))
if err != nil {
log.Printf("WARN - Couldn't fetch newspaper: %s", err)
return
}
var xml = &bytes.Buffer{}
_, err = xml.ReadFrom(response.Body)
if err != nil {
log.Printf("WARN - Error reading MARC XML: %s", err)
return
}
var dir = path.Join("marc", lccn)
err = os.MkdirAll(dir, 0755)
if err != nil {
log.Printf("WARN - Error creating directory: %s", err)
return
}
var f *os.File
f, err = os.Create(path.Join(dir, "marc.xml"))
if err != nil {
log.Printf("WARN - Error opening file for writing marc.xml: %s", err)
return
}
defer f.Close()
f.Write(xml.Bytes())
}
func main() {
// Start searching titles from last page pulled or page 1
var p = getLastPageRead()
var sr = getSearchPage(p)
log.Printf("INFO - processing %d titles", sr.TotalItems)
for sr.TotalItems > sr.EndIndex {
for _, title := range sr.Titles {
// Let's not DOS Chronicling America
time.Sleep(time.Millisecond * 500)
getMARC(title.LCCN)
}
p++
sr = getSearchPage(p)
}
}