forked from elifesciences/bot-lax-adaptor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackfill.sh
executable file
·178 lines (138 loc) · 5.28 KB
/
backfill.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/bin/bash
# this script is used to update the article-json stored in lax
# performing just an INGEST of content, *no* PUBLISH events are
# sent whatsoever.
# talking to lax via the adaptor.py script for many thousands of articles is
# extremely slow, so this script bypasses all that, bulk generates articles,
# bulk validates them then tells lax to do a bulk ingest.
set -euo pipefail # strict mode
#set -xv # debugging
# housekeeping
trap ctrl_c INT
function ctrl_c() {
echo "caught ctrl-c"
exit 1
}
mustexist() {
if [ ! -e "$1" ]; then errcho "$1 does not exist. quitting."; exit 1; fi
}
errcho(){ >&2 echo $@; }
#
#
#
prjdir=$(pwd) # bot-lax project, where this script lives
tmpdir=/tmp # where we do our work
if [ -e /ext/tmp ]; then
# an external store has been mounted. do our work there.
tmpdir=/ext/tmp
fi
mustexist "$tmpdir"
# where articles will be linked to/downloaded for backfill
runpath="run-$(date +'%Y-%m-%d-%H-%M-%S')" # ll: run-2017-01-31-23-59-59
if [ ! -z "$@" ]; then
# args were provided to backfill.sh.
runpath=$1
# remove them for downstream scripts (download-api-raml.sh)
shift
fi
# ll: /tmp/run-2017-01-31-23-59-59
# or: /ext/tmp/run-2017-01-31-23-59-59
runpath="$tmpdir/$runpath"
# where to find lax
laxdir="/srv/lax"
mustexist "$laxdir"
# where to find xml on the fs
xmlrepodir="$prjdir/article-xml/articles"
# where to download unpublished xml
unpubxmldir="$tmpdir/unpub-article-xml"
# where generated article-json will be stored
ajsondir="$runpath/ajson"
# where the results of validation will be stored
validdir="$ajsondir/valid"
invaliddir="$ajsondir/invalid"
# confirm
echo "backfill.sh
this script will:
1. pull latest article-xml from elifesciences/elife-article-xml (large repo)
2. download any missing/unpublished articles after consulting Lax (needs /srv/lax, s3 auth)
3. create a 'backfill-run' directory with symbolic links to the xml to be processed
4. generate article-json from ALL xml in the ./articles-xml/articles/ directory (long process)
5. validate all generated article-json, failing if any are invalid
6. force an INGEST into Lax for all valid articles (needs /srv/lax)"
read -p "any key to continue (ctrl-c to quit) "
# begin
# create our dirs
mkdir -p "$unpubxmldir" "$runpath" "$ajsondir"
# because we can choose an existing directory for the run
# ensure the results of any previous run are empty
rm -rf "$validdir" "$invaliddir"
mkdir "$validdir" "$invaliddir"
# wrangle our xml
# we ingest from the latest on the master branch
(
. download-elife-xml.sh
cd $xmlrepodir
git reset --hard
git checkout master
git pull
)
# activate venv
# virtualenv script has unset vars we can't control
set +o nounset; . install.sh; set -o nounset;
# create a list of articles to backfill in this run
(
# switch to the run dir (/tmp/run-something)
cd "$runpath"
# iterate over response from lax about the articles it knows about
# (should be *all* articles, lax is now the publishing authority)
# https://www.cyberciti.biz/faq/unix-linux-bash-read-comma-separated-cvsfile/
OLDIFS=$IFS
IFS=,
errcho "fetching articles from lax"
$laxdir/manage.sh --skip-install report all-article-versions-as-csv | while read msid version remotepath
do
# ll: elife-00003-v1.xml
fname="elife-$msid-v$version.xml"
# ll: /home/user/bot-lax/article-xml/articles/elife-00003-v1.xml
xmlpath="$xmlrepodir/$fname"
# ll: /home/user/bot-lax/unpub-article-xml/elife-00003-v1.xml
xmlunpubpath="$unpubxmldir/$fname"
# we look in both places for xml and if it's in neither, we download it
if [ ! -f $xmlpath ] && [ ! -f $xmlunpubpath ]; then
# lax doesn't know where the remote location and it's not on the fs
if [ "$remotepath" = "no-location-stored" ]; then
errcho "$fname not found, skipping"
continue
fi
# edge case: previous backfill stored a path to the unpubdir for this article
# if it's made it this far, it's *still* not present in the xml repo and it's original remote
# path has been overwritten.
if [ "$remotepath" = "$xmlunpubpath" ]; then
errcho "$fname still not published since previous backfill, remote path unknown, skipping"
continue
fi
# xml absent, download it
# download.py reuses code in the adaptor and does an authenticated requests to s3
python $prjdir/src/download.py "$remotepath" "$xmlunpubpath"
fi
#errcho "linking $fname"
# link it in to the run dir
if [ -f $xmlpath ]; then
ln -sfT $xmlpath $fname
else
ln -sfT $xmlunpubpath $fname
fi
done
IFS=$OLDIFS
)
# generate article-json
# generated files are stored in $ajsondir
echo > "$prjdir/scrape.log"
time python $prjdir/src/generate_article_json.py "$runpath" "$ajsondir"
# validate all generated article-json
echo > "$prjdir/validate.log"
time python $prjdir/src/validate_article_json.py "$ajsondir"
# call the lax 'ingest' command with a directory of valid article json
LAX_MULTIPROCESSING=1 time $laxdir/manage.sh --skip-install ingest --ingest --force --dir "$validdir"
# clean up
# rm -rf "$rundir/"