forked from pkiraly/qa-catalogue
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon-script
executable file
·674 lines (572 loc) · 25.4 KB
/
common-script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
#!/usr/bin/env bash
set -ueo pipefail
# when run interactively via terminal
if [ -t 1 ]; then
# prefix each command with timestamp in trace mode
PS4='\033[0D\033[1;37m$(date +"%F %T")>\033[0m '
function colored() { echo -en "\033[${1}m${2}\033[0m"; }
else
PS4='$(date +"%F %T")> '
function colored() { echo -n "$2"; }
fi
# emit message with timestamp
log() {
colored "1;37" "$(date +'%F %T')> "
echo "$1"
}
# define 'untrace' command to disable trace mode
unalias -a
shopt -s expand_aliases
alias untrace='{ set +x; } 2> /dev/null'
# start a named processing step and enable trace mode
run() {
untrace
colored "1;1" "[$1]"
echo ""
set -x
}
# ---- proccessing steps ----
do_validate() {
GENERAL_PARAMS="--details --trimId --summary --format csv --defaultRecordType BOOKS"
OUTPUT_PARAMS="--outputDir ${OUTPUT_DIR} --detailsFileName issue-details.csv --summaryFileName issue-summary.csv"
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run validate
./validate ${GENERAL_PARAMS} ${OUTPUT_PARAMS} ${PARAMS} ${MARC_DIR}/$MASK 2> ${LOG_DIR}/validate.log
}
do_prepare_solr() {
run prepare-solr
./prepare-solr $NAME 2> ${LOG_DIR}/solr.log
}
do_index() {
run index
untrace
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+//g')
# HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# PARAMS="${PARAMS} --solrForScoresUrl ${NAME}_validation"
# fi
PARAMS="${PARAMS} --outputDir ${OUTPUT_DIR}"
ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)
if [[ "${ONLY_INDEX}" == "0" ]]; then
CORE=${NAME}_dev
else
PARAMS=$(echo ${PARAMS} | sed -r 's/\s*--onlyIndex//')
CORE=${NAME}
fi
./index --core ${CORE} --file-path ${MARC_DIR} --file-mask $MASK ${PARAMS} --trimId 2>> ${LOG_DIR}/solr.log
}
do_postprocess_solr() {
run postprocess-solr
./postprocess-solr $NAME 2>> ${LOG_DIR}/solr.log
}
do_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run completeness
./completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/completeness.log
}
do_completeness_sqlite() {
run completeness_sqlite
untrace
HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
bash scripts/sqlite/completeness.sqlite.sh ${OUTPUT_DIR} ${HAS_GROUP_PARAM}
}
do_classifications() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run classifications
./classifications --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/classifications.log
Rscript scripts/classifications/classifications-type.R ${OUTPUT_DIR}
}
do_authorities() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run authorities
./authorities --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/authorities.log
}
do_tt_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run tt-completeness
./tt-completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/tt-completeness.log
Rscript scripts/tt-histogram/tt-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/tt-completeness.log
# for large files
# php scripts/tt-histogram/tt-histogram.php ${OUTPUT_DIR} &>> ${LOG_DIR}/tt-completeness.log
}
do_shelf_ready_completeness() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run shelf-ready-completeness
./shelf-ready-completeness \
--defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ \
--trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shelf-ready-completeness.log
Rscript scripts/shelf-ready/shelf-ready-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/shelf-ready-completeness.log
# for large files
# php scripts/shelf-ready-histogram.php ${OUTPUT_DIR} &>> ${LOG_DIR}/shelf-ready-completeness.log
}
do_bl_classification() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run bk-classification
./bl-classification \
--defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ \
--trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/bl-classification.log
}
do_serial_score() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run serial-score
./serial-score --defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ \
--trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/serial-score.log
Rscript scripts/serial-score/serial-score-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/serial-score.log
}
do_format() {
run format
./formatter --defaultRecordType BOOKS ${MARC_DIR}/${MASK}
}
do_functional_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run functional-analysis
./functional-analysis --defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/functional-analysis.log
}
do_network_analysis() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run network-analysis
./network-analysis --defaultRecordType BOOKS \
${PARAMS} \
--outputDir ${OUTPUT_DIR}/ \
${MARC_DIR}/${MASK} 2> ${LOG_DIR}/network-analysis.log
# network.csv (concept, id) ->
# network-by-concepts.csv (concept, count, ids)
# network-by-record.csv (id, count, concepts)
# network-statistics.csv (type, total, single, multi)
Rscript scripts/network-transform.R ${OUTPUT_DIR} &>> ${LOG_DIR}/network-analysis.log
# network-by-concepts (concept, count, ids) ->
# network-pairs.csv (id1 id2)
# network-nodes.csv (id, id)
./network-analysis --outputDir ${OUTPUT_DIR} \
--action pairing \
&>> ${LOG_DIR}/network-analysis.log
untrace
sort network-pairs.csv | uniq -c | sort -nr > network-pairs-uniq-with-count.csv
awk '{print $2 " " $3}' network-pairs-uniq-with-count.csv > network-pairs-all.csv
log "ziping output"
PWD=$(pdw)
cd ${OUTPUT_DIR}
zip network-input network-nodes.csv network-nodes-???.csv network-pairs-???.csv network-by-concepts-tags.csv
cd $PWD
log "upload output"
scp ${OUTPUT_DIR}/network-input.zip [email protected]:/roedel/pkiraly/network/input
# spark-shell -I scripts/network.scala --conf spark.driver.metadata.qa.dir="${OUTPUT_DIR}"
# ./network-export.sh ${OUTPUT_DIR}
}
do_pareto() {
run pareto
Rscript scripts/pareto/frequency-range.R ${OUTPUT_DIR} &> ${LOG_DIR}/pareto.log
untrace
. ./common-variables
if [[ "${WEB_DIR:-}" != "" ]]; then
mkdir -p $WEB_DIR/images
ln -s ${OUTPUT_DIR}/img $WEB_DIR/images/${NAME}
fi
}
do_marc_history() {
if [ "${SCHEMA}" == "PICA" ]; then
SELECTOR='011@$a;001A$0|extractPicaDate'
else
SELECTOR="008~7-10;008~0-5"
fi
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run marc-history
./formatter --selector "$SELECTOR" --defaultRecordType BOOKS ${PARAMS} --separator "," \
--outputDir ${OUTPUT_DIR}/ --fileName "marc-history.csv" ${MARC_DIR}/${MASK} &> ${LOG_DIR}/marc-history.log
# | grep -v '008~7-10,008~0-5' \
untrace
tail -n +2 ${OUTPUT_DIR}/marc-history.csv \
| sort \
| uniq -c \
| sed -r 's/([0-9]) ([0-9uc xwticrka])/\1,\2/' \
| sed 's, ,,g' > ${OUTPUT_DIR}/marc-history-grouped.csv
set -x
Rscript scripts/marc-history/marc-history-grouped.R ${OUTPUT_DIR} &>> ${LOG_DIR}/marc-history.log
}
do_record_patterns() {
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run record-patterns
Rscript scripts/record-patterns/top-fields.R ${OUTPUT_DIR} &>> ${LOG_DIR}/top-fields.log
./record-patterns --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} &> ${LOG_DIR}/record-patterns.log
head -1 ${OUTPUT_DIR}/record-patterns.csv | sed -e 's/^/count,/' > ${OUTPUT_DIR}/record-patterns-grouped.csv
cat ${OUTPUT_DIR}/record-patterns.csv \
| grep -v "\\$" \
| sort \
| uniq -c \
| sort -n -r \
| sed -r 's/^ *([0-9]+) /\1,/' >> ${OUTPUT_DIR}/record-patterns-grouped.csv
}
do_version_link() {
run version-link
untrace
if [[ "${VERSION:-}" != "" ]]; then
OUTPUT_LINK=${BASE_OUTPUT_DIR}/${NAME}
if [[ -e ${OUTPUT_LINK} ]]; then
rm ${OUTPUT_LINK}
fi
ln -s ${OUTPUT_DIR} ${OUTPUT_LINK}
fi
}
do_validate_sqlite() {
run "validate sqlite"
php scripts/sqlite/normalize-issue-details.php ${OUTPUT_DIR} &> ${LOG_DIR}/sqlite.log
untrace
# log "delete DB"
# if [[ -e ${OUTPUT_DIR}/qa_catalogue.sqlite ]]; then
# rm ${OUTPUT_DIR}/qa_catalogue.sqlite
# fi
HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "create DB structure"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/qa_catalogue.sqlite.sql
else
log "create DB structure (grouped)"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/qa_catalogue.grouped.sqlite.sql
fi
log "create importable files"
tail -n +2 ${OUTPUT_DIR}/issue-details-normalized.csv > ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
tail -n +2 ${OUTPUT_DIR}/issue-summary.csv > ${OUTPUT_DIR}/issue-summary_noheader.csv
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
tail -n +2 ${OUTPUT_DIR}/id-groupid.csv > ${OUTPUT_DIR}/id-groupid_noheader.csv
fi
log "import issue details"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/issue-details-normalized_noheader.csv issue_details
EOF
log "import issue summary"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/issue-summary_noheader.csv issue_summary
EOF
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
log "import id_groupid"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/id-groupid_noheader.csv id_groupid
EOF
fi
log "delete importable files"
rm ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
rm ${OUTPUT_DIR}/issue-details-normalized.csv
rm ${OUTPUT_DIR}/issue-summary_noheader.csv
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
rm ${OUTPUT_DIR}/id-groupid_noheader.csv
fi
SOLR_FOR_SCORES_URL=$(echo $TYPE_PARAMS | grep -P -o --regexp='--solrForScoresUrl \K([^ ]+)' || true)
ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)
if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "index"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${LOG_DIR}/sqlite.log
if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
echo "index at ${SOLR_FOR_SCORES_URL}"
# index id-groupid.csv and issue-details.csv
scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${ONLY_INDEX} ${SOLR_FOR_SCORES_URL}
fi
else
log "index (grouped)"
scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${ONLY_INDEX} ${SOLR_FOR_SCORES_URL}
scripts/sqlite/calculate-aggregated-numbers.grouped.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${SOLR_FOR_SCORES_URL}
fi
}
do_mysql() {
run mysql
php scripts/sqlite/normalize-issue-details.php ${OUTPUT_DIR} &> ${LOG_DIR}/mysql.log
untrace
# log "delete DB"
# if [[ -e ${OUTPUT_DIR}/qa_catalogue.sqlite ]]; then
# rm ${OUTPUT_DIR}/qa_catalogue.sqlite
# fi
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_details"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_summary"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_groups"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS id_groupid"
HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "create DB structure"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/qa_catalogue.mysql.sql
else
log "create DB structure (grouped)"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/qa_catalogue.grouped.mysql.sql
fi
log "create importable files"
# tail -n +2 ${OUTPUT_DIR}/issue-details-normalized.csv > ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
# tail -n +2 ${OUTPUT_DIR}/issue-summary.csv > ${OUTPUT_DIR}/issue-summary_noheader.csv
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# tail -n +2 ${OUTPUT_DIR}/id-groupid.csv > ${OUTPUT_DIR}/id-groupid_noheader.csv
# fi
log "import issue details"
MYSQL_SEC_DIR=$(mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e 'SHOW VARIABLES LIKE "secure_file_priv"\G' | grep 'Value:' | sed 's,\s*Value: ,,')
sudo cp ${OUTPUT_DIR}/issue-details-normalized.csv ${MYSQL_SEC_DIR}
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/issue-details-normalized.csv'
INTO TABLE issue_details
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
# sudo rm ${MYSQL_SEC_DIR}/issue-details-normalized.csv
log "import issue summary"
sudo cp ${OUTPUT_DIR}/issue-summary.csv ${MYSQL_SEC_DIR}
mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/issue-summary.csv'
INTO TABLE issue_summary
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
# sudo rm ${MYSQL_SEC_DIR}/issue-summary.csv
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
log "import id_groupid"
sudo cp ${OUTPUT_DIR}/id-groupid.csv ${MYSQL_SEC_DIR}
sudo mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/id-groupid.csv'
INTO TABLE id_groupid
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
# sudo rm ${MYSQL_SEC_DIR}/id-groupid.csv
fi
log "delete importable files"
# rm ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
# rm ${OUTPUT_DIR}/issue-summary_noheader.csv
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# rm ${OUTPUT_DIR}/id-groupid_noheader.csv
# fi
if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
log "index"
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${LOG_DIR}/mysql.log
else
log "index (grouped)"
mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/modify-tables.grouped.mysql.sql &>> ${LOG_DIR}/mysql.log
fi
}
do_export_schema_files() {
mkdir -p avram-schemas
run avram
untrace
./export-schema --withSubfieldCodelists | jq . > avram-schemas/marc-schema.json
./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode | jq . > avram-schemas/marc-schema-with-solr.json
./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode --withLocallyDefinedFields | jq . > avram-schemas/marc-schema-with-solr-and-extensions.json
log "files generated at 'avram-schemas' directory: marc-schema.json, marc-schema-with-solr.json, marc-schema-with-solr-and-extensions.json"
}
do_export_cli_parameters() {
. ./common-variables
java -cp $JAR de.gwdg.metadataqa.marc.cli.CliParameterDefinitionsExporter | jq . > cli-parameter-definitions.json
log "CLI parameters has been exported into cli-parameter-definitions.json"
}
do_shacl4bib() {
# note: SHACL specific parameters are missing here --shaclConfigurationFile, --shaclOutputType, --shaclOutputFile
PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
run shacl4bib
echo " ./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log"
./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log
Rscript scripts/shacl4bib/shacl4bib.R ${OUTPUT_DIR}
log "import issue details"
echo "DROP TABLE shacl;" > scripts/shacl4bib/shacl4bib.sql
head -1 ${OUTPUT_DIR}/shacl4bib.csv \
| sed 's/,/" VARCHAR(2), "/g' \
| sed 's/^/CREATE TABLE shacl( "/' \
| sed 's/id" VARCHAR(2)/id" VARCHAR(100)/' \
| sed 's/$/" VARCHAR(2)\n);/' \
| sed 's/,/,\n /g' >> scripts/shacl4bib/shacl4bib.sql
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/shacl4bib/shacl4bib.sql
rm scripts/shacl4bib/shacl4bib.sql
tail -n +2 ${OUTPUT_DIR}/shacl4bib.csv > ${OUTPUT_DIR}/shacl4bib-noheader.csv
sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/shacl4bib-noheader.csv shacl
EOF
}
do_all_analyses() {
analysis_tasks=$(echo "${ANALYSES}" | tr , ' ')
for task in $analysis_tasks; do
declare -F "do_$task" > /dev/null || fatal "unknown analysis task: $task"
done
for task in $(echo "${ANALYSES}" | tr , ' '); do
do_$task
echo "##"
echo "## THE RESULT IS $?"
echo "##"
done
}
do_all_solr() {
do_prepare_solr
do_index
do_postprocess_solr
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# php scripts/sqlite/solr-copy-ids-from-validation.php ${NAME}_validation ${NAME} 2>> ${LOG_DIR}/solr.log
# fi
}
# ---- usage and execution of proccessing steps ----
help() {
ME=$0
cat <<END
Run QA catalogue analyses
${ME} [VARIABLES] <COMMAND[,COMMAND...]>
Commands:
validate record validation
validate-sqlite import result of validation to SQLite
completeness completeness analysis
completeness-sqlite import grouped output of completeness to SQLite
classifications classification analysis
authorities authority analysis
tt-completeness Thompson-Traill completeness analysis
shelf-ready-completeness shelf-ready completeness analysis
bl-classification British Library's quality classification
serial-score serial score analysis
format search and format records
functional-analysis FRBR functional requirement analysis
network-analysis network analysis
pareto pareto analysis
marc-history generating cataloguing history chart
record-patterns record patterns
prepare-solr prepare indexing
index indexing with Solr
postprocess-solr postprocess indexing (swap NAME and NAME_dev indexes)
export-schema-files export schema files
export-cli-parameters export the parameters of the CLI components into a JSON file
shacl4bib run SHACL-like validation
all-analyses run all analytical tasks (or those set via ANALYSES)
all-solr run all indexing tasks
all run all tasks (analyses and indexing)
config show configuration
help show this help message
Variable of the setdir.sh script:
BASE_INPUT_DIR the base input directory (default: -./input)
BASE_OUTPUT_DIR the base input directory (default: -./output)
Environmental variables:
NAME the machine readable name of the dataset. Usually same as the MARC_DIR variable
MARC_DIR the name of the subdirectory where bibliographical data stored. If not set it will
be set to BASE_INPUT_DIR/NAME.
MASK the mask of the bibliographical data files (e.g. marc*.mrc.gz)
OUTPUT_DIR the name of subdirectory where the output of the analysis will be stored. If not
set it will be set to BASE_OUTPUT_DIR/NAME or if VERSION is set then it will be
set to BASE_OUTPUT_DIR/NAME-VERSION.
TYPE_PARAMS all parameters of analyses which are described in the documentation
ANALYSES a comma separated list of analyses (commands) to execute
SCHEMA the metadata schema (MARC21 (default) or PICA)
WEB_DIR the directory of the qa-catalogue-web
LOG_DIR the directory where log files are written (default: BASE_LOG_DIR/NAME)
UPDATE the date time string (in YYYY-mm-dd H:M:s format) of the last data update.
It will be stored into OUTPUT_DIR/last-update.csv
VERSION a version number for the source data (e.g. the date of the update). If set, the actual
OUTPUT_DIR path will contain it, and there will symbolic link created to the latest one
(from BASE_OUTPUT_DIR/NAME to BASE_OUTPUT_DIR/NAME-VERSION).
more info: https://github.com/pkiraly/qa-catalogue
END
}
config() {
set +u
echo "NAME=$NAME"
echo "MARC_DIR=$MARC_DIR"
echo "MASK=$MASK"
echo "OUTPUT_DIR=$OUTPUT_DIR"
echo "TYPE_PARAMS=$TYPE_PARAMS"
echo "ANALYSES=$ANALYSES"
echo "SCHEMA=$SCHEMA"
echo "WEB_DIR=$WEB_DIR"
echo "LOG_DIR=$LOG_DIR"
echo "UPDATE=$UPDATE"
cat ./common-variables
}
fatal() {
colored "1;31" "$1"
exit 1
}
# initialize environment, set default values
# TODO: remove this because it's already done in ./qa-catalogue)
NAME=${NAME:-$(basename $0 .sh)}
BASE_INPUT_DIR=${BASE_INPUT_DIR:-./input}
BASE_OUTPUT_DIR=${BASE_OUTPUT_DIR:-./output}
BASE_LOG_DIR=${BASE_LOG_DIR:-./logs}
MARC_DIR=${MARC_DIR:-$BASE_INPUT_DIR/$NAME}
SCHEMA=${SCHEMA:-MARC21}
LOG_DIR=${BASE_LOG_DIR}/${NAME}
if [[ "${VERSION:-}" != "" ]]; then
OUTPUT_DIR=${BASE_OUTPUT_DIR}/${NAME}-${VERSION}
else
OUTPUT_DIR=${BASE_OUTPUT_DIR}/${NAME}
fi
# which tasks to run on `all-analyses`
if [ "${SCHEMA}" == "PICA" ]; then
ALL_ANALYSES=validate,validate_sqlite,completeness,completeness_sqlite,classifications,authorities,marc_history
# automatically set unless already set
[[ "$TYPE_PARAMS" =~ --marcFormat|-f ]] || TYPE_PARAMS="--marcFormat PICA_NORMALIZED $TYPE_PARAMS"
[[ "$TYPE_PARAMS" =~ --schemaType|-w ]] || TYPE_PARAMS="--schemaType PICA $TYPE_PARAMS"
else
ALL_ANALYSES=validate,validate_sqlite,completeness,completeness_sqlite,classifications,authorities,tt_completeness,shelf_ready_completeness,serial_score,functional_analysis,pareto,marc_history
fi
ANALYSES=${ANALYSES:-$ALL_ANALYSES}
tasks="${1:-help}"
datatask=
# Check whether data is going to be processed
for task in ${tasks//,/ }; do
if [[ ! "$task" =~ ^(help|config|export-schema-files|export-cli-parameters)$ ]]; then
datatask=true
fi
done
# check directories for processing commands
if [[ "$datatask" = true ]]; then
mkdir -p $LOG_DIR
mkdir -p $OUTPUT_DIR
log "input: $MARC_DIR/$MASK"
log "output: $OUTPUT_DIR"
log "logs: $LOG_DIR"
ls ${MARC_DIR}/${MASK} &> /dev/null || fatal "Missing input files: ${MARC_DIR}/${MASK}!\n"
if [[ -n "${UPDATE:-}" ]]; then
log "update: $UPDATE"
echo "${UPDATE}" > "${OUTPUT_DIR}/last-update.csv"
fi
fi
for task in ${tasks//,/ }; do
case $task in
validate) do_validate ; do_validate_sqlite ;;
validate-sqlite) do_validate_sqlite ;;
prepare-solr) do_prepare_solr ;;
index) do_index ;;
postprocess-solr) do_postprocess_solr ;;
completeness) do_completeness ; do_completeness_sqlite ;;
completeness-sqlite) do_completeness_sqlite ;;
classifications) do_classifications ;;
authorities) do_authorities ;;
tt-completeness) do_tt_completeness ;;
shelf-ready-completeness) do_shelf_ready_completeness ;;
bl-classification) do_bl_classification ;;
serial-score) do_serial_score ;;
format) do_format ;;
functional-analysis) do_functional_analysis ;;
network-analysis) do_network_analysis ;;
pareto) do_pareto ;;
marc-history) do_marc_history ;;
record-patterns) do_record_patterns ;;
mysql) do_mysql ;;
export-schema-files) do_export_schema_files ;;
export-cli-parameters) do_export_cli_parameters ;;
shacl4bib) do_shacl4bib ;;
all-analyses) do_all_analyses ;;
all-solr) do_all_solr ;;
all) do_all_analyses ; do_all_solr ;;
version-link) do_version_link ;;
config) config ;;
help) help ;;
*) fatal "unknown command: $1"
esac
done
untrace
if [[ "$datatask" = true ]]; then
sec=$SECONDS
log "DONE in $(printf '%02d:%02d:%02d\n' $((sec/3600)) $((sec%3600/60)) $((sec%60)))"
fi