tex2xml.sh

#!/bin/sh
# $1: .tex file name, without extension
# $2: .bib file name, without extension
# $3: math mode or not
# Math mode True or False, default False
math=${3:-false}
# Clean some stuff in the tex file before pandoc
cp $1.tex $1_copy.tex
sed -i.bak 's/figure\*/figure/g' $1_copy.tex
sed -i.bak 's/\\makeseistitle{/\\makeseistitle\n{%/g' $1_copy.tex
sed -i.bak 's/seistable/tabular/g' $1_copy.tex
sed -i.bak 's/\\begin{acknowledgements}/\\begin{acknowledgements}Acknowledgements/g' $1_copy.tex
# Clean inline code: replace '\code{...}' with '{...}' in the .tex file
perl -i.bak -00pe 's/\\code\{(.*?)\}/\{$1\}/ig' $1_copy.tex
# Clean spaces in \ref, \label
python3 << END
import re
with open("$1_copy.tex") as fi:
    tex = fi.read()
tex = re.sub(r"(ref{)(.*?)(})", lambda m: m[0].replace(" ", r"_"), tex)     
tex = re.sub(r"(label{)(.*?)(})", lambda m: m[0].replace(" ", r"_"), tex)  
with open("$1_copy.tex", "w", encoding='utf-8') as fi:
    fi.write(tex)
END
# Sort the bib file alphabetically by first author
python3 sort_bib.py $2
# Convert tex file to jats xml file
pandoc $1_copy.tex -f latex -t jats+element_citations --citeproc --bibliography=$2_sort.bib --mathjax --metadata link-citations=true --natbib --csl apa.csl -s -o $1.xml
# Remove the backup file generated by sed
rm -rf $1_copy.tex
rm -rf $1_copy.tex.bak
# Clean stuff
# Execute the Python script to clean the generated .xml file
python3 cleanjats.py $1 $math
# Extract and replace the bibliography section in the generated .xml file with contents from 'bib.xml'
sed -i.bak -n '/<back>/,/<\/back>/p' bib.xml
sed -i.bak -e '/<back>/,/<\/back>/!b' -e "/<\/back>/!d;r bib.xml" -e 'd' "$1.xml"
# Clean metadata and replace in the xml file
sed -i.bak 's/\\&/&amp;/g' "$1_metadata.xml"
sed -e '/<front>/,/<\/front>/!b' -e "/<\/front>/!d;r $1_metadata.xml" -e 'd' "$1.xml" > "$1_galley.xml"
# Replace '\&' with '&amp;' in the 'credits.xml' file and add it as a section in the 'galley.xml' file
sed -i.bak 's/\\&/\&amp;/g' "$1_credits.xml"
perl -i.bak -pe 'if (/<\/body>/ && ! $inserted) { open(my $fh, "<", "'"$1_credits.xml"'"); print <$fh>; $inserted = 1; }' "$1_galley.xml"
# Clean and format multiple abstracts in the 'galley.xml' file
perl -i.bak -00pe 's/<boxed-text>\n\s*<boxed-text>/<boxed-text>/g' $1_galley.xml
perl -i.bak -00pe 's/<\/boxed-text>\n\s*<\/boxed-text>/<\/boxed-text>/g' $1_galley.xml
# Make the first word of the abstract bold
perl -i.bak -00pe 's/<boxed-text>[\n\t ]*?<p>(Non-technical summary|\S{5,25}) ([\S\d\n\t ]*?)<\/p>[\n\t ]*?<\/boxed-text>/<boxed-text><p>\n <bold>$1.<\/bold> $2\n<\/p><\/boxed-text>/gmi' $1_galley.xml
# Clean all sed backup files
rm -rf $1_metadata.xml.bak
rm -rf $1_credits.xml.bak
rm -rf $1_galley.xml.bak
rm -rf $1.xml.bak
rm -rf bib.xml.bak
# Clean all working files
rm -rf $1_metadata.xml
rm -rf $1_credits.xml
rm -rf $1.xml
rm -rf bib.xml
#eof