-
Notifications
You must be signed in to change notification settings - Fork 3
/
opus.sh
71 lines (58 loc) · 2.25 KB
/
opus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
# Usage : ./opus.sh $lang_pair
# lang_pair in es-fr, de-en, fr-ru, en-ru, de-fr, es-it...
# All conditions are there to allow the script to resume or it stopped in case of a sudden stop.
set -e
pair=$1 # input language pair
# folder (container/in which to store) the data
PARA_PATH=/content/data/para
mkdir -p $PARA_PATH
# opus data source (MultiUN or OpenSubtitles2018 or MultiUN,OpenSubtitles2018) : customize as needed
SRC=OpenSubtitles2018,MultiUN
if [ ! -d $PARA_PATH/${pair} ]; then
mkdir $PARA_PATH/${pair}
else
echo "dir $PARA_PATH/${pair} already exists"
fi
echo -e "\n"
echo "***Download data and unzip it in $PARA_PATH/$pair ***"
# Take 2 parameters : PARA_PATH source
download_and_unzip_data() {
wget -c http://opus.nlpl.eu/download.php?f=$2%2F${pair}.txt.zip -P $1/${pair}
unzip -u $1/${pair}/download.php?f=$2%2F${pair}.txt.zip -d $1/${pair}
}
if [ $SRC = "MultiUN,OpenSubtitles2018" ] || [ $SRC = "OpenSubtitles2018,MultiUN" ]; then
if [ $pair != "es-it" ]; then
# es-fr, de-en, fr-ru, en-ru, de-fr...
for src in $(echo $SRC | sed -e 's/\,/ /g'); do
download_and_unzip_data $PARA_PATH $src
done
else
# es-it...
for src in OpenSubtitles GlobalVoices EUbookshop; do
wget -c http://opus.nlpl.eu/download.php?f=${src}%2Fes-it.txt.zip -P $PARA_PATH/${pair}
unzip -u $PARA_PATH/${pair}/download.php?f=${src}%2Fes-it.txt.zip -d $PARA_PATH/${pair}
done
fi
elif [ $SRC = "MultiUN" ] || [ $SRC = "OpenSubtitles2018" ]; then
if [ $pair != "es-it" ]; then
# es-fr, de-en, fr-ru, en-ru, de-fr...
download_and_unzip_data $PARA_PATH $SRC
else
# es-it...
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fes-it.txt.zip -P $PARA_PATH/${pair}
unzip -u $PARA_PATH/${pair}/download.php?f=OpenSubtitles2018%2Fes-it.txt.zip -d $PARA_PATH/${pair}
fi
else
echo "source error : $SRC is not valid source, choose between MultiUN and OpenSubtitles2018"
exit
fi
echo -e "\n"
echo "*** Convert to txt***"
for lg in $(echo $pair | sed -e 's/\-/ /g'); do
if [ ! -f $PARA_PATH/${pair}/$pair.$lg.txt ]; then
cat $PARA_PATH/${pair}/*.$pair.$lg > $PARA_PATH/${pair}/$pair.$lg.txt
else
echo "file $PARA_PATH/${pair}/$pair.$lg.txt already exists"
fi
done