Spaces:
Build error
Build error
| SCRIPTS=mosesdecoder/scripts | |
| TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl | |
| NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl | |
| REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl | |
| BPEROOT=subword-nmt/subword_nmt | |
| BPE_CODE=wmt18_en_de/code | |
| SUBSAMPLE_SIZE=25000000 | |
| LANG=de | |
| OUTDIR=wmt18_${LANG}_mono | |
| orig=orig | |
| tmp=$OUTDIR/tmp | |
| mkdir -p $OUTDIR $tmp | |
| URLS=( | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2007.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2008.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2009.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2011.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz" | |
| "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz" | |
| "http://www.statmt.org/wmt15/training-monolingual-news-crawl-v2/news.2014.de.shuffled.v2.gz" | |
| "http://data.statmt.org/wmt16/translation-task/news.2015.de.shuffled.gz" | |
| "http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz" | |
| "http://data.statmt.org/wmt18/translation-task/news.2017.de.shuffled.deduped.gz" | |
| ) | |
| FILES=( | |
| "news.2007.de.shuffled.gz" | |
| "news.2008.de.shuffled.gz" | |
| "news.2009.de.shuffled.gz" | |
| "news.2010.de.shuffled.gz" | |
| "news.2011.de.shuffled.gz" | |
| "news.2012.de.shuffled.gz" | |
| "news.2013.de.shuffled.gz" | |
| "news.2014.de.shuffled.v2.gz" | |
| "news.2015.de.shuffled.gz" | |
| "news.2016.de.shuffled.gz" | |
| "news.2017.de.shuffled.deduped.gz" | |
| ) | |
| cd $orig | |
| for ((i=0;i<${#URLS[@]};++i)); do | |
| file=${FILES[i]} | |
| if [ -f $file ]; then | |
| echo "$file already exists, skipping download" | |
| else | |
| url=${URLS[i]} | |
| wget "$url" | |
| fi | |
| done | |
| cd .. | |
| if [ -f $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then | |
| echo "found monolingual sample, skipping shuffle/sample/tokenize" | |
| else | |
| gzip -c -d -k $(for FILE in "${FILES[@]}"; do echo $orig/$FILE; done) \ | |
| | shuf -n $SUBSAMPLE_SIZE \ | |
| | perl $NORM_PUNC $LANG \ | |
| | perl $REM_NON_PRINT_CHAR \ | |
| | perl $TOKENIZER -threads 8 -a -l $LANG \ | |
| > $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} | |
| fi | |
| if [ -f $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then | |
| echo "found BPE monolingual sample, skipping BPE step" | |
| else | |
| python $BPEROOT/apply_bpe.py -c $BPE_CODE \ | |
| < $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} \ | |
| > $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} | |
| fi | |
| if [ -f $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} ]; then | |
| echo "found deduplicated monolingual sample, skipping deduplication step" | |
| else | |
| python deduplicate_lines.py $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} \ | |
| > $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} | |
| fi | |
| if [ -f $OUTDIR/bpe.monolingual.dedup.00.de ]; then | |
| echo "found sharded data, skipping sharding step" | |
| else | |
| split --lines 1000000 --numeric-suffixes \ | |
| --additional-suffix .${LANG} \ | |
| $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} \ | |
| $OUTDIR/bpe.monolingual.dedup. | |
| fi | |