-
Notifications
You must be signed in to change notification settings - Fork 0
/
full_pipeline.sh
88 lines (72 loc) · 3.57 KB
/
full_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# 0 - Initial setup
mkdir -p corpora
mkdir -p data/working
mkdir -p data/external
mkdir -p results
mkdir -p logs
chmod -R +x src
# sudo apt-get update
sudo apt-get install pandoc # to execute RMarkdown
# 1 - Download wiki dump
WIKI_URL=https://archive.org/download/enwiki-20210401
WIKI_FILE=enwiki-20210401-pages-articles.xml.bz2
wget -c -b -P corpora/ $WIKI_URL/$WIKI_FILE
# "-c": continue getting a partially-downloaded file
# "-b": go to background after startup. Output is redirected to wget-log.
# 2 - Extract corpus into a raw .txt file
src/data/extract_wiki_dump.sh corpora/enwiki-20210401-pages-articles.xml.bz2
# 3 - Create text file with one line per sentence and removing paragraphs of less than 50 words
python3 -u src/data/tokenize_and_reduce_corpus.py corpora/enwiki-20210401-pages-articles.txt
# 4 - Clean corpus
CORPUS_IN=corpora/enwiki-20210401-pages-articles_sentences.txt &&
CORPUS_OUT=corpora/wiki2021.txt &&
src/data/clean_corpus.sh $CORPUS_IN > $CORPUS_OUT
# check number of lines,words,characters with:
wc corpora/wiki2021.txt
# 78051838 1749313740 10453079770 corpora/wiki2021.txt
# 6 - Shuffle corpus. Set seed in `src/data/shuffle_corpus_multiple.sh`.
# The new corpus is named as `corpora/{CORPUS_ID}s<seed>.txt`.
CORPUS_ID=wiki2021 &&
src/data/shuffle_corpus_multiple.sh $CORPUS_ID
# 7 - Get vocabulary using GloVe module
OUT_DIR=data/working &&
VOCAB_MINCOUNT=100 &&
IDS=(wiki2021) &&
for id in ${IDS[@]}; do
corpus=corpora/$id.txt
src/corpus2vocab.sh $corpus $OUT_DIR $VOCAB_MINCOUNT
done
# 8 - Resample corpus to achieve target frequencies in word B and create vocab. of new corpora
resample_corpus.sh "she" "he" &&
resample_corpus.sh "he" "she" &&
resample_corpus.sh "african" "european" &&
resample_corpus.sh "rich" "poor"
# 9 - Train word embeddings in all corpora
nohup src/corpus2sgns_multiple.sh &> logs/nohup_sgns.out &
nohup src/corpus2fasttext_multiple.sh &> logs/nohup_fasttext.out &
nohup src/corpus2glove_multiple.sh &> logs/nohup_glove.out &
# 10 - Run PCA on normalized vectors of unshuffled and shuffled corpus
python src/matrices2pca.py --normalize
# 11 - Run hyperparameter trials
nohup src/embeddings_hyperparams.sh &> logs/nohup_hyperparams.out &
# 12 - Compute bias wrt context words (definir files en el sh)
A="SHE" && B="HE" && src/biasdf_multiple.sh $A $B
A="HE" && B="SHE" && src/biasdf_multiple.sh $A $B
A="AFRICAN" && B="EUROPEAN" && src/biasdf_multiple.sh $A $B
A="RICH" && B="POOR" && src/biasdf_multiple.sh $A $B
# 13 - Figures of cosine similarities heatmaps
jupyter nbconvert --to html --execute notebooks/similarities_sgns.ipynb
jupyter nbconvert --to html --execute notebooks/similarities_fasttext.ipynb
jupyter nbconvert --to html --execute notebooks/similarities_glove.ipynb
# 14 - Figures of bias (resampling experiment), hyperparameters and PCA
# Download glasgow norms:
wget -O data/external/GlasgowNorms.csv https://static-content.springer.com/esm/art%3A10.3758%2Fs13428-018-1099-3/MediaObjects/13428_2018_1099_MOESM2_ESM.csv
R -e 'rmarkdown::render("notebooks/bias_resampling_gender.Rmd", "html_document")' &&
R -e 'rmarkdown::render("notebooks/bias_resampling_gender_she.Rmd", "html_document")' &&
R -e 'rmarkdown::render("notebooks/bias_resampling_ethnicity.Rmd", "html_document")' &&
R -e 'rmarkdown::render("notebooks/bias_resampling_affluence.Rmd", "html_document")' &&
R -e 'rmarkdown::render("notebooks/metrics_hyperparams.Rmd", "html_document")' &&
R -e 'rmarkdown::render("notebooks/pca_normalized.Rmd", "html_document")'
# 15 - Make grids of figures for paper
python src/make_plots_grids.py &&
Rscript src/bias_resampling_grid.R