forked from puria-radmard/RFL-SBDALNER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vec.py
72 lines (53 loc) · 2.19 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import json
import re
import sys
import numpy as np
from gensim.models.word2vec import LineSentence, Word2Vec
def func(fin, fout):
for line in fin:
line = line.strip()
if not line:
continue
sentence = json.loads(line)
sentence = sentence["sentText"].strip().strip('"').lower()
fout.write(sentence + "\n")
def make_corpus():
# print("-------------haha")
with open(os.path.join(root_dir, "corpus.txt"), "wt", encoding="utf-8") as fout:
with open(os.path.join(root_dir, "train.json"), "rt", encoding="utf-8") as fin:
func(fin, fout)
with open(os.path.join(root_dir, "test.json"), "rt", encoding="utf-8") as fin:
func(fin, fout)
def save_numpy_array(root_dir, wv):
"""
Save word vectors in a .npy array in order they appear in vocab.txt
"""
with open(os.path.join(root_dir, "vocab.txt")) as vocab_file:
vocab_file_string = vocab_file.read()
lines = vocab_file_string.split("\n")[:-1]
vocab = [re.compile("\s[0-9]{1,}$").split(a)[0] for a in lines]
print(f"{len(vocab)} words in vocab.txt || {len(wv.vocab)} words in W2V vocab")
print(f"i.e. {len(vocab) - len(wv.vocab)} missing from W2V")
word_matrix = np.stack([wv[w] for w in vocab if w in wv.vocab.keys()], axis=0)
with open(os.path.join(root_dir, "word2vec.vectors.npy"), 'wb') as npy_file:
np.save(npy_file, word_matrix)
if __name__ == "__main__":
root_dir = sys.argv[1] # Fix this later
# e.g. "data/NYT_CoType"
if not os.path.exists(os.path.join(root_dir, "corpus.txt")):
make_corpus()
print("Made corpus")
sentences = LineSentence(os.path.join(root_dir, "corpus.txt"))
print("Made sentences")
model = Word2Vec(sentences, sg=1, size=300, workers=4, iter=8, negative=8, min_count=1)
print("Made model")
word_vectors = model.wv
print("Made WVs")
word_vectors.save(os.path.join(root_dir, "word2vec"))
save_numpy_array(root_dir, wv=word_vectors)
print("Saved WVs")
word_vectors.save_word2vec_format(
os.path.join(root_dir, "word2vec.txt"), fvocab=os.path.join(root_dir, "vocab.txt")
)
print("Complete!")