-
Notifications
You must be signed in to change notification settings - Fork 4
/
cluster_train.py
70 lines (53 loc) · 1.94 KB
/
cluster_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import time
import pickle
import os
import logging
import datetime
import multiprocessing
import numpy as np
from gensim.models import Word2Vec as w2v
from sklearn.cluster import KMeans, MiniBatchKMeans
# Set seed
seed = 7
np.random.seed(seed)
def doClustering():
os.system('cls')
# Load Word2Vec model
print("LOADING WORD2VEC MODEL \n\n")
model = w2v.load_word2vec_format('W2V Models/w2v_reddit_unigram_300d.bin', binary=True)
# Specify the number of words and clusters (250,500,1000,2000,4000)
#WORDS = 1000000
CLUSTERS = 500
# Get the word vectors and the word
print("GETTING WORD VECTORS AND WORDS \n\n")
word_vectors = model.syn0
words = model.index2word
# Delete the model to clear up memory
print("DELETING WORD2VEC MODEL AND SLEEPING \n\n")
del model
time.sleep(10)
# Initialize K-Means
k_means = KMeans( n_clusters = CLUSTERS, n_jobs= -2, precompute_distances=True)
#k_means = MiniBatchKMeans(n_clusters = CLUSTERS)
# Give starting time of initialization
start = datetime.datetime.now()
print("STARTING AT: %i/%i/%i %i:%i \n" % (start.month, start.day, start.year, start.hour, start.minute))
# Fit the model, get the centroid number and calculate time
print("TRAINING K-MEANS WITH %i CLUSTERS \n\n" % (CLUSTERS))
start = time.time()
idx = k_means.fit_predict(word_vectors)
end_time = time.time()
# Display ending time of fitting
end = datetime.datetime.now()
print("ENDING AT: %i/%i/%i %i:%i" % (end.month, end.day, end.year, end.hour, end.minute))
print("TIME TAKEN: ", end_time-start)
# Create a Word / Index dictionary
# Each vocabulary word is matched to a cluster center
# Motivation: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors
word_centroid_map = dict(zip(words,idx))
# Save the dictionary
print("\n\nSAVING MODEL")
FILE = "K-Means Models/full_" + str(CLUSTERS) + "C.pk"
pickle.dump(word_centroid_map, open(FILE, "wb"))
if __name__ == '__main__':
doClustering()