-
Notifications
You must be signed in to change notification settings - Fork 0
/
mycluster.py
169 lines (139 loc) · 5.85 KB
/
mycluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import sys
import json
import jieba
import sklearn
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from gensim import corpora, models
if sys.getdefaultencoding() != 'utf-8':
reload(sys)
sys.setdefaultencoding('utf-8')
def union_dict(*dicts):
ks = set(sum([d.keys() for d in dicts], []))
total = {}
for k in ks:
total[k] = sum([d.get(k, 0) for d in dicts])
return total
# Term Frequecy matrix
def get_tf_matrix(tdm_words):
try:
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform(tdm_words)
words = vectorizer.get_feature_names()
return tf_matrix, words
except Exception as e:
print "From:get_tf_matrix:\n\tUnexpect Error: {}".format(e)
def get_tfidf_matrix(tf_matrix):
try:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(tf_matrix)
return tfidf_matrix.toarray()
except Exception as e:
print "From:get_tfidf_matrix:\n\tUnexpect Error: {}".format(e)
# get first n words of every article according the descend tfidf weight sorting.
def get_first_n_words(tfidf_matrix, n, dictionary):
try:
docs_keywords_string = []
docs_words_weight = [] # the sub element should be a dict
# np.argsort(-ndarray, axis = 1) # Descendding Sort, axis = 1 represent the column.
descend_sort_index = np.argsort(-tfidf_matrix, axis = 1)
for idx in range(len(tfidf_matrix)):
words_list = []
words_weight = {}
for i in range(n):
# get the i-th index # sort according the tfidf weight
point = descend_sort_index[idx][i]
if tfidf_matrix[idx][point] > 0:
# get the index correspond word and add to words_list
words_list.append(dictionary[point])
# get the first_n_words weight for the idx doc
words_weight[dictionary[point]] = tfidf_matrix[idx][point]
docs_keywords_string.append(' '.join(words_list))
docs_words_weight.append(words_weight)
return docs_keywords_string, docs_words_weight
except Exception as e:
print "From:get_first_n_words:\n\tUnexpect Error: {}".format(e)
def get_label_list(n_clusters, weight):
try:
kmeans = KMeans(n_clusters = n_clusters, init = "k-means++")
# kmeans = KMeans(n_clusters = n_clusters, init = "random")
km_model = kmeans.fit(weight)
return km_model.labels_
except Exception as e:
print "From:get_label_list:\n\tUnexpect Error: {}".format(e)
def get_classes(n_clusters, data, docs_words_weight, docs_keywords_string):
try:
sub_weight = [[] for i in range(n_clusters)]
sub_cluster = [[] for i in range(n_clusters)]
for i in range(n_clusters):
class_weight = []
number_list = list(pd.DataFrame(data.loc[data['cluster'] == i]).index) # get the art number of every class
for j in number_list:
sub_cluster[i].append(docs_keywords_string[j])
class_weight.append(docs_words_weight[j])
sub_weight[i] = (union_dict(*class_weight))
return sub_cluster, sub_weight
except Exception as e:
print "From:get_classes:\n\tUnexpect Error: {}".format(e)
def get_class_keywords(n_clusters, count, sub_cluster, sub_weight):
try:
classes_keywords = []
for i in range(n_clusters):
class_keywords = []
tf_m_weight = []
sub_tf_matrix, sub_words = get_tf_matrix(sub_cluster[i])
m = sub_tf_matrix.toarray() # m is two dimension
# get the keywords frequency of articles in the same cluster
# tf_sum = np.sum(m, axis = 0)
tf_sum = sum(m, axis = 0)
print "TTTTTTTTT--------------------------TTTTTTT"
# /usr/local/lib/python2.7/dist-packages/gensim/models/ldamodel.py:826: DeprecationWarning: Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.from_iter(generator)) or the python sum builtin instead.
# NOTE
# get trem frequency
for wtf in range(len(tf_sum)):
tf_m_weight.append(tf_sum[wtf] * sub_weight[i][sub_words[wtf]])
tf_m_weight = np.array(tf_m_weight) # get tf * weight
sort_idx = np.argsort(-tf_m_weight, axis = 0)
# sort according the tf * tfidf weight
for j in range(count):
class_keywords.append(sub_words[sort_idx[j]])
classes_keywords.append(' '.join(class_keywords))
return classes_keywords
except Exception as e:
print "From:get_class_keywords:\n\tUnexpect Error: {}".format(e)
# Throught the col parameter can get the cluster information.
def get_clusters_detail(n_clusters, data, col):
try:
clusters_list = []
hot_sort = data['cluster'].value_counts()
print list(hot_sort.index)
for i in list(hot_sort.index):
art_list = list(pd.DataFrame(data.loc[data['cluster'] == i])[col])
clusters_list.append(art_list)
# print len(clusters_art_list)
# for i in clusters_art_list:
# print len(i)
return clusters_list
except Exception as e:
print "From:get_cluster_art:\n\tUnexpect Error: {}".format(e)
def all_output(n_clusters, classes_keywords, clusters_topic_dict, \
sub_cluster, data, d_stit_prop_dict, s_stit_prop_dict):
try:
hot_sort = data['cluster'].value_counts()
print list(hot_sort.index)
for i in list(hot_sort.index):
print "# The count of article: %d"%hot_sort[i]
print "# %d class --- keywords: %s"%(i, classes_keywords[i])
print "# %d class --- topic: %s"%(i, clusters_topic_dict[i])
print "# Sentiment: doc2vec---pos: %f , neg: %f"%(d_stit_prop_dict[i]*100, 100-d_stit_prop_dict[i]*100)
print "# Sentiment: snownlp---pos: %f , neg: %f"%(s_stit_prop_dict[i]*100, 100-s_stit_prop_dict[i]*100)
number_list = list(pd.DataFrame(data.loc[data['cluster'] == i]).index)
id_list = list(pd.DataFrame(data.loc[data['cluster'] == i])['id'])
url_list = list(pd.DataFrame(data.loc[data['cluster'] == i])['url'])
for j, number in enumerate(number_list):
print number, sub_cluster[i][j], id_list[j]
except Exception as e:
print "From:all_output:\n\tUnexpect Error: {}".format(e)