-
Notifications
You must be signed in to change notification settings - Fork 2
/
TX_vec_Analysis.py
103 lines (76 loc) · 3.22 KB
/
TX_vec_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
def main():
df = pd.read_csv('./TX_vec.csv')
# La i-ésima fila de TX_vec (i.e. TX_vec[i]) corresponde a un vector transformado que representa a un abstract
TX_vec = [df.values[i][1:] for i in range(df.shape[0])]
f = open('TX_vec.tsv', 'w')
for i in range(len(TX_vec)):
for j in range(len(TX_vec[1])):
f.write(str(TX_vec[i][j]) + '\t')
f.write('\n')
# SVD
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
# full_matrices: bool, optional
# If True (default), u and v have the shapes (M, M) and (N, N), respectively.
# Otherwise, the shapes are (M, K) and (K, N), respectively, where K = min(M, N).
U, s, V = np.linalg.svd(TX_vec, full_matrices=True, compute_uv=True)
S = np.zeros((U.shape[1], V.shape[0]), dtype=float)
S[:s.shape[0], :s.shape[0]] = np.diag(
s) # Matriz Diagonal de Valores Propios ordenados
print('SVD shapes' + '\n U shape = ' + str(U.shape) + '\n V shape = ' +
str(V.shape) + '\n s shape = ' + str(s.shape) + '\n S shape = ' +
str(S.shape))
# Reconstruction
USV = np.dot(U, np.dot(S, V))
# print(USV)
########################################################################################################################
# USELESS
# http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html #sphx-glr-auto-examples-cluster-plot-dbscan-py
# eps = 5
# eps = 25
X = TX_vec
X = StandardScaler().fit_transform(X)
db = DBSCAN(eps=15, min_samples=3).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f" %
metrics.silhouette_score(X, labels))
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [
plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0],
xy[:, 1],
'o',
markerfacecolor=tuple(col),
markeredgecolor='k',
markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0],
xy[:, 1],
'o',
markerfacecolor=tuple(col),
markeredgecolor='k',
markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
########################################################################################################################
if __name__ == '__main__':
main()