-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_v2.py
313 lines (291 loc) · 11.3 KB
/
preprocess_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import os
import re
from clause_generator import *
import heapq
import random
'''
此脚本主要做以下的事情
1. 生成预处理命令并执行,预处理做的工作就是 create 所有的节点,还有一半的边
2. 将预处理命令生成的图 save 进指定目录
3. 生成一个实验命令,这个命令一开始会 load 上面保存的图数据,然后运行一句continuously match语句,然后逐条 create 剩下的一半边
'''
'''
要导入的图数据
node : node id + node label
edge : src node id + edge label + dst node id
'''
dir = '/data/share/users/legend/project/query-generator/WukongSLarge/'
base_edges = 'base_edges.txt'
nodes = 'node.txt'
stream_edges = 'stream_edges.txt'
query = 'query.txt'
#id -> label
pattern = re.compile(r'\d+')
dict = {}
#生成要load的图,需要的一些预处理的命令
base_command_file = 'command/base_command.txt'
#不断插入新边的命令
stream_command_file = 'command/stream_command.txt'
#前半个图的存放地点
data_to_load = 'base_graph'
#graphflow-shell 路径
shell_path = '~/graphdb/graphflow/build/install/graphflow/bin/graphflow-cli'
#数据文件的分割符
separator = ' '
clause_gen = Clause_generator()
var_gen = Variable_generator()
'''
(:organisation)-[:isLocatedIn]->(:place), (:place)-[isPartOf]->(:place);
'''
match_clause = 'Continuously match (m:organisation)-[:isLocatedIn]->(n:place), (n:place)-[:isPartOf]->(r:place) file \"result.txt\";'
save_clause = 'save to dir \"base_graph\";'
load_clase = 'load from dir \"base_graph\";'
def alter_type(type):
return '_' + type
'''
convert a part of the vertex files to a single cypher commands file
'''
def generate_create_vertex_commands_v1(node_file, save_file, bz = 100, rate = 1/10000):
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
id, label = list(pattern.findall(line))
seed = random.random()
if seed <= rate:
label = alter_type(label)
clause_gen.add_vertex(id, label)
count += 1
if count >= bz:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
count = 0
dict[id] = label
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
f.close()
def generate_create_vertex_commands_v2(node_file, save_file, bz = 100, rate = 1/100000):
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
count += 1
line = f.readline()
f.close()
size = int(rate * count)
heap = []
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
id = pattern.findall(line)[0]
id = int(id)
if count >= size:
if id > heapq.nsmallest(1, heap)[0]:
heapq.heapreplace(heap, id)
else:
heapq.heappush(heap, id)
count += 1
line = f.readline()
f.close()
print("node num : ", size)
barrier = heapq.nsmallest(1, heap)[0]
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
id, label = list(pattern.findall(line))
if int(id) >= barrier:
label = alter_type(label)
clause_gen.add_vertex(id, label)
count += 1
if count >= bz:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
count = 0
dict[id] = label
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
f.close()
print("max id is : ", barrier)
return barrier
def generate_create_vertex_commands_v3(node_file, save_file, bz = 100, rate = 1/100000):
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
count += 1
line = f.readline()
f.close()
size = int(rate * count)
maxheap = []
minheap = []
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
id = pattern.findall(line)[0]
id = int(id)
if count >= size:
if id > heapq.nsmallest(1, maxheap)[0]:
heapq.heapreplace(maxheap, id)
if -id > -heapq.nsmallest(1, minheap)[0]:
heapq.heapreplace(minheap, -id)
else:
heapq.heappush(maxheap, id)
heapq.heappush(minheap, -id)
count += 1
line = f.readline()
f.close()
print("node num : ", size)
lower_barrier = heapq.nsmallest(1, maxheap)[0]
upper_barrier = -heapq.nsmallest(1, minheap)[0]
with open(node_file, 'r', encoding='utf-8') as f:
line = f.readline()
count = 0
while line:
id, label = list(pattern.findall(line))
if int(id) >= lower_barrier or int(id) <= upper_barrier:
label = alter_type(label)
clause_gen.add_vertex(id, label)
count += 1
if count >= bz:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
count = 0
dict[id] = label
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
f.close()
return lower_barrier, upper_barrier
def generate_create_edge_commands_v1(edge_file, save_file, bz = 100):
with open(edge_file, 'r', encoding='utf-8') as f:
line = f.readline()
total_edge = 0
count = 0
while line:
from_id, edge_type, to_id = list(pattern.findall(line))
if from_id in dict.keys() and to_id in dict.keys():
from_type = dict[from_id]
to_type = dict[to_id]
edge_type = alter_type(edge_type)
clause_gen.add_edge(from_id, from_type, edge_type, to_id, to_type)
count += 1
if count >= bz:
clause = clause_gen.create_edge()
save_file.write(clause + '\n')
count = 0
total_edge += 1
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
print("total edge is : ", total_edge)
f.close()
def generate_create_edge_commands_v2(edge_file, save_file, barrier, bz = 100):
with open(edge_file, 'r', encoding='utf-8') as f:
line = f.readline()
total_edge = 0
count = 0
while line:
from_id, edge_type, to_id = list(pattern.findall(line))
if int(from_id) >= barrier and int(to_id) >= barrier:
from_type = dict[from_id]
to_type = dict[to_id]
edge_type = alter_type(edge_type)
clause_gen.add_edge(from_id, from_type, edge_type, to_id, to_type)
count += 1
if count >= bz:
clause = clause_gen.create_edge()
save_file.write(clause + '\n')
count = 0
total_edge += 1
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
print("total edge is : ", total_edge)
f.close()
def generate_create_edge_commands_v3(edge_file, save_file, lower_barrier, upper_barrier, bz = 100):
with open(edge_file, 'r', encoding='utf-8') as f:
line = f.readline()
total_edge = 0
count = 0
while line:
from_id, edge_type, to_id = list(pattern.findall(line))
if (int(from_id) <= upper_barrier or int(from_id) >= lower_barrier) and (int(to_id) >= lower_barrier or int(to_id) <= upper_barrier):
from_type = dict[from_id]
to_type = dict[to_id]
edge_type = alter_type(edge_type)
clause_gen.add_edge(from_id, from_type, edge_type, to_id, to_type)
count += 1
if count >= bz:
clause = clause_gen.create_edge()
save_file.write(clause + '\n')
count = 0
total_edge += 1
line = f.readline()
if count > 0:
clause = clause_gen.create_vertex()
save_file.write(clause + '\n')
print("total edge is : ", total_edge)
f.close()
def generate_match_command(query_file, save_file):
with open(query_file, 'r', encoding='utf-8') as f:
line = f.readline()
query_num = int(pattern.findall(line)[0])
for i in range(0, query_num):
line = f.readline()
node_num, edge_num = list(map(int, pattern.findall(line)))
node_types = []
for j in range(0, node_num):
line = f.readline()
node_types.append(pattern.findall(line)[0])
for j in range(0, edge_num):
line = f.readline()
from_id, to_id, edge_type = list(pattern.findall(line))
from_type = alter_type(node_types[int(from_id)])
to_type = alter_type(node_types[int(to_id)])
from_id = var_gen.get_variable()
to_id = var_gen.get_variable()
edge_type = alter_type(edge_type)
clause_gen.add_match_edge(from_id, from_type, edge_type, to_id, to_type)
save_file.write(clause_gen.create_continuous_edge("result.txt")+'\n')
f.close()
if __name__ == '__main__' :
'''
base_file = open(base_command_file, 'w', encoding='utf-8')
barrier = generate_create_vertex_commands_v2(dir+nodes, base_file, bz=200)
print('finish nodes ! \n')
generate_create_edge_commands_v2(dir+base_edges, base_file, barrier, bz=200)
print('finish base edge ! \n')
base_file.write(save_clause + '\n')
base_file.close()
stream_file = open(stream_command_file, 'w', encoding='utf-8')
stream_file.write(load_clase + '\n')
generate_match_command(dir+query, stream_file)
print('finish continuously match clauses ! \n')
generate_create_edge_commands_v2(dir+stream_edges, stream_file, barrier, bz=1)
print('finish stream edge ! \n')
stream_file.close()
'''
base_file = open(base_command_file, 'w', encoding='utf-8')
lower_barrier, upper_barrier = generate_create_vertex_commands_v3(dir+nodes, base_file, bz=100)
print('finish nodes ! \n')
generate_create_edge_commands_v3(dir + base_edges, base_file, lower_barrier, upper_barrier, bz=100)
print('finish base edge ! \n')
base_file.write(save_clause + '\n')
base_file.close()
stream_file = open(stream_command_file, 'w', encoding='utf-8')
stream_file.write(load_clase + '\n')
generate_match_command(dir + query, stream_file)
print('finish continuously match clauses ! \n')
generate_create_edge_commands_v1(dir + stream_edges, stream_file, bz=1)
print('finish stream edge ! \n')
stream_file.close()