5
5
from operator import or_
6
6
from functools import reduce
7
7
8
- import py2neo
9
- import py2neo .packages .httpstream
10
8
import pandas
11
9
from tqdm import tqdm
12
10
13
11
import hetio .hetnet
14
12
15
- # Get py2neo version
16
- PY2NEO_VER = int (py2neo .__version__ [0 ])
17
13
18
- # Avoid SocketError
19
- py2neo .packages .httpstream .http .socket_timeout = 1e8
14
+ def import_py2neo ():
15
+ """
16
+ Imports the py2neo library, checks its version, and sets the socket timeout if necessary
17
+ """
18
+ import py2neo
19
+ # Get py2neo version
20
+ PY2NEO_VER = int (py2neo .__version__ [0 ])
21
+ if PY2NEO_VER < 4 :
22
+ import py2neo .packages .httpstream
23
+ # Avoid SocketError
24
+ py2neo .packages .httpstream .http .socket_timeout = 1e8
25
+ return py2neo
26
+
20
27
21
28
def export_neo4j (graph , uri , node_queue = 200 , edge_queue = 5 , show_progress = False ):
22
29
"""Export hetnet to neo4j"""
30
+ py2neo = import_py2neo ()
23
31
24
32
if isinstance (uri , py2neo .Graph ):
25
33
db_graph = uri
@@ -89,6 +97,10 @@ def append(self, x):
89
97
self .create ()
90
98
91
99
def create (self ):
100
+ import py2neo
101
+
102
+ PY2NEO_VER = int (py2neo .__version__ [0 ])
103
+
92
104
if not self :
93
105
return
94
106
@@ -162,43 +174,16 @@ def cypher_path(metarels):
162
174
q += '{dir0}[:{rel_type}]{dir1}(n{i}{target_label})' .format (** kwargs )
163
175
return q
164
176
165
- def construct_dwpc_query (metarels , property = 'name' , join_hint = 'midpoint' , index_hint = False , unique_nodes = True ):
177
+ def construct_degree_clause (metarels ):
166
178
"""
167
- Create a cypher query for computing the *DWPC* for a type of path.
179
+ Create a Cypher query clause that calculates the degree of each node
168
180
169
181
Parameters
170
182
----------
171
183
metarels : a metarels or MetaPath object
172
- the metapath (path type) to create a query for
173
- property : str
174
- which property to use for soure and target node lookup
175
- join_hint : 'midpoint', bool, or int
176
- whether to add a join hint to tell neo4j to traverse form both ends of
177
- the path and join at a specific index. `'midpoint'` or `True` specifies
178
- joining at the middle node in the path (rounded down if an even number
179
- of nodes). `False` specifies not to add a join hint. An int specifies
180
- the node to join on.
181
- index_hint : bool
182
- whether to add index hints which specifies the properties of the source
183
- and target nodes to use for lookup. Enabling both `index_hint` and
184
- `join_hint` can cause the query to fail.
185
- unique_nodes : bool or str
186
- whether to exclude paths with duplicate nodes. To not enforce node
187
- uniqueness, use `False`. Methods for enforcing node uniqueness are:
188
- `nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
189
- `expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
190
- `labeled` to perform an intelligent version of `expanded` where only
191
- nodes with the same label are checked for duplicity. Specifying `True`,
192
- which is the default, uses the `labeled` method.
184
+ the metapath to create the clause for
193
185
"""
194
- # Convert metapath to metarels
195
- if isinstance (metarels , hetio .hetnet .MetaPath ):
196
- metarels = metapath_to_metarels (metarels )
197
-
198
- # create cypher path query
199
- metapath_query = cypher_path (metarels )
200
186
201
- # create cypher path degree query
202
187
degree_strs = list ()
203
188
for i , (source_label , target_label , rel_type , direction ) in enumerate (metarels ):
204
189
kwargs = {
@@ -217,6 +202,26 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
217
202
).format (** kwargs ))
218
203
degree_query = ',\n ' .join (degree_strs )
219
204
205
+ return degree_query
206
+
207
+ def construct_using_clause (metarels , join_hint , index_hint ):
208
+ """
209
+ Create a Cypher query clause that gives the planner hints to speed up the query
210
+ Parameters
211
+ ----------
212
+ metarels : a metarels or MetaPath object
213
+ the metapath to create the clause for
214
+ join_hint : 'midpoint', bool, or int
215
+ whether to add a join hint to tell neo4j to traverse form both ends of
216
+ the path and join at a specific index. `'midpoint'` or `True` specifies
217
+ joining at the middle node in the path (rounded down if an even number
218
+ of nodes). `False` specifies not to add a join hint. An int specifies
219
+ the node to join on.
220
+ index_hint : bool
221
+ whether to add index hints which specifies the properties of the source
222
+ and target nodes to use for lookup. Enabling both `index_hint` and
223
+ `join_hint` can cause the query to fail.
224
+ """
220
225
using_query = ''
221
226
# Specify index hint for node lookup
222
227
if index_hint :
@@ -239,7 +244,24 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
239
244
assert join_hint <= len (metarels )
240
245
using_query += "\n USING JOIN ON n{}" .format (join_hint )
241
246
242
- # Unique node constraint (pevent paths with duplicate nodes)
247
+ return using_query
248
+
249
+ def construct_unique_nodes_clause (metarels , unique_nodes ):
250
+ """
251
+ Create a Cypher query clause that gives the planner hints to speed up the query
252
+ Parameters
253
+ ----------
254
+ metarels : a metarels or MetaPath object
255
+ the metapath to create the clause for
256
+ unique_nodes : bool or str
257
+ whether to exclude paths with duplicate nodes. To not enforce node
258
+ uniqueness, use `False`. Methods for enforcing node uniqueness are:
259
+ `nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
260
+ `expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
261
+ `labeled` to perform an intelligent version of `expanded` where only
262
+ nodes with the same label are checked for duplicity. Specifying `True`,
263
+ which is the default, uses the `labeled` method.
264
+ """
243
265
if unique_nodes == 'nested' :
244
266
unique_nodes_query = '\n AND ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)'
245
267
elif unique_nodes == 'expanded' :
@@ -259,6 +281,52 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
259
281
assert unique_nodes is False
260
282
unique_nodes_query = ''
261
283
284
+ return unique_nodes_query
285
+
286
+ def construct_dwpc_query (metarels , property = 'name' , join_hint = 'midpoint' , index_hint = False , unique_nodes = True ):
287
+ """
288
+ Create a cypher query for computing the *DWPC* for a type of path.
289
+
290
+ Parameters
291
+ ----------
292
+ metarels : a metarels or MetaPath object
293
+ the metapath (path type) to create a query for
294
+ property : str
295
+ which property to use for soure and target node lookup
296
+ join_hint : 'midpoint', bool, or int
297
+ whether to add a join hint to tell neo4j to traverse form both ends of
298
+ the path and join at a specific index. `'midpoint'` or `True` specifies
299
+ joining at the middle node in the path (rounded down if an even number
300
+ of nodes). `False` specifies not to add a join hint. An int specifies
301
+ the node to join on.
302
+ index_hint : bool
303
+ whether to add index hints which specifies the properties of the source
304
+ and target nodes to use for lookup. Enabling both `index_hint` and
305
+ `join_hint` can cause the query to fail.
306
+ unique_nodes : bool or str
307
+ whether to exclude paths with duplicate nodes. To not enforce node
308
+ uniqueness, use `False`. Methods for enforcing node uniqueness are:
309
+ `nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
310
+ `expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
311
+ `labeled` to perform an intelligent version of `expanded` where only
312
+ nodes with the same label are checked for duplicity. Specifying `True`,
313
+ which is the default, uses the `labeled` method.
314
+ """
315
+ # Convert metapath to metarels
316
+ if isinstance (metarels , hetio .hetnet .MetaPath ):
317
+ metarels = metapath_to_metarels (metarels )
318
+
319
+ # create cypher path query
320
+ metapath_query = cypher_path (metarels )
321
+
322
+ # create cypher path degree query
323
+ degree_query = construct_degree_clause (metarels )
324
+
325
+ using_query = construct_using_clause (metarels , join_hint , index_hint )
326
+
327
+ # Unique node constraint (pevent paths with duplicate nodes)
328
+ unique_nodes_query = construct_unique_nodes_clause (metarels , unique_nodes )
329
+
262
330
# combine cypher fragments into a single query and add DWPC logic
263
331
query = textwrap .dedent ('''\
264
332
MATCH path = {metapath_query}{using_query}
@@ -323,65 +391,12 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
323
391
metapath_query = cypher_path (metarels )
324
392
325
393
# create cypher path degree query
326
- degree_strs = list ()
327
- for i , (source_label , target_label , rel_type , direction ) in enumerate (metarels ):
328
- kwargs = {
329
- 'i0' : i ,
330
- 'i1' : i + 1 ,
331
- 'source_label' : source_label ,
332
- 'target_label' : target_label ,
333
- 'rel_type' : rel_type ,
334
- 'dir0' : '<-' if direction == 'backward' else '-' ,
335
- 'dir1' : '->' if direction == 'forward' else '-' ,
336
- }
337
- degree_strs .append (textwrap .dedent (
338
- '''\
339
- size((n{i0}){dir0}[:{rel_type}]{dir1}()),
340
- size((){dir0}[:{rel_type}]{dir1}(n{i1}))'''
341
- ).format (** kwargs ))
342
- degree_query = ',\n ' .join (degree_strs )
394
+ degree_query = construct_degree_clause (metarels )
343
395
344
- using_query = ''
345
- # Specify index hint for node lookup
346
- if index_hint :
347
- using_query = '\n ' + textwrap .dedent ('''\
348
- USING INDEX n0:{source_label}({property})
349
- USING INDEX n{length}:{target_label}({property})
350
- ''' ).rstrip ().format (
351
- property = property ,
352
- source_label = metarels [0 ][0 ],
353
- target_label = metarels [- 1 ][1 ],
354
- length = len (metarels )
355
- )
356
-
357
- # Specify join hint with node to join on
358
- if join_hint is not False :
359
- if join_hint is True or join_hint == 'midpoint' :
360
- join_hint = len (metarels ) // 2
361
- join_hint = int (join_hint )
362
- assert join_hint >= 0
363
- assert join_hint <= len (metarels )
364
- using_query += "\n USING JOIN ON n{}" .format (join_hint )
396
+ using_query = construct_using_clause (metarels , join_hint , index_hint )
365
397
366
398
# Unique node constraint (pevent paths with duplicate nodes)
367
- if unique_nodes == 'nested' :
368
- unique_nodes_query = '\n AND ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)'
369
- elif unique_nodes == 'expanded' :
370
- pairs = itertools .combinations (range (len (metarels ) + 1 ), 2 )
371
- unique_nodes_query = format_expanded_clause (pairs )
372
- elif unique_nodes == 'labeled' or unique_nodes is True :
373
- labels = [metarel [0 ] for metarel in metarels ]
374
- labels .append (metarels [- 1 ][1 ])
375
- label_to_nodes = dict ()
376
- for i , label in enumerate (labels ):
377
- label_to_nodes .setdefault (label , list ()).append (i )
378
- pairs = list ()
379
- for nodes in label_to_nodes .values ():
380
- pairs .extend (itertools .combinations (nodes , 2 ))
381
- unique_nodes_query = format_expanded_clause (pairs )
382
- else :
383
- assert unique_nodes is False
384
- unique_nodes_query = ''
399
+ unique_nodes_query = construct_unique_nodes_clause (metarels , unique_nodes )
385
400
386
401
# combine cypher fragments into a single query and add PDP logic
387
402
query = ''
@@ -394,9 +409,9 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
394
409
[
395
410
{degree_query}
396
411
] AS degrees, path
397
- WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
412
+ WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) AS PDP
398
413
RETURN
399
- path,
414
+ substring(reduce(s = '', node IN nodes(path)| s + '–' + node.name), 1) AS path,
400
415
PDP,
401
416
100 * (PDP / {dwpc}) AS PERCENT_OF_DWPC
402
417
ORDER BY PERCENT_OF_DWPC DESC
@@ -408,9 +423,8 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
408
423
length = len (metarels ),
409
424
property = property ,
410
425
dwpc = dwpc )
411
- # If the dwpc isn't provided, we'll have to calculate it before the PDP.
412
- # Doing so roughly doubles the query execution time, as it effectively
413
- # runs the query twice returning different degrees of aggregation.
426
+
427
+ # https://stackoverflow.com/questions/54245415/
414
428
else :
415
429
query = textwrap .dedent ('''\
416
430
MATCH path = {metapath_query}{using_query}
@@ -420,20 +434,16 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
420
434
[
421
435
{degree_query}
422
436
] AS degrees, path
423
- WITH sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }})) as DWPC
424
-
425
- MATCH path = {metapath_query}{using_query}
426
- WHERE n0.{property} = {{ source }}
427
- AND n{length}.{property} = {{ target }}{unique_nodes_query}
428
- WITH
429
- [
430
- {degree_query}
431
- ] AS degrees, path, DWPC
432
- WITH path, DWPC, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
437
+ WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) AS PDP
438
+ WITH path, collect(PDP) AS pdps, PDP
439
+ WITH collect({{path: path, pdps: pdps}}) AS allData, sum(PDP) AS DWPC
440
+ UNWIND allData AS data
441
+ UNWIND data.pdps AS PDP
442
+ WITH data.path AS path, PDP, DWPC
433
443
RETURN
434
- path,
435
- PDP,
436
- 100 * (PDP / DWPC) AS PERCENT_OF_DWPC
444
+ substring(reduce(s = '', node IN nodes(path)| s + '–' + node.name), 1) AS path,
445
+ PDP,
446
+ 100 * (PDP / DWPC) AS PERCENT_OF_DWPC
437
447
ORDER BY PERCENT_OF_DWPC DESC
438
448
''' ).rstrip ().format (
439
449
metapath_query = metapath_query ,
@@ -443,7 +453,6 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
443
453
length = len (metarels ),
444
454
property = property )
445
455
446
-
447
456
return query
448
457
449
458
def format_expanded_clause (pairs ):
@@ -487,6 +496,7 @@ def permute_rel_type(uri, rel_type, nswap=None, max_tries=None, nswap_mult=10, m
487
496
Randomization Techniques for Graphs. SIAM International Conference on
488
497
Data Mining. https://doi.org/10.1137/1.9781611972795.67
489
498
"""
499
+ py2neo = import_py2neo ()
490
500
491
501
neo = py2neo .Graph (uri )
492
502
0 commit comments