Skip to content

Commit 1d1135b

Browse files
committed
Added a version check before calling http.socket_timeout to make neo4j.py compatible with newer versions of py2neo
Modified travis config to test both py2neo 3 and 4 Moved the py2neo import logic into a function called when needed to keep the other functions from having an unnecessary dependency Modified construct_pdp_query to run more efficiently and return a string representation of the paths. Modified tests to handle the new output Added tests to check whether the queries produced by construct_pdp_query are formatted correctly Removed multiple py2neo versions from .travis.yml
1 parent e793abf commit 1d1135b

File tree

2 files changed

+210
-118
lines changed

2 files changed

+210
-118
lines changed

hetio/neo4j.py

+122-112
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,29 @@
55
from operator import or_
66
from functools import reduce
77

8-
import py2neo
9-
import py2neo.packages.httpstream
108
import pandas
119
from tqdm import tqdm
1210

1311
import hetio.hetnet
1412

15-
# Get py2neo version
16-
PY2NEO_VER = int(py2neo.__version__[0])
1713

18-
# Avoid SocketError
19-
py2neo.packages.httpstream.http.socket_timeout = 1e8
14+
def import_py2neo():
15+
"""
16+
Imports the py2neo library, checks its version, and sets the socket timeout if necessary
17+
"""
18+
import py2neo
19+
# Get py2neo version
20+
PY2NEO_VER = int(py2neo.__version__[0])
21+
if PY2NEO_VER < 4:
22+
import py2neo.packages.httpstream
23+
# Avoid SocketError
24+
py2neo.packages.httpstream.http.socket_timeout = 1e8
25+
return py2neo
26+
2027

2128
def export_neo4j(graph, uri, node_queue=200, edge_queue=5, show_progress=False):
2229
"""Export hetnet to neo4j"""
30+
py2neo = import_py2neo()
2331

2432
if isinstance(uri, py2neo.Graph):
2533
db_graph = uri
@@ -89,6 +97,10 @@ def append(self, x):
8997
self.create()
9098

9199
def create(self):
100+
import py2neo
101+
102+
PY2NEO_VER = int(py2neo.__version__[0])
103+
92104
if not self:
93105
return
94106

@@ -162,43 +174,16 @@ def cypher_path(metarels):
162174
q += '{dir0}[:{rel_type}]{dir1}(n{i}{target_label})'.format(**kwargs)
163175
return q
164176

165-
def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_hint=False, unique_nodes=True):
177+
def construct_degree_clause(metarels):
166178
"""
167-
Create a cypher query for computing the *DWPC* for a type of path.
179+
Create a Cypher query clause that calculates the degree of each node
168180
169181
Parameters
170182
----------
171183
metarels : a metarels or MetaPath object
172-
the metapath (path type) to create a query for
173-
property : str
174-
which property to use for soure and target node lookup
175-
join_hint : 'midpoint', bool, or int
176-
whether to add a join hint to tell neo4j to traverse form both ends of
177-
the path and join at a specific index. `'midpoint'` or `True` specifies
178-
joining at the middle node in the path (rounded down if an even number
179-
of nodes). `False` specifies not to add a join hint. An int specifies
180-
the node to join on.
181-
index_hint : bool
182-
whether to add index hints which specifies the properties of the source
183-
and target nodes to use for lookup. Enabling both `index_hint` and
184-
`join_hint` can cause the query to fail.
185-
unique_nodes : bool or str
186-
whether to exclude paths with duplicate nodes. To not enforce node
187-
uniqueness, use `False`. Methods for enforcing node uniqueness are:
188-
`nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
189-
`expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
190-
`labeled` to perform an intelligent version of `expanded` where only
191-
nodes with the same label are checked for duplicity. Specifying `True`,
192-
which is the default, uses the `labeled` method.
184+
the metapath to create the clause for
193185
"""
194-
# Convert metapath to metarels
195-
if isinstance(metarels, hetio.hetnet.MetaPath):
196-
metarels = metapath_to_metarels(metarels)
197-
198-
# create cypher path query
199-
metapath_query = cypher_path(metarels)
200186

201-
# create cypher path degree query
202187
degree_strs = list()
203188
for i, (source_label, target_label, rel_type, direction) in enumerate(metarels):
204189
kwargs = {
@@ -217,6 +202,26 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
217202
).format(**kwargs))
218203
degree_query = ',\n'.join(degree_strs)
219204

205+
return degree_query
206+
207+
def construct_using_clause(metarels, join_hint, index_hint):
208+
"""
209+
Create a Cypher query clause that gives the planner hints to speed up the query
210+
Parameters
211+
----------
212+
metarels : a metarels or MetaPath object
213+
the metapath to create the clause for
214+
join_hint : 'midpoint', bool, or int
215+
whether to add a join hint to tell neo4j to traverse form both ends of
216+
the path and join at a specific index. `'midpoint'` or `True` specifies
217+
joining at the middle node in the path (rounded down if an even number
218+
of nodes). `False` specifies not to add a join hint. An int specifies
219+
the node to join on.
220+
index_hint : bool
221+
whether to add index hints which specifies the properties of the source
222+
and target nodes to use for lookup. Enabling both `index_hint` and
223+
`join_hint` can cause the query to fail.
224+
"""
220225
using_query = ''
221226
# Specify index hint for node lookup
222227
if index_hint:
@@ -239,7 +244,24 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
239244
assert join_hint <= len(metarels)
240245
using_query += "\nUSING JOIN ON n{}".format(join_hint)
241246

242-
# Unique node constraint (pevent paths with duplicate nodes)
247+
return using_query
248+
249+
def construct_unique_nodes_clause(metarels, unique_nodes):
250+
"""
251+
Create a Cypher query clause that gives the planner hints to speed up the query
252+
Parameters
253+
----------
254+
metarels : a metarels or MetaPath object
255+
the metapath to create the clause for
256+
unique_nodes : bool or str
257+
whether to exclude paths with duplicate nodes. To not enforce node
258+
uniqueness, use `False`. Methods for enforcing node uniqueness are:
259+
`nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
260+
`expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
261+
`labeled` to perform an intelligent version of `expanded` where only
262+
nodes with the same label are checked for duplicity. Specifying `True`,
263+
which is the default, uses the `labeled` method.
264+
"""
243265
if unique_nodes == 'nested':
244266
unique_nodes_query = '\nAND ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)'
245267
elif unique_nodes == 'expanded':
@@ -259,6 +281,52 @@ def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_
259281
assert unique_nodes is False
260282
unique_nodes_query = ''
261283

284+
return unique_nodes_query
285+
286+
def construct_dwpc_query(metarels, property='name', join_hint='midpoint', index_hint=False, unique_nodes=True):
287+
"""
288+
Create a cypher query for computing the *DWPC* for a type of path.
289+
290+
Parameters
291+
----------
292+
metarels : a metarels or MetaPath object
293+
the metapath (path type) to create a query for
294+
property : str
295+
which property to use for soure and target node lookup
296+
join_hint : 'midpoint', bool, or int
297+
whether to add a join hint to tell neo4j to traverse form both ends of
298+
the path and join at a specific index. `'midpoint'` or `True` specifies
299+
joining at the middle node in the path (rounded down if an even number
300+
of nodes). `False` specifies not to add a join hint. An int specifies
301+
the node to join on.
302+
index_hint : bool
303+
whether to add index hints which specifies the properties of the source
304+
and target nodes to use for lookup. Enabling both `index_hint` and
305+
`join_hint` can cause the query to fail.
306+
unique_nodes : bool or str
307+
whether to exclude paths with duplicate nodes. To not enforce node
308+
uniqueness, use `False`. Methods for enforcing node uniqueness are:
309+
`nested` the path-length independent query (`ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)`)
310+
`expanded` for the combinatorial and path-length dependent form (`NOT (n0=n1 OR n0=n2 OR n0=n3 OR n1=n2 OR n1=n3 OR n2=n3)`).
311+
`labeled` to perform an intelligent version of `expanded` where only
312+
nodes with the same label are checked for duplicity. Specifying `True`,
313+
which is the default, uses the `labeled` method.
314+
"""
315+
# Convert metapath to metarels
316+
if isinstance(metarels, hetio.hetnet.MetaPath):
317+
metarels = metapath_to_metarels(metarels)
318+
319+
# create cypher path query
320+
metapath_query = cypher_path(metarels)
321+
322+
# create cypher path degree query
323+
degree_query = construct_degree_clause(metarels)
324+
325+
using_query = construct_using_clause(metarels, join_hint, index_hint)
326+
327+
# Unique node constraint (pevent paths with duplicate nodes)
328+
unique_nodes_query = construct_unique_nodes_clause(metarels, unique_nodes)
329+
262330
# combine cypher fragments into a single query and add DWPC logic
263331
query = textwrap.dedent('''\
264332
MATCH path = {metapath_query}{using_query}
@@ -323,65 +391,12 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
323391
metapath_query = cypher_path(metarels)
324392

325393
# create cypher path degree query
326-
degree_strs = list()
327-
for i, (source_label, target_label, rel_type, direction) in enumerate(metarels):
328-
kwargs = {
329-
'i0': i,
330-
'i1': i + 1,
331-
'source_label': source_label,
332-
'target_label': target_label,
333-
'rel_type': rel_type,
334-
'dir0': '<-' if direction == 'backward' else '-',
335-
'dir1': '->' if direction == 'forward' else '-',
336-
}
337-
degree_strs.append(textwrap.dedent(
338-
'''\
339-
size((n{i0}){dir0}[:{rel_type}]{dir1}()),
340-
size((){dir0}[:{rel_type}]{dir1}(n{i1}))'''
341-
).format(**kwargs))
342-
degree_query = ',\n'.join(degree_strs)
394+
degree_query = construct_degree_clause(metarels)
343395

344-
using_query = ''
345-
# Specify index hint for node lookup
346-
if index_hint:
347-
using_query = '\n' + textwrap.dedent('''\
348-
USING INDEX n0:{source_label}({property})
349-
USING INDEX n{length}:{target_label}({property})
350-
''').rstrip().format(
351-
property = property,
352-
source_label = metarels[0][0],
353-
target_label = metarels[-1][1],
354-
length = len(metarels)
355-
)
356-
357-
# Specify join hint with node to join on
358-
if join_hint is not False:
359-
if join_hint is True or join_hint == 'midpoint':
360-
join_hint = len(metarels) // 2
361-
join_hint = int(join_hint)
362-
assert join_hint >= 0
363-
assert join_hint <= len(metarels)
364-
using_query += "\nUSING JOIN ON n{}".format(join_hint)
396+
using_query = construct_using_clause(metarels, join_hint, index_hint)
365397

366398
# Unique node constraint (pevent paths with duplicate nodes)
367-
if unique_nodes == 'nested':
368-
unique_nodes_query = '\nAND ALL (x IN nodes(path) WHERE size(filter(z IN nodes(path) WHERE z = x)) = 1)'
369-
elif unique_nodes == 'expanded':
370-
pairs = itertools.combinations(range(len(metarels) + 1), 2)
371-
unique_nodes_query = format_expanded_clause(pairs)
372-
elif unique_nodes == 'labeled' or unique_nodes is True:
373-
labels = [metarel[0] for metarel in metarels]
374-
labels.append(metarels[-1][1])
375-
label_to_nodes = dict()
376-
for i, label in enumerate(labels):
377-
label_to_nodes.setdefault(label, list()).append(i)
378-
pairs = list()
379-
for nodes in label_to_nodes.values():
380-
pairs.extend(itertools.combinations(nodes, 2))
381-
unique_nodes_query = format_expanded_clause(pairs)
382-
else:
383-
assert unique_nodes is False
384-
unique_nodes_query = ''
399+
unique_nodes_query = construct_unique_nodes_clause(metarels, unique_nodes)
385400

386401
# combine cypher fragments into a single query and add PDP logic
387402
query = ''
@@ -394,9 +409,9 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
394409
[
395410
{degree_query}
396411
] AS degrees, path
397-
WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
412+
WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) AS PDP
398413
RETURN
399-
path,
414+
substring(reduce(s = '', node IN nodes(path)| s + '–' + node.name), 1) AS path,
400415
PDP,
401416
100 * (PDP / {dwpc}) AS PERCENT_OF_DWPC
402417
ORDER BY PERCENT_OF_DWPC DESC
@@ -408,9 +423,8 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
408423
length=len(metarels),
409424
property=property,
410425
dwpc = dwpc)
411-
# If the dwpc isn't provided, we'll have to calculate it before the PDP.
412-
# Doing so roughly doubles the query execution time, as it effectively
413-
# runs the query twice returning different degrees of aggregation.
426+
427+
# https://stackoverflow.com/questions/54245415/
414428
else:
415429
query = textwrap.dedent('''\
416430
MATCH path = {metapath_query}{using_query}
@@ -420,20 +434,16 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
420434
[
421435
{degree_query}
422436
] AS degrees, path
423-
WITH sum(reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }})) as DWPC
424-
425-
MATCH path = {metapath_query}{using_query}
426-
WHERE n0.{property} = {{ source }}
427-
AND n{length}.{property} = {{ target }}{unique_nodes_query}
428-
WITH
429-
[
430-
{degree_query}
431-
] AS degrees, path, DWPC
432-
WITH path, DWPC, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) as PDP
437+
WITH path, reduce(pdp = 1.0, d in degrees| pdp * d ^ -{{ w }}) AS PDP
438+
WITH path, collect(PDP) AS pdps, PDP
439+
WITH collect({{path: path, pdps: pdps}}) AS allData, sum(PDP) AS DWPC
440+
UNWIND allData AS data
441+
UNWIND data.pdps AS PDP
442+
WITH data.path AS path, PDP, DWPC
433443
RETURN
434-
path,
435-
PDP,
436-
100 * (PDP / DWPC) AS PERCENT_OF_DWPC
444+
substring(reduce(s = '', node IN nodes(path)| s + '–' + node.name), 1) AS path,
445+
PDP,
446+
100 * (PDP / DWPC) AS PERCENT_OF_DWPC
437447
ORDER BY PERCENT_OF_DWPC DESC
438448
''').rstrip().format(
439449
metapath_query = metapath_query,
@@ -443,7 +453,6 @@ def construct_pdp_query(metarels, dwpc=None, property='name', join_hint='midpoin
443453
length=len(metarels),
444454
property=property)
445455

446-
447456
return query
448457

449458
def format_expanded_clause(pairs):
@@ -487,6 +496,7 @@ def permute_rel_type(uri, rel_type, nswap=None, max_tries=None, nswap_mult=10, m
487496
Randomization Techniques for Graphs. SIAM International Conference on
488497
Data Mining. https://doi.org/10.1137/1.9781611972795.67
489498
"""
499+
py2neo = import_py2neo()
490500

491501
neo = py2neo.Graph(uri)
492502

0 commit comments

Comments
 (0)