2
2
use std:: ascii:: escape_default;
3
3
use std:: collections:: BTreeMap ;
4
4
use std:: collections:: BTreeSet ;
5
+ use std:: collections:: HashMap ;
5
6
use std:: fs:: File ;
7
+ use std:: hash:: Hash ;
6
8
use std:: io:: BufWriter ;
7
9
use std:: io:: BufReader ;
8
10
use std:: io:: Write ;
@@ -11,6 +13,7 @@ use std::io::BufRead;
11
13
use std:: io:: StdoutLock ;
12
14
use std:: mem:: transmute;
13
15
use sha1:: { Sha1 , Digest } ;
16
+ use serde_json:: json;
14
17
15
18
use clap:: Parser ;
16
19
use flate2:: write:: GzEncoder ;
@@ -45,10 +48,28 @@ struct Args {
45
48
#[ arg( long) ]
46
49
out_edges_jsonl : String ,
47
50
51
+ #[ arg( long) ]
52
+ out_edge_summary_json : String ,
53
+
48
54
#[ arg( long) ]
49
55
exclude : String
50
56
}
51
57
58
+
59
+ type EdgeSummaryTable = HashMap <
60
+ String , /* src node type signature */
61
+ HashMap <
62
+ String /* edge type */ ,
63
+ HashMap <
64
+ String , /* dest node type signature */
65
+ HashMap <
66
+ String , /* set of datasources */
67
+ u64 /* count */
68
+ >
69
+ >
70
+ >
71
+ > ;
72
+
52
73
fn main ( ) -> std:: io:: Result < ( ) > {
53
74
54
75
let args = Args :: parse ( ) ;
@@ -59,7 +80,6 @@ fn main() -> std::io::Result<()> {
59
80
60
81
let exclude: BTreeSet < Vec < u8 > > = args. exclude . split ( "," ) . map ( |s| s. to_string ( ) . as_bytes ( ) . to_vec ( ) ) . collect ( ) ;
61
82
62
-
63
83
let stdin = io:: stdin ( ) . lock ( ) ;
64
84
let mut reader = BufReader :: new ( stdin) ;
65
85
@@ -69,6 +89,11 @@ fn main() -> std::io::Result<()> {
69
89
let stdout = io:: stdout ( ) . lock ( ) ;
70
90
let mut nodes_writer = BufWriter :: new ( stdout) ;
71
91
92
+ let edge_summary_file = File :: create ( args. out_edge_summary_json ) . unwrap ( ) ;
93
+ let mut edge_summary_writer = BufWriter :: new ( edge_summary_file) ;
94
+
95
+ let mut edge_summary: EdgeSummaryTable = HashMap :: new ( ) ;
96
+
72
97
let mut n_nodes: i64 = 0 ;
73
98
74
99
loop {
@@ -91,7 +116,7 @@ fn main() -> std::io::Result<()> {
91
116
92
117
sliced. props . iter ( ) . for_each ( |prop| {
93
118
for val in & prop. values {
94
- maybe_write_edge ( sliced. id , prop, & val, & mut edges_writer, & exclude, & node_metadata, & val. datasources , sliced. subgraph ) ;
119
+ maybe_write_edge ( sliced. id , prop, & val, & mut edges_writer, & exclude, & node_metadata, & val. datasources , sliced. subgraph , & mut edge_summary ) ;
95
120
}
96
121
} ) ;
97
122
@@ -120,10 +145,16 @@ fn main() -> std::io::Result<()> {
120
145
121
146
eprintln ! ( "materialise took {} seconds" , start_time. elapsed( ) . as_secs( ) ) ;
122
147
148
+ edge_summary_writer. write_all ( serde_json:: to_string_pretty ( & json ! ( {
149
+ "edges" : edge_summary
150
+ } ) ) . unwrap ( ) . as_bytes ( ) ) . unwrap ( ) ;
151
+
152
+ edge_summary_writer. flush ( ) . unwrap ( ) ;
153
+
123
154
Ok ( ( ) )
124
155
}
125
156
126
- fn maybe_write_edge ( from_id : & [ u8 ] , prop : & SlicedProperty , val : & SlicedPropertyValue , edges_writer : & mut BufWriter < File > , exclude : & BTreeSet < Vec < u8 > > , node_metadata : & BTreeMap < Vec < u8 > , Metadata > , datasources : & Vec < & [ u8 ] > , subgraph : & [ u8 ] ) {
157
+ fn maybe_write_edge ( from_id : & [ u8 ] , prop : & SlicedProperty , val : & SlicedPropertyValue , edges_writer : & mut BufWriter < File > , exclude : & BTreeSet < Vec < u8 > > , node_metadata : & BTreeMap < Vec < u8 > , Metadata > , datasources : & Vec < & [ u8 ] > , subgraph : & [ u8 ] , edge_summary : & mut EdgeSummaryTable ) {
127
158
128
159
if prop. key . eq ( b"id" ) || prop. key . starts_with ( b"grebi:" ) || exclude. contains ( prop. key ) {
129
160
return ;
@@ -140,7 +171,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
140
171
let str = JsonParser :: parse ( & buf) . string ( ) ;
141
172
let exists = node_metadata. contains_key ( str) ;
142
173
if exists {
143
- write_edge ( from_id, str, prop. key , Some ( & reified_u. props ) , edges_writer, node_metadata, & datasources, & subgraph) ;
174
+ write_edge ( from_id, str, prop. key , Some ( & reified_u. props ) , edges_writer, node_metadata, & datasources, & subgraph, edge_summary ) ;
144
175
}
145
176
} else {
146
177
// panic!("unexpected kind: {:?}", reified_u.value_kind);
@@ -154,7 +185,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
154
185
let exists = node_metadata. contains_key ( str) ;
155
186
156
187
if exists {
157
- write_edge ( from_id, str, prop. key , None , edges_writer, node_metadata, & datasources, & subgraph) ;
188
+ write_edge ( from_id, str, prop. key , None , edges_writer, node_metadata, & datasources, & subgraph, edge_summary ) ;
158
189
}
159
190
160
191
} else if val. kind == JsonTokenType :: StartArray {
@@ -169,8 +200,8 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
169
200
170
201
}
171
202
172
- fn write_edge ( from_id : & [ u8 ] , to_id : & [ u8 ] , edge : & [ u8 ] , edge_props : Option < & Vec < SlicedProperty > > , edges_writer : & mut BufWriter < File > , node_metadata : & BTreeMap < Vec < u8 > , Metadata > , datasources : & Vec < & [ u8 ] > , subgraph : & [ u8 ] ) {
173
-
203
+ fn write_edge ( from_id : & [ u8 ] , to_id : & [ u8 ] , edge : & [ u8 ] , edge_props : Option < & Vec < SlicedProperty > > , edges_writer : & mut BufWriter < File > , node_metadata : & BTreeMap < Vec < u8 > , Metadata > , datasources : & Vec < & [ u8 ] > , subgraph : & [ u8 ] , edge_summary : & mut EdgeSummaryTable ) {
204
+
174
205
let mut buf = Vec :: new ( ) ;
175
206
176
207
buf. extend ( b"\" grebi:type\" :\" " ) ;
@@ -206,7 +237,7 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
206
237
}
207
238
}
208
239
209
- let _refs = {
240
+ let _refs: Map < String , Value > = {
210
241
let mut res: Map < String , Value > = Map :: new ( ) ;
211
242
for ( start, end) in find_strings ( & buf) {
212
243
let maybe_id = & buf[ start..end] ;
@@ -221,6 +252,21 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
221
252
res
222
253
} ;
223
254
255
+ let from_type_signature: String = get_type_signature_from_metadata_json ( _refs. get ( & String :: from_utf8_lossy ( from_id) . to_string ( ) ) . unwrap ( ) ) ;
256
+ let to_type_signature: String = get_type_signature_from_metadata_json ( _refs. get ( & String :: from_utf8_lossy ( to_id) . to_string ( ) ) . unwrap ( ) ) ;
257
+ let datasources_signature: String = datasources. iter ( ) . map ( |ds| String :: from_utf8_lossy ( ds) . to_string ( ) ) . collect :: < Vec < String > > ( ) . join ( "," ) ;
258
+
259
+ let edge_summary_edges = edge_summary. entry ( from_type_signature) . or_insert ( HashMap :: new ( ) ) ;
260
+ let count: & mut u64 = edge_summary_edges
261
+ . entry ( String :: from_utf8_lossy ( edge) . to_string ( ) )
262
+ . or_insert ( HashMap :: new ( ) )
263
+ . entry ( to_type_signature)
264
+ . or_insert ( HashMap :: new ( ) )
265
+ . entry ( datasources_signature)
266
+ . or_insert ( 0 ) ;
267
+
268
+ * count = * count + 1 ;
269
+
224
270
// sha1 not for security, just as a simple way to assign a unique
225
271
// id to the edge that will be reproducible between dataloads
226
272
//
@@ -238,3 +284,15 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
238
284
}
239
285
240
286
287
+
288
+
289
+ fn get_type_signature_from_metadata_json ( json : & Value ) -> String {
290
+ let mut t: Vec < & str > = json. as_object ( ) . unwrap ( )
291
+ . get ( "grebi:type" ) . unwrap ( )
292
+ . as_array ( ) . unwrap ( )
293
+ . iter ( )
294
+ . map ( |val| val. as_str ( ) . unwrap ( ) )
295
+ . collect ( ) ;
296
+ t. sort ( ) ;
297
+ return t. join ( "," ) . to_string ( ) ;
298
+ }
0 commit comments