Skip to content

Commit d1a8042

Browse files
authored
add hra kg (#4)
1 parent 32ec824 commit d1a8042

File tree

25 files changed

+958
-739
lines changed

25 files changed

+958
-739
lines changed

01_ingest/grebi_ingest_ols/src/main.rs

+13-4
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
266266
output_nodes.write_all(r#","#.as_bytes()).unwrap();
267267
}
268268
output_nodes.write_all(r#"""#.as_bytes()).unwrap();
269-
output_nodes.write_all(name.as_bytes()).unwrap();
269+
write_escaped_string(&name.as_bytes(), output_nodes);
270270
output_nodes.write_all(r#"""#.as_bytes()).unwrap();
271271
}
272272
output_nodes.write_all(r#"]"#.as_bytes()).unwrap();
@@ -434,8 +434,17 @@ fn reprefix_predicate(pred:&str) -> String {
434434
}
435435
}
436436

437-
438-
439-
437+
fn write_escaped_string(str:&[u8], writer:&mut BufWriter<StdoutLock>) {
438+
for c in str {
439+
match c {
440+
b'"' => { writer.write_all(b"\\\"").unwrap(); }
441+
b'\\' => { writer.write_all(b"\\\\").unwrap(); }
442+
b'\n' => { writer.write_all(b"\\n").unwrap(); }
443+
b'\r' => { writer.write_all(b"\\r").unwrap(); }
444+
b'\t' => { writer.write_all(b"\\t").unwrap(); }
445+
_ => { writer.write_all([*c].as_slice()).unwrap(); }
446+
}
447+
}
448+
}
440449

441450

01_ingest/grebi_ingest_rdf/src/main.rs

+33-15
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,10 @@ struct Args {
7474
nest_objects_of_predicate:Vec<String>,
7575

7676
#[arg(long)]
77-
exclude_objects_of_predicate:Vec<String> // if an object is used with this predicate, ignore the object
77+
exclude_objects_of_predicate:Vec<String>, // if an object is used with this predicate, ignore the object
78+
79+
#[arg(long, default_value_t = false)]
80+
rdf_types_are_grebi_types:bool
7881
}
7982

8083
fn main() -> std::io::Result<()> {
@@ -93,6 +96,7 @@ fn main() -> std::io::Result<()> {
9396

9497
let nest_preds:BTreeSet<String> = args.nest_objects_of_predicate.into_iter().collect();
9598
let ignore_preds:BTreeSet<String> = args.exclude_objects_of_predicate.into_iter().collect();
99+
let rdf_types_are_grebi_types = args.rdf_types_are_grebi_types;
96100

97101
let gr:CustomGraph = match args.rdf_type.as_str() {
98102
"rdf_triples_xml" => {
@@ -102,14 +106,11 @@ fn main() -> std::io::Result<()> {
102106
},
103107
"rdf_quads_nq" => {
104108

105-
if args.rdf_graph.len() == 0 {
106-
panic!("must specify at least one graph to load for nquads");
107-
}
108-
109109
let parser = NQuadsParser {};
110110

111111
let quad_source = parser.parse(reader);
112-
let mut filtered_quads = quad_source.filter_quads(|q| args.rdf_graph.contains(&q.g().unwrap().value().to_string()));
112+
let mut filtered_quads = quad_source.filter_quads(|q|
113+
args.rdf_graph.len() == 0 || args.rdf_graph.contains(&q.g().unwrap().value().to_string()));
113114

114115
let mut g:CustomGraph = CustomGraph::new();
115116

@@ -160,7 +161,7 @@ fn main() -> std::io::Result<()> {
160161

161162
eprintln!("Building reification index took {} seconds", start_time.elapsed().as_secs());
162163

163-
write_subjects(ds, &mut output_nodes, &nest_preds, &exclude_subjects, &exclude_subjects_at_toplevel, reifs);
164+
write_subjects(ds, &mut output_nodes, &nest_preds, &exclude_subjects, &exclude_subjects_at_toplevel, reifs, rdf_types_are_grebi_types);
164165

165166
eprintln!("Total time elapsed: {} seconds", start_time.elapsed().as_secs());
166167

@@ -189,7 +190,7 @@ fn populate_reifs(
189190
let annotated_predicate = ds.triples_matching(&s, &pred_prop, &ANY).next().unwrap().unwrap().o().clone();
190191
let annotated_object = ds.triples_matching(&s, &obj_prop, &ANY).next().unwrap().unwrap().o().clone();
191192

192-
let obj_json = term_to_json(&annotated_object, ds, nest_preds, None).to_string();
193+
let obj_json = term_to_json(&annotated_object, ds, nest_preds, None, false).to_string();
193194

194195
let lhs = ReifLhs {
195196
s: annotated_subject.clone(),
@@ -212,7 +213,14 @@ fn populate_reifs(
212213
}
213214

214215

215-
fn write_subjects(ds:&CustomGraph, nodes_writer:&mut BufWriter<StdoutLock>, nest_preds:&BTreeSet<String>, exclude_subjects:&HashSet<Term<Rc<str>>>, exclude_subjects_at_toplevel:&HashSet<Term<Rc<str>>>, reifs:HashMap<ReifLhs, BTreeMap<String, Term<Rc<str>>>>) {
216+
fn write_subjects(
217+
ds:&CustomGraph,
218+
nodes_writer:&mut BufWriter<StdoutLock>,
219+
nest_preds:&BTreeSet<String>,
220+
exclude_subjects:&HashSet<Term<Rc<str>>>,
221+
exclude_subjects_at_toplevel:&HashSet<Term<Rc<str>>>,
222+
reifs:HashMap<ReifLhs, BTreeMap<String, Term<Rc<str>>>>,
223+
rdf_types_are_grebi_types:bool) {
216224

217225
let start_time2 = std::time::Instant::now();
218226

@@ -229,7 +237,7 @@ fn write_subjects(ds:&CustomGraph, nodes_writer:&mut BufWriter<StdoutLock>, nest
229237
continue;
230238
}
231239

232-
let json = term_to_json(s, ds, nest_preds, Some(&reifs));
240+
let json = term_to_json(s, ds, nest_preds, Some(&reifs), rdf_types_are_grebi_types);
233241

234242
let json_obj = json.as_object().unwrap();
235243
let types = json_obj.get("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
@@ -252,7 +260,13 @@ fn write_subjects(ds:&CustomGraph, nodes_writer:&mut BufWriter<StdoutLock>, nest
252260
eprintln!("Writing JSONL took {} seconds", start_time2.elapsed().as_secs());
253261
}
254262

255-
fn term_to_json(term:&Term<Rc<str>>, ds:&CustomGraph, nest_preds:&BTreeSet<String>, reifs:Option<&HashMap<ReifLhs, BTreeMap<String, Term<Rc<str>>>>>) -> Value {
263+
fn term_to_json(
264+
term:&Term<Rc<str>>,
265+
ds:&CustomGraph,
266+
nest_preds:&BTreeSet<String>,
267+
reifs:Option<&HashMap<ReifLhs, BTreeMap<String, Term<Rc<str>>>>>,
268+
rdf_types_are_grebi_types:bool
269+
) -> Value {
256270

257271
let triples = ds.triples_matching(term, &ANY, &ANY);
258272

@@ -285,7 +299,7 @@ fn term_to_json(term:&Term<Rc<str>>, ds:&CustomGraph, nest_preds:&BTreeSet<Strin
285299
let reifs_for_this_sp = reifs_u.get(&ReifLhs { s: tu.s().clone(), p: tu.p().clone() });
286300
if reifs_for_this_sp.is_some() {
287301
let reifs_for_this_sp_u = reifs_for_this_sp.unwrap();
288-
let o_json = term_to_json(&o, ds, nest_preds, None).to_string();
302+
let o_json = term_to_json(&o, ds, nest_preds, None, false).to_string();
289303
let reif = reifs_for_this_sp_u.get(&o_json);
290304
if reif.is_some() {
291305
Some(reif.unwrap())
@@ -304,7 +318,7 @@ fn term_to_json(term:&Term<Rc<str>>, ds:&CustomGraph, nest_preds:&BTreeSet<Strin
304318
if nest_preds.contains(p) {
305319
match o.kind() {
306320
Iri|Literal|BlankNode => {
307-
let mut obj = term_to_json(o, ds, nest_preds, reifs);
321+
let mut obj = term_to_json(o, ds, nest_preds, reifs, false);
308322
let obj_o = obj.as_object_mut().unwrap();
309323
obj_o.remove_entry("id");
310324
obj
@@ -314,14 +328,14 @@ fn term_to_json(term:&Term<Rc<str>>, ds:&CustomGraph, nest_preds:&BTreeSet<Strin
314328
} else {
315329
match o.kind() {
316330
Iri|Literal => Value::String( o.value().to_string() ),
317-
BlankNode => term_to_json(o, ds, nest_preds, reifs),
331+
BlankNode => term_to_json(o, ds, nest_preds, reifs, false),
318332
Variable => todo!(),
319333
}
320334
}
321335
};
322336

323337
if reif_subj.is_some() {
324-
let mut reif_as_json = term_to_json(reif_subj.unwrap(), ds, nest_preds, None);
338+
let mut reif_as_json = term_to_json(reif_subj.unwrap(), ds, nest_preds, None, false);
325339
let reif_as_json_o = reif_as_json.as_object_mut().unwrap();
326340
reif_as_json_o.remove_entry("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
327341
reif_as_json_o.remove_entry("id");
@@ -340,6 +354,10 @@ fn term_to_json(term:&Term<Rc<str>>, ds:&CustomGraph, nest_preds:&BTreeSet<Strin
340354
}
341355
}
342356

357+
if rdf_types_are_grebi_types && json.contains_key("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") {
358+
json.insert("grebi:type".to_string(), json.get("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap().clone());
359+
}
360+
343361
return Value::Object(json);
344362
}
345363

03_merge/grebi_merge/src/main.rs

+14-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ struct Args {
2929
#[arg(long)]
3030
exclude_props: String,
3131

32+
#[arg(long)]
33+
annotate_subgraph_name: Option<String>,
34+
3235
#[arg(trailing_var_arg = true, allow_hyphen_values = true, required = true)]
3336
_files: Vec<String>,
3437
}
@@ -51,6 +54,8 @@ fn main() -> std::io::Result<()> {
5154
input_filenames.sort();
5255
input_filenames.dedup();
5356

57+
let subgraph_name:Option<String> = args.annotate_subgraph_name;
58+
5459
let mut inputs: Vec<Input> = input_filenames
5560
.iter()
5661
.map(|file| {
@@ -110,7 +115,7 @@ fn main() -> std::io::Result<()> {
110115
if !id.eq(&cur_id) {
111116
// this is a new subject; we have finished the old one (if present)
112117
if cur_id.len() > 0 {
113-
write_merged_entity(&lines_to_write, &mut writer, &inputs, &exclude_props);
118+
write_merged_entity(&lines_to_write, &mut writer, &inputs, &exclude_props, &subgraph_name);
114119
lines_to_write.clear();
115120
}
116121
cur_id = id.to_vec();
@@ -143,7 +148,7 @@ fn main() -> std::io::Result<()> {
143148
}
144149

145150
if cur_id.len() > 0 {
146-
write_merged_entity(&lines_to_write, &mut writer, &inputs, &exclude_props);
151+
write_merged_entity(&lines_to_write, &mut writer, &inputs, &exclude_props, &subgraph_name);
147152
lines_to_write.clear();
148153
}
149154

@@ -153,7 +158,7 @@ fn main() -> std::io::Result<()> {
153158
}
154159

155160
#[inline(always)]
156-
fn write_merged_entity(lines_to_write: &Vec<BufferedLine>, stdout: &mut BufWriter<std::io::StdoutLock>, inputs: &Vec<Input>, exclude_props:&BTreeSet<Vec<u8>>) {
161+
fn write_merged_entity(lines_to_write: &Vec<BufferedLine>, stdout: &mut BufWriter<std::io::StdoutLock>, inputs: &Vec<Input>, exclude_props:&BTreeSet<Vec<u8>>, subgraph_name:&Option<String>) {
157162

158163
if lines_to_write.len() == 0 {
159164
panic!();
@@ -220,6 +225,12 @@ fn write_merged_entity(lines_to_write: &Vec<BufferedLine>, stdout: &mut BufWrite
220225
}
221226
stdout.write_all(r#"]"#.as_bytes()).unwrap();
222227

228+
if subgraph_name.is_some() {
229+
stdout.write_all(r#","grebi:subgraph":""#.as_bytes()).unwrap();
230+
stdout.write_all(&subgraph_name.as_ref().unwrap().as_bytes());
231+
stdout.write_all(r#"""#.as_bytes()).unwrap();
232+
}
233+
223234
// sort by key, then value, then datasource
224235
merged_props.sort_by(|a, b| {
225236
match a.1.key.cmp(&b.1.key) {

04_index/grebi_index/src/main.rs

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ use serde_json::json;
2626
#[command(author, version, about, long_about = None)]
2727
struct Args {
2828

29+
#[arg(long)]
30+
subgraph_name: String,
31+
2932
#[arg(long)]
3033
out_summary_json_path: String,
3134

@@ -195,6 +198,7 @@ fn main() {
195198

196199
summary_writer.write_all(
197200
serde_json::to_string_pretty(&json!({
201+
"subgraph_name": args.subgraph_name,
198202
"entity_props": entity_props_to_count.iter().map(|(k,v)| {
199203
return (String::from_utf8(k.to_vec()).unwrap(), json!({
200204
"count": v

06_prepare_db_import/grebi_make_csv/src/main.rs

+18-6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::io::BufWriter;
55
use std::io::BufReader;
66
use std::io::Write;
77
use std::io::BufRead;
8+
use std::collections::HashSet;
89
use clap::Parser;
910
use grebi_shared::json_lexer::JsonTokenType;
1011
use grebi_shared::slice_materialised_edge::SlicedEdge;
@@ -28,7 +29,7 @@ struct Args {
2829
in_edges_jsonl: String,
2930

3031
#[arg(long)]
31-
in_summary_json: String,
32+
in_summary_jsons: String,
3233

3334
#[arg(long)]
3435
out_nodes_csv_path: String,
@@ -43,10 +44,21 @@ fn main() -> std::io::Result<()> {
4344

4445
let start_time = std::time::Instant::now();
4546

46-
let summary:Value = serde_json::from_reader(File::open(args.in_summary_json).unwrap()).unwrap();
4747

48-
let all_entity_props: Vec<String> = summary["entity_props"].as_object().unwrap().keys().cloned().collect();
49-
let all_edge_props: Vec<String> = summary["edge_props"].as_object().unwrap().keys().cloned().collect();
48+
let mut all_entity_props: HashSet<String> = HashSet::new();
49+
let mut all_edge_props: HashSet<String> = HashSet::new();
50+
51+
52+
for f in args.in_summary_jsons.split(",") {
53+
let summary:Value = serde_json::from_reader(File::open(f).unwrap()).unwrap();
54+
for prop in summary["edge_props"].as_object().unwrap().keys() {
55+
all_edge_props.insert(prop.to_string());
56+
}
57+
for prop in summary["entity_props"].as_object().unwrap().keys() {
58+
all_entity_props.insert(prop.to_string());
59+
}
60+
}
61+
5062

5163

5264
let mut nodes_reader = BufReader::new(File::open(args.in_nodes_jsonl).unwrap());
@@ -140,7 +152,7 @@ fn main() -> std::io::Result<()> {
140152
Ok(())
141153
}
142154

143-
fn write_node(src_line:&[u8], entity:&SlicedEntity, all_node_props:&Vec<String>, nodes_writer:&mut BufWriter<&File>) {
155+
fn write_node(src_line:&[u8], entity:&SlicedEntity, all_node_props:&HashSet<String>, nodes_writer:&mut BufWriter<&File>) {
144156

145157
let refs:Map<String,Value> = serde_json::from_slice(entity._refs.unwrap()).unwrap();
146158

@@ -214,7 +226,7 @@ fn write_node(src_line:&[u8], entity:&SlicedEntity, all_node_props:&Vec<String>,
214226
nodes_writer.write_all(b"\n").unwrap();
215227
}
216228

217-
fn write_edge(src_line:&[u8], edge:SlicedEdge, all_edge_props:&Vec<String>, edges_writer: &mut BufWriter<&File>) {
229+
fn write_edge(src_line:&[u8], edge:SlicedEdge, all_edge_props:&HashSet<String>, edges_writer: &mut BufWriter<&File>) {
218230

219231
let refs:Map<String,Value> = serde_json::from_slice(edge._refs.unwrap()).unwrap();
220232

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
import json
3+
import os
4+
import sys
5+
import shlex
6+
import time
7+
import glob
8+
import argparse
9+
from pathlib import Path
10+
from subprocess import Popen, PIPE, STDOUT
11+
12+
13+
def main():
14+
parser = argparse.ArgumentParser(description='Create Solr autocomplete config')
15+
parser.add_argument('--subgraph-name', type=str, help='subgraph name', required=True)
16+
parser.add_argument('--in-template-config-dir', type=str, help='Path of config template', required=True)
17+
parser.add_argument('--out-config-dir', type=str, help='Path to write config', required=True)
18+
args = parser.parse_args()
19+
20+
os.makedirs(args.out_config_dir)
21+
22+
autocomplete_core_path = os.path.join(args.out_config_dir, f'grebi_autocomplete_{args.subgraph_name}')
23+
os.system('cp -r ' + shlex.quote(os.path.join(args.in_template_config_dir, "grebi_autocomplete")) + ' ' + shlex.quote(autocomplete_core_path))
24+
25+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solr.xml")) + ' ' + shlex.quote(args.out_config_dir))
26+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solrconfig.xml")) + ' ' + shlex.quote(args.out_config_dir))
27+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "zoo.cfg")) + ' ' + shlex.quote(args.out_config_dir))
28+
29+
Path(f'{autocomplete_core_path}/core.properties').write_text(f"name=grebi_autocomplete_{args.subgraph_name}\n")
30+
31+
if __name__=="__main__":
32+
main()
33+
34+
35+

06_prepare_db_import/make_solr_config.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,39 @@
1212

1313
def main():
1414
parser = argparse.ArgumentParser(description='Create Solr config')
15+
parser.add_argument('--subgraph-name', type=str, help='subgraph name', required=True)
1516
parser.add_argument('--in-summary-json', type=str, help='summary.json', required=True)
1617
parser.add_argument('--in-template-config-dir', type=str, help='Path of config template', required=True)
1718
parser.add_argument('--out-config-dir', type=str, help='Path to write config', required=True)
1819
args = parser.parse_args()
1920

20-
os.system('cp -r ' + shlex.quote(args.in_template_config_dir) + ' ' + shlex.quote(args.out_config_dir))
21+
os.makedirs(args.out_config_dir)
22+
23+
nodes_core_path = os.path.join(args.out_config_dir, f'grebi_nodes_{args.subgraph_name}')
24+
edges_core_path = os.path.join(args.out_config_dir, f'grebi_edges_{args.subgraph_name}')
25+
os.system('cp -r ' + shlex.quote(os.path.join(args.in_template_config_dir, "grebi_nodes")) + ' ' + shlex.quote(nodes_core_path))
26+
os.system('cp -r ' + shlex.quote(os.path.join(args.in_template_config_dir, "grebi_edges")) + ' ' + shlex.quote(edges_core_path))
27+
28+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solr.xml")) + ' ' + shlex.quote(args.out_config_dir))
29+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "solrconfig.xml")) + ' ' + shlex.quote(args.out_config_dir))
30+
os.system('cp ' + shlex.quote(os.path.join(args.in_template_config_dir, "zoo.cfg")) + ' ' + shlex.quote(args.out_config_dir))
2131

2232
summary = json.load(open(args.in_summary_json))
2333
node_props = map(lambda f: f.replace(':', '__').replace('&', '_'), summary['entity_props'].keys())
2434
edge_props = map(lambda f: f.replace(':', '__').replace('&', '_'), summary['edge_props'].keys())
2535

26-
nodes_schema = Path(os.path.join(args.out_config_dir, 'grebi_nodes/conf/schema.xml'))
36+
Path(f'{nodes_core_path}/core.properties').write_text(f"name=grebi_nodes_{args.subgraph_name}\n")
37+
Path(f'{edges_core_path}/core.properties').write_text(f"name=grebi_edges_{args.subgraph_name}\n")
38+
39+
nodes_schema = Path(f'{nodes_core_path}/conf/schema.xml')
2740
nodes_schema.write_text(nodes_schema.read_text().replace('[[GREBI_FIELDS]]', '\n'.join(list(map(
2841
lambda f: '\n'.join([
2942
f'<field name="{f}" type="string" indexed="true" stored="false" required="false" multiValued="true" />',
3043
f'<copyField source="{f}" dest="str_{f}"/>',
3144
f'<copyField source="{f}" dest="lowercase_{f}"/>'
3245
]), node_props)))))
3346

34-
edges_schema = Path(os.path.join(args.out_config_dir, 'grebi_edges/conf/schema.xml'))
47+
edges_schema = Path(f'{edges_core_path}/conf/schema.xml')
3548
edges_schema.write_text(edges_schema.read_text().replace('[[GREBI_FIELDS]]', '\n'.join(list(map(
3649
lambda f: '\n'.join([
3750
f'<field name="{f}" type="string" indexed="true" stored="false" required="false" multiValued="true" />',
+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11

22
CREATE INDEX node_id FOR (n:GraphNode) ON n.`grebi:nodeId`
33
;
4+
CREATE INDEX subgraph FOR (n:GraphNode) ON n.`grebi:subgraph`
5+
;
46
CALL db.awaitIndexes(10800)
57
;

0 commit comments

Comments
 (0)