Skip to content

Commit e2ec013

Browse files
committed
generate edge summary, add some types from ontology superclasses
1 parent bfb7ecd commit e2ec013

File tree

7 files changed

+195
-39
lines changed

7 files changed

+195
-39
lines changed

01_ingest/grebi_ingest_ols/src/main.rs

+26-9
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@ struct Args {
2424
ontologies:String,
2525

2626
#[arg(long)]
27-
defining_only:bool
27+
defining_only:bool,
28+
29+
#[arg(long)]
30+
superclass_is_type:Vec<String>,
2831
}
2932

3033
fn main() {
@@ -46,6 +49,8 @@ fn main() {
4649
ontology_whitelist.insert(ontology.to_string());
4750
}
4851

52+
let mut type_superclasses:HashSet<String> = args.superclass_is_type.iter().map(|x| x.to_string()).collect();
53+
4954
let mut json = JsonStreamReader::new(reader);
5055

5156
json.begin_object().unwrap();
@@ -55,14 +60,14 @@ fn main() {
5560
}
5661
json.begin_array().unwrap();
5762
while json.has_next().unwrap() {
58-
read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only);
63+
read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only, &type_superclasses);
5964
}
6065
json.end_array().unwrap();
6166
json.end_object().unwrap();
6267

6368
}
6469

65-
fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool) {
70+
fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool, type_superclasses:&HashSet<String>) {
6671

6772
json.begin_object().unwrap();
6873

@@ -126,11 +131,11 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
126131

127132
loop {
128133
if key.eq("classes") {
129-
read_entities(json, output_nodes, &datasource, "ols:Class", defining_only);
134+
read_entities(json, output_nodes, &datasource, "ols:Class", defining_only, &type_superclasses);
130135
} else if key.eq("properties") {
131-
read_entities(json, output_nodes, &datasource, "ols:Property", defining_only);
136+
read_entities(json, output_nodes, &datasource, "ols:Property", defining_only, &type_superclasses);
132137
} else if key.eq("individuals") {
133-
read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only);
138+
read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only, &type_superclasses);
134139
} else {
135140
panic!();
136141
}
@@ -145,7 +150,7 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
145150

146151
}
147152

148-
fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool) {
153+
fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool, type_superclasses:&HashSet<String>) {
149154
json.begin_array().unwrap();
150155
while json.has_next().unwrap() {
151156
let mut val:Value = read_value(json);
@@ -208,6 +213,14 @@ fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
208213
output_nodes.write_all(datasource.as_bytes()).unwrap();
209214
output_nodes.write_all(r#"","grebi:type":[""#.as_bytes()).unwrap();
210215
output_nodes.write_all(grebitype.as_bytes()).unwrap();
216+
if obj.contains_key("ols:directAncestor") {
217+
for ancestor in obj.get("ols:directAncestor").unwrap().as_array().unwrap() {
218+
if type_superclasses.contains(ancestor.as_str().unwrap()) {
219+
output_nodes.write_all(r#","#.as_bytes()).unwrap();
220+
output_nodes.write_all(ancestor.to_string().as_bytes()).unwrap();
221+
}
222+
}
223+
}
211224
output_nodes.write_all(r#""]"#.as_bytes()).unwrap();
212225

213226
for k in obj.keys() {
@@ -328,8 +341,12 @@ fn write_value(v:&Value, output_nodes: &mut BufWriter<StdoutLock>) {
328341
output_nodes.write_all(r#"}}"#.as_bytes()).unwrap();
329342
}
330343
} else {
331-
let value = obj.get("ols:value").unwrap();
332-
write_value(&value, output_nodes);
344+
if obj.contains_key("ols:value") {
345+
let value = obj.get("ols:value").unwrap();
346+
write_value(&value, output_nodes);
347+
} else {
348+
panic!("Unknown value: {:?}", serde_json::to_string(obj));
349+
}
333350
}
334351
return;
335352
} else {

05_materialise/grebi_materialise/src/main.rs

+66-8
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
use std::ascii::escape_default;
33
use std::collections::BTreeMap;
44
use std::collections::BTreeSet;
5+
use std::collections::HashMap;
56
use std::fs::File;
7+
use std::hash::Hash;
68
use std::io::BufWriter;
79
use std::io::BufReader;
810
use std::io::Write;
@@ -11,6 +13,7 @@ use std::io::BufRead;
1113
use std::io::StdoutLock;
1214
use std::mem::transmute;
1315
use sha1::{Sha1, Digest};
16+
use serde_json::json;
1417

1518
use clap::Parser;
1619
use flate2::write::GzEncoder;
@@ -45,10 +48,28 @@ struct Args {
4548
#[arg(long)]
4649
out_edges_jsonl: String,
4750

51+
#[arg(long)]
52+
out_edge_summary_json: String,
53+
4854
#[arg(long)]
4955
exclude: String
5056
}
5157

58+
59+
type EdgeSummaryTable = HashMap<
60+
String, /* src node type signature */
61+
HashMap<
62+
String /* edge type */,
63+
HashMap<
64+
String, /* dest node type signature */
65+
HashMap<
66+
String, /* set of datasources */
67+
u64 /* count */
68+
>
69+
>
70+
>
71+
>;
72+
5273
fn main() -> std::io::Result<()> {
5374

5475
let args = Args::parse();
@@ -59,7 +80,6 @@ fn main() -> std::io::Result<()> {
5980

6081
let exclude:BTreeSet<Vec<u8>> = args.exclude.split(",").map(|s| s.to_string().as_bytes().to_vec()).collect();
6182

62-
6383
let stdin = io::stdin().lock();
6484
let mut reader = BufReader::new(stdin);
6585

@@ -69,6 +89,11 @@ fn main() -> std::io::Result<()> {
6989
let stdout = io::stdout().lock();
7090
let mut nodes_writer = BufWriter::new(stdout);
7191

92+
let edge_summary_file = File::create(args.out_edge_summary_json).unwrap();
93+
let mut edge_summary_writer = BufWriter::new(edge_summary_file);
94+
95+
let mut edge_summary:EdgeSummaryTable = HashMap::new();
96+
7297
let mut n_nodes:i64 = 0;
7398

7499
loop {
@@ -91,7 +116,7 @@ fn main() -> std::io::Result<()> {
91116

92117
sliced.props.iter().for_each(|prop| {
93118
for val in &prop.values {
94-
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources, sliced.subgraph);
119+
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &node_metadata, &val.datasources, sliced.subgraph, &mut edge_summary);
95120
}
96121
});
97122

@@ -120,10 +145,16 @@ fn main() -> std::io::Result<()> {
120145

121146
eprintln!("materialise took {} seconds", start_time.elapsed().as_secs());
122147

148+
edge_summary_writer.write_all(serde_json::to_string_pretty(&json!({
149+
"edges": edge_summary
150+
})).unwrap().as_bytes()).unwrap();
151+
152+
edge_summary_writer.flush().unwrap();
153+
123154
Ok(())
124155
}
125156

126-
fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {
157+
fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8], edge_summary: &mut EdgeSummaryTable) {
127158

128159
if prop.key.eq(b"id") || prop.key.starts_with(b"grebi:") || exclude.contains(prop.key) {
129160
return;
@@ -140,7 +171,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
140171
let str = JsonParser::parse(&buf).string();
141172
let exists = node_metadata.contains_key(str);
142173
if exists {
143-
write_edge(from_id, str, prop.key, Some(&reified_u.props), edges_writer, node_metadata, &datasources, &subgraph);
174+
write_edge(from_id, str, prop.key, Some(&reified_u.props), edges_writer, node_metadata, &datasources, &subgraph, edge_summary);
144175
}
145176
} else {
146177
// panic!("unexpected kind: {:?}", reified_u.value_kind);
@@ -154,7 +185,7 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
154185
let exists = node_metadata.contains_key(str);
155186

156187
if exists {
157-
write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources, &subgraph);
188+
write_edge(from_id, str, prop.key, None, edges_writer, node_metadata, &datasources, &subgraph, edge_summary);
158189
}
159190

160191
} else if val.kind == JsonTokenType::StartArray {
@@ -169,8 +200,8 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal
169200

170201
}
171202

172-
fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8]) {
173-
203+
fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<SlicedProperty>>, edges_writer: &mut BufWriter<File>, node_metadata:&BTreeMap<Vec<u8>,Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8], edge_summary:&mut EdgeSummaryTable) {
204+
174205
let mut buf = Vec::new();
175206

176207
buf.extend(b"\"grebi:type\":\"");
@@ -206,7 +237,7 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
206237
}
207238
}
208239

209-
let _refs = {
240+
let _refs:Map<String,Value> = {
210241
let mut res:Map<String,Value> = Map::new();
211242
for (start,end) in find_strings(&buf) {
212243
let maybe_id = &buf[start..end];
@@ -221,6 +252,21 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
221252
res
222253
};
223254

255+
let from_type_signature:String = get_type_signature_from_metadata_json(_refs.get(&String::from_utf8_lossy(from_id).to_string()).unwrap());
256+
let to_type_signature:String = get_type_signature_from_metadata_json(_refs.get(&String::from_utf8_lossy(to_id).to_string()).unwrap());
257+
let datasources_signature:String = datasources.iter().map(|ds| String::from_utf8_lossy(ds).to_string()).collect::<Vec<String>>().join(",");
258+
259+
let edge_summary_edges = edge_summary.entry(from_type_signature).or_insert(HashMap::new());
260+
let count:&mut u64 = edge_summary_edges
261+
.entry(String::from_utf8_lossy(edge).to_string())
262+
.or_insert(HashMap::new())
263+
.entry(to_type_signature)
264+
.or_insert(HashMap::new())
265+
.entry(datasources_signature)
266+
.or_insert(0);
267+
268+
*count = *count + 1;
269+
224270
// sha1 not for security, just as a simple way to assign a unique
225271
// id to the edge that will be reproducible between dataloads
226272
//
@@ -238,3 +284,15 @@ fn write_edge(from_id: &[u8], to_id: &[u8], edge:&[u8], edge_props:Option<&Vec<S
238284
}
239285

240286

287+
288+
289+
fn get_type_signature_from_metadata_json(json:&Value) -> String {
290+
let mut t:Vec<&str> = json.as_object().unwrap()
291+
.get("grebi:type").unwrap()
292+
.as_array().unwrap()
293+
.iter()
294+
.map(|val| val.as_str().unwrap())
295+
.collect();
296+
t.sort();
297+
return t.join(",").to_string();
298+
}

Cargo.lock

+27-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ GrEBI also imports complementary datasets, so far:
1515
* [Ubergraph](https://github.com/INCATools/ubergraph)
1616
* [Human Reference Atlas KG](https://humanatlas.io/)
1717

18-
The resulting graphs can be downloaded from https://ftp.ebi.ac.uk/pub/databases/spot/kg/
18+
The resulting graphs can be downloaded from https://ftp.ebi.ac.uk/pub/databases/spot/kg/ebi/
1919

2020
## Implementation
2121

configs/datasource_configs/ols.json

+6-2
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
"enabled": true,
44
"ingests": [
55
{
6-
"ingest_files": ["/nfs/production/parkinso/spot/ols4/prod/slurm_pipeline/ontologies.json.gz"],
6+
"ingest_files": ["/nfs/production/parkinso/spot/grebi/ontologies.json.gz"],
77
"ingest_script": "./target/release/grebi_ingest_ols",
88
"ingest_args": [
9-
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" }
9+
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" },
10+
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/MONDO_0000001" },
11+
{ "name": "--superclass-is-type", "value": "http://www.ebi.ac.uk/efo/EFO_0000408" },
12+
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_36080" },
13+
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_24431" }
1014
]
1115
}
1216
]

0 commit comments

Comments
 (0)