Skip to content

Commit 38a9c8c

Browse files
authored
Assign ids to everything not just equivalence groups (#7)
1 parent 484d549 commit 38a9c8c

File tree

24 files changed

+263
-306
lines changed

24 files changed

+263
-306
lines changed

01_ingest/grebi_ingest_gwas/src/write_associations.rs

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use std::io::{BufWriter, self, BufReader, StdinLock, StdoutLock, Write};
55
use std::ptr::eq;
66
use grebi_shared::prefix_map::PrefixMap;
77
use grebi_shared::prefix_map::PrefixMapBuilder;
8-
use grebi_shared::serialize_equivalence;
98
use serde_json::{json, Value};
109

1110
use crate::check_headers::check_headers;

01_ingest/grebi_ingest_gwas/src/write_studies.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use std::io::{BufWriter, self, BufReader, StdinLock, StdoutLock, Write};
55
use std::ptr::eq;
66
use grebi_shared::prefix_map::PrefixMap;
77
use grebi_shared::prefix_map::PrefixMapBuilder;
8-
use grebi_shared::serialize_equivalence;
98
use serde_json::json;
109

1110
use crate::check_headers::check_headers;
@@ -106,4 +105,4 @@ pub fn write_studies(csv_reader: &mut csv::Reader<BufReader<StdinLock>>,nodes_wr
106105
}
107106
}
108107

109-
}
108+
}

01_ingest/grebi_ingest_ols/src/main.rs

-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ use std::ptr::eq;
66
use clap::Parser;
77
use grebi_shared::prefix_map::PrefixMap;
88
use grebi_shared::prefix_map::PrefixMapBuilder;
9-
use grebi_shared::serialize_equivalence;
109
use struson::reader::{JsonReader, JsonStreamReader, ValueType};
1110
use serde_json::Value;
1211
use serde_json::Map;
@@ -146,16 +145,6 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
146145

147146
}
148147

149-
const EQUIV_PREDICATES :[&str;2]= [
150-
"owl:equivalentClass",
151-
"owl:equivalentProperty",
152-
// "owl:sameAs",
153-
// "skos:exactMatch",
154-
// "oboinowl:hasAlternativeId",
155-
// "uniprot:replaces",
156-
// "iao:0100001" // -> replacement term
157-
];
158-
159148
fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool) {
160149
json.begin_array().unwrap();
161150
while json.has_next().unwrap() {

01_ingest/grebi_ingest_reactome/src/main.rs

-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ use std::env;
77
use clap::Parser;
88
use grebi_shared::prefix_map::PrefixMap;
99
use grebi_shared::prefix_map::PrefixMapBuilder;
10-
use grebi_shared::serialize_equivalence;
1110
use serde_json::json;
1211
use serde_json::Value;
1312

01_ingest/grebi_ingest_sssom/src/main.rs

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ use std::ptr::eq;
66
use clap::Parser;
77
use grebi_shared::prefix_map::PrefixMap;
88
use grebi_shared::prefix_map::PrefixMapBuilder;
9-
use grebi_shared::serialize_equivalence;
109
use serde_json::json;
1110
use serde_yaml;
1211

01_ingest/grebi_normalise_prefixes/src/main.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use std::{env, io};
77
use std::io::{BufRead, BufReader };
88
use std::io::{Write, BufWriter};
99

10-
use grebi_shared::{get_subject, find_strings, serialize_equivalence, json_parser, json_lexer};
10+
use grebi_shared::{get_subject, find_strings, json_parser, json_lexer};
1111
use grebi_shared::prefix_map::PrefixMap;
1212
use grebi_shared::prefix_map::PrefixMapBuilder;
1313

02_equivalences/grebi_assign_ids/src/main.rs 02_assign_ids/grebi_assign_ids/src/main.rs

+29-22
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use grebi_shared::find_strings;
1818
struct Args {
1919

2020
#[arg(long)]
21-
add_prefix: String, // used to prepend the subgraph name like hra_kg:g:
21+
identifier_properties:String,
2222

2323
#[arg(long)]
2424
groups_txt: String,
@@ -34,9 +34,15 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
3434
fn main() {
3535

3636
let args = Args::parse();
37-
let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();
3837

39-
let add_prefix = args.add_prefix;
38+
39+
let mut id_props:HashSet<Vec<u8>> = HashSet::new();
40+
for prop in args.identifier_properties.split(",") {
41+
id_props.insert(prop.as_bytes().to_vec());
42+
}
43+
44+
45+
let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();
4046

4147
let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {
4248

@@ -95,29 +101,32 @@ fn main() {
95101
while json.peek().kind != JsonTokenType::EndObject {
96102
let prop_key = json.name();
97103

98-
if prop_key == b"id" {
99-
id = Some(json.string());
104+
// any of the IDs will do, we only need one
105+
// as all identifiers map to the same group
106+
//
107+
if id_props.contains(prop_key) {
108+
// TODO handle the same cases as the id extraction does
109+
if json.peek().kind == JsonTokenType::StartArray {
110+
json.begin_array();
111+
id = Some(json.string());
112+
} else {
113+
id = Some(json.string());
114+
}
115+
break;
100116
} else {
101117
json.value(); // skip
102118
}
103119
}
104120

105121
let group = id_to_group.get(id.unwrap());
106-
if group.is_some() {
107-
108-
// the subject mapped to an equivalence group
109-
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
110-
writer.write_all(add_prefix.as_bytes()).unwrap();
111-
writer.write_all(group.unwrap().as_slice()).unwrap();
112-
writer.write_all("\"".as_bytes()).unwrap();
113-
} else {
114-
// the subject did not map to an equivalence group
115-
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
116-
writer.write_all(add_prefix.as_bytes()).unwrap();
117-
writer.write_all(id.unwrap()).unwrap();
118-
writer.write_all("\"".as_bytes()).unwrap();
122+
if !group.is_some() {
123+
panic!("could not find identifier group for id: {}", String::from_utf8(id.unwrap().to_vec()).unwrap());
119124
}
120125

126+
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
127+
writer.write_all(group.unwrap().as_slice()).unwrap();
128+
writer.write_all("\"".as_bytes()).unwrap();
129+
121130
json.rewind();
122131
while json.peek().kind != JsonTokenType::EndObject {
123132

@@ -129,7 +138,6 @@ fn main() {
129138
} else {
130139
let name_group = id_to_group.get(name);
131140
if name_group.is_some() {
132-
writer.write_all(add_prefix.as_bytes()).unwrap();
133141
writer.write_all(name_group.unwrap()).unwrap();
134142
} else {
135143
writer.write_all(name).unwrap();
@@ -140,7 +148,7 @@ fn main() {
140148
if name.eq(b"id") || preserve_fields.contains(name) {
141149
writer.write_all(json.value()).unwrap();
142150
} else {
143-
write_value(&mut writer, json.value(), &id_to_group, &add_prefix);
151+
write_value(&mut writer, json.value(), &id_to_group);
144152
}
145153
}
146154

@@ -151,7 +159,7 @@ fn main() {
151159

152160
}
153161

154-
fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>, add_prefix:&str) {
162+
fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>) {
155163

156164
let string_locations = find_strings(&value);
157165

@@ -174,7 +182,6 @@ fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&
174182

175183
let pv_group = id_to_group.get(str);
176184
if pv_group.is_some() {
177-
writer.write_all(add_prefix.as_bytes()).unwrap();
178185
writer.write_all(pv_group.unwrap()).unwrap();
179186
} else {
180187
writer.write_all(str).unwrap();

02_equivalences/grebi_extract_equivalences/Cargo.toml 02_assign_ids/grebi_extract_identifiers/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[package]
2-
name = "grebi_extract_equivalences"
2+
name = "grebi_extract_identifiers"
33
version = "0.1.0"
44
edition = "2021"
55

02_equivalences/grebi_extract_equivalences/src/main.rs 02_assign_ids/grebi_extract_identifiers/src/main.rs

+22-30
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use std::{env, io};
77
use std::io::{BufRead, BufReader };
88
use std::io::{Write, BufWriter};
99

10-
use grebi_shared::{get_subject, find_strings, serialize_equivalence, json_parser, json_lexer};
10+
use grebi_shared::{get_subject, find_strings, json_parser, json_lexer};
1111

1212
use clap::Parser;
1313

@@ -22,7 +22,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
2222
struct Args {
2323

2424
#[arg(long)]
25-
equivalence_properties:String
25+
identifier_properties:String
2626
}
2727

2828
fn main() {
@@ -36,13 +36,13 @@ fn main() {
3636
let stdout = io::stdout().lock();
3737
let mut writer = BufWriter::new(stdout);
3838

39-
let mut equiv_props:HashSet<Vec<u8>> = HashSet::new();
39+
let mut id_props:HashSet<Vec<u8>> = HashSet::new();
4040

4141
let mut n_total = 0;
4242

4343
let args = Args::parse();
44-
for prop in args.equivalence_properties.split(",") {
45-
equiv_props.insert(prop.as_bytes().to_vec());
44+
for prop in args.identifier_properties.split(",") {
45+
id_props.insert(prop.as_bytes().to_vec());
4646
}
4747

4848
loop {
@@ -54,32 +54,15 @@ fn main() {
5454
}
5555

5656
let mut json = JsonParser::parse(&line);
57-
58-
59-
let mut id:Option<&[u8]> = None;
6057
json.begin_object();
61-
json.mark();
6258

63-
while json.peek().kind != JsonTokenType::EndObject {
64-
let name = json.name();
65-
if name.eq("id".as_bytes()) {
66-
id = Some(json.string());
67-
break;
68-
} else {
69-
json.value(); // skip
70-
}
71-
}
72-
json.rewind();
73-
74-
if id.is_none() {
75-
panic!("Missing id field in JSON: {}", String::from_utf8(line).unwrap());
76-
}
59+
let mut wrote_any = false;
7760

7861
while json.peek().kind != JsonTokenType::EndObject {
7962

8063
let k = json.name();
8164

82-
if !equiv_props.contains(k) {
65+
if !id_props.contains(k) {
8366
json.value(); // skip
8467
continue;
8568
}
@@ -88,24 +71,33 @@ fn main() {
8871
json.begin_array();
8972
while json.peek().kind != JsonTokenType::EndArray {
9073
if json.peek().kind == JsonTokenType::StartString {
91-
let serialized = serialize_equivalence(id.unwrap(), json.string());
92-
if serialized.is_some() {
93-
writer.write_all(&serialized.unwrap()).unwrap();
74+
if wrote_any {
75+
writer.write_all(b"\t").unwrap();
76+
} else {
77+
wrote_any = true;
9478
}
79+
writer.write_all(&json.string()).unwrap();
9580
} else {
9681
json.value(); // skip
9782
}
9883
}
9984
json.end_array();
10085
} else if json.peek().kind == JsonTokenType::StartString {
101-
let serialized = serialize_equivalence(id.unwrap(), json.string());
102-
if serialized.is_some() {
103-
writer.write_all(&serialized.unwrap()).unwrap();
86+
if wrote_any {
87+
writer.write_all(b"\t").unwrap();
88+
} else {
89+
wrote_any = true;
10490
}
91+
writer.write_all(&json.string()).unwrap();
10592
} else {
10693
json.value(); // skip
10794
}
10895
}
96+
if !wrote_any {
97+
panic!("no identifiers found in object {}", String::from_utf8_lossy(&line));
98+
}
99+
100+
writer.write_all(b"\n").unwrap();
109101

110102
n_total = n_total + 1;
111103

02_equivalences/grebi_equivalences2groups/Cargo.toml 02_assign_ids/grebi_identifiers2groups/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[package]
2-
name = "grebi_equivalences2groups"
2+
name = "grebi_identifiers2groups"
33
version = "0.1.0"
44
edition = "2021"
55

0 commit comments

Comments
 (0)