Skip to content

Commit

Permalink
add prop and type defs to summary json, update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Sep 4, 2024
1 parent c07e25b commit ac872aa
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 9 deletions.
69 changes: 61 additions & 8 deletions 05_materialise/grebi_materialise/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct Args {
out_edges_jsonl: String,

#[arg(long)]
out_edge_summary_json: String,
out_summary_json: String,

#[arg(long)]
groups_txt: String,
Expand Down Expand Up @@ -120,11 +120,15 @@ fn main() -> std::io::Result<()> {
let stdout = io::stdout().lock();
let mut nodes_writer = BufWriter::new(stdout);

let edge_summary_file = File::create(args.out_edge_summary_json).unwrap();
let mut edge_summary_writer = BufWriter::new(edge_summary_file);
let summary_file = File::create(args.out_summary_json).unwrap();
let mut summary_writer = BufWriter::new(summary_file);

let mut edge_summary:EdgeSummaryTable = HashMap::new();

let mut all_entity_props:BTreeSet<Vec<u8>> = BTreeSet::new();
let mut all_edge_props:BTreeSet<Vec<u8>> = BTreeSet::new();
let mut all_types:BTreeSet<Vec<u8>> = BTreeSet::new();

let mut n_nodes:i64 = 0;

loop {
Expand All @@ -146,8 +150,24 @@ fn main() -> std::io::Result<()> {
}

sliced.props.iter().for_each(|prop| {

let prop_key = prop.key;

if prop_key.eq(b"grebi:type") {
for val in &prop.values {
if val.kind == JsonTokenType::StartString {
let buf = &val.value.to_vec();
let str = JsonParser::parse(&buf).string();
all_types.insert(str.to_vec());
}
}

}

all_entity_props.insert(prop_key.to_vec());

for val in &prop.values {
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &exclude_self_ref, &node_metadata, &val.datasources, sliced.subgraph, &mut edge_summary);
maybe_write_edge(sliced.id, prop, &val, &mut edges_writer, &exclude, &exclude_self_ref, &node_metadata, &val.datasources, sliced.subgraph, &mut edge_summary, &mut all_edge_props);
}
});

Expand Down Expand Up @@ -176,16 +196,45 @@ fn main() -> std::io::Result<()> {

eprintln!("materialise took {} seconds", start_time.elapsed().as_secs());

edge_summary_writer.write_all(serde_json::to_string_pretty(&json!({
let mut entity_prop_defs:Map<String,Value> = Map::new();

for prop in all_entity_props {
let def = node_metadata.get(&prop);
if def.is_some() {
entity_prop_defs.insert(String::from_utf8_lossy(&prop).to_string(), serde_json::from_slice::<Value>(def.unwrap().json.as_slice()).unwrap());
}
}

let mut edge_prop_defs:Map<String,Value> = Map::new();

for prop in all_edge_props {
let def = node_metadata.get(&prop);
if def.is_some() {
edge_prop_defs.insert(String::from_utf8_lossy(&prop).to_string(), serde_json::from_slice::<Value>(def.unwrap().json.as_slice()).unwrap());
}
}
let mut type_defs:Map<String,Value> = Map::new();

for t in all_types {
let def = node_metadata.get(&t);
if def.is_some() {
type_defs.insert(String::from_utf8_lossy(&t).to_string(), serde_json::from_slice::<Value>(def.unwrap().json.as_slice()).unwrap());
}
}

summary_writer.write_all(serde_json::to_string_pretty(&json!({
"entity_prop_defs": entity_prop_defs,
"edge_prop_defs": edge_prop_defs,
"type_defs": type_defs,
"edges": edge_summary
})).unwrap().as_bytes()).unwrap();

edge_summary_writer.flush().unwrap();
summary_writer.flush().unwrap();

Ok(())
}

fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, exclude_self_ref:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8], edge_summary: &mut EdgeSummaryTable) {
fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyValue, edges_writer: &mut BufWriter<File>, exclude:&BTreeSet<Vec<u8>>, exclude_self_ref:&BTreeSet<Vec<u8>>, node_metadata:&BTreeMap<Vec<u8>, Metadata>, datasources:&Vec<&[u8]>, subgraph:&[u8], edge_summary: &mut EdgeSummaryTable, all_edge_props: &mut BTreeSet<Vec<u8>>) {

if prop.key.eq(b"id") || prop.key.starts_with(b"grebi:") || exclude.contains(prop.key) {
return;
Expand All @@ -197,6 +246,10 @@ fn maybe_write_edge(from_id:&[u8], prop: &SlicedProperty, val:&SlicedPropertyVal

if reified.is_some() {
let reified_u = reified.unwrap();
reified_u.props.iter().for_each(|prop| {
let prop_key = prop.key.to_vec();
all_edge_props.insert(prop_key);
});
if reified_u.value_kind == JsonTokenType::StartString {
let buf = &reified_u.value.to_vec();
let str = JsonParser::parse(&buf).string();
Expand Down Expand Up @@ -332,4 +385,4 @@ fn get_type_signature_from_metadata_json(json:&Value) -> String {
.collect();
t.sort();
return t.join(",").to_string();
}
}
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ EBI Codon HPC pipeline for building integrated knowledge graphs from [EMBL-EBI r
* [Reactome](https://reactome.org/)
* [OpenTargets](https://www.opentargets.org/)
* [Metabolights](https://www.ebi.ac.uk/metabolights)
* [ChEMBL](https://www.ebi.ac.uk/chembl/)

GrEBI also imports complementary datasets, so far:

* [MONARCH Initiative KG](https://monarch-initiative.github.io/monarch-ingest/Sources/)
* [Ubergraph](https://github.com/INCATools/ubergraph)
* [Human Reference Atlas KG](https://humanatlas.io/)
* [AOPWiki](https://aopwiki.org/) (via [AOPWikiRDF](https://github.com/marvinm2/AOPWikiRDF))

The resulting graphs can be downloaded from https://ftp.ebi.ac.uk/pub/databases/spot/kg/ebi/

Expand Down
2 changes: 1 addition & 1 deletion nextflow/01_create_subgraph.nf
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ process materialise {
--in-metadata-jsonl ${metadata_jsonl} \
--groups-txt ${groups_txt} \
--out-edges-jsonl materialised_edges_${task.index}.jsonl \
--out-edge-summary-json edge_summary_${task.index}.json \
--out-summary-json edge_summary_${task.index}.json \
--exclude ${exclude.iterator().join(",")} \
--exclude-self-referential ${exclude_self_referential.iterator().join(",")} \
> materialised_nodes_${task.index}.jsonl
Expand Down

0 comments on commit ac872aa

Please sign in to comment.