Skip to content

Commit

Permalink
Add ic_scores.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Oct 8, 2024
1 parent 7d229f5 commit b07f7a3
Showing 1 changed file with 211 additions and 0 deletions.
211 changes: 211 additions & 0 deletions notebooks/ic_scores.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys, os, io, json\n",
"from neo4j import GraphDatabase\n",
"from py2neo import Graph\n",
"from pathlib import Path\n",
"from pandas import DataFrame\n",
"import pandas as pd\n",
"import networkx as nx\n",
"\n",
"graph = Graph(\"bolt://localhost:7687\")\n",
"driver = GraphDatabase.driver('bolt://localhost:7687', auth=None)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3475900\n"
]
}
],
"source": [
"# Add num_desc to all onto classes, return the max value\n",
"\n",
"df = DataFrame(graph.run(\"\"\"\n",
"MATCH (ancestor:`ols:Class`)<-[:`biolink:broad_match`*1..]-(subclass:`ols:Class`)\n",
"WITH ancestor, count(DISTINCT subclass) AS num_desc\n",
"SET ancestor.num_desc = num_desc\n",
"RETURN max(num_desc) AS max_num_desc\n",
"\"\"\").data())\n",
"\n",
"max_num_desc=df['max_num_desc'][0]\n",
"print(max_num_desc)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# For all onto classes set ic to a normalised value based on the number of descendants where 0 descendants = 1 ic\n",
"# upper level classes like owl:Thing, bfo continuant will have very low ic scores (< 0.02)\n",
"\n",
"df = DataFrame(graph.run(\"\"\"\n",
"MATCH (cl:`ols:Class`)\n",
"SET cl.ic = 1.0 - (cl.num_desc/$max_num_desc)\n",
"\"\"\", {\n",
" 'max_num_desc': float(max_num_desc)\n",
"}).data())\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"df = DataFrame(graph.run(\"\"\"\n",
"CREATE INDEX ic FOR (n:GraphNode) ON (n.ic)\n",
"\"\"\").data())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Which nodes have an IC score of less than 0.5?"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| | name | ic |\n",
"|---:|:------------------------------|------------:|\n",
"| 0 | entity | 0 |\n",
"| 1 | Thing | 2.47418e-05 |\n",
"| 2 | entity | 2.56049e-05 |\n",
"| 3 | experimental factor | 0.00913087 |\n",
"| 4 | bfo:continuant | 0.0175767 |\n",
"| 5 | bfo:independent_continuant | 0.0793748 |\n",
"| 6 | material entity | 0.0793815 |\n",
"| 7 | object | 0.0867643 |\n",
"| 8 | biological entity | 0.0906039 |\n",
"| 9 | organismal entity | 0.251327 |\n",
"| 10 | obi:organism | 0.253235 |\n",
"| 11 | obo:ncbitaxon.owl | 0.253238 |\n",
"| 12 | cellular organisms or viruses | 0.259099 |\n",
"| 13 | NCBI_taxonomy:131567 | 0.33141 |\n",
"| 14 | Archaea or Eukaryota | 0.493794 |\n",
"| 15 | Eukaryota | 0.498043 |\n"
]
}
],
"source": [
"\n",
"df = DataFrame(graph.run(\"\"\"\n",
"MATCH (n:GraphNode) WHERE n.ic < 0.5\n",
"RETURN n.`grebi:name`[0] AS name, n.ic AS ic\n",
"\"\"\").data())\n",
"\n",
"print(df.to_markdown())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| | name | ic |\n",
"|---:|:---------------------------------------------------|------------:|\n",
"| 0 | entity | 0 |\n",
"| 1 | Thing | 2.47418e-05 |\n",
"| 2 | entity | 2.56049e-05 |\n",
"| 3 | experimental factor | 0.00913087 |\n",
"| 4 | bfo:continuant | 0.0175767 |\n",
"| 5 | bfo:independent_continuant | 0.0793748 |\n",
"| 6 | material entity | 0.0793815 |\n",
"| 7 | object | 0.0867643 |\n",
"| 8 | biological entity | 0.0906039 |\n",
"| 9 | organismal entity | 0.251327 |\n",
"| 10 | obi:organism | 0.253235 |\n",
"| 11 | obo:ncbitaxon.owl | 0.253238 |\n",
"| 12 | cellular organisms or viruses | 0.259099 |\n",
"| 13 | NCBI_taxonomy:131567 | 0.33141 |\n",
"| 14 | Archaea or Eukaryota | 0.493794 |\n",
"| 15 | Eukaryota | 0.498043 |\n",
"| 16 | Unikonta | 0.591569 |\n",
"| 17 | Fungi/Metazoa group | 0.592667 |\n",
"| 18 | Metazoa | 0.650527 |\n",
"| 19 | Eumetazoa | 0.652455 |\n",
"| 20 | Bilateria | 0.657087 |\n",
"| 21 | NCBI_taxonomy:33317 | 0.695427 |\n",
"| 22 | Ecdysozoa | 0.72271 |\n",
"| 23 | NCBI_taxonomy:88770 | 0.728683 |\n",
"| 24 | Arthropoda | 0.729258 |\n",
"| 25 | Mandibulata | 0.752957 |\n",
"| 26 | NCBI_taxonomy:197562 | 0.754289 |\n",
"| 27 | Viridiplantae or Bacteria or Euglenozoa or Archaea | 0.755665 |\n",
"| 28 | Viridiplantae or Archaea or Bacteria | 0.756341 |\n",
"| 29 | Viridiplantae or Bacteria or Euglenozoa | 0.759915 |\n",
"| 30 | Viridiplantae or Bacteria | 0.76059 |\n",
"| 31 | Hexapoda | 0.76777 |\n",
"| 32 | Insecta | 0.773244 |\n",
"| 33 | Dicondylia | 0.77332 |\n",
"| 34 | NCBI_taxonomy:7496 | 0.773414 |\n",
"| 35 | Fungi or Bacteria or Archaea | 0.775881 |\n",
"| 36 | NCBI_taxonomy:33340 | 0.776196 |\n",
"| 37 | Fungi or Bacteria | 0.780131 |\n",
"| 38 | Endopterygota | 0.799735 |\n"
]
}
],
"source": [
"\n",
"df = DataFrame(graph.run(\"\"\"\n",
"MATCH (n:GraphNode) WHERE n.ic < 0.8\n",
"RETURN n.`grebi:name`[0] AS name, n.ic AS ic\n",
"\"\"\").data())\n",
"\n",
"print(df.to_markdown())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit b07f7a3

Please sign in to comment.