From b39d677365e8d7e73c4b40e920583faf6da3bcb6 Mon Sep 17 00:00:00 2001
From: souzadevinicius <souzadevinicius@gmail.com>
Date: Thu, 19 Sep 2024 15:10:59 +0100
Subject: [PATCH] add hpo queries notebook

---
 notebooks/hpo.ipynb | 136 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 notebooks/hpo.ipynb

diff --git a/notebooks/hpo.ipynb b/notebooks/hpo.ipynb
new file mode 100644
index 0000000..605e285
--- /dev/null
+++ b/notebooks/hpo.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "from py2neo import Graph\n",
+    "from pathlib import Path\n",
+    "from pandas import DataFrame\n",
+    "\n",
+    "graph = Graph(\"bolt://localhost:8687\")\n",
+    "\n",
+    "# df = DataFrame(graph.run(\"\"\"\n",
+    "# MATCH (id:Id { id:\"chebi:5063\"})<-[:id]-(n:GraphNode)\n",
+    "# RETURN n.`grebi:name`[0] as name\n",
+    "# \"\"\").data())\n",
+    "\n",
+    "# print(df)\n",
+    "\n",
+    "\n",
+    "#Returns all HP terms. Each term node definitely corresponds to an HP term, but may ALSO correspond to an MP term due to the mappings.\n",
+    "#This also means that the relationships may come from either HP or MP.\n",
+    "#\n",
+    "df = DataFrame(graph.run(\"\"\"\n",
+    "MATCH (id:Id { id:\"hp:0000001\"})<-[:id]-(hpo_root_term:GraphNode)\n",
+    "    <-[:`biolink:broad_match`]-(term:GraphNode)\n",
+    "    -[outgoing_edge]->(n:GraphNode)\n",
+    "RETURN term.`grebi:name`[0] AS from, type(outgoing_edge) AS edge, n.`grebi:name`[0] AS to\n",
+    "\"\"\").data())\n",
+    "\n",
+    "df.to_csv(\"all_hp_all_out.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# This version of the above query filters the relationships to those asserted by HP only (not MP)\n",
+    "df = DataFrame(graph.run(\"\"\"\n",
+    "MATCH (id:Id { id:\"hp:0000001\"})<-[:id]-(hpo_root_term:GraphNode)\n",
+    "    <-[:`biolink:broad_match`]-(term:GraphNode)\n",
+    "    -[outgoing_edge]->(n:GraphNode)\n",
+    "    WHERE \"OLS.hp\" IN outgoing_edge.`grebi:datasources`\n",
+    "RETURN term.`grebi:name`[0] AS from, type(outgoing_edge) AS edge, n.`grebi:name`[0] AS to\n",
+    "\"\"\").data())\n",
+    "\n",
+    "df.to_csv(\"all_hp_all_out_hp_only.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# This version of the above query filters the relationships to those asserted by HP only (not MP)\n",
+    "# Also adds identifiers in the results\n",
+    "\n",
+    "df = DataFrame(graph.run(\"\"\"\n",
+    "MATCH (id:Id { id:\"hp:0000001\"})<-[:id]-(hpo_root_term:GraphNode)\n",
+    "    <-[:`biolink:broad_match`]-(term:GraphNode)\n",
+    "    -[outgoing_edge]->(n:GraphNode)\n",
+    "    WHERE \"OLS.hp\" IN outgoing_edge.`grebi:datasources`\n",
+    "RETURN\n",
+    "    [id in term.id WHERE id =~ \"hp:[0-9]*\" | id][0] AS from_id,\n",
+    "    term.`grebi:name`[0] AS from_label,\n",
+    "    type(outgoing_edge) AS edge,\n",
+    "    n.id AS to_ids,\n",
+    "    n.`grebi:name`[0] AS to_label\n",
+    "\"\"\").data())\n",
+    "\n",
+    "df.to_csv(\"all_hp_all_out_hp_outgoing.csv\", index=False)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# This query returns all incoming edges to all HP terms\n",
+    "# Note that the incoming edges may target either the HP terms or their equivalent MP terms\n",
+    "# and there is currently no way to differentiate!\n",
+    "# We will prob have to make two different versions of the Neo4j, one with merged mappings and one without\n",
+    "\n",
+    "df = DataFrame(graph.run(\"\"\"\n",
+    "MATCH (id:Id { id:\"hp:0000001\"})<-[:id]-(hpo_root_term:GraphNode)\n",
+    "    <-[:`biolink:broad_match`]-(term:GraphNode)\n",
+    "    <-[incoming_edge]-(n:GraphNode)\n",
+    "RETURN\n",
+    "    [id in term.id WHERE id =~ \"hp:[0-9]*\" | id][0] AS to_id,\n",
+    "    term.`grebi:name`[0] AS to_label,\n",
+    "    type(incoming_edge) AS edge,\n",
+    "    n.id AS from_ids,\n",
+    "    n.`grebi:name`[0] AS from_label\n",
+    "\"\"\").data())\n",
+    "\n",
+    "df.to_csv(\"all_hp_all_in_hp.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}