From 311a2247a61d6069963e61c221dafd76d7e27e87 Mon Sep 17 00:00:00 2001 From: linkmluser Date: Tue, 11 Jun 2024 15:11:27 -0700 Subject: [PATCH 1/4] Aligning type and name --- docs/about.rst | 8 + docs/how-to/Index-caDSR.ipynb | 1026 +++++++++++++++++ docs/how-to/Use-MongoDB.ipynb | 175 ++- src/linkml_store/api/client.py | 28 +- src/linkml_store/api/collection.py | 24 +- src/linkml_store/api/config.py | 8 +- src/linkml_store/api/database.py | 51 +- .../api/stores/duckdb/duckdb_collection.py | 2 + .../api/stores/duckdb/duckdb_database.py | 5 +- .../api/stores/filesystem/__init__.py | 16 + .../filesystem/filesystem_collection.py | 142 +++ .../stores/filesystem/filesystem_database.py | 48 + src/linkml_store/cli.py | 2 +- tests/test_api/test_api.py | 71 +- tests/test_cli.py | 6 +- 15 files changed, 1496 insertions(+), 116 deletions(-) create mode 100644 docs/how-to/Index-caDSR.ipynb create mode 100644 src/linkml_store/api/stores/filesystem/__init__.py create mode 100644 src/linkml_store/api/stores/filesystem/filesystem_collection.py create mode 100644 src/linkml_store/api/stores/filesystem/filesystem_database.py diff --git a/docs/about.rst b/docs/about.rst index 89b3f9a..363e484 100644 --- a/docs/about.rst +++ b/docs/about.rst @@ -6,6 +6,14 @@ About LinkML-Store is an early effort to provide a unifying storage layer over multiple different backends, unified via LinkML schemas. +The overall goals are to provide: + +* Make it easier to work with data in different forms (tabular, JSON, columnar, RDF) +* Expressive validation at scale, including full referential integrity validation +* Ability to mix and match different backends (e.g. DuckDB, MongoDB, Solr, ChromaDB, HDF5) +* Composability of different search indexes, including LLM textual embeddings +* LAMP-like stack for LinkML + Installation ------------ diff --git a/docs/how-to/Index-caDSR.ipynb b/docs/how-to/Index-caDSR.ipynb new file mode 100644 index 0000000..3f03cef --- /dev/null +++ b/docs/how-to/Index-caDSR.ipynb @@ -0,0 +1,1026 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# How to index the caDSR metadata element registry with LinkML-Store\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "collapsed": false + }, + "id": "fc4794dd116ed21" + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "\n", + "import os\n", + "import json\n", + "path = \"cadsr/cde-json\"\n", + "objs = []\n", + "for root, dirs, files in os.walk(path):\n", + " for file in files:\n", + " if file.endswith(\".json\"):\n", + " with open(os.path.join(root, file)) as stream:\n", + " obj = json.load(stream)\n", + " objs.append(obj)\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:40:02.156361Z", + "start_time": "2024-05-12T01:39:27.413147Z" + } + }, + "id": "142993c7e60551d1" + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": "74229" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(objs)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:40:02.165963Z", + "start_time": "2024-05-12T01:40:02.155292Z" + } + }, + "id": "978bf035146309c2" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataElement:\n", + " AlternateNames: []\n", + " ClassificationSchemes: []\n", + " DataElementConcept:\n", + " ConceptualDomain:\n", + " administrativeNotes: null\n", + " beginDate: '2006-09-28'\n", + " changeDescription: null\n", + " context: CCR\n", + " contextVersion: '1'\n", + " createdBy: REEVESD\n", + " dateCreated: '2006-09-28'\n", + " dateModified: '2008-11-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 1E838B40-6636-0A25-E044-0003BA3F9857\n", + " latestVersionIndicator: 'Yes'\n", + " longName: MEASURE/INSTRUMENT TESTING\n", + " modifiedBy: REEVESD\n", + " origin: CCR:Center for Cancer Research\n", + " preferredDefinition: Process and results associated with self-reported measures\n", + " and instruments, surveys, other tools\n", + " preferredName: Person Measure/Instrument Testing\n", + " publicId: '2524082'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " ObjectClass:\n", + " Concepts:\n", + " - conceptCode: C15747\n", + " definition: Supportive care is that which helps the patient and their family\n", + " to cope with cancer and treatment of it from pre-diagnosis, through the\n", + " process of diagnosis and treatment, to cure, continuing illness or death\n", + " and into bereavement. It helps the patient to maximize the benefits of treatment\n", + " and to live as well as possible with the effects of the disease. Supportive\n", + " therapy may provide a patient with friendship, encouragement, practical\n", + " advice such as access to community resources or how to develop a more active\n", + " social life, vocational counseling, suggestions for minimizing friction\n", + " with family members, and, above all, hope that the life of the patient may\n", + " be improved. In all situations, supportive therapy involves the teaching\n", + " of such life skills as managing medication, learning to socialize, handling\n", + " finances, and getting a job.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Supportive Care\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2021-01-27'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: MALUMK\n", + " dateCreated: '2021-01-27'\n", + " dateModified: '2021-01-27'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: B9E8B130-3E9E-5D7D-E053-4EBD850A6CF1\n", + " latestVersionIndicator: 'Yes'\n", + " longName: C15747\n", + " modifiedBy: ONEDATA\n", + " origin: NCI Thesaurus\n", + " preferredDefinition: Supportive care is that which helps the patient and their\n", + " family to cope with cancer and treatment of it from pre-diagnosis, through\n", + " the process of diagnosis and treatment, to cure, continuing illness or death\n", + " and into bereavement. It helps the patient to maximize the benefits of treatment\n", + " and to live as well as possible with the effects of the disease. Supportive\n", + " therapy may provide a patient with friendship, encouragement, practical advice\n", + " such as access to community resources or how to develop a more active social\n", + " life, vocational counseling, suggestions for minimizing friction with family\n", + " members, and, above all, hope that the life of the patient may be improved.\n", + " In all situations, supportive therapy involves the teaching of such life skills\n", + " as managing medication, learning to socialize, handling finances, and getting\n", + " a job.\n", + " preferredName: Supportive Care\n", + " publicId: '7559951'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " Property:\n", + " Concepts:\n", + " - conceptCode: C177230\n", + " definition: A question about how strongly an individual agrees that they would\n", + " feel a specified emotion when they hear the term \"palliative care\".\n", + " displayOrder: '3'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: What I Feel When I Hear Palliative Care\n", + " primaryIndicator: 'No'\n", + " - conceptCode: C77960\n", + " definition: A feeling of apprehension that one may be in danger.\n", + " displayOrder: '2'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Fear\n", + " primaryIndicator: 'No'\n", + " - conceptCode: C25369\n", + " definition: The act of agreeing to a plan, protocol, or arrangement. Also\n", + " used to describe a harmony of opinion.\n", + " displayOrder: '1'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Agreement\n", + " primaryIndicator: 'No'\n", + " - conceptCode: C25664\n", + " definition: An ordered reference standard used to measure incremental changes.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Scale\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2021-01-29'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: MALUMK\n", + " dateCreated: '2021-01-29'\n", + " dateModified: '2021-01-29'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: BA115178-92C1-5D90-E053-4EBD850A955D\n", + " latestVersionIndicator: 'Yes'\n", + " longName: C177230:C77960:C25369:C25664\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " preferredDefinition: A question about how strongly an individual agrees that\n", + " they would feel a specified emotion when they hear the term \"palliative care\".:A\n", + " feeling of apprehension that one may be in danger.:The act of agreeing to\n", + " a plan, protocol, or arrangement. Also used to describe a harmony of opinion.:An\n", + " ordered reference standard used to measure incremental changes.\n", + " preferredName: What I Feel When I Hear Palliative Care Fear Agreement Scale\n", + " publicId: '7571386'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " administrativeNotes: null\n", + " beginDate: '2021-01-29'\n", + " changeDescription: Released. 03/18/2021 KMM\n", + " context: NHLBI\n", + " contextVersion: '1'\n", + " createdBy: MALUMK\n", + " dateCreated: '2021-01-29'\n", + " dateModified: '2021-03-18'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: BA115178-92D2-5D90-E053-4EBD850A955D\n", + " latestVersionIndicator: 'Yes'\n", + " longName: 7559951v1.0:7571386v1.0\n", + " modifiedBy: MALUMK\n", + " origin: NMDP:National Marrow Donor Program\n", + " preferredDefinition: Supportive care is that which helps the patient and their\n", + " family to cope with cancer and treatment of it from pre-diagnosis, through the\n", + " process of diagnosis and treatment, to cure, continuing illness or death and\n", + " into bereavement. It helps the patient to maximize the benefits of treatment\n", + " and to live as well as possible with the effects of the disease. Supportive\n", + " therapy may provide a patient with friendship, encouragement, practical advice\n", + " such as access to community resources or how to develop a more active social\n", + " life, vocational counseling, suggestions for minimizing friction with family\n", + " members, and, above all, hope that the life of the patient may be improved.\n", + " In all situations, supportive therapy involves the teaching of such life skills\n", + " as managing medication, learning to socialize, handling finances, and getting\n", + " a job._A question about how strongly an individual agrees that they would feel\n", + " a specified emotion when they hear the term \"palliative care\"._A feeling of\n", + " apprehension that one may be in danger._The act of agreeing to a plan, protocol,\n", + " or arrangement. Also used to describe a harmony of opinion._An ordered reference\n", + " standard used to measure incremental changes.\n", + " preferredName: Supportive Care When I Hear The Term Palliative Care, I Feel Fear\n", + " Agreement Scale\n", + " publicId: '7571388'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " ReferenceDocuments:\n", + " - context: NHLBI\n", + " description: I feel fear when hearing the term palliative care.\n", + " name: I feel fear when hearing the\n", + " type: Preferred Question Text\n", + " url: null\n", + " - context: NHLBI\n", + " description: When I hear the term palliative care, I feel scared\n", + " name: When I hear the term palliative care, I feel scared\n", + " type: Alternate Question Text\n", + " url: null\n", + " - context: NHLBI\n", + " description: When I hear the term palliative care, I feel scared\n", + " name: When I hear the term palliative care, I feel scared\n", + " type: Application Standard Question Text\n", + " url: null\n", + " - context: NHLBI\n", + " description: When I hear the term palliative care, I feel scared.\n", + " name: When I hear the term palliative care, I feel scared.\n", + " type: Alternate Question Text\n", + " url: null\n", + " ValueDomain:\n", + " ConceptualDomain:\n", + " administrativeNotes: null\n", + " beginDate: '2006-09-28'\n", + " changeDescription: null\n", + " context: CCR\n", + " contextVersion: '1'\n", + " createdBy: REEVESD\n", + " dateCreated: '2006-09-28'\n", + " dateModified: '2008-11-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 1E838B40-6636-0A25-E044-0003BA3F9857\n", + " latestVersionIndicator: 'Yes'\n", + " longName: MEASURE/INSTRUMENT TESTING\n", + " modifiedBy: REEVESD\n", + " origin: CCR:Center for Cancer Research\n", + " preferredDefinition: Process and results associated with self-reported measures\n", + " and instruments, surveys, other tools\n", + " preferredName: Person Measure/Instrument Testing\n", + " publicId: '2524082'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " PermissibleValues:\n", + " - ValueMeaning:\n", + " Concepts:\n", + " - conceptCode: C104476\n", + " definition: Intensely at odds.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Strongly Disagree\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2013-02-07'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: COOPERM\n", + " dateCreated: '2013-02-07'\n", + " dateModified: '2018-01-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: D52DC623-4AA8-9139-E040-BB89AD4368ED\n", + " latestVersionIndicator: 'Yes'\n", + " longName: '3682710'\n", + " modifiedBy: SBR\n", + " origin: null\n", + " preferredDefinition: Intensely at odds.\n", + " preferredName: Strongly Disagree\n", + " publicId: '3682710'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " beginDate: '2013-02-07'\n", + " createdBy: TSESU\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-07'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2B8-325F-E053-F662850AD34F\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " value: Strongly Disagree\n", + " valueDescription: Strongly Disagree\n", + " - ValueMeaning:\n", + " Concepts:\n", + " - conceptCode: C104479\n", + " definition: To be at odds.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Disagree\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2013-02-07'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: COOPERM\n", + " dateCreated: '2013-02-07'\n", + " dateModified: '2018-01-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: D52DC623-4ACB-9139-E040-BB89AD4368ED\n", + " latestVersionIndicator: 'Yes'\n", + " longName: '3682711'\n", + " modifiedBy: SBR\n", + " origin: null\n", + " preferredDefinition: To be at odds.\n", + " preferredName: Disagree\n", + " publicId: '3682711'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " beginDate: '2013-02-07'\n", + " createdBy: TSESU\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-07'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2C2-325F-E053-F662850AD34F\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " value: Disagree\n", + " valueDescription: Disagree\n", + " - ValueMeaning:\n", + " Concepts:\n", + " - conceptCode: C103804\n", + " definition: Without concurrence or opposition.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Neither Agree or Disagree\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2013-02-07'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: COOPERM\n", + " dateCreated: '2013-02-07'\n", + " dateModified: '2023-12-21'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: D52DC623-4AF0-9139-E040-BB89AD4368ED\n", + " latestVersionIndicator: 'Yes'\n", + " longName: '3682712'\n", + " modifiedBy: MMADDINENI\n", + " origin: null\n", + " preferredDefinition: Without concurrence or opposition.\n", + " preferredName: Neither Agree or Disagree\n", + " publicId: '3682712'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " beginDate: '2013-02-07'\n", + " createdBy: TSESU\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-07'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2CC-325F-E053-F662850AD34F\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " value: No Opinion\n", + " valueDescription: Neither Agree or Disagree\n", + " - ValueMeaning:\n", + " Concepts:\n", + " - conceptCode: C104478\n", + " definition: To concur.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Agree\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2013-02-07'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: COOPERM\n", + " dateCreated: '2013-02-07'\n", + " dateModified: '2018-01-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: D52DC623-4B13-9139-E040-BB89AD4368ED\n", + " latestVersionIndicator: 'Yes'\n", + " longName: '3682713'\n", + " modifiedBy: SBR\n", + " origin: null\n", + " preferredDefinition: To concur.\n", + " preferredName: Agree\n", + " publicId: '3682713'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " beginDate: '2013-02-07'\n", + " createdBy: TSESU\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-07'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2D6-325F-E053-F662850AD34F\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " value: Agree\n", + " valueDescription: Agree\n", + " - ValueMeaning:\n", + " Concepts:\n", + " - conceptCode: C104475\n", + " definition: Intense concurrence.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Strongly Agree\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2013-02-07'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: COOPERM\n", + " dateCreated: '2013-02-07'\n", + " dateModified: '2018-01-19'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: D52DC623-4B36-9139-E040-BB89AD4368ED\n", + " latestVersionIndicator: 'Yes'\n", + " longName: '3682714'\n", + " modifiedBy: SBR\n", + " origin: null\n", + " preferredDefinition: Intense concurrence.\n", + " preferredName: Strongly Agree\n", + " publicId: '3682714'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " beginDate: '2013-02-07'\n", + " createdBy: TSESU\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-07'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2E0-325F-E053-F662850AD34F\n", + " modifiedBy: ONEDATA\n", + " origin: null\n", + " value: Strongly Agree\n", + " valueDescription: Strongly Agree\n", + " RepresentationTerm:\n", + " Concepts:\n", + " - conceptCode: C25664\n", + " definition: An ordered reference standard used to measure incremental changes.\n", + " displayOrder: '0'\n", + " evsSource: NCI_CONCEPT_CODE\n", + " longName: Scale\n", + " primaryIndicator: 'Yes'\n", + " administrativeNotes: null\n", + " beginDate: '2006-02-24'\n", + " changeDescription: null\n", + " context: NCIP\n", + " contextVersion: '1'\n", + " createdBy: CAMPBELB\n", + " dateCreated: '2006-02-24'\n", + " dateModified: '2006-02-24'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 0D51E8D8-69DD-387C-E044-0003BA3F9857\n", + " latestVersionIndicator: 'Yes'\n", + " longName: C25664\n", + " modifiedBy: ONEDATA\n", + " origin: NCI Thesaurus\n", + " preferredDefinition: an ordered reference standard.\n", + " preferredName: Scale\n", + " publicId: '2452834'\n", + " registrationStatus: Standard\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n", + " administrativeNotes: null\n", + " beginDate: '2017-09-21'\n", + " changeDescription: 'Created new version to meet BP requirements: smt 9/7/17'\n", + " context: NCI Standards\n", + " contextVersion: '1'\n", + " createdBy: TSESU\n", + " dataType: CHARACTER\n", + " dateCreated: '2017-09-07'\n", + " dateModified: '2017-09-21'\n", + " decimalPlace: null\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: 58A301BB-F2A1-325F-E053-F662850AD34F\n", + " latestVersionIndicator: 'Yes'\n", + " longName: AGMT_5PT_SCL\n", + " maxLength: '20'\n", + " maxValue: null\n", + " minLength: null\n", + " minValue: null\n", + " modifiedBy: TSESU\n", + " origin: MDADI:M.D.Anderson Dysphagia Inventory, Amy Y.Chen, 2000\n", + " preferredDefinition: A number with no fractional part, including the negative\n", + " and positive numbers as well as zero.::5_The precise location of something;\n", + " a spatially limited location._an ordered reference standard.\n", + " preferredName: Agreement 5 Point Likert Scale\n", + " publicId: '3682709'\n", + " registrationStatus: Standard\n", + " type: Enumerated\n", + " unresolvedIssues: null\n", + " version: '2'\n", + " workflowStatus: RELEASED\n", + " administrativeNotes: null\n", + " beginDate: '2021-01-29'\n", + " changeDescription: Released. 03/18/2021 KMM; System generated def displayed as alt\n", + " def.\n", + " context: NHLBI\n", + " contextVersion: '1'\n", + " createdBy: MALUMK\n", + " dateCreated: '2021-01-29'\n", + " dateModified: '2021-03-18'\n", + " deletedIndicator: 'No'\n", + " endDate: null\n", + " id: BA1184FA-0CD9-5906-E053-4EBD850A7F9F\n", + " latestVersionIndicator: 'Yes'\n", + " longName: 7571388v1.0:3682709v2.0\n", + " modifiedBy: MALUMK\n", + " origin: NMDP:National Marrow Donor Program\n", + " preferredDefinition: A person's agreement with a statement related to feeling fear\n", + " when hearing the term palliative care using a five-point Likert scale.\n", + " preferredName: Supportive Care When I Hear the Term Palliative Care, I Feel Fear\n", + " Agreement 5 Point Likert Scale\n", + " publicId: '7571389'\n", + " registrationStatus: Application\n", + " unresolvedIssues: null\n", + " version: '1'\n", + " workflowStatus: RELEASED\n" + ] + } + ], + "source": [ + "import yaml\n", + "print(yaml.dump(objs[1]))" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:41:17.946491Z", + "start_time": "2024-05-12T01:41:17.916594Z" + } + }, + "id": "2b173220f17a40bd" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "bb4cd8e2e5fccbe9" + }, + { + "cell_type": "markdown", + "source": [ + "## Creating a client and attaching to a database\n", + "\n", + "First we will create a client as normal:" + ], + "metadata": { + "collapsed": false + }, + "id": "493c7599d2f40c27" + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "from linkml_store import Client\n", + "\n", + "client = Client()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:53:21.953731Z", + "start_time": "2024-05-12T01:53:20.116792Z" + } + }, + "id": "initial_id" + }, + { + "cell_type": "markdown", + "source": [ + "Next we'll attach to a MongoDB instance. this assumes you have one running already." + ], + "metadata": { + "collapsed": false + }, + "id": "470f1cb70bf3641b" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "db = client.attach_database(\"mongodb://localhost:27017\", \"cadsr\", recreate_if_exists=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:53:30.201194Z", + "start_time": "2024-05-12T01:53:30.191163Z" + } + }, + "id": "cc164c0acbe4c39d" + }, + { + "cell_type": "markdown", + "source": [ + "## Creating a collection\n", + "\n", + "We'll create a simple test collection. The concept of collection in linkml-store maps directly to mongodb collections" + ], + "metadata": { + "collapsed": false + }, + "id": "334ea2ced79828f7" + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + }, + "id": "a0a98c5a5c9f0072" + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "collection = db.create_collection(\"cdes\", recreate_if_exists=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:53:38.324488Z", + "start_time": "2024-05-12T01:53:38.270331Z" + } + }, + "id": "c3a79013f9359a9" + }, + { + "cell_type": "markdown", + "source": [ + "## Loading" + ], + "metadata": { + "collapsed": false + }, + "id": "207f35ee61edc14d" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "collection.insert(objs)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:54:00.474166Z", + "start_time": "2024-05-12T01:53:42.606475Z" + } + }, + "id": "4a09a78fe3c8dc33" + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "74229" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "collection.find({}, limit=1).num_rows" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:54:23.412643Z", + "start_time": "2024-05-12T01:54:00.473836Z" + } + }, + "id": "f505fdc8cc20196e" + }, + { + "cell_type": "markdown", + "source": [ + "Let's check with pandas just to make sure it looks as expected:" + ], + "metadata": { + "collapsed": false + }, + "id": "90e2e9793375431f" + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "data": { + "text/plain": " DataElement\n0 {'publicId': '2869761', 'version': '1', 'prefe...\n1 {'publicId': '7571389', 'version': '1', 'prefe...\n2 {'publicId': '2773112', 'version': '1', 'prefe...\n3 {'publicId': '2971930', 'version': '1', 'prefe...\n4 {'publicId': '7637945', 'version': '1', 'prefe...\n... ...\n74224 {'publicId': '4561278', 'version': '1', 'prefe...\n74225 {'publicId': '7787595', 'version': '1', 'prefe...\n74226 {'publicId': '6703581', 'version': '1', 'prefe...\n74227 {'publicId': '2220287', 'version': '1', 'prefe...\n74228 {'publicId': '7736664', 'version': '1', 'prefe...\n\n[74229 rows x 1 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DataElement
0{'publicId': '2869761', 'version': '1', 'prefe...
1{'publicId': '7571389', 'version': '1', 'prefe...
2{'publicId': '2773112', 'version': '1', 'prefe...
3{'publicId': '2971930', 'version': '1', 'prefe...
4{'publicId': '7637945', 'version': '1', 'prefe...
......
74224{'publicId': '4561278', 'version': '1', 'prefe...
74225{'publicId': '7787595', 'version': '1', 'prefe...
74226{'publicId': '6703581', 'version': '1', 'prefe...
74227{'publicId': '2220287', 'version': '1', 'prefe...
74228{'publicId': '7736664', 'version': '1', 'prefe...
\n

74229 rows × 1 columns

\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qr = collection.find({}, limit=3)\n", + "qr.rows_dataframe" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T01:55:50.464416Z", + "start_time": "2024-05-12T01:55:31.337058Z" + } + }, + "id": "e763fe6cd50022e2" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "qr.rows[1]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-12T02:47:51.405447Z", + "start_time": "2024-05-12T02:47:51.404058Z" + } + }, + "id": "4c688db0600f8d57" + }, + { + "cell_type": "markdown", + "source": [ + "## Semantic Search\n", + "\n", + "We will index phenopackets using a template that extracts the subject, phenotypic features and diseases." + ], + "metadata": { + "collapsed": false + }, + "id": "648f05e75f250221" + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "template = \"\"\"\n", + "subject: {{subject}}\n", + "phenotypes: {% for p in phenotypicFeatures %}{{p.type.label}}{% endfor %}\n", + "diseases: {% for d in diseases %}{{d.term.label}}{% endfor %}\n", + "\"\"\"" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:05:35.047572Z", + "start_time": "2024-05-10T16:05:35.045279Z" + } + }, + "id": "976095541027ce9e" + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [], + "source": [ + "from linkml_store.index.implementations.llm_indexer import LLMIndexer\n", + "\n", + "index = LLMIndexer(\n", + " name=\"ppkt\", \n", + " cached_embeddings_database=\"tmp/llm_pheno_cache.db\",\n", + " text_template=template,\n", + " text_template_syntax=\"jinja2\",\n", + ")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:05:35.053303Z", + "start_time": "2024-05-10T16:05:35.047842Z" + } + }, + "id": "e98f9d6eb4a5e385" + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [ + { + "data": { + "text/plain": "\"\\nsubject: {'id': 'Higgins-Patient-1', 'timeAtLastEncounter': {'age': {'iso8601duration': 'P17Y'}}, 'sex': 'FEMALE'}\\nphenotypes: Ventricular hypertrophyHeart murmurHypertrophic cardiomyopathyShort statureHypertelorismLow-set earsPosteriorly rotated earsGlobal developmental delayCognitive impairmentCardiac arrest\\ndiseases: Noonan syndrome-11\"" + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.object_to_text(qr.rows[0])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:05:35.059305Z", + "start_time": "2024-05-10T16:05:35.055942Z" + } + }, + "id": "16dce837e31c88f6" + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/duckdb_engine/__init__.py:580: SAWarning: Did not recognize type 'list' of column 'embedding'\n", + " columns = self._get_columns_info(rows, domains, enums, schema) # type: ignore[attr-defined]\n", + "/Users/cjm/Library/Caches/pypoetry/virtualenvs/linkml-store-8ZYO4kTy-py3.10/lib/python3.10/site-packages/duckdb_engine/__init__.py:173: DuckDBEngineWarning: duckdb-engine doesn't yet support reflection on indices\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "collection.attach_indexer(index, auto_index=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:13:07.803423Z", + "start_time": "2024-05-10T16:05:35.059166Z" + } + }, + "id": "18a0bd86de7f1d81" + }, + { + "cell_type": "markdown", + "source": [ + "## Queries\n", + "\n", + "We can specify key-value constraints:" + ], + "metadata": { + "collapsed": false + }, + "id": "f49056b209918a9" + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "data": { + "text/plain": " score id \\\n0 0.794360 PMID_30658709_patient \n1 0.786465 PMID_37303127_6 \n2 0.785974 PMID_22508010_22508010_P1 \n3 0.785179 PMID_27536553_27536553_P3 \n4 0.781917 PMID_27536553_27536553_P2 \n5 0.778760 PMID_25129007_25129007_P1 \n6 0.776784 PMID_24894789_24894789_P1 \n7 0.776577 PMID_27536553_27536553_P1 \n8 0.776096 PMID_34023347_34023347_P1 \n9 0.775122 PMID_28209105_28209105_P1 \n\n subject \\\n0 {'id': 'patient', 'timeAtLastEncounter': {'age... \n1 {'id': '6', 'timeAtLastEncounter': {'age': {'i... \n2 {'id': '22508010_P1', 'timeAtLastEncounter': {... \n3 {'id': '27536553_P3', 'timeAtLastEncounter': {... \n4 {'id': '27536553_P2', 'timeAtLastEncounter': {... \n5 {'id': '25129007_P1', 'timeAtLastEncounter': {... \n6 {'id': '24894789_P1', 'timeAtLastEncounter': {... \n7 {'id': '27536553_P1', 'timeAtLastEncounter': {... \n8 {'id': '34023347_P1', 'timeAtLastEncounter': {... \n9 {'id': '28209105_P1', 'timeAtLastEncounter': {... \n\n phenotypicFeatures \\\n0 [{'type': {'id': 'HP:0031956', 'label': 'Eleva... \n1 [{'type': {'id': 'HP:0001397', 'label': 'Hepat... \n2 [{'type': {'id': 'HP:0006562', 'label': 'Viral... \n3 [{'type': {'id': 'HP:0001396', 'label': 'Chole... \n4 [{'type': {'id': 'HP:0001396', 'label': 'Chole... \n5 [{'type': {'id': 'HP:0000952', 'label': 'Jaund... \n6 [{'type': {'id': 'HP:0000952', 'label': 'Jaund... \n7 [{'type': {'id': 'HP:0001396', 'label': 'Chole... \n8 [{'type': {'id': 'HP:0006554', 'label': 'Acute... \n9 [{'type': {'id': 'HP:0001508', 'label': 'Failu... \n\n interpretations \\\n0 [{'id': 'patient', 'progressStatus': 'SOLVED',... \n1 [{'id': '6', 'progressStatus': 'SOLVED', 'diag... \n2 [{'id': '22508010_P1', 'progressStatus': 'SOLV... \n3 [{'id': '27536553_P3', 'progressStatus': 'SOLV... \n4 [{'id': '27536553_P2', 'progressStatus': 'SOLV... \n5 [{'id': '25129007_P1', 'progressStatus': 'SOLV... \n6 [{'id': '24894789_P1', 'progressStatus': 'SOLV... \n7 [{'id': '27536553_P1', 'progressStatus': 'SOLV... \n8 [{'id': '34023347_P1', 'progressStatus': 'SOLV... \n9 [{'id': '28209105_P1', 'progressStatus': 'SOLV... \n\n diseases \\\n0 [{'term': {'id': 'OMIM:615878', 'label': 'Chol... \n1 [{'term': {'id': 'OMIM:151660', 'label': 'Lipo... \n2 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n3 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n4 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n5 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n6 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n7 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n8 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n9 [{'term': {'id': 'OMIM:256810', 'label': 'Mito... \n\n metaData \n0 {'created': '2024-05-05T09:03:25.388371944Z', ... \n1 {'created': '2024-03-23T17:41:42.999521017Z', ... \n2 {'created': '2024-03-23T19:28:35.860860824Z', ... \n3 {'created': '2024-03-23T19:28:35.688389062Z', ... \n4 {'created': '2024-03-23T19:28:35.674263954Z', ... \n5 {'created': '2024-03-23T19:28:36.169033050Z', ... \n6 {'created': '2024-03-23T19:28:36.148879051Z', ... \n7 {'created': '2024-03-23T19:28:35.665367126Z', ... \n8 {'created': '2024-03-23T19:28:35.581506967Z', ... \n9 {'created': '2024-03-23T19:28:35.655704975Z', ... ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
scoreidsubjectphenotypicFeaturesinterpretationsdiseasesmetaData
00.794360PMID_30658709_patient{'id': 'patient', 'timeAtLastEncounter': {'age...[{'type': {'id': 'HP:0031956', 'label': 'Eleva...[{'id': 'patient', 'progressStatus': 'SOLVED',...[{'term': {'id': 'OMIM:615878', 'label': 'Chol...{'created': '2024-05-05T09:03:25.388371944Z', ...
10.786465PMID_37303127_6{'id': '6', 'timeAtLastEncounter': {'age': {'i...[{'type': {'id': 'HP:0001397', 'label': 'Hepat...[{'id': '6', 'progressStatus': 'SOLVED', 'diag...[{'term': {'id': 'OMIM:151660', 'label': 'Lipo...{'created': '2024-03-23T17:41:42.999521017Z', ...
20.785974PMID_22508010_22508010_P1{'id': '22508010_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0006562', 'label': 'Viral...[{'id': '22508010_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.860860824Z', ...
30.785179PMID_27536553_27536553_P3{'id': '27536553_P3', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0001396', 'label': 'Chole...[{'id': '27536553_P3', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.688389062Z', ...
40.781917PMID_27536553_27536553_P2{'id': '27536553_P2', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0001396', 'label': 'Chole...[{'id': '27536553_P2', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.674263954Z', ...
50.778760PMID_25129007_25129007_P1{'id': '25129007_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0000952', 'label': 'Jaund...[{'id': '25129007_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:36.169033050Z', ...
60.776784PMID_24894789_24894789_P1{'id': '24894789_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0000952', 'label': 'Jaund...[{'id': '24894789_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:36.148879051Z', ...
70.776577PMID_27536553_27536553_P1{'id': '27536553_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0001396', 'label': 'Chole...[{'id': '27536553_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.665367126Z', ...
80.776096PMID_34023347_34023347_P1{'id': '34023347_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0006554', 'label': 'Acute...[{'id': '34023347_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.581506967Z', ...
90.775122PMID_28209105_28209105_P1{'id': '28209105_P1', 'timeAtLastEncounter': {...[{'type': {'id': 'HP:0001508', 'label': 'Failu...[{'id': '28209105_P1', 'progressStatus': 'SOLV...[{'term': {'id': 'OMIM:256810', 'label': 'Mito...{'created': '2024-03-23T19:28:35.655704975Z', ...
\n
" + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qr = collection.search(\"older males with liver disease\")\n", + "qr.rows_dataframe[0:10]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:13:27.566268Z", + "start_time": "2024-05-10T16:13:26.277769Z" + } + }, + "id": "1ddd4ac75719342d" + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "data": { + "text/plain": "(0.7943603537606876,\n {'id': 'PMID_30658709_patient',\n 'subject': {'id': 'patient',\n 'timeAtLastEncounter': {'age': {'iso8601duration': 'P1Y11M'}},\n 'sex': 'FEMALE'},\n 'phenotypicFeatures': [{'type': {'id': 'HP:0031956',\n 'label': 'Elevated circulating aspartate aminotransferase concentration'},\n 'onset': {'age': {'iso8601duration': 'P1Y11M'}}},\n {'type': {'id': 'HP:0031964',\n 'label': 'Elevated circulating alanine aminotransferase concentration'},\n 'onset': {'age': {'iso8601duration': 'P1Y11M'}}},\n {'type': {'id': 'HP:0003573', 'label': 'Increased total bilirubin'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0012202',\n 'label': 'Increased serum bile acid concentration'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0002908', 'label': 'Conjugated hyperbilirubinemia'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0001433', 'label': 'Hepatosplenomegaly'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0001510', 'label': 'Growth delay'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0000989', 'label': 'Pruritus'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0000952', 'label': 'Jaundice'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0100810', 'label': 'Pointed helix'},\n 'onset': {'age': {'iso8601duration': 'P6M'}}},\n {'type': {'id': 'HP:0002650', 'label': 'Scoliosis'}},\n {'type': {'id': 'HP:0003112',\n 'label': 'Abnormal circulating amino acid concentration'},\n 'excluded': True},\n {'type': {'id': 'HP:0001928', 'label': 'Abnormality of coagulation'},\n 'excluded': True},\n {'type': {'id': 'HP:0010701', 'label': 'Abnormal immunoglobulin level'},\n 'excluded': True},\n {'type': {'id': 'HP:0001627', 'label': 'Abnormal heart morphology'},\n 'excluded': True}],\n 'interpretations': [{'id': 'patient',\n 'progressStatus': 'SOLVED',\n 'diagnosis': {'disease': {'id': 'OMIM:615878',\n 'label': 'Cholestasis, progressive familial intrahepatic 4'},\n 'genomicInterpretations': [{'subjectOrBiosampleId': 'patient',\n 'interpretationStatus': 'CAUSATIVE',\n 'variantInterpretation': {'variationDescriptor': {'id': 'var_kKNGnjOxGXMbcoWzDGEJKVPIB',\n 'geneContext': {'valueId': 'HGNC:11828', 'symbol': 'TJP2'},\n 'expressions': [{'syntax': 'hgvs.c',\n 'value': 'NM_004817.4:c.2355+1G>C'},\n {'syntax': 'hgvs.g', 'value': 'NC_000009.12:g.69238790G>C'}],\n 'vcfRecord': {'genomeAssembly': 'hg38',\n 'chrom': 'chr9',\n 'pos': '69238790',\n 'ref': 'G',\n 'alt': 'C'},\n 'moleculeContext': 'genomic',\n 'allelicState': {'id': 'GENO:0000136', 'label': 'homozygous'}}}}]}}],\n 'diseases': [{'term': {'id': 'OMIM:615878',\n 'label': 'Cholestasis, progressive familial intrahepatic 4'},\n 'onset': {'ontologyClass': {'id': 'HP:0003593',\n 'label': 'Infantile onset'}}}],\n 'metaData': {'created': '2024-05-05T09:03:25.388371944Z',\n 'createdBy': 'ORCID:0000-0002-0736-9199',\n 'resources': [{'id': 'geno',\n 'name': 'Genotype Ontology',\n 'url': 'http://purl.obolibrary.org/obo/geno.owl',\n 'version': '2022-03-05',\n 'namespacePrefix': 'GENO',\n 'iriPrefix': 'http://purl.obolibrary.org/obo/GENO_'},\n {'id': 'hgnc',\n 'name': 'HUGO Gene Nomenclature Committee',\n 'url': 'https://www.genenames.org',\n 'version': '06/01/23',\n 'namespacePrefix': 'HGNC',\n 'iriPrefix': 'https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/'},\n {'id': 'omim',\n 'name': 'An Online Catalog of Human Genes and Genetic Disorders',\n 'url': 'https://www.omim.org',\n 'version': 'January 4, 2023',\n 'namespacePrefix': 'OMIM',\n 'iriPrefix': 'https://www.omim.org/entry/'},\n {'id': 'so',\n 'name': 'Sequence types and features ontology',\n 'url': 'http://purl.obolibrary.org/obo/so.obo',\n 'version': '2021-11-22',\n 'namespacePrefix': 'SO',\n 'iriPrefix': 'http://purl.obolibrary.org/obo/SO_'},\n {'id': 'hp',\n 'name': 'human phenotype ontology',\n 'url': 'http://purl.obolibrary.org/obo/hp.owl',\n 'version': '2024-04-26',\n 'namespacePrefix': 'HP',\n 'iriPrefix': 'http://purl.obolibrary.org/obo/HP_'}],\n 'phenopacketSchemaVersion': '2.0',\n 'externalReferences': [{'id': 'PMID:30658709',\n 'reference': 'https://pubmed.ncbi.nlm.nih.gov/30658709',\n 'description': 'Novel compound heterozygote mutations of TJP2 in a Chinese child with progressive cholestatic liver disease'}]}})" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qr.ranked_rows[0]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:43:05.712314Z", + "start_time": "2024-05-10T16:43:05.706782Z" + } + }, + "id": "5a4fd8fe217fdf6b" + }, + { + "cell_type": "markdown", + "source": [ + "## Validation\n", + "\n", + "__TODO__ " + ], + "metadata": { + "collapsed": false + }, + "id": "41a14e7976a923b3" + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-10T16:13:08.974163Z", + "start_time": "2024-05-10T16:13:08.970774Z" + } + }, + "id": "5294ee7927a372f1" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/how-to/Use-MongoDB.ipynb b/docs/how-to/Use-MongoDB.ipynb index d891e9d..eadaf0e 100644 --- a/docs/how-to/Use-MongoDB.ipynb +++ b/docs/how-to/Use-MongoDB.ipynb @@ -43,8 +43,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.138754Z", - "start_time": "2024-05-04T04:31:30.984678Z" + "end_time": "2024-06-11T22:10:41.592200Z", + "start_time": "2024-06-11T22:10:40.322078Z" } }, "id": "initial_id" @@ -69,12 +69,62 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.143001Z", - "start_time": "2024-05-04T04:31:32.139616Z" + "end_time": "2024-06-11T22:10:41.595610Z", + "start_time": "2024-06-11T22:10:41.592760Z" } }, "id": "cc164c0acbe4c39d" }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "'mongodb://localhost:27017'" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.handle" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-06-11T22:10:41.599804Z", + "start_time": "2024-06-11T22:10:41.597005Z" + } + }, + "id": "cb437a6dd0f73ec8" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "'{\"handle\":\"mongodb://localhost:27017\",\"alias\":\"test\",\"schema_location\":null,\"schema_dict\":null,\"collections\":{},\"recreate_if_exists\":false,\"collection_type_slot\":null,\"searchable_slots\":null,\"ensure_referential_integrity\":false}'" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.metadata.model_dump_json()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-06-11T22:10:41.605260Z", + "start_time": "2024-06-11T22:10:41.601527Z" + } + }, + "id": "4390916095bf647a" + }, { "cell_type": "markdown", "source": [ @@ -89,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "outputs": [], "source": [ "collection = db.create_collection(\"test\", recreate_if_exists=True)" @@ -97,8 +147,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.196922Z", - "start_time": "2024-05-04T04:31:32.143137Z" + "end_time": "2024-06-11T22:10:41.650477Z", + "start_time": "2024-06-11T22:10:41.603373Z" } }, "id": "c3a79013f9359a9" @@ -125,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "outputs": [], "source": [ "COUNTRIES = \"../../tests/input/countries/countries.jsonl\"" @@ -133,15 +183,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:45:34.555717Z", - "start_time": "2024-05-09T23:45:34.551294Z" + "end_time": "2024-06-11T22:10:41.650886Z", + "start_time": "2024-06-11T22:10:41.648075Z" } }, "id": "5286ef4e9dd0f316" }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "outputs": [], "source": [ "from linkml_store.utils.format_utils import load_objects\n", @@ -151,8 +201,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:45:35.253688Z", - "start_time": "2024-05-09T23:45:35.233162Z" + "end_time": "2024-06-11T22:10:41.654343Z", + "start_time": "2024-06-11T22:10:41.650251Z" } }, "id": "2e21988e4fc13f58" @@ -169,14 +219,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "outputs": [ { "data": { "text/plain": " name code capital continent \\\n0 United States US Washington, D.C. North America \n1 Canada CA Ottawa North America \n2 Mexico MX Mexico City North America \n3 Brazil BR Brasília South America \n4 Argentina AR Buenos Aires South America \n5 United Kingdom GB London Europe \n6 France FR Paris Europe \n7 Germany DE Berlin Europe \n8 Italy IT Rome Europe \n9 Spain ES Madrid Europe \n10 China CN Beijing Asia \n11 Japan JP Tokyo Asia \n12 India IN New Delhi Asia \n13 South Korea KR Seoul Asia \n14 Indonesia ID Jakarta Asia \n15 Australia AU Canberra Oceania \n16 New Zealand NZ Wellington Oceania \n17 Egypt EG Cairo Africa \n18 Nigeria NG Abuja Africa \n19 South Africa ZA Pretoria Africa \n\n languages \n0 [English] \n1 [English, French] \n2 [Spanish] \n3 [Portuguese] \n4 [Spanish] \n5 [English] \n6 [French] \n7 [German] \n8 [Italian] \n9 [Spanish] \n10 [Standard Chinese] \n11 [Japanese] \n12 [Hindi, English] \n13 [Korean] \n14 [Indonesian] \n15 [English] \n16 [English, Māori] \n17 [Arabic] \n18 [English] \n19 [Zulu, Xhosa, Afrikaans, English, Northern Sot... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namecodecapitalcontinentlanguages
0United StatesUSWashington, D.C.North America[English]
1CanadaCAOttawaNorth America[English, French]
2MexicoMXMexico CityNorth America[Spanish]
3BrazilBRBrasíliaSouth America[Portuguese]
4ArgentinaARBuenos AiresSouth America[Spanish]
5United KingdomGBLondonEurope[English]
6FranceFRParisEurope[French]
7GermanyDEBerlinEurope[German]
8ItalyITRomeEurope[Italian]
9SpainESMadridEurope[Spanish]
10ChinaCNBeijingAsia[Standard Chinese]
11JapanJPTokyoAsia[Japanese]
12IndiaINNew DelhiAsia[Hindi, English]
13South KoreaKRSeoulAsia[Korean]
14IndonesiaIDJakartaAsia[Indonesian]
15AustraliaAUCanberraOceania[English]
16New ZealandNZWellingtonOceania[English, Māori]
17EgyptEGCairoAfrica[Arabic]
18NigeriaNGAbujaAfrica[English]
19South AfricaZAPretoriaAfrica[Zulu, Xhosa, Afrikaans, English, Northern Sot...
\n
" }, - "execution_count": 17, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -188,8 +238,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:45:38.327537Z", - "start_time": "2024-05-09T23:45:38.324134Z" + "end_time": "2024-06-11T22:10:41.666716Z", + "start_time": "2024-06-11T22:10:41.654416Z" } }, "id": "e98f9d6eb4a5e385" @@ -208,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "outputs": [], "source": [ "collection.insert(objects)" @@ -216,8 +266,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.259318Z", - "start_time": "2024-05-04T04:31:32.214555Z" + "end_time": "2024-06-11T22:10:41.670028Z", + "start_time": "2024-06-11T22:10:41.665360Z" } }, "id": "668e59a8f28e7bfe" @@ -234,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "outputs": [], "source": [ "qr = collection.find()" @@ -242,22 +292,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.261486Z", - "start_time": "2024-05-04T04:31:32.239305Z" + "end_time": "2024-06-11T22:10:41.679764Z", + "start_time": "2024-06-11T22:10:41.670488Z" } }, "id": "9575e280dda32e41" }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "outputs": [ { "data": { "text/plain": " name code capital continent \\\n0 United States US Washington, D.C. North America \n1 Canada CA Ottawa North America \n2 Mexico MX Mexico City North America \n3 Brazil BR Brasília South America \n4 Argentina AR Buenos Aires South America \n5 United Kingdom GB London Europe \n6 France FR Paris Europe \n7 Germany DE Berlin Europe \n8 Italy IT Rome Europe \n9 Spain ES Madrid Europe \n10 China CN Beijing Asia \n11 Japan JP Tokyo Asia \n12 India IN New Delhi Asia \n13 South Korea KR Seoul Asia \n14 Indonesia ID Jakarta Asia \n15 Australia AU Canberra Oceania \n16 New Zealand NZ Wellington Oceania \n17 Egypt EG Cairo Africa \n18 Nigeria NG Abuja Africa \n19 South Africa ZA Pretoria Africa \n\n languages \n0 [English] \n1 [English, French] \n2 [Spanish] \n3 [Portuguese] \n4 [Spanish] \n5 [English] \n6 [French] \n7 [German] \n8 [Italian] \n9 [Spanish] \n10 [Standard Chinese] \n11 [Japanese] \n12 [Hindi, English] \n13 [Korean] \n14 [Indonesian] \n15 [English] \n16 [English, Māori] \n17 [Arabic] \n18 [English] \n19 [Zulu, Xhosa, Afrikaans, English, Northern Sot... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namecodecapitalcontinentlanguages
0United StatesUSWashington, D.C.North America[English]
1CanadaCAOttawaNorth America[English, French]
2MexicoMXMexico CityNorth America[Spanish]
3BrazilBRBrasíliaSouth America[Portuguese]
4ArgentinaARBuenos AiresSouth America[Spanish]
5United KingdomGBLondonEurope[English]
6FranceFRParisEurope[French]
7GermanyDEBerlinEurope[German]
8ItalyITRomeEurope[Italian]
9SpainESMadridEurope[Spanish]
10ChinaCNBeijingAsia[Standard Chinese]
11JapanJPTokyoAsia[Japanese]
12IndiaINNew DelhiAsia[Hindi, English]
13South KoreaKRSeoulAsia[Korean]
14IndonesiaIDJakartaAsia[Indonesian]
15AustraliaAUCanberraOceania[English]
16New ZealandNZWellingtonOceania[English, Māori]
17EgyptEGCairoAfrica[Arabic]
18NigeriaNGAbujaAfrica[English]
19South AfricaZAPretoriaAfrica[Zulu, Xhosa, Afrikaans, English, Northern Sot...
\n
" }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -268,8 +318,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.265254Z", - "start_time": "2024-05-04T04:31:32.261703Z" + "end_time": "2024-06-11T22:10:41.710549Z", + "start_time": "2024-06-11T22:10:41.677982Z" } }, "id": "9b65321ea7c7ee15" @@ -288,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "outputs": [], "source": [ "qr = collection.find({\"continent\": \"Europe\"})" @@ -296,22 +346,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.276843Z", - "start_time": "2024-05-04T04:31:32.265456Z" + "end_time": "2024-06-11T22:10:41.734378Z", + "start_time": "2024-06-11T22:10:41.680230Z" } }, "id": "244378540320adcf" }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "outputs": [ { "data": { "text/plain": " name code capital continent languages\n0 United Kingdom GB London Europe [English]\n1 France FR Paris Europe [French]\n2 Germany DE Berlin Europe [German]\n3 Italy IT Rome Europe [Italian]\n4 Spain ES Madrid Europe [Spanish]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
namecodecapitalcontinentlanguages
0United KingdomGBLondonEurope[English]
1FranceFRParisEurope[French]
2GermanyDEBerlinEurope[German]
3ItalyITRomeEurope[Italian]
4SpainESMadridEurope[Spanish]
\n
" }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -322,8 +372,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-04T04:31:32.283217Z", - "start_time": "2024-05-04T04:31:32.273826Z" + "end_time": "2024-06-11T22:10:41.741764Z", + "start_time": "2024-06-11T22:10:41.688583Z" } }, "id": "1ff46aef2d8abba1" @@ -344,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "outputs": [], "source": [ "fc = collection.query_facets()" @@ -352,21 +402,21 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:49:28.838341Z", - "start_time": "2024-05-09T23:49:28.805375Z" + "end_time": "2024-06-11T22:10:41.759537Z", + "start_time": "2024-06-11T22:10:41.691693Z" } }, "id": "df3c2afcdb8153e8" }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "outputs": [ { "data": { - "text/plain": "[('Asia', 5),\n ('Europe', 5),\n ('North America', 3),\n ('Africa', 3),\n ('South America', 2),\n ('Oceania', 2)]" + "text/plain": "[('Europe', 5),\n ('Asia', 5),\n ('North America', 3),\n ('Africa', 3),\n ('South America', 2),\n ('Oceania', 2)]" }, - "execution_count": 19, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -377,8 +427,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:49:30.071922Z", - "start_time": "2024-05-09T23:49:30.068967Z" + "end_time": "2024-06-11T22:10:41.759951Z", + "start_time": "2024-06-11T22:10:41.703519Z" } }, "id": "61cd37ba4849ed22" @@ -397,7 +447,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "outputs": [], "source": [ "collection.attach_indexer(\"llm\")" @@ -405,8 +455,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:50:16.729756Z", - "start_time": "2024-05-09T23:50:12.200093Z" + "end_time": "2024-06-11T22:10:44.140941Z", + "start_time": "2024-06-11T22:10:41.707263Z" } }, "id": "674607f18f83360c" @@ -423,14 +473,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "outputs": [ { "data": { - "text/plain": " score name code capital continent \\\n0 0.771005 United Kingdom GB London Europe \n1 0.758345 Australia AU Canberra Oceania \n2 0.754094 South Korea KR Seoul Asia \n3 0.750437 United States US Washington, D.C. North America \n4 0.750217 New Zealand NZ Wellington Oceania \n5 0.748993 South Africa ZA Pretoria Africa \n6 0.748223 Canada CA Ottawa North America \n7 0.746446 France FR Paris Europe \n8 0.745456 Germany DE Berlin Europe \n9 0.743408 Spain ES Madrid Europe \n10 0.739835 China CN Beijing Asia \n11 0.739585 Nigeria NG Abuja Africa \n12 0.738728 Egypt EG Cairo Africa \n13 0.735895 Brazil BR Brasília South America \n14 0.735177 Mexico MX Mexico City North America \n15 0.734055 Japan JP Tokyo Asia \n16 0.731329 Argentina AR Buenos Aires South America \n17 0.728073 Indonesia ID Jakarta Asia \n18 0.724353 India IN New Delhi Asia \n19 0.723590 Italy IT Rome Europe \n\n languages \n0 [English] \n1 [English] \n2 [Korean] \n3 [English] \n4 [English, Māori] \n5 [Zulu, Xhosa, Afrikaans, English, Northern Sot... \n6 [English, French] \n7 [French] \n8 [German] \n9 [Spanish] \n10 [Standard Chinese] \n11 [English] \n12 [Arabic] \n13 [Portuguese] \n14 [Spanish] \n15 [Japanese] \n16 [Spanish] \n17 [Indonesian] \n18 [Hindi, English] \n19 [Italian] ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
scorenamecodecapitalcontinentlanguages
00.771005United KingdomGBLondonEurope[English]
10.758345AustraliaAUCanberraOceania[English]
20.754094South KoreaKRSeoulAsia[Korean]
30.750437United StatesUSWashington, D.C.North America[English]
40.750217New ZealandNZWellingtonOceania[English, Māori]
50.748993South AfricaZAPretoriaAfrica[Zulu, Xhosa, Afrikaans, English, Northern Sot...
60.748223CanadaCAOttawaNorth America[English, French]
70.746446FranceFRParisEurope[French]
80.745456GermanyDEBerlinEurope[German]
90.743408SpainESMadridEurope[Spanish]
100.739835ChinaCNBeijingAsia[Standard Chinese]
110.739585NigeriaNGAbujaAfrica[English]
120.738728EgyptEGCairoAfrica[Arabic]
130.735895BrazilBRBrasíliaSouth America[Portuguese]
140.735177MexicoMXMexico CityNorth America[Spanish]
150.734055JapanJPTokyoAsia[Japanese]
160.731329ArgentinaARBuenos AiresSouth America[Spanish]
170.728073IndonesiaIDJakartaAsia[Indonesian]
180.724353IndiaINNew DelhiAsia[Hindi, English]
190.723590ItalyITRomeEurope[Italian]
\n
" + "text/plain": " score name code capital continent \\\n0 0.770891 United Kingdom GB London Europe \n1 0.758388 Australia AU Canberra Oceania \n2 0.754203 South Korea KR Seoul Asia \n3 0.750652 New Zealand NZ Wellington Oceania \n4 0.750419 United States US Washington, D.C. North America \n5 0.748973 South Africa ZA Pretoria Africa \n6 0.748322 Canada CA Ottawa North America \n7 0.746444 France FR Paris Europe \n8 0.745408 Germany DE Berlin Europe \n9 0.743449 Spain ES Madrid Europe \n10 0.739856 China CN Beijing Asia \n11 0.739504 Nigeria NG Abuja Africa \n12 0.738601 Egypt EG Cairo Africa \n13 0.735424 Brazil BR Brasília South America \n14 0.735056 Mexico MX Mexico City North America \n15 0.733898 Japan JP Tokyo Asia \n16 0.731288 Argentina AR Buenos Aires South America \n17 0.728014 Indonesia ID Jakarta Asia \n18 0.724164 India IN New Delhi Asia \n19 0.723299 Italy IT Rome Europe \n\n languages \n0 [English] \n1 [English] \n2 [Korean] \n3 [English, Māori] \n4 [English] \n5 [Zulu, Xhosa, Afrikaans, English, Northern Sot... \n6 [English, French] \n7 [French] \n8 [German] \n9 [Spanish] \n10 [Standard Chinese] \n11 [English] \n12 [Arabic] \n13 [Portuguese] \n14 [Spanish] \n15 [Japanese] \n16 [Spanish] \n17 [Indonesian] \n18 [Hindi, English] \n19 [Italian] ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
scorenamecodecapitalcontinentlanguages
00.770891United KingdomGBLondonEurope[English]
10.758388AustraliaAUCanberraOceania[English]
20.754203South KoreaKRSeoulAsia[Korean]
30.750652New ZealandNZWellingtonOceania[English, Māori]
40.750419United StatesUSWashington, D.C.North America[English]
50.748973South AfricaZAPretoriaAfrica[Zulu, Xhosa, Afrikaans, English, Northern Sot...
60.748322CanadaCAOttawaNorth America[English, French]
70.746444FranceFRParisEurope[French]
80.745408GermanyDEBerlinEurope[German]
90.743449SpainESMadridEurope[Spanish]
100.739856ChinaCNBeijingAsia[Standard Chinese]
110.739504NigeriaNGAbujaAfrica[English]
120.738601EgyptEGCairoAfrica[Arabic]
130.735424BrazilBRBrasíliaSouth America[Portuguese]
140.735056MexicoMXMexico CityNorth America[Spanish]
150.733898JapanJPTokyoAsia[Japanese]
160.731288ArgentinaARBuenos AiresSouth America[Spanish]
170.728014IndonesiaIDJakartaAsia[Indonesian]
180.724164IndiaINNew DelhiAsia[Hindi, English]
190.723299ItalyITRomeEurope[Italian]
\n
" }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -442,8 +492,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:51:00.634942Z", - "start_time": "2024-05-09T23:51:00.389929Z" + "end_time": "2024-06-11T22:10:45.513951Z", + "start_time": "2024-06-11T22:10:44.141727Z" } }, "id": "1ddd4ac75719342d" @@ -460,13 +510,13 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "outputs": [ { "data": { "text/plain": "20" }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -477,21 +527,21 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:51:05.504631Z", - "start_time": "2024-05-09T23:51:05.484793Z" + "end_time": "2024-06-11T22:10:45.519240Z", + "start_time": "2024-06-11T22:10:45.515082Z" } }, "id": "fa1cdc4e62ad19a1" }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "outputs": [ { "data": { - "text/plain": "[(0.7710047523911066,\n {'name': 'United Kingdom',\n 'code': 'GB',\n 'capital': 'London',\n 'continent': 'Europe',\n 'languages': ['English']}),\n (0.7583452373538249,\n {'name': 'Australia',\n 'code': 'AU',\n 'capital': 'Canberra',\n 'continent': 'Oceania',\n 'languages': ['English']}),\n (0.7540936603499173,\n {'name': 'South Korea',\n 'code': 'KR',\n 'capital': 'Seoul',\n 'continent': 'Asia',\n 'languages': ['Korean']}),\n (0.7504374327081739,\n {'name': 'United States',\n 'code': 'US',\n 'capital': 'Washington, D.C.',\n 'continent': 'North America',\n 'languages': ['English']}),\n (0.7502171566966283,\n {'name': 'New Zealand',\n 'code': 'NZ',\n 'capital': 'Wellington',\n 'continent': 'Oceania',\n 'languages': ['English', 'Māori']}),\n (0.748992841890395,\n {'name': 'South Africa',\n 'code': 'ZA',\n 'capital': 'Pretoria',\n 'continent': 'Africa',\n 'languages': ['Zulu',\n 'Xhosa',\n 'Afrikaans',\n 'English',\n 'Northern Sotho',\n 'Tswana',\n 'Southern Sotho',\n 'Tsonga',\n 'Swazi',\n 'Venda',\n 'Southern Ndebele']}),\n (0.74822283101589,\n {'name': 'Canada',\n 'code': 'CA',\n 'capital': 'Ottawa',\n 'continent': 'North America',\n 'languages': ['English', 'French']}),\n (0.7464458557589044,\n {'name': 'France',\n 'code': 'FR',\n 'capital': 'Paris',\n 'continent': 'Europe',\n 'languages': ['French']}),\n (0.7454555562192956,\n {'name': 'Germany',\n 'code': 'DE',\n 'capital': 'Berlin',\n 'continent': 'Europe',\n 'languages': ['German']}),\n (0.7434083913303983,\n {'name': 'Spain',\n 'code': 'ES',\n 'capital': 'Madrid',\n 'continent': 'Europe',\n 'languages': ['Spanish']}),\n (0.7398351135651812,\n {'name': 'China',\n 'code': 'CN',\n 'capital': 'Beijing',\n 'continent': 'Asia',\n 'languages': ['Standard Chinese']}),\n (0.7395849105341467,\n {'name': 'Nigeria',\n 'code': 'NG',\n 'capital': 'Abuja',\n 'continent': 'Africa',\n 'languages': ['English']}),\n (0.7387284972086676,\n {'name': 'Egypt',\n 'code': 'EG',\n 'capital': 'Cairo',\n 'continent': 'Africa',\n 'languages': ['Arabic']}),\n (0.7358951137401105,\n {'name': 'Brazil',\n 'code': 'BR',\n 'capital': 'Brasília',\n 'continent': 'South America',\n 'languages': ['Portuguese']}),\n (0.7351767098925348,\n {'name': 'Mexico',\n 'code': 'MX',\n 'capital': 'Mexico City',\n 'continent': 'North America',\n 'languages': ['Spanish']}),\n (0.7340548737475453,\n {'name': 'Japan',\n 'code': 'JP',\n 'capital': 'Tokyo',\n 'continent': 'Asia',\n 'languages': ['Japanese']}),\n (0.7313286603399471,\n {'name': 'Argentina',\n 'code': 'AR',\n 'capital': 'Buenos Aires',\n 'continent': 'South America',\n 'languages': ['Spanish']}),\n (0.7280731626046606,\n {'name': 'Indonesia',\n 'code': 'ID',\n 'capital': 'Jakarta',\n 'continent': 'Asia',\n 'languages': ['Indonesian']}),\n (0.7243527809360168,\n {'name': 'India',\n 'code': 'IN',\n 'capital': 'New Delhi',\n 'continent': 'Asia',\n 'languages': ['Hindi', 'English']}),\n (0.7235900790596334,\n {'name': 'Italy',\n 'code': 'IT',\n 'capital': 'Rome',\n 'continent': 'Europe',\n 'languages': ['Italian']})]" + "text/plain": "[(0.7708908770614274,\n {'name': 'United Kingdom',\n 'code': 'GB',\n 'capital': 'London',\n 'continent': 'Europe',\n 'languages': ['English']}),\n (0.7583880255490492,\n {'name': 'Australia',\n 'code': 'AU',\n 'capital': 'Canberra',\n 'continent': 'Oceania',\n 'languages': ['English']}),\n (0.754202745445488,\n {'name': 'South Korea',\n 'code': 'KR',\n 'capital': 'Seoul',\n 'continent': 'Asia',\n 'languages': ['Korean']}),\n (0.7506523769140084,\n {'name': 'New Zealand',\n 'code': 'NZ',\n 'capital': 'Wellington',\n 'continent': 'Oceania',\n 'languages': ['English', 'Māori']}),\n (0.7504190890778679,\n {'name': 'United States',\n 'code': 'US',\n 'capital': 'Washington, D.C.',\n 'continent': 'North America',\n 'languages': ['English']}),\n (0.7489726600700292,\n {'name': 'South Africa',\n 'code': 'ZA',\n 'capital': 'Pretoria',\n 'continent': 'Africa',\n 'languages': ['Zulu',\n 'Xhosa',\n 'Afrikaans',\n 'English',\n 'Northern Sotho',\n 'Tswana',\n 'Southern Sotho',\n 'Tsonga',\n 'Swazi',\n 'Venda',\n 'Southern Ndebele']}),\n (0.7483222334041403,\n {'name': 'Canada',\n 'code': 'CA',\n 'capital': 'Ottawa',\n 'continent': 'North America',\n 'languages': ['English', 'French']}),\n (0.7464438929713734,\n {'name': 'France',\n 'code': 'FR',\n 'capital': 'Paris',\n 'continent': 'Europe',\n 'languages': ['French']}),\n (0.7454078196210195,\n {'name': 'Germany',\n 'code': 'DE',\n 'capital': 'Berlin',\n 'continent': 'Europe',\n 'languages': ['German']}),\n (0.7434487849009042,\n {'name': 'Spain',\n 'code': 'ES',\n 'capital': 'Madrid',\n 'continent': 'Europe',\n 'languages': ['Spanish']}),\n (0.739856220599693,\n {'name': 'China',\n 'code': 'CN',\n 'capital': 'Beijing',\n 'continent': 'Asia',\n 'languages': ['Standard Chinese']}),\n (0.7395038203235198,\n {'name': 'Nigeria',\n 'code': 'NG',\n 'capital': 'Abuja',\n 'continent': 'Africa',\n 'languages': ['English']}),\n (0.7386007424118528,\n {'name': 'Egypt',\n 'code': 'EG',\n 'capital': 'Cairo',\n 'continent': 'Africa',\n 'languages': ['Arabic']}),\n (0.7354238434740793,\n {'name': 'Brazil',\n 'code': 'BR',\n 'capital': 'Brasília',\n 'continent': 'South America',\n 'languages': ['Portuguese']}),\n (0.7350558425995254,\n {'name': 'Mexico',\n 'code': 'MX',\n 'capital': 'Mexico City',\n 'continent': 'North America',\n 'languages': ['Spanish']}),\n (0.733897746229655,\n {'name': 'Japan',\n 'code': 'JP',\n 'capital': 'Tokyo',\n 'continent': 'Asia',\n 'languages': ['Japanese']}),\n (0.7312880542513781,\n {'name': 'Argentina',\n 'code': 'AR',\n 'capital': 'Buenos Aires',\n 'continent': 'South America',\n 'languages': ['Spanish']}),\n (0.7280135748889252,\n {'name': 'Indonesia',\n 'code': 'ID',\n 'capital': 'Jakarta',\n 'continent': 'Asia',\n 'languages': ['Indonesian']}),\n (0.7241642577932456,\n {'name': 'India',\n 'code': 'IN',\n 'capital': 'New Delhi',\n 'continent': 'Asia',\n 'languages': ['Hindi', 'English']}),\n (0.7232991877572457,\n {'name': 'Italy',\n 'code': 'IT',\n 'capital': 'Rome',\n 'continent': 'Europe',\n 'languages': ['Italian']})]" }, - "execution_count": 23, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -502,21 +552,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-05-09T23:51:07.706122Z", - "start_time": "2024-05-09T23:51:07.698147Z" + "end_time": "2024-06-11T22:10:45.523666Z", + "start_time": "2024-06-11T22:10:45.521062Z" } }, "id": "9509d574fd222a72" }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "outputs": [], "source": [], "metadata": { "collapsed": false, "ExecuteTime": { - "start_time": "2024-05-04T04:31:35.914292Z" + "end_time": "2024-06-11T22:10:45.527045Z", + "start_time": "2024-06-11T22:10:45.524024Z" } }, "id": "cdd3d5bb25673310" diff --git a/src/linkml_store/api/client.py b/src/linkml_store/api/client.py index adb536d..494db2d 100644 --- a/src/linkml_store/api/client.py +++ b/src/linkml_store/api/client.py @@ -27,14 +27,27 @@ class Client: """ A client is the top-level object for interacting with databases. - A client has access to one or more :class:`Database` objects. + * A client has access to one or more :class:`.Database` objects. + * Each database consists of a number of :class:`.Collection` objects. - Each database consists of a number of :class:`.Collection` objects. - - Examples - -------- + Creating a client + ----------------- >>> client = Client() + + Attaching a database + -------------------- >>> db = client.attach_database("duckdb", alias="test") + + Note that normally a handle would be specified by a locator such as ``duckdb:///``, but + for convience, an in-memory duckdb object can be specified without a full locator + + We can check the actual handle: + + >>> db.handle + 'duckdb:///:memory:' + + Creating a new collection + ------------------------- >>> collection = db.create_collection("Person") >>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}] >>> collection.insert(objs) @@ -171,6 +184,11 @@ def attach_database( self._databases = {} self._databases[alias] = db db.parent = self + if db.alias: + if db.alias != alias: + raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}") + else: + db.metadata.alias = alias return db def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database: diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py index dbb2416..70c5c39 100644 --- a/src/linkml_store/api/collection.py +++ b/src/linkml_store/api/collection.py @@ -12,6 +12,7 @@ from pydantic import BaseModel from linkml_store.index import get_indexer +from linkml_store.utils.format_utils import load_objects from linkml_store.utils.object_utils import clean_empties try: @@ -69,8 +70,12 @@ def __init__( self.metadata = metadata else: self.metadata = CollectionConfig(name=name, **kwargs) - if name is not None and self.metadata.name is not None and name != self.metadata.name: - raise ValueError(f"Name mismatch: {name} != {self.metadata.name}") + if not self.metadata.alias: + self.metadata.alias = name + if not self.metadata.type: + self.metadata.type = name + #if name is not None and self.metadata.name is not None and name != self.metadata.name: + # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}") @property def name(self) -> str: @@ -93,7 +98,7 @@ def hidden(self) -> bool: :return: True if the collection is hidden """ - return self.metadata.hidden + # return self.metadata.hidden @property def target_class_name(self): @@ -152,6 +157,7 @@ def alias(self): :return: """ # TODO: this is a shim layer until we can normalize on this + # TODO: this is a shim layer until we can normalize on this if self.metadata.alias: return self.metadata.alias return self.name @@ -444,9 +450,13 @@ def is_internal(self) -> bool: :return: """ - if not self.name: - raise ValueError(f"Collection has no name: {self} // {self.metadata}") - return self.name.startswith("internal__") + if not self.alias: + raise ValueError(f"Collection has no alias: {self} // {self.metadata}") + return self.alias.startswith("internal__") + + def load_from_source(self): + objects = load_objects(self.metadata.source_location) + self.insert(objects) def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs): """ @@ -599,6 +609,8 @@ def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_si :param max_sample_size: :return: """ + if not self.target_class_name: + raise ValueError(f"No target_class_name for {self.alias}") cd = ClassDefinition(self.target_class_name) keys = defaultdict(list) for obj in objs[0:max_sample_size]: diff --git a/src/linkml_store/api/config.py b/src/linkml_store/api/config.py index d45caf0..e79cde8 100644 --- a/src/linkml_store/api/config.py +++ b/src/linkml_store/api/config.py @@ -16,7 +16,7 @@ class CollectionConfig(BaseModel): default=None, description="The type of object in the collection. TODO; use this instead of name", ) - metadata: Optional[Dict] = Field( + additional_properties: Optional[Dict] = Field( default=None, description="Optional metadata for the collection", ) @@ -36,6 +36,10 @@ class CollectionConfig(BaseModel): default=False, description="Whether the collection is prepopulated", ) + source_location: Optional[str] = Field( + default=None, + description="Filesystem or remote URL that stores the data", + ) class DatabaseConfig(BaseModel): @@ -55,7 +59,7 @@ class DatabaseConfig(BaseModel): default=None, description="The LinkML schema as a dictionary", ) - collections: Dict[str, CollectionConfig] = Field( + collections: Optional[Dict[str, CollectionConfig]] = Field( default={}, description="A dictionary of collection configurations", ) diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py index cef29e0..37ecfbc 100644 --- a/src/linkml_store/api/database.py +++ b/src/linkml_store/api/database.py @@ -29,13 +29,33 @@ class Database(ABC): """ A Database provides access to named collections of data. - Examples - -------- + A database object is owned by a :ref:`Client`. The database + object uses a :ref:`handle` to know what kind of external + dataase system to connect to (e.g. duckdb, mongodb). The handle + is a string ``:`` + + The + database object may also have an :ref:`alias` that is mapped + to the handle. + + Attaching a database + -------------------- >>> from linkml_store.api.client import Client >>> client = Client() - >>> db = client.attach_database("duckdb", alias="test") + >>> db = client.attach_database("duckdb:///:memory:", alias="test") + + We can check the value of the handle: + >>> db.handle 'duckdb:///:memory:' + + The alias can be used to retrieve the database object from the client + + >>> assert db == client.get_database("test") + + Creating a collection + --------------------- + >>> collection = db.create_collection("Person") >>> len(db.list_collections()) 1 @@ -108,6 +128,8 @@ def from_config(self, db_config: DatabaseConfig, **kwargs): return self def _initialize_collections(self): + if not self.metadata.collections: + return for name, collection_config in self.metadata.collections.items(): alias = collection_config.alias typ = collection_config.type @@ -156,6 +178,10 @@ def handle(self) -> str: """ return self.metadata.handle + @property + def alias(self): + return self.metadata.alias + def store(self, obj: Dict[str, Any], **kwargs): """ Store an object in the database. @@ -193,9 +219,11 @@ def store(self, obj: Dict[str, Any], **kwargs): if not v: continue if slot: - collection = self.get_collection(slot.range, create_if_not_exists=True) + logger.debug(f"Aligning to existing slot: {slot.name} range={slot.range}") + collection = self.get_collection(slot.name, type=slot.range, create_if_not_exists=True) else: collection = self.get_collection(k, create_if_not_exists=True) + logger.debug(f"Replacing using {collection.alias} {collection.target_class_name}") collection.replace(v) def commit(self, **kwargs): @@ -260,6 +288,8 @@ def create_collection( raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}") collection_cls = self.collection_class collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata) + if metadata and metadata.source_location: + collection.load_from_source() if metadata and metadata.attributes: sv = self.schema_view schema = sv.schema @@ -318,7 +348,7 @@ def list_collection_names(self, **kwargs) -> Sequence[str]: """ return [c.name for c in self.list_collections(**kwargs)] - def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection": + def get_collection(self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs) -> "Collection": """ Get a named collection. @@ -340,10 +370,14 @@ def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Col """ if not self._collections: + logger.debug("Initializing collections") self.init_collections() if name not in self._collections.keys(): if create_if_not_exists: - self._collections[name] = self.create_collection(name) + if type is None: + type = name + logger.debug(f"Creating new collection: {name} kwargs: {kwargs}") + self._collections[name] = self.create_collection(type, alias=name, **kwargs) else: raise KeyError(f"Collection {name} does not exist") return self._collections[name] @@ -470,8 +504,7 @@ def set_schema_view(self, schema_view: Union[str, Path, SchemaView]): if inlined and slot.range: if slot.name in self._collections: coll = self._collections[slot.name] - if not coll.metadata.type: - coll.metadata.type = slot.range + coll.metadata.type = slot.range def load_schema_view(self, path: Union[str, Path]): """ @@ -538,7 +571,7 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]: >>> db = client.attach_database("duckdb", alias="test") >>> db.load_schema_view("tests/input/countries/countries.linkml.yaml") - Let's introspet the schema to see what slots are applicable for the class "Country": + Let's introspect the schema to see what slots are applicable for the class "Country": >>> sv = db.schema_view >>> for slot in sv.class_induced_slots("Country"): diff --git a/src/linkml_store/api/stores/duckdb/duckdb_collection.py b/src/linkml_store/api/stores/duckdb/duckdb_collection.py index 3f0dd64..a0e6ee8 100644 --- a/src/linkml_store/api/stores/duckdb/duckdb_collection.py +++ b/src/linkml_store/api/stores/duckdb/duckdb_collection.py @@ -19,12 +19,14 @@ class DuckDBCollection(Collection): _table_created: bool = None def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): + logger.debug(f"Inserting {len(objs)}") if not isinstance(objs, list): objs = [objs] if not objs: return cd = self.class_definition() if not cd: + logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce") cd = self.induce_class_definition_from_objects(objs) self._create_table(cd) table = self._sqla_table(cd) diff --git a/src/linkml_store/api/stores/duckdb/duckdb_database.py b/src/linkml_store/api/stores/duckdb/duckdb_database.py index 89188c9..da22909 100644 --- a/src/linkml_store/api/stores/duckdb/duckdb_database.py +++ b/src/linkml_store/api/stores/duckdb/duckdb_database.py @@ -116,7 +116,10 @@ def query(self, query: Query, **kwargs) -> QueryResult: def init_collections(self): # TODO: unify schema introspection - schema = introspect_schema(self.engine) + if not self.schema_view: + schema = introspect_schema(self.engine) + else: + schema = self.schema_view.schema table_names = schema.classes.keys() if self._collections is None: self._collections = {} diff --git a/src/linkml_store/api/stores/filesystem/__init__.py b/src/linkml_store/api/stores/filesystem/__init__.py new file mode 100644 index 0000000..24aacf4 --- /dev/null +++ b/src/linkml_store/api/stores/filesystem/__init__.py @@ -0,0 +1,16 @@ +""" +Adapter for DuckDB embedded database. + +Handles have the form: + + - ``duckdb:///`` for a file-based database + - ``duckdb:///:memory:`` for an in-memory database +""" + +from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection +from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase + +__all__ = [ + "DuckDBCollection", + "DuckDBDatabase", +] diff --git a/src/linkml_store/api/stores/filesystem/filesystem_collection.py b/src/linkml_store/api/stores/filesystem/filesystem_collection.py new file mode 100644 index 0000000..ce762fa --- /dev/null +++ b/src/linkml_store/api/stores/filesystem/filesystem_collection.py @@ -0,0 +1,142 @@ +import logging +from typing import Any, Dict, List, Optional, Union + +import sqlalchemy as sqla +from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition +from sqlalchemy import Column, Table, delete, insert, inspect, text +from sqlalchemy.sql.ddl import CreateTable + +from linkml_store.api import Collection +from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT +from linkml_store.api.queries import Query +from linkml_store.api.stores.duckdb.mappings import TMAP +from linkml_store.utils.sql_utils import facet_count_sql + +logger = logging.getLogger(__name__) + + +class FileSystemCollection(Collection): + _table_created: bool = None + + def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs): + if not isinstance(objs, list): + objs = [objs] + if not objs: + return + cd = self.class_definition() + if not cd: + cd = self.induce_class_definition_from_objects(objs) + self._create_table(cd) + table = self._sqla_table(cd) + logger.info(f"Inserting into: {self.alias} // T={table.name}") + engine = self.parent.engine + col_names = [c.name for c in table.columns] + objs = [{k: obj.get(k, None) for k in col_names} for obj in objs] + with engine.connect() as conn: + with conn.begin(): + conn.execute(insert(table), objs) + conn.commit() + + def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]: + if not isinstance(objs, list): + objs = [objs] + cd = self.class_definition() + if not cd: + cd = self.induce_class_definition_from_objects(objs) + table = self._sqla_table(cd) + engine = self.parent.engine + with engine.connect() as conn: + for obj in objs: + conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes] + stmt = delete(table).where(*conditions) + stmt = stmt.compile(engine) + conn.execute(stmt) + conn.commit() + return + + def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]: + logger.info(f"Deleting from {self.target_class_name} where: {where}") + if where is None: + where = {} + cd = self.class_definition() + if not cd: + logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated") + return 0 + table = self._sqla_table(cd) + engine = self.parent.engine + inspector = inspect(engine) + table_exists = table.name in inspector.get_table_names() + if not table_exists: + logger.info(f"Table {table.name} does not exist, assuming no data") + return 0 + with engine.connect() as conn: + conditions = [table.c[k] == v for k, v in where.items()] + stmt = delete(table).where(*conditions) + stmt = stmt.compile(engine) + result = conn.execute(stmt) + deleted_rows_count = result.rowcount + if deleted_rows_count == 0 and not missing_ok: + raise ValueError(f"No rows found for {where}") + conn.commit() + return deleted_rows_count if deleted_rows_count > -1 else None + + def query_facets( + self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs + ) -> Dict[str, Dict[str, int]]: + results = {} + cd = self.class_definition() + with self.parent.engine.connect() as conn: + if not facet_columns: + facet_columns = list(self.class_definition().attributes.keys()) + for col in facet_columns: + logger.debug(f"Faceting on {col}") + if isinstance(col, tuple): + sd = SlotDefinition(name="PLACEHOLDER") + else: + sd = cd.attributes[col] + facet_query = self._create_query(where_clause=where) + facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued) + logger.debug(f"Facet query: {facet_query_str}") + rows = list(conn.execute(text(facet_query_str))) + results[col] = rows + return results + + def _sqla_table(self, cd: ClassDefinition) -> Table: + schema_view = self.parent.schema_view + metadata_obj = sqla.MetaData() + cols = [] + for att in schema_view.class_induced_slots(cd.name): + typ = TMAP.get(att.range, sqla.String) + if att.inlined: + typ = sqla.JSON + if att.multivalued: + typ = sqla.ARRAY(typ, dimensions=1) + if att.array: + typ = sqla.ARRAY(typ, dimensions=1) + col = Column(att.name, typ) + cols.append(col) + t = Table(self.alias, metadata_obj, *cols) + return t + + def _create_table(self, cd: ClassDefinition): + if self._table_created or self.metadata.is_prepopulated: + logger.info(f"Already have table for: {cd.name}") + return + query = Query( + from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias} + ) + qr = self.parent.query(query) + if qr.num_rows > 0: + logger.info(f"Table already exists for {cd.name}") + self._table_created = True + self.metadata.is_prepopulated = True + return + logger.info(f"Creating table for {cd.name}") + t = self._sqla_table(cd) + ct = CreateTable(t) + ddl = str(ct.compile(self.parent.engine)) + with self.parent.engine.connect() as conn: + conn.execute(text(ddl)) + conn.commit() + self._table_created = True + self.metadata.is_prepopulated = True diff --git a/src/linkml_store/api/stores/filesystem/filesystem_database.py b/src/linkml_store/api/stores/filesystem/filesystem_database.py new file mode 100644 index 0000000..c39d70b --- /dev/null +++ b/src/linkml_store/api/stores/filesystem/filesystem_database.py @@ -0,0 +1,48 @@ +import json +import logging +from pathlib import Path +from typing import Optional + +import pandas as pd +import sqlalchemy +from linkml_runtime import SchemaView +from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition +from linkml_runtime.utils.schema_builder import SchemaBuilder +from sqlalchemy import NullPool, text + +from linkml_store.api import Database, Collection +from linkml_store.api.config import CollectionConfig +from linkml_store.api.queries import Query, QueryResult +from linkml_store.api.stores.duckdb import DuckDBDatabase +from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection +from linkml_store.utils.sql_utils import introspect_schema, query_to_sql + + +logger = logging.getLogger(__name__) + + +class FileSystemDatabase(Database): + collection_class = FileSystemCollection + wrapped_database: Database = None + + def __init__(self, handle: Optional[str] = None, recreate_if_exists: bool = False, **kwargs): + self.wrapped_database = DuckDBDatabase("duckdb:///:memory:") + super().__init__(handle=handle, **kwargs) + + def commit(self, **kwargs): + # TODO: sync + pass + + def close(self, **kwargs): + self.wrapped_database.close() + + def create_collection( + self, + name: str, + alias: Optional[str] = None, + metadata: Optional[CollectionConfig] = None, + recreate_if_exists=False, + **kwargs, + ) -> Collection: + wd = self.wrapped_database + wd.create_collection() diff --git a/src/linkml_store/cli.py b/src/linkml_store/cli.py index 3fca702..657738f 100644 --- a/src/linkml_store/cli.py +++ b/src/linkml_store/cli.py @@ -125,7 +125,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, ctx.obj["database"] = database ctx.obj["collection"] = collection if settings.database_name: - db = client.attach_database(database) + db = client.get_database(database) if set: for expr in set: if "=" not in expr: diff --git a/tests/test_api/test_api.py b/tests/test_api/test_api.py index a91a370..8c6a079 100644 --- a/tests/test_api/test_api.py +++ b/tests/test_api/test_api.py @@ -9,7 +9,7 @@ from linkml_runtime.linkml_model import SlotDefinition from linkml_runtime.utils.schema_builder import SchemaBuilder from linkml_store.api.client import Client -from linkml_store.api.config import ClientConfig +from linkml_store.api.config import ClientConfig, CollectionConfig from linkml_store.api.queries import Query from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase from linkml_store.api.stores.solr.solr_database import SolrDatabase @@ -271,9 +271,24 @@ def test_store_nested(handle): assert remove_none(qr.rows[0]) == obj["organizations"][0] +@pytest.mark.parametrize("handle", SCHEMES) +def test_load_from_source(handle): + """ + Test syncing with sources + + :param handle: + :return: + """ + pytest.skip("TODO - in progress") + client = create_client(handle) + database = client.get_database() + coll = database.create_collection("Country", metadata=CollectionConfig(source_location=str(COUNTRIES_DATA_JSONL))) + assert coll.find({}).num_rows > 0 + + @pytest.mark.parametrize("handle", SCHEMES) @pytest.mark.parametrize( - "name_alias", + "type_alias", [ ( "Person", @@ -282,23 +297,23 @@ def test_store_nested(handle): ("Person", "persons"), ], ) -def test_induced_schema(handle, name_alias): +def test_induced_schema(handle, type_alias): """ Test induced schema and collection creation :param handle: - :param name_alias: + :param type_alias: :return: """ - name, alias = name_alias + typ, alias = type_alias client = create_client(handle) assert len(client.databases) == 1 database = client.get_database() assert len(database.list_collections()) == 0, "fresh database should have no collections" if alias: - collection = database.create_collection(name, alias=alias) + collection = database.create_collection(typ, alias=alias) else: - collection = database.create_collection(name) + collection = database.create_collection(typ) assert len(database.list_collections()) == 1, "expected collection to be created" assert collection.class_definition() is None, "no explicit schema and no data to induce from" # check is empty @@ -311,11 +326,11 @@ def test_induced_schema(handle, name_alias): assert collection.parent.schema_view.schema is not None, "expected schema to be initialized from data" assert collection.parent.schema_view.schema.classes, "expected single class to be initialized from data" assert len(collection.parent.schema_view.schema.classes) == 1, "expected single class to be initialized from data" - assert collection.parent.schema_view.schema.classes[name], "name of class is collection name by default" + assert collection.parent.schema_view.schema.classes[typ], "name of class is collection name by default" assert ( - collection.parent.schema_view.schema.classes[name].name == collection.name + collection.parent.schema_view.schema.classes[typ].name == collection.target_class_name ), "name of class is collection name by default" - assert collection.parent.schema_view.get_class(name), "schema view should work" + assert collection.parent.schema_view.get_class(typ), "schema view should work" assert collection.class_definition() is not None, "expected class definition to be created" assert len(database.list_collections()) == 1, "collections should be unmodified" assert collection.find().num_rows == len(objs), "expected no change in data" @@ -332,9 +347,9 @@ def test_induced_schema(handle, name_alias): # else: # collection = database.get_collection(name, create_if_not_exists=True) sv = database.schema_view - cd = sv.get_class(name) + cd = sv.get_class(typ) assert cd is not None, "class should be named using name (even if alias is set)" - assert cd.name == name + assert cd.name == typ assert len(cd.attributes) == 3, "expected 3 attributes induced from data" assert cd.attributes["id"].range == "integer", "expected id to be induced as integer" assert cd.attributes["name"].range == "string", "expected name to be induced as string" @@ -382,7 +397,7 @@ def test_induced_multivalued(handle): assert collection.parent.schema_view.schema is not None assert collection.parent.schema_view.schema.classes assert collection.parent.schema_view.schema.classes["foo"] - assert collection.parent.schema_view.schema.classes["foo"].name == collection.name + assert collection.parent.schema_view.schema.classes["foo"].name == collection.target_class_name assert collection.parent.schema_view.get_class("foo") assert collection.class_definition() is not None collection.query(collection._create_query()) @@ -655,8 +670,8 @@ def test_from_config_file(name, inserts): """ Test creating a client from a configuration file - :param name: - :param inserts: + :param name: configuration name + :param inserts: list of (db_alias, collection_alias, rows) tuples :return: """ source_dir = INPUT_DIR / "configurations" / name @@ -670,14 +685,14 @@ def test_from_config_file(name, inserts): config = client.metadata index = SimpleIndexer(name="test") - for db_name in config.databases: - print(f"DB: {db_name}") - db = client.get_database(db_name) + for db_alias in config.databases: + print(f"DB: {db_alias}") + db = client.get_database(db_alias) sv = db.schema_view print(f"SV: {sv.schema.classes.keys()}") for coll in db.list_collections(): - print(f"Looking up coll: {coll.name} in {config.databases[db_name].collections.keys()}") - coll_config = config.databases[db_name].collections[coll.name] + print(f"Looking up coll: {coll.alias} in {config.databases[db_alias].collections.keys()}") + coll_config = config.databases[db_alias].collections[coll.alias] if coll_config.attributes: print(f"Checking CD; expected as schema has {sv.schema.classes.keys()}") cd = coll.class_definition() @@ -685,18 +700,18 @@ def test_from_config_file(name, inserts): assert cd.attributes.keys() == coll_config.attributes.keys() for insert in inserts: - db_name, coll_name, objs = insert - db = client.get_database(db_name) - collection = db.get_collection(coll_name) + db_alias, coll_alias, objs = insert + db = client.get_database(db_alias) + collection = db.get_collection(coll_alias) assert collection is not None - assert collection.name == coll_name + assert collection.alias == coll_alias collection.insert(objs) - print(f"Searching in {coll_name}; TC={collection.target_class_name}, ALIAS={collection.alias}") + print(f"Searching in {coll_alias}; TC={collection.target_class_name}, ALIAS={collection.alias}") qr = collection.find() - assert qr.num_rows == len(objs), f"expected {len(objs)} for n={coll_name} I= {insert}" + assert qr.num_rows == len(objs), f"expected {len(objs)} for n={coll_type} I= {insert}" - for db_name in config.databases: - db = client.get_database(db_name) + for db_alias in config.databases: + db = client.get_database(db_alias) for coll in db.list_collections(): coll.attach_indexer(index) _results = coll.search("e") diff --git a/tests/test_cli.py b/tests/test_cli.py index 6df1ba0..7ae0379 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -189,6 +189,7 @@ def test_store_explicit_schema(cli_runner, output_dir): database_handle = f"duckdb:///{db_path}" input_path = INPUT_DIR / "nested.yaml" input_schema_path = INPUT_DIR / "nested.schema.yaml" + # store the objects, using the schema result = cli_runner.invoke( cli, [ @@ -201,6 +202,7 @@ def test_store_explicit_schema(cli_runner, output_dir): ], ) assert result.exit_code == 0 + # now export the schema schema_output_path = os.path.join(output_dir, "schema_output.yaml") result = cli_runner.invoke( cli, @@ -215,6 +217,6 @@ def test_store_explicit_schema(cli_runner, output_dir): assert result.exit_code == 0 schema_dict = yaml.safe_load(Path(schema_output_path).read_text()) classes = schema_dict["classes"] - # note we have intentionally "lost" the original containerx + # note we have intentionally "lost" the original container assert len(classes) == 3 - assert set(classes.keys()) == {"About", "Person", "Organization"} + assert set(classes.keys()) == {"about", "persons", "organizations"} From d5ae6879eddaa84addb5d1df2b1083d5dd933f0d Mon Sep 17 00:00:00 2001 From: linkmluser Date: Fri, 21 Jun 2024 19:53:18 -0700 Subject: [PATCH 2/4] fix-typo --- src/linkml_store/api/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linkml_store/api/client.py b/src/linkml_store/api/client.py index 494db2d..b659870 100644 --- a/src/linkml_store/api/client.py +++ b/src/linkml_store/api/client.py @@ -39,7 +39,7 @@ class Client: >>> db = client.attach_database("duckdb", alias="test") Note that normally a handle would be specified by a locator such as ``duckdb:///``, but - for convience, an in-memory duckdb object can be specified without a full locator + for convenience, an in-memory duckdb object can be specified without a full locator We can check the actual handle: From 9380e9e3df73ead14a20b5bf619cd0e205c574f5 Mon Sep 17 00:00:00 2001 From: linkmluser Date: Fri, 21 Jun 2024 19:57:13 -0700 Subject: [PATCH 3/4] lint --- src/linkml_store/api/collection.py | 2 +- src/linkml_store/api/database.py | 5 +++- .../stores/filesystem/filesystem_database.py | 26 +++++-------------- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/src/linkml_store/api/collection.py b/src/linkml_store/api/collection.py index 70c5c39..79b2356 100644 --- a/src/linkml_store/api/collection.py +++ b/src/linkml_store/api/collection.py @@ -74,7 +74,7 @@ def __init__( self.metadata.alias = name if not self.metadata.type: self.metadata.type = name - #if name is not None and self.metadata.name is not None and name != self.metadata.name: + # if name is not None and self.metadata.name is not None and name != self.metadata.name: # raise ValueError(f"Name mismatch: {name} != {self.metadata.name}") @property diff --git a/src/linkml_store/api/database.py b/src/linkml_store/api/database.py index 37ecfbc..5fe8cf9 100644 --- a/src/linkml_store/api/database.py +++ b/src/linkml_store/api/database.py @@ -348,7 +348,9 @@ def list_collection_names(self, **kwargs) -> Sequence[str]: """ return [c.name for c in self.list_collections(**kwargs)] - def get_collection(self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs) -> "Collection": + def get_collection( + self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs + ) -> "Collection": """ Get a named collection. @@ -366,6 +368,7 @@ def get_collection(self, name: str, type: Optional[str] = None, create_if_not_ex KeyError: 'Collection NonExistent does not exist' :param name: name of the collection + :param type: target class name :param create_if_not_exists: create the collection if it does not exist """ diff --git a/src/linkml_store/api/stores/filesystem/filesystem_database.py b/src/linkml_store/api/stores/filesystem/filesystem_database.py index c39d70b..560a162 100644 --- a/src/linkml_store/api/stores/filesystem/filesystem_database.py +++ b/src/linkml_store/api/stores/filesystem/filesystem_database.py @@ -1,22 +1,10 @@ -import json import logging -from pathlib import Path from typing import Optional -import pandas as pd -import sqlalchemy -from linkml_runtime import SchemaView -from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition -from linkml_runtime.utils.schema_builder import SchemaBuilder -from sqlalchemy import NullPool, text - -from linkml_store.api import Database, Collection +from linkml_store.api import Collection, Database from linkml_store.api.config import CollectionConfig -from linkml_store.api.queries import Query, QueryResult from linkml_store.api.stores.duckdb import DuckDBDatabase from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection -from linkml_store.utils.sql_utils import introspect_schema, query_to_sql - logger = logging.getLogger(__name__) @@ -37,12 +25,12 @@ def close(self, **kwargs): self.wrapped_database.close() def create_collection( - self, - name: str, - alias: Optional[str] = None, - metadata: Optional[CollectionConfig] = None, - recreate_if_exists=False, - **kwargs, + self, + name: str, + alias: Optional[str] = None, + metadata: Optional[CollectionConfig] = None, + recreate_if_exists=False, + **kwargs, ) -> Collection: wd = self.wrapped_database wd.create_collection() From 3e6929ea68e14832865f9ae919fa583a95c10a3c Mon Sep 17 00:00:00 2001 From: linkmluser Date: Fri, 21 Jun 2024 19:58:03 -0700 Subject: [PATCH 4/4] lint --- tests/test_api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_api/test_api.py b/tests/test_api/test_api.py index 8c6a079..16bb8ae 100644 --- a/tests/test_api/test_api.py +++ b/tests/test_api/test_api.py @@ -708,7 +708,7 @@ def test_from_config_file(name, inserts): collection.insert(objs) print(f"Searching in {coll_alias}; TC={collection.target_class_name}, ALIAS={collection.alias}") qr = collection.find() - assert qr.num_rows == len(objs), f"expected {len(objs)} for n={coll_type} I= {insert}" + assert qr.num_rows == len(objs), f"expected {len(objs)} for n={coll_alias} I= {insert}" for db_alias in config.databases: db = client.get_database(db_alias)