Skip to content

Commit

Permalink
Merge pull request #15 from linkml/pxf-nb
Browse files Browse the repository at this point in the history
Documentation
  • Loading branch information
cmungall authored Jun 22, 2024
2 parents c2e8617 + 3e6929e commit 0872233
Show file tree
Hide file tree
Showing 15 changed files with 1,487 additions and 116 deletions.
8 changes: 8 additions & 0 deletions docs/about.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ About
LinkML-Store is an early effort to provide a unifying storage layer
over multiple different backends, unified via LinkML schemas.

The overall goals are to provide:

* Make it easier to work with data in different forms (tabular, JSON, columnar, RDF)
* Expressive validation at scale, including full referential integrity validation
* Ability to mix and match different backends (e.g. DuckDB, MongoDB, Solr, ChromaDB, HDF5)
* Composability of different search indexes, including LLM textual embeddings
* LAMP-like stack for LinkML

Installation
------------

Expand Down
1,026 changes: 1,026 additions & 0 deletions docs/how-to/Index-caDSR.ipynb

Large diffs are not rendered by default.

175 changes: 113 additions & 62 deletions docs/how-to/Use-MongoDB.ipynb

Large diffs are not rendered by default.

28 changes: 23 additions & 5 deletions src/linkml_store/api/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,27 @@ class Client:
"""
A client is the top-level object for interacting with databases.
A client has access to one or more :class:`Database` objects.
* A client has access to one or more :class:`.Database` objects.
* Each database consists of a number of :class:`.Collection` objects.
Each database consists of a number of :class:`.Collection` objects.
Examples
--------
Creating a client
-----------------
>>> client = Client()
Attaching a database
--------------------
>>> db = client.attach_database("duckdb", alias="test")
Note that normally a handle would be specified by a locator such as ``duckdb:///<PATH>``, but
for convenience, an in-memory duckdb object can be specified without a full locator
We can check the actual handle:
>>> db.handle
'duckdb:///:memory:'
Creating a new collection
-------------------------
>>> collection = db.create_collection("Person")
>>> objs = [{"id": "P1", "name": "John", "age_in_years": 30}, {"id": "P2", "name": "Alice", "age_in_years": 25}]
>>> collection.insert(objs)
Expand Down Expand Up @@ -171,6 +184,11 @@ def attach_database(
self._databases = {}
self._databases[alias] = db
db.parent = self
if db.alias:
if db.alias != alias:
raise AssertionError(f"Inconsistent alias: {db.alias} != {alias}")
else:
db.metadata.alias = alias
return db

def get_database(self, name: Optional[str] = None, create_if_not_exists=True, **kwargs) -> Database:
Expand Down
24 changes: 18 additions & 6 deletions src/linkml_store/api/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pydantic import BaseModel

from linkml_store.index import get_indexer
from linkml_store.utils.format_utils import load_objects
from linkml_store.utils.object_utils import clean_empties

try:
Expand Down Expand Up @@ -69,8 +70,12 @@ def __init__(
self.metadata = metadata
else:
self.metadata = CollectionConfig(name=name, **kwargs)
if name is not None and self.metadata.name is not None and name != self.metadata.name:
raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")
if not self.metadata.alias:
self.metadata.alias = name
if not self.metadata.type:
self.metadata.type = name
# if name is not None and self.metadata.name is not None and name != self.metadata.name:
# raise ValueError(f"Name mismatch: {name} != {self.metadata.name}")

@property
def name(self) -> str:
Expand All @@ -93,7 +98,7 @@ def hidden(self) -> bool:
:return: True if the collection is hidden
"""
return self.metadata.hidden
# return self.metadata.hidden

@property
def target_class_name(self):
Expand Down Expand Up @@ -152,6 +157,7 @@ def alias(self):
:return:
"""
# TODO: this is a shim layer until we can normalize on this
# TODO: this is a shim layer until we can normalize on this
if self.metadata.alias:
return self.metadata.alias
return self.name
Expand Down Expand Up @@ -444,9 +450,13 @@ def is_internal(self) -> bool:
:return:
"""
if not self.name:
raise ValueError(f"Collection has no name: {self} // {self.metadata}")
return self.name.startswith("internal__")
if not self.alias:
raise ValueError(f"Collection has no alias: {self} // {self.metadata}")
return self.alias.startswith("internal__")

def load_from_source(self):
objects = load_objects(self.metadata.source_location)
self.insert(objects)

def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
"""
Expand Down Expand Up @@ -599,6 +609,8 @@ def induce_class_definition_from_objects(self, objs: List[OBJECT], max_sample_si
:param max_sample_size:
:return:
"""
if not self.target_class_name:
raise ValueError(f"No target_class_name for {self.alias}")
cd = ClassDefinition(self.target_class_name)
keys = defaultdict(list)
for obj in objs[0:max_sample_size]:
Expand Down
8 changes: 6 additions & 2 deletions src/linkml_store/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class CollectionConfig(BaseModel):
default=None,
description="The type of object in the collection. TODO; use this instead of name",
)
metadata: Optional[Dict] = Field(
additional_properties: Optional[Dict] = Field(
default=None,
description="Optional metadata for the collection",
)
Expand All @@ -36,6 +36,10 @@ class CollectionConfig(BaseModel):
default=False,
description="Whether the collection is prepopulated",
)
source_location: Optional[str] = Field(
default=None,
description="Filesystem or remote URL that stores the data",
)


class DatabaseConfig(BaseModel):
Expand All @@ -55,7 +59,7 @@ class DatabaseConfig(BaseModel):
default=None,
description="The LinkML schema as a dictionary",
)
collections: Dict[str, CollectionConfig] = Field(
collections: Optional[Dict[str, CollectionConfig]] = Field(
default={},
description="A dictionary of collection configurations",
)
Expand Down
54 changes: 45 additions & 9 deletions src/linkml_store/api/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,33 @@ class Database(ABC):
"""
A Database provides access to named collections of data.
Examples
--------
A database object is owned by a :ref:`Client`. The database
object uses a :ref:`handle` to know what kind of external
dataase system to connect to (e.g. duckdb, mongodb). The handle
is a string ``<DatabaseType>:<LocalLocator>``
The
database object may also have an :ref:`alias` that is mapped
to the handle.
Attaching a database
--------------------
>>> from linkml_store.api.client import Client
>>> client = Client()
>>> db = client.attach_database("duckdb", alias="test")
>>> db = client.attach_database("duckdb:///:memory:", alias="test")
We can check the value of the handle:
>>> db.handle
'duckdb:///:memory:'
The alias can be used to retrieve the database object from the client
>>> assert db == client.get_database("test")
Creating a collection
---------------------
>>> collection = db.create_collection("Person")
>>> len(db.list_collections())
1
Expand Down Expand Up @@ -108,6 +128,8 @@ def from_config(self, db_config: DatabaseConfig, **kwargs):
return self

def _initialize_collections(self):
if not self.metadata.collections:
return
for name, collection_config in self.metadata.collections.items():
alias = collection_config.alias
typ = collection_config.type
Expand Down Expand Up @@ -156,6 +178,10 @@ def handle(self) -> str:
"""
return self.metadata.handle

@property
def alias(self):
return self.metadata.alias

def store(self, obj: Dict[str, Any], **kwargs):
"""
Store an object in the database.
Expand Down Expand Up @@ -193,9 +219,11 @@ def store(self, obj: Dict[str, Any], **kwargs):
if not v:
continue
if slot:
collection = self.get_collection(slot.range, create_if_not_exists=True)
logger.debug(f"Aligning to existing slot: {slot.name} range={slot.range}")
collection = self.get_collection(slot.name, type=slot.range, create_if_not_exists=True)
else:
collection = self.get_collection(k, create_if_not_exists=True)
logger.debug(f"Replacing using {collection.alias} {collection.target_class_name}")
collection.replace(v)

def commit(self, **kwargs):
Expand Down Expand Up @@ -260,6 +288,8 @@ def create_collection(
raise ValueError(f"Collection name must be provided: alias: {alias} metadata: {metadata}")
collection_cls = self.collection_class
collection = collection_cls(name=name, alias=alias, parent=self, metadata=metadata)
if metadata and metadata.source_location:
collection.load_from_source()
if metadata and metadata.attributes:
sv = self.schema_view
schema = sv.schema
Expand Down Expand Up @@ -318,7 +348,9 @@ def list_collection_names(self, **kwargs) -> Sequence[str]:
"""
return [c.name for c in self.list_collections(**kwargs)]

def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Collection":
def get_collection(
self, name: str, type: Optional[str] = None, create_if_not_exists=True, **kwargs
) -> "Collection":
"""
Get a named collection.
Expand All @@ -336,14 +368,19 @@ def get_collection(self, name: str, create_if_not_exists=True, **kwargs) -> "Col
KeyError: 'Collection NonExistent does not exist'
:param name: name of the collection
:param type: target class name
:param create_if_not_exists: create the collection if it does not exist
"""
if not self._collections:
logger.debug("Initializing collections")
self.init_collections()
if name not in self._collections.keys():
if create_if_not_exists:
self._collections[name] = self.create_collection(name)
if type is None:
type = name
logger.debug(f"Creating new collection: {name} kwargs: {kwargs}")
self._collections[name] = self.create_collection(type, alias=name, **kwargs)
else:
raise KeyError(f"Collection {name} does not exist")
return self._collections[name]
Expand Down Expand Up @@ -470,8 +507,7 @@ def set_schema_view(self, schema_view: Union[str, Path, SchemaView]):
if inlined and slot.range:
if slot.name in self._collections:
coll = self._collections[slot.name]
if not coll.metadata.type:
coll.metadata.type = slot.range
coll.metadata.type = slot.range

def load_schema_view(self, path: Union[str, Path]):
"""
Expand Down Expand Up @@ -538,7 +574,7 @@ def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
>>> db = client.attach_database("duckdb", alias="test")
>>> db.load_schema_view("tests/input/countries/countries.linkml.yaml")
Let's introspet the schema to see what slots are applicable for the class "Country":
Let's introspect the schema to see what slots are applicable for the class "Country":
>>> sv = db.schema_view
>>> for slot in sv.class_induced_slots("Country"):
Expand Down
2 changes: 2 additions & 0 deletions src/linkml_store/api/stores/duckdb/duckdb_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ class DuckDBCollection(Collection):
_table_created: bool = None

def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
logger.debug(f"Inserting {len(objs)}")
if not isinstance(objs, list):
objs = [objs]
if not objs:
return
cd = self.class_definition()
if not cd:
logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
cd = self.induce_class_definition_from_objects(objs)
self._create_table(cd)
table = self._sqla_table(cd)
Expand Down
5 changes: 4 additions & 1 deletion src/linkml_store/api/stores/duckdb/duckdb_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,10 @@ def query(self, query: Query, **kwargs) -> QueryResult:

def init_collections(self):
# TODO: unify schema introspection
schema = introspect_schema(self.engine)
if not self.schema_view:
schema = introspect_schema(self.engine)
else:
schema = self.schema_view.schema
table_names = schema.classes.keys()
if self._collections is None:
self._collections = {}
Expand Down
16 changes: 16 additions & 0 deletions src/linkml_store/api/stores/filesystem/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
Adapter for DuckDB embedded database.
Handles have the form:
- ``duckdb:///<path>`` for a file-based database
- ``duckdb:///:memory:`` for an in-memory database
"""

from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase

__all__ = [
"DuckDBCollection",
"DuckDBDatabase",
]
Loading

0 comments on commit 0872233

Please sign in to comment.