From 630077f8afa0628ee91357c2bb782d61d9f85db3 Mon Sep 17 00:00:00 2001 From: Jacob Wirth Date: Mon, 15 Jan 2024 17:25:39 -0800 Subject: [PATCH 1/3] Add tests around include_attributes=True --- tests/test_vectors.py | 18 ++++++++++++++++++ turbopuffer/backend.py | 2 ++ turbopuffer/error.py | 2 +- turbopuffer/namespace.py | 1 + turbopuffer/query.py | 7 ++++--- 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tests/test_vectors.py b/tests/test_vectors.py index 01b9212..f199d5f 100644 --- a/tests/test_vectors.py +++ b/tests/test_vectors.py @@ -201,6 +201,24 @@ def check_result(row, expected): for i in range(len(vector_set)): # Use VectorResult in index mode check_result(vector_set[i], expected[i]) + # Test query with all attributes + vector_set = ns.query( + top_k=5, + vector=[0.8, 0.7], + distance_metric='euclidean_squared', + include_vectors=True, + include_attributes=True + ) + expected = [ + tpuf.VectorRow(id=7, vector=[0.7, 0.7], dist=0.01, attributes={'hello': 'world'}), + tpuf.VectorRow(id=10, vector=[1.0, 1.0], dist=0.13, attributes={'test': 'cols'}), + tpuf.VectorRow(id=11, vector=[1.1, 1.1], dist=0.25, attributes={'test': 'cols'}), + tpuf.VectorRow(id=3, vector=[0.3, 0.3], dist=0.41, attributes={'test': 'cols', 'key1': 'three', 'key2': 'c'}), + tpuf.VectorRow(id=6, vector=[0.3, 0.3], dist=0.41, attributes={'test': 'cols', 'key1': 'three', 'key2': 'c'}), + ] + for i in range(len(vector_set)): # Use VectorResult in index mode + check_result(vector_set[i], expected[i]) + # Test query with typed query vector_set = ns.query(tpuf.VectorQuery( top_k=5, diff --git a/turbopuffer/backend.py b/turbopuffer/backend.py index be00820..2f1ec53 100644 --- a/turbopuffer/backend.py +++ b/turbopuffer/backend.py @@ -52,6 +52,8 @@ def make_api_request(self, # before = time.monotonic() json_payload = tpuf.dump_json_bytes(payload) # print('Json time:', time.monotonic() - before) + elif isinstance(payload, bytes): + json_payload = payload else: raise ValueError(f'Unsupported POST payload type: {type(payload)}') diff --git a/turbopuffer/error.py b/turbopuffer/error.py index ac42cad..51760bd 100644 --- a/turbopuffer/error.py +++ b/turbopuffer/error.py @@ -10,4 +10,4 @@ class APIError(TurbopufferError): def __init__(self, status_code: int, status_name: str, message: str): self.status_code = status_code self.status_name = status_name - super().__init__(f'{status_name}: {message}') + super().__init__(f'{status_name} (HTTP {status_code}): {message}') diff --git a/turbopuffer/namespace.py b/turbopuffer/namespace.py index f6d379f..35c6283 100644 --- a/turbopuffer/namespace.py +++ b/turbopuffer/namespace.py @@ -1,3 +1,4 @@ +import json import sys from turbopuffer.vectors import Cursor, VectorResult, VectorColumns, VectorRow, batch_iter from turbopuffer.backend import Backend diff --git a/turbopuffer/query.py b/turbopuffer/query.py index dfde544..da08f23 100644 --- a/turbopuffer/query.py +++ b/turbopuffer/query.py @@ -21,7 +21,7 @@ class VectorQuery: distance_metric: Optional[str] = None top_k: int = 10 include_vectors: bool = False - include_attributes: Optional[List[str]] = None + include_attributes: Optional[Union[List[str], bool]] = None filters: Optional[Dict[str, List[FilterTuple]]] = None def from_dict(source: dict) -> 'VectorQuery': @@ -41,8 +41,9 @@ def __post_init__(self): raise ValueError(f'VectorQuery.vector must a 1d-array, got {self.vector.ndim} dimensions') elif not isinstance(self.vector, list): raise ValueError('VectorQuery.vector must be a list, got:', type(self.vector)) - if self.include_attributes is not None and not isinstance(self.include_attributes, list) and not isinstance(self.include_attributes, bool): - raise ValueError('VectorQuery.include_attributes must be a list or bool, got:', type(self.include_attributes)) + if self.include_attributes is not None: + if not isinstance(self.include_attributes, list) and not isinstance(self.include_attributes, bool): + raise ValueError('VectorQuery.include_attributes must be a list or bool, got:', type(self.include_attributes)) if self.filters is not None: if not isinstance(self.filters, dict): raise ValueError('VectorQuery.filters must be a dict, got:', type(self.filters)) From 9ddfc35379f9870d53005981abf1dc599f2d0fe9 Mon Sep 17 00:00:00 2001 From: Jacob Wirth Date: Wed, 17 Jan 2024 00:41:56 -0800 Subject: [PATCH 2/3] Add another CI variation to catch bug --- .github/workflows/ci_test.yml | 25 +++++++++++++++++++++++++ tests/test_vectors.py | 2 +- turbopuffer/namespace.py | 1 - 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index d21b9d7..9f9edf0 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -40,6 +40,31 @@ jobs: poetry run pytest compatibility: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.12 + uses: actions/setup-python@v3 + with: + python-version: "3.12" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest poetry + poetry install --with test --with compatibility + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + poetry run flake8 . --count --per-file-ignores=vectors.py:F821 --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. Skip package import warnings. + poetry run flake8 . --count --exit-zero --per-file-ignores=vectors.py:F821 --exclude=__init__.py --max-line-length=140 --statistics + - name: Test with pytest + env: + TURBOPUFFER_API_KEY: ${{ secrets.TURBOPUFFER_TEST_API_KEY }} + run: | + poetry run pytest + + compatibility-fast: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/tests/test_vectors.py b/tests/test_vectors.py index f199d5f..ffc77c9 100644 --- a/tests/test_vectors.py +++ b/tests/test_vectors.py @@ -7,7 +7,7 @@ def test_upsert_rows(): ns = tpuf.Namespace(tests.test_prefix + 'client_test') assert str(ns) == f'tpuf-namespace:{tests.test_prefix}client_test' - # Test upsert mutliple dict rows + # Test upsert multiple dict rows ns.upsert([ {'id': 2, 'vector': [2, 2]}, {'id': 7, 'vector': [0.7, 0.7], 'attributes': {'hello': 'world', 'test': 'rows'}}, diff --git a/turbopuffer/namespace.py b/turbopuffer/namespace.py index 35c6283..f6d379f 100644 --- a/turbopuffer/namespace.py +++ b/turbopuffer/namespace.py @@ -1,4 +1,3 @@ -import json import sys from turbopuffer.vectors import Cursor, VectorResult, VectorColumns, VectorRow, batch_iter from turbopuffer.backend import Backend From 1eb4338c36e67f2a4105f0dc746d55110d372e72 Mon Sep 17 00:00:00 2001 From: Jacob Wirth Date: Thu, 25 Jan 2024 15:55:13 +0000 Subject: [PATCH 3/3] Fix numpy compatibility with non-fast JSON serializer --- turbopuffer/__init__.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/turbopuffer/__init__.py b/turbopuffer/__init__.py index f6dae6d..69e1d8c 100644 --- a/turbopuffer/__init__.py +++ b/turbopuffer/__init__.py @@ -1,4 +1,5 @@ import os +import sys api_key = os.environ.get('TURBOPUFFER_API_KEY') api_base_url = os.environ.get('TURBOPUFFER_API_BASE_URL', 'https://api.turbopuffer.com/v1') upsert_batch_size = 5_000 @@ -8,7 +9,19 @@ def dump_json_bytes(obj): return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY) except ImportError: import json - def dump_json_bytes(obj): return json.dumps(obj).encode() + + class NumpyEncoder(json.JSONEncoder): + def default(self, obj): + if 'numpy' in sys.modules: + if isinstance(obj, sys.modules['numpy'].integer): + return int(obj) + elif isinstance(obj, sys.modules['numpy'].floating): + return float(obj) + elif isinstance(obj, sys.modules['numpy'].ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) + + def dump_json_bytes(obj): return json.dumps(obj, cls=NumpyEncoder).encode() from turbopuffer.version import VERSION from turbopuffer.namespace import Namespace