Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated embedding to work with OpenAI's Python v1.3+ and with Azure O… #569

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
9 changes: 7 additions & 2 deletions gptcache/embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from gptcache.utils.lazy_import import LazyImport

openai = LazyImport("openai", globals(), "gptcache.embedding.openai")
azureopenai = LazyImport("azureopenai", globals(), "gptcache.embedding.azureopenai")
huggingface = LazyImport("huggingface", globals(), "gptcache.embedding.huggingface")
sbert = LazyImport("sbert", globals(), "gptcache.embedding.sbert")
onnx = LazyImport("onnx", globals(), "gptcache.embedding.onnx")
Expand All @@ -36,8 +37,12 @@ def Cohere(model="large", api_key=None):
return cohere.Cohere(model, api_key)


def OpenAI(model="text-embedding-ada-002", api_key=None):
return openai.OpenAI(model, api_key)
def OpenAI(client, model="text-embedding-ada-002"):
return openai.OpenAIEmbedding(client, model)


def AzureOpenAI(client, azure_deployment, model="text-embedding-ada-002"):
return azureopenai.AzureOpenAIEmbedding(client, azure_deployment, model)


def Huggingface(model="distilbert-base-uncased"):
Expand Down
82 changes: 82 additions & 0 deletions gptcache/embedding/azureopenai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Union

import numpy as np

from gptcache.embedding.base import BaseEmbedding
from gptcache.utils import import_openai

import_openai()

from openai.lib.azure import AzureOpenAI, AsyncAzureOpenAI


class AzureOpenAIEmbedding(BaseEmbedding):
"""Generate text embedding for given text using Azure's OpenAI service.

:param model: OpenAI Client with any modifications you intend to use.
:type model: str

:param model: model id from the API, defaults to 'text-embedding-ada-002'.
:type model: str

Example:
.. code-block:: python

from gptcache.embedding import AzureOpenAIEmbedding
from openai import AzureOpenAI

test_sentence = 'Hello, world.'
client = AzureOpenAI()

# You can create different deployments for different embedding models on Azure.
encoder = OpenAIEmbedding(client, azure_deployment='my_embedding_azure_deployment')

embed = encoder.to_embeddings(test_sentence)
"""

def __init__(self,
client: Union[AzureOpenAI, AsyncAzureOpenAI],
azure_deployment: str,
model: str = 'text-embedding-ada-002',
):
"""

:param client: Azure OpenAI Client class
:type client: Union[AzureOpenAI, AsyncAzureOpenAI]
:param azure_deployment: The deployment name for the embedding; used to generate the endpoint url.
:type azure_deployment: str
:param model: The name of the embedding model; only used for determining the dimensions. Defaults to "text-embedding-ada-002"
:type model: str
"""
self.model = model
self.__azure_embedding_model_deployment = azure_deployment
self.__dimension = self.dim_dict().get(self.model, None)
self.client = client

def to_embeddings(self, data, **_):
"""

:param data: String that you wish to convert to an embedding.
:param _:
:return: Array of Float32 numbers that represent the string
"""
sentence_embeddings = self.client.embeddings.create(
input=data,
model=self.__azure_embedding_model_deployment,
)
return np.array(sentence_embeddings.data[0].embedding).astype("float32")

@property
def dimension(self):
"""Embedding dimension.

:return: embedding dimension
"""
if not self.__dimension:
foo_emb = self.to_embeddings("foo")
self.__dimension = len(foo_emb)
return self.__dimension

@staticmethod
def dim_dict():
return {"text-embedding-ada-002": 1536}
42 changes: 19 additions & 23 deletions gptcache/embedding/openai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from typing import Union

import numpy as np

Expand All @@ -7,39 +7,35 @@

import_openai()

import openai # pylint: disable=C0413
from openai import OpenAI, AsyncOpenAI

class OpenAI(BaseEmbedding):

class OpenAIEmbedding(BaseEmbedding):
"""Generate text embedding for given text using OpenAI.

:param model: model name, defaults to 'text-embedding-ada-002'.
:param model: OpenAI Client with any modifications you intend to use.
:type model: str

:param model: model id from the API, defaults to 'text-embedding-ada-002'.
:type model: str
:param api_key: OpenAI API Key. When the parameter is not specified, it will load the key by default if it is available.
:type api_key: str

Example:
.. code-block:: python

from gptcache.embedding import OpenAI
from gptcache.embedding import OpenAIEmbedding
from openai import OpenAI

test_sentence = 'Hello, world.'
encoder = OpenAI(api_key='your_openai_key')
client = OpenAI(api_key='your_openai_key')
encoder = OpenAIEmbedding(client, model="MyEmbeddingModelId")
embed = encoder.to_embeddings(test_sentence)
"""

def __init__(self, model: str = "text-embedding-ada-002", api_key: str = None, api_base: str = None):
if not api_key:
if openai.api_key:
api_key = openai.api_key
else:
api_key = os.getenv("OPENAI_API_KEY")
if not api_base:
if openai.api_base:
api_base = openai.api_base
else:
api_base = os.getenv("OPENAI_API_BASE")
openai.api_key = api_key
self.api_base = api_base # don't override all of openai as we may just want to override for say embeddings
def __init__(self,
client: Union[OpenAI, AsyncOpenAI],
model: str = "text-embedding-ada-002",
):
self.client = client
self.model = model
if model in self.dim_dict():
self.__dimension = self.dim_dict()[model]
Expand All @@ -54,8 +50,8 @@ def to_embeddings(self, data, **_):

:return: a text embedding in shape of (dim,).
"""
sentence_embeddings = openai.Embedding.create(model=self.model, input=data, api_base=self.api_base)
return np.array(sentence_embeddings["data"][0]["embedding"]).astype("float32")
sentence_embeddings = self.client.embeddings.create(model=self.model, input=data)
return np.array(sentence_embeddings.data[0].embedding).astype("float32")

@property
def dimension(self):
Expand Down
9 changes: 6 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
numpy
cachetools
requests
numpy~=1.26.2
cachetools~=5.3.2
requests~=2.31.0
pydantic~=2.5.1
httpx~=0.25.1
setuptools~=60.2.0
Loading