diff --git a/src/evidently/experimental/dataset_generators/__init__.py b/src/evidently/experimental/dataset_generators/__init__.py deleted file mode 100644 index 4bfe1f7c80..0000000000 --- a/src/evidently/experimental/dataset_generators/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import _registry - -__all__ = ["_registry"] diff --git a/src/evidently/experimental/dataset_generators/_registry.py b/src/evidently/experimental/dataset_generators/_registry.py deleted file mode 100644 index 74a027ac6a..0000000000 --- a/src/evidently/experimental/dataset_generators/_registry.py +++ /dev/null @@ -1,67 +0,0 @@ -from evidently.experimental.dataset_generators.base import BaseDatasetGenerator -from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider -from evidently.experimental.dataset_generators.llm.splitter import Splitter -from evidently.pydantic_utils import register_type_alias -from evidently.utils.llm.prompts import PromptTemplate - -register_type_alias( - BaseDatasetGenerator, - "evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator", - "evidently:dataset_generator:QADatasetFromSeedGenerator", -) -register_type_alias( - BaseDatasetGenerator, - "evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator", - "evidently:dataset_generator:QADatasetGenerator", -) -register_type_alias( - DataCollectionProvider, - "evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider", - "evidently:data_collecton_provider:ChunksDataCollectionProvider", -) -register_type_alias( - DataCollectionProvider, - "evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider", - "evidently:data_collecton_provider:FileDataCollectionProvider", -) - -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate", - "evidently:prompt_template:BaselineAnswerPromptTemplate", -) -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate", - "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate", -) -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate", - "evidently:prompt_template:QuestionsFromContextPromptTemplate", -) -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate", - "evidently:prompt_template:QuestionsFromSeedPromptTemplate", -) -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate", - "evidently:prompt_template:ReformulateQuestionPromptTemplate", -) -register_type_alias( - PromptTemplate, - "evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate", - "evidently:prompt_template:SimpleQuestionPromptTemplate", -) -register_type_alias( - Splitter, - "evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter", - "evidently:splitter:LlamaIndexSplitter", -) -register_type_alias( - Splitter, - "evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter", - "evidently:splitter:SimpleSplitter", -) diff --git a/src/evidently/experimental/dataset_generators/base.py b/src/evidently/experimental/dataset_generators/base.py deleted file mode 100644 index 0aefc12c8e..0000000000 --- a/src/evidently/experimental/dataset_generators/base.py +++ /dev/null @@ -1,21 +0,0 @@ -from abc import ABC -from abc import abstractmethod - -import pandas as pd -from typing_extensions import TypeAlias - -from evidently.options.base import Options -from evidently.pydantic_utils import EvidentlyBaseModel - -DatasetGeneratorResult: TypeAlias = pd.DataFrame - - -class BaseDatasetGenerator(EvidentlyBaseModel, ABC): - class Config: - is_base_type = True - - options: Options - - @abstractmethod - def generate(self) -> DatasetGeneratorResult: - raise NotImplementedError diff --git a/src/evidently/experimental/dataset_generators/llm/__init__.py b/src/evidently/experimental/dataset_generators/llm/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/evidently/experimental/dataset_generators/llm/base.py b/src/evidently/experimental/dataset_generators/llm/base.py deleted file mode 100644 index 9710610657..0000000000 --- a/src/evidently/experimental/dataset_generators/llm/base.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional - -from evidently._pydantic_compat import PrivateAttr -from evidently.experimental.dataset_generators.base import BaseDatasetGenerator -from evidently.options.base import Options -from evidently.utils.llm.wrapper import LLMWrapper -from evidently.utils.llm.wrapper import get_llm_wrapper - - -class BaseLLMDatasetGenerator(BaseDatasetGenerator): - provider: str - model: str - _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None) - - def get_llm_wrapper(self, options: Options) -> LLMWrapper: - if self._llm_wrapper is None: - self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options) - return self._llm_wrapper - - @property - def wrapper(self): - return self.get_llm_wrapper(self.options) diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py deleted file mode 100644 index 1b5d2c2bf5..0000000000 --- a/src/evidently/experimental/dataset_generators/llm/index.py +++ /dev/null @@ -1,149 +0,0 @@ -import abc -import glob -import os -from pathlib import Path -from typing import List -from typing import Optional - -import chromadb -from chromadb.types import Collection -from chromadb.utils import embedding_functions - -from evidently.experimental.dataset_generators.llm.splitter import AnySplitter -from evidently.experimental.dataset_generators.llm.splitter import Splitter -from evidently.pydantic_utils import EvidentlyBaseModel - -Chunk = str -DEFAULT_CHUNK_SIZE = 512 -DEFAULT_CHUNK_OVERLAP = 20 - - -def read_text(filename: str) -> str: - file_path = Path(filename) - if file_path.suffix.lower() == ".pdf": - try: - from pypdf import PdfReader - except ImportError as e: - raise ImportError("Please install pypdf to extract context from .pdf files") from e - reader = PdfReader(file_path) - text = "" - for page_num in range(len(reader.pages)): - page = reader.pages[page_num] - text += page.extract_text() - return text - else: - return Path(filename).read_text() - - -class DataCollectionProvider(EvidentlyBaseModel, abc.ABC): - class Config: - is_base_type = True - - chunk_size: int = DEFAULT_CHUNK_SIZE - chunk_overlap: int = DEFAULT_CHUNK_OVERLAP - splitter: AnySplitter = "llama_index" - - @abc.abstractmethod - def get_data_collection(self) -> "DataCollection": - raise NotImplementedError - - @classmethod - def from_files( - cls, - path: str, - chunk_size: int = DEFAULT_CHUNK_SIZE, - chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, - splitter: AnySplitter = "llama_index", - ) -> "DataCollectionProvider": - return FileDataCollectionProvider( - path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, splitter=splitter - ) - - @classmethod - def from_chunks(cls, chunks: List[str]): - return ChunksDataCollectionProvider(chunks=chunks) - - -class ChunksDataCollectionProvider(DataCollectionProvider): - class Config: - type_alias = "evidently:data_collecton_provider:ChunksDataCollectionProvider" - - chunks: List[Chunk] - - def get_data_collection(self): - dc = DataCollection(name="chunks", chunks=self.chunks) - dc.init_collection() - return dc - - -class FileDataCollectionProvider(DataCollectionProvider): - class Config: - type_alias = "evidently:data_collecton_provider:FileDataCollectionProvider" - - path: str - - def get_data_collection(self): - file_path = Path(self.path) - paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*")) - - splitter = Splitter.from_any(self.splitter, self.chunk_size, self.chunk_overlap) - chunks = list(splitter.split([read_text(p) for p in paths])) - - data_collection = DataCollection(name=file_path.name, chunks=chunks) - data_collection.init_collection() - return data_collection - - -class DataCollection: - name: str - chunks: List[Chunk] - collection: Optional[Collection] = None - - def __init__(self, name: str, chunks: List[str], collection: Optional["Collection"] = None): - self.name = name - self.chunks = chunks - self.collection = collection - - def init_collection(self): - if self.collection is None: - # fixme: huggingface/tokenizers warns about clean_up_tokenization_spaces - import warnings - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - warnings.filterwarnings("ignore", category=FutureWarning) - - default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( - model_name="all-MiniLM-L6-v2", - ) - chroma_client = chromadb.Client() - collection = chroma_client.get_or_create_collection( - name=self.name, - embedding_function=default_embedding_function, - ) - for i, chunk in enumerate(self.chunks): - collection.upsert( - ids=str(i), - documents=chunk, - ) - self.collection = collection - - def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]: - """ - Queries the collection with a given question and returns the relevant text chunks. - - Args: - question (str): The query or question text to search for. - n_results (int): Number of results to retrieve. Default is 3. - - Returns: - List[Chunk]: A list of relevant text chunks. - """ - if self.collection is None: - raise ValueError("Collection is not initialized") - results = self.collection.query( - query_texts=question, - n_results=min(n_results, len(self.chunks)), - ) - - relevant_chunks = [chunk for document in results["documents"] for chunk in document] - return relevant_chunks diff --git a/src/evidently/experimental/dataset_generators/llm/prompts.py b/src/evidently/experimental/dataset_generators/llm/prompts.py deleted file mode 100644 index 9070613036..0000000000 --- a/src/evidently/experimental/dataset_generators/llm/prompts.py +++ /dev/null @@ -1,95 +0,0 @@ -from typing import ClassVar -from typing import List - -from evidently.utils.llm.prompts import BlockPromptTemplate -from evidently.utils.llm.prompts import PromptBlock -from evidently.utils.llm.prompts import WithSystemPrompt -from evidently.utils.llm.prompts import llm_call - - -class SimpleQuestionPromptTemplate(BlockPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:SimpleQuestionPromptTemplate" - - blocks: ClassVar = [ - "Please generate a {question_type} question about this:", - PromptBlock.input("context").anchored(), - PromptBlock.json_output(question="question text", answer="answer text"), - ] - question_type: str = "simple" - - -class QuestionsFromSeedPromptTemplate(BlockPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:QuestionsFromSeedPromptTemplate" - - blocks: ClassVar = [ - """Write for me {number} alternative questions quite similar to the question you got. - The question: """, - PromptBlock.input("seed_question").anchored(), - PromptBlock.string_list_output("questions"), - ] - - @llm_call - def generate(self, seed_question: str, number: int) -> List[str]: ... # type: ignore[empty-body] - - -class QuestionsFromContextPromptTemplate(WithSystemPrompt, BlockPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:QuestionsFromContextPromptTemplate" - - system_prompt: str = "You are an assistant who generates questions based on provided context" - - @llm_call - def generate_questions(self, context: str, number: int) -> List[str]: ... # type: ignore[empty-body] - - -class NaiveQuestionsFromContextPromptTemplate(QuestionsFromContextPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate" - - blocks: ClassVar = [ - "Generate {number} conceptual questions based on the provided context and " - "can be answered from the information in the provided context.\n" - "Here is a context", - PromptBlock.input("context").anchored(), - "Remain faithful to the above context.\n" - "Avoid providing any preamble!\n" - "Avoid providing any closing statement!", - PromptBlock.string_list_output("questions"), - ] - - -class ReformulateQuestionPromptTemplate(QuestionsFromContextPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:ReformulateQuestionPromptTemplate" - - blocks: ClassVar = [ - """Write for me {number} alternative questions quite similar to the question you got. -The question:""", - PromptBlock.input("context").anchored(), - PromptBlock.string_list_output("questions"), - ] - number: int - system_prompt: str = "You are a smart assistant who helps repharase questions" - - -class BaselineAnswerPromptTemplate(WithSystemPrompt, BlockPromptTemplate): - class Config: - type_alias = "evidently:prompt_template:BaselineAnswerPromptTemplate" - - blocks: ClassVar = [ - "Your task is to answer the following query:", - PromptBlock.input("question").anchored(), - "You have access to the following documents which are meant to provide context as you answer the query:", - PromptBlock.input("context").anchored(), - """Please remain faithful to the underlying context, -and deviate from it only if you haven't found the answer in the provided context. -Avoid providing any preamble! -Avoid providing any closing statement!""", - PromptBlock.string_output("answer"), - ] - system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble" - - @llm_call - def generate_answers(self, question: str, context: str): ... diff --git a/src/evidently/experimental/dataset_generators/llm/questions.py b/src/evidently/experimental/dataset_generators/llm/questions.py deleted file mode 100644 index 263d7f5fd7..0000000000 --- a/src/evidently/experimental/dataset_generators/llm/questions.py +++ /dev/null @@ -1,75 +0,0 @@ -import random -from typing import List -from typing import Sequence -from typing import Tuple - -import pandas as pd - -from evidently.experimental.dataset_generators.base import DatasetGeneratorResult -from evidently.experimental.dataset_generators.llm.base import BaseLLMDatasetGenerator -from evidently.experimental.dataset_generators.llm.index import Chunk -from evidently.experimental.dataset_generators.llm.index import DataCollection -from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider -from evidently.experimental.dataset_generators.llm.prompts import BaselineAnswerPromptTemplate -from evidently.experimental.dataset_generators.llm.prompts import NaiveQuestionsFromContextPromptTemplate -from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromContextPromptTemplate -from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromSeedPromptTemplate - -Question = str -Answer = str -GeneratedQuestion = Tuple[Question, Answer, Chunk] -ChunkSet = List[Chunk] - - -class QADatasetGenerator(BaseLLMDatasetGenerator): - class Config: - type_alias = "evidently:dataset_generator:QADatasetGenerator" - - data_collection: DataCollectionProvider - num_questions: int - questions: QuestionsFromContextPromptTemplate = NaiveQuestionsFromContextPromptTemplate() - answers: BaselineAnswerPromptTemplate = BaselineAnswerPromptTemplate() - - def generate(self) -> DatasetGeneratorResult: - documents = self.data_collection.get_data_collection() - chunk_set_count, chunks_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count() - chunk_sets = self.generate_chunksets(documents, chunk_set_count, chunks_in_set_count) - questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset) - relevant_chunks = [documents.find_relevant_chunks(q) for q in questions] - answers = self.generate_answers(questions, relevant_chunks) - return pd.DataFrame({"questions": questions, "answers": answers, "context": relevant_chunks}) - - def get_chunks_and_question_count(self) -> Tuple[int, int, int]: - return 1, 1, self.num_questions - - def generate_chunksets(self, documents: DataCollection, count: int, chunks_per_set: int) -> List[ChunkSet]: - return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)] - - def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]: - questions = self.wrapper.run_batch_sync( - self.questions.generate_questions(context="\n\n".join(chunks), number=questions_per_chunkset) - for chunks in chunk_sets - ) - return [q for qs in questions for q in qs] - - def generate_answers(self, questions: List[Question], relevant_chunks: List[List[Chunk]]) -> List[str]: - return self.wrapper.run_batch_sync( - self.answers.generate_answers(question=question, context="\n".join(chunks)) - for question, chunks in zip(questions, relevant_chunks) - ) - - -class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator): - class Config: - type_alias = "evidently:dataset_generator:QADatasetFromSeedGenerator" - - seed_question: str - num_questions: int - prompt: QuestionsFromSeedPromptTemplate = QuestionsFromSeedPromptTemplate() - - def generate(self) -> DatasetGeneratorResult: - response = self.wrapper.run_sync( - self.prompt.generate(number=self.num_questions, seed_question=self.seed_question) - ) - - return pd.DataFrame({"questions": response}) diff --git a/src/evidently/experimental/dataset_generators/llm/splitter.py b/src/evidently/experimental/dataset_generators/llm/splitter.py deleted file mode 100644 index e4b775eb29..0000000000 --- a/src/evidently/experimental/dataset_generators/llm/splitter.py +++ /dev/null @@ -1,130 +0,0 @@ -import re -from abc import ABC -from abc import abstractmethod -from enum import Enum -from typing import ClassVar -from typing import List -from typing import Optional -from typing import Sequence -from typing import Union - -from evidently._pydantic_compat import PrivateAttr -from evidently.pydantic_utils import EvidentlyBaseModel - - -class TextSource: - @classmethod - def from_any(cls, text_source: "AnyTextSource"): - if isinstance(text_source, TextSource): - return text_source - if isinstance(text_source, str): - return StrSource(text_source) - raise NotImplementedError(f"Cannot create TextSource from {text_source.__class__.__name__}") - - @abstractmethod - def get_text(self) -> str: - raise NotImplementedError - - -class StrSource(TextSource): - def __init__(self, value: str): - self.value = value - - def get_text(self) -> str: - return self.value - - -AnyTextSource = Union[str, bytes, TextSource] - -Chunk = str -Split = str - - -class Splitters(str, Enum): - Simple = "simple" - LlamaIndex = "llama_index" - - -AnySplitter = Union[str, Splitters, "Splitter"] - - -class Splitter(EvidentlyBaseModel, ABC): - class Config: - is_base_type = True - - chunk_size: int - chunk_overlap: int - - def split(self, texts: Union[AnyTextSource, List[AnyTextSource]]) -> Sequence[Chunk]: - if not isinstance(texts, list): - texts = [texts] - - for text in texts: - yield from self.split_text(TextSource.from_any(text)) - - @abstractmethod - def split_text(self, text: TextSource) -> Sequence[Chunk]: - raise NotImplementedError - - @classmethod - def from_any(cls, splitter: AnySplitter, chunk_size: int, chunk_overlap: int, **kwargs): - if isinstance(splitter, Splitter): - return splitter - if isinstance(splitter, str): - splitter = Splitters(splitter) - if isinstance(splitter, Splitters): - if splitter == Splitters.Simple: - return SimpleSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) - if splitter == Splitters.LlamaIndex: - return LlamaIndexSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs) - raise ValueError(f"Unknown splitter {splitter}") - raise NotImplementedError(f"Cannot create splitter from {splitter.__class__.__name__}") - - -class SimpleSplitter(Splitter): - class Config: - type_alias = "evidently:splitter:SimpleSplitter" - - split_re: ClassVar = re.compile(r"([^,.;。?!]+[,.;。?!]?)") - - def split_text(self, text: TextSource) -> Sequence[Chunk]: - current_splits: List[str] = [] - current_size = 0 - for split in self.split_re.split(text.get_text()): - split_size = len(split) - if len(current_splits) > 0 and current_size + split_size > self.chunk_size: - yield "".join(current_splits) - while current_size > self.chunk_overlap and len(current_splits) > 0: - last, *current_splits = current_splits - last_size = len(last) - current_size -= last_size - current_size += split_size - current_splits.append(split) - if current_size > 0: - yield "".join(current_splits) - - -class LlamaIndexSplitter(Splitter): - class Config: - type_alias = "evidently:splitter:LlamaIndexSplitter" - - separator: str = " " - paragraph_separator: Optional[str] = None - _splitter = PrivateAttr(None) - - @property - def splitter(self): - if self._splitter is None: - from llama_index.core.node_parser import SentenceSplitter - from llama_index.core.node_parser.text.sentence import DEFAULT_PARAGRAPH_SEP - - self._splitter = SentenceSplitter( - chunk_size=self.chunk_size, - chunk_overlap=self.chunk_overlap, - separator=self.separator, - paragraph_separator=self.paragraph_separator or DEFAULT_PARAGRAPH_SEP, - ) - return self._splitter - - def split_text(self, text: TextSource) -> Sequence[Chunk]: - yield from self.splitter.split_text(text.get_text()) diff --git a/tests/test_pydantic_aliases.py b/tests/test_pydantic_aliases.py index 0cd96d923c..6e1ee34ace 100644 --- a/tests/test_pydantic_aliases.py +++ b/tests/test_pydantic_aliases.py @@ -16,9 +16,6 @@ from evidently.base_metric import MetricResult from evidently.collector.config import CollectorTrigger from evidently.collector.storage import CollectorStorage -from evidently.experimental.dataset_generators.base import BaseDatasetGenerator -from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider -from evidently.experimental.dataset_generators.llm.splitter import Splitter from evidently.features.generated_features import BaseDescriptor from evidently.features.generated_features import GeneratedFeatures from evidently.features.llm_judge import BaseLLMPromptTemplate @@ -110,9 +107,6 @@ def test_all_aliases_correct(): CollectorStorage: "collector_storage", BaseLLMPromptTemplate: "prompt_template", DashboardPanel: "dashboard_panel", - BaseDatasetGenerator: "dataset_generator", - Splitter: "splitter", - DataCollectionProvider: "data_collecton_provider", PromptBlock: "prompt_block", PromptTemplate: "prompt_template", }