Skip to content

Commit

Permalink
chore: load chunker from config (#270)
Browse files Browse the repository at this point in the history
  • Loading branch information
cachho authored Jul 17, 2023
1 parent 07ba65d commit 9c58627
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 69 deletions.
12 changes: 3 additions & 9 deletions docs/advanced/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Here's the readme example with configuration options.
```python
import os
from embedchain import App
from embedchain.config import InitConfig, AddConfig, QueryConfig
from embedchain.config import InitConfig, AddConfig, QueryConfig, ChunkerConfig
from chromadb.utils import embedding_functions

# Example: use your own embedding function
Expand All @@ -25,14 +25,8 @@ config = InitConfig(ef=embedding_functions.OpenAIEmbeddingFunction(
naval_chat_bot = App(config)

# Example: define your own chunker config for `youtube_video`
youtube_add_config = {
"chunker": {
"chunk_size": 1000,
"chunk_overlap": 100,
"length_function": len,
}
}
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(**youtube_add_config))
chunker_config = ChunkerConfig(chunk_size=1000, chunk_overlap=100, length_function=len)
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))

add_config = AddConfig()
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", add_config)
Expand Down
14 changes: 6 additions & 8 deletions embedchain/chunkers/docx_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 1000,
"chunk_overlap": 0,
"length_function": len,
}


class DocxFileChunker(BaseChunker):
"""Chunker for .docx file."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/pdf_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 1000,
"chunk_overlap": 0,
"length_function": len,
}


class PdfFileChunker(BaseChunker):
"""Chunker for PDF file."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/qna_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 300,
"chunk_overlap": 0,
"length_function": len,
}


class QnaPairChunker(BaseChunker):
"""Chunker for QnA pair."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 300,
"chunk_overlap": 0,
"length_function": len,
}


class TextChunker(BaseChunker):
"""Chunker for text."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/web_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 0,
"length_function": len,
}


class WebPageChunker(BaseChunker):
"""Chunker for web page."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/youtube_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 2000,
"chunk_overlap": 0,
"length_function": len,
}


class YoutubeVideoChunker(BaseChunker):
"""Chunker for Youtube video."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=2000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
12 changes: 6 additions & 6 deletions embedchain/config/AddConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ class ChunkerConfig(BaseConfig):

def __init__(
self,
chunk_size: Optional[int] = 4000,
chunk_overlap: Optional[int] = 200,
length_function: Optional[Callable[[str], int]] = len,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
length_function: Optional[Callable[[str], int]] = None,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.length_function = length_function
self.chunk_size = chunk_size if chunk_size else 2000
self.chunk_overlap = chunk_overlap if chunk_overlap else 0
self.length_function = length_function if length_function else len


class LoaderConfig(BaseConfig):
Expand Down
2 changes: 1 addition & 1 deletion embedchain/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .AddConfig import AddConfig # noqa: F401
from .AddConfig import AddConfig, ChunkerConfig # noqa: F401
from .BaseConfig import BaseConfig # noqa: F401
from .ChatConfig import ChatConfig # noqa: F401
from .InitConfig import InitConfig # noqa: F401
Expand Down
7 changes: 2 additions & 5 deletions tests/chunkers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

from embedchain.chunkers.text import TextChunker
from embedchain.config import ChunkerConfig


class TestTextChunker(unittest.TestCase):
Expand All @@ -11,11 +12,7 @@ def test_chunks(self):
Test the chunks generated by TextChunker.
# TODO: Not a very precise test.
"""
chunker_config = {
"chunk_size": 10,
"chunk_overlap": 0,
"length_function": len,
}
chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
chunker = TextChunker(config=chunker_config)
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

Expand Down

0 comments on commit 9c58627

Please sign in to comment.