Added test_document_loader, test files, and github workflow to run them

bshastry · Oct 5, 2023 · f125371 · f125371
1 parent 809650a
commit f125371
Show file tree

Hide file tree

Showing 9 changed files with 177 additions and 3 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,25 @@
+name: Run Tests
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run tests
+        run: python test_runner.py
diff --git a/docubot.py b/docubot.py
@@ -73,7 +73,7 @@ def build_kb(data_directory: str) -> List[T]:
         data.extend(load_document(doc_name))
 
     print(f"There are {len(data)} pages in the knowledge base")
-    chunks = chunk_data(data, chunk_size=512)
+    chunks = chunk_data(data, chunk_size=512, chunk_overlap=20)
     print(f"These have been split into {len(chunks)} chunks for indexing")
     return chunks
 

diff --git a/document_loaders/document_loaders.py b/document_loaders/document_loaders.py
@@ -143,7 +143,9 @@ def load_document(document_name: str) -> List[T]:
             return None
 
 
-def chunk_data(data: str, chunk_size: int = 512) -> List[T]:
+def chunk_data(
+    data: List[T], chunk_size: int = 512, chunk_overlap: int = 20
+) -> List[T]:
     """
     Splits the input data into chunks of specified size using a RecursiveCharacterTextSplitter.
 
@@ -159,7 +161,7 @@ def chunk_data(data: str, chunk_size: int = 512) -> List[T]:
 
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=chunk_size,
-        chunk_overlap=20,
+        chunk_overlap=chunk_overlap,
         length_function=tiktoken_len,
         separators=["\n\n", "\n", " ", ""],
     )

diff --git a/test_files/test.docx b/test_files/test.docx
diff --git a/test_files/test.md b/test_files/test.md
@@ -0,0 +1,5 @@
+# Test
+
+## Sub heading
+
+Some text
diff --git a/test_files/test.pdf b/test_files/test.pdf
diff --git a/test_files/test.txt b/test_files/test.txt
@@ -0,0 +1 @@
+This is a text file that has more than ten characters.
diff --git a/test_runner.py b/test_runner.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+import unittest
+
+
+def run_tests():
+    """
+    Discovers and runs all tests in the 'tests' directory.
+
+    This function uses the unittest module to discover and run all tests in the 'tests' directory.
+    It creates a test loader to load the tests, a test suite to hold the discovered tests, and a test runner to execute the tests.
+    The 'tests' directory should contain all the unit test files.
+
+    Returns:
+    None
+    """
+
+    test_loader = unittest.TestLoader()
+    test_suite = test_loader.discover("tests")
+    test_runner = unittest.TextTestRunner()
+    test_runner.run(test_suite)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/test_document_loaders.py b/tests/test_document_loaders.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+"""
+This module contains unit tests for the document loaders module.
+
+The document loaders module provides functions for loading various types of documents, such as PDF, DOCX, Markdown, and plain text files. It also includes a function for loading content from Wikipedia.
+
+The unit tests in this module ensure that the document loading functions return the expected results. They also test the chunk_data function, which is used to split document content into smaller chunks.
+
+The module includes the following test cases:
+
+- test_load_pdf_document: Tests the load_pdf_document function by loading a PDF file and checking that the returned content is a list with a length greater than 0.
+
+- test_load_docx_document: Tests the load_docx_document function by loading a DOCX file and checking that the returned content is a list with a length greater than 0.
+
+- test_load_markdown_document: Tests the load_markdown_document function by loading a Markdown file and checking that the returned content is a list with a length greater than 0.
+
+- test_load_txt_document: Tests the load_txt_document function by loading a plain text file and checking that the returned content is a list with a length greater than 0.
+
+- test_load_from_wikipedia: Tests the load_from_wikipedia function by loading content from Wikipedia for a given query and checking that the returned content is a list with a length greater than 0.
+
+- test_load_document: Tests the load_document function by loading various types of documents (PDF, DOCX, Markdown, and plain text files) and checking that the returned content is a list with a length greater than 0.
+
+- test_chunk_data: Tests the chunk_data function by loading a plain text file, splitting its content into smaller chunks, and checking that each chunk has a length less than or equal to the specified chunk size.
+
+To run the unit tests, execute this module as a script.
+"""
+
+import unittest
+from document_loaders.document_loaders import (
+    load_pdf_document,
+    load_docx_document,
+    load_markdown_document,
+    load_txt_document,
+    load_from_wikipedia,
+    load_document,
+    chunk_data,
+)
+from text_utils.text_utils import tiktoken_len
+
+
+class TestDocumentLoaders(unittest.TestCase):
+    def test_load_pdf_document(self):
+        pdf_file = "test_files/test.pdf"
+        pdf_contents = load_pdf_document(pdf_file)
+        self.assertIsInstance(pdf_contents, list)
+        self.assertGreater(len(pdf_contents), 0)
+
+    def test_load_docx_document(self):
+        docx_file = "test_files/test.docx"
+        docx_contents = load_docx_document(docx_file)
+        self.assertIsInstance(docx_contents, list)
+        self.assertGreater(len(docx_contents), 0)
+
+    def test_load_markdown_document(self):
+        md_file = "test_files/test.md"
+        md_contents = load_markdown_document(md_file)
+        self.assertIsInstance(md_contents, list)
+        self.assertGreater(len(md_contents), 0)
+
+    def test_load_txt_document(self):
+        txt_file = "test_files/test.txt"
+        txt_contents = load_txt_document(txt_file)
+        self.assertIsInstance(txt_contents, list)
+        self.assertGreater(len(txt_contents), 0)
+
+    def test_load_from_wikipedia(self):
+        query = "Python programming language"
+        wikipedia_contents = load_from_wikipedia(query)
+        self.assertIsInstance(wikipedia_contents, list)
+        self.assertGreater(len(wikipedia_contents), 0)
+
+    def test_load_document(self):
+        pdf_file = "test_files/test.pdf"
+        docx_file = "test_files/test.docx"
+        md_file = "test_files/test.md"
+        txt_file = "test_files/test.txt"
+        url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+
+        pdf_contents = load_document(pdf_file)
+        self.assertIsInstance(pdf_contents, list)
+        self.assertGreater(len(pdf_contents), 0)
+
+        docx_contents = load_document(docx_file)
+        self.assertIsInstance(docx_contents, list)
+        self.assertGreater(len(docx_contents), 0)
+
+        md_contents = load_document(md_file)
+        self.assertIsInstance(md_contents, list)
+        self.assertGreater(len(md_contents), 0)
+
+        txt_contents = load_document(txt_file)
+        self.assertIsInstance(txt_contents, list)
+        self.assertGreater(len(txt_contents), 0)
+
+        url_contents = load_document(url)
+        self.assertIsInstance(url_contents, list)
+
+    def test_chunk_data(self):
+        txt_file = "test_files/test.txt"
+        txt_contents = load_document(txt_file)
+        # Chunk size in tokens (not characters)
+        chunk_size = 10
+        # Number of tokens to overlap between chunks
+        chunk_overlap = 5
+        chunks = chunk_data(
+            txt_contents, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        self.assertIsInstance(chunks, list)
+        self.assertGreater(len(chunks), 1)
+        for chunk in chunks:
+            self.assertLessEqual(tiktoken_len(chunk.page_content), chunk_size)
+
+
+if __name__ == "__main__":
+    unittest.main()