feat(rag): Support RAG SDK (#1322)

eosphoros-ai · Mar 22, 2024 · 8a17099 · 8a17099
1 parent e65732d
commit 8a17099
Show file tree

Hide file tree

Showing 69 changed files with 1,330 additions and 556 deletions.
diff --git a/.mypy.ini b/.mypy.ini
@@ -11,6 +11,9 @@ follow_imports = skip
 [mypy-dbgpt.serve.*]
 follow_imports = skip
 
+[mypy-dbgpt.model.*]
+follow_imports = skip
+
 [mypy-dbgpt.util.*]
 follow_imports = skip
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,11 @@
+# Contribution 
 
-To contribute to this GitHub project, you can follow these steps:
+First of all, thank you for considering contributing to this project. 
+It's people like you that make it a reality for the community. There are many ways to contribute, and we appreciate all of them.
+
+This guide will help you get started with contributing to this project.
+
+## Fork The Repository
 
 1. Fork the repository you want to contribute to by clicking the "Fork" button on the project page.
 
@@ -8,71 +14,107 @@ To contribute to this GitHub project, you can follow these steps:
 ```
 git clone https://github.com/<YOUR-GITHUB-USERNAME>/DB-GPT
 ```
+Please replace `<YOUR-GITHUB-USERNAME>` with your GitHub username.
+
+
+## Create A New Development Environment
+
+1. Create a new virtual environment using the following command:
+```
+# Make sure python >= 3.10
+conda create -n dbgpt_env python=3.10
+conda activate dbgpt_env
+```
 
-3. Install the project requirements
+2. Change to the project directory using the following command:
 ```
+cd DB-GPT
+```
+
+3. Install the project from the local source using the following command:
+```
+# it will take some minutes
 pip install -e ".[default]"
 ```
 
-4. Install pre-commit hooks
+4. Install development requirements
+```
+pip install -r requirements/dev-requirements.txt
+pip install -r requirements/lint-requirements.txt
+```
+
+5. Install pre-commit hooks
 ```
 pre-commit install
 ```
 
-5. Create a new branch for your changes using the following command:
+6. Install `make` command
+The `make` command has been installed by default on most Unix-based systems. If you not 
+have it, you can install it by searching on the internet.
+
+## New Branch And Make Changes
+
+1. Create a new branch for your changes using the following command:
 ```
-git checkout -b "branch-name"
+git checkout -b <branch-name>
 ```
+Please replace `<branch-name>` with a descriptive name for your branch.
 
-6. Make your changes to the code or documentation.
+2. Make your changes to the code or documentation.
 
-- Example: Improve User Interface or Add Documentation.
+3. Add tests for your changes if necessary.
 
-7. Format the code using the following command:
+4. Format your code using the following command:
 ```
 make fmt
 ```
 
-8. Add the changes to the staging area using the following command:
+5. Run the tests using the following command:
 ```
-git add .
+make test
 ```
 
-9. Make sure the tests pass and your code lints using the following command:
+6. Check types using the following command:
 ```
-make pre-commit
+make mypy
 ```
 
-10. Commit the changes with a meaningful commit message using the following command:
+7. Check lint using the following command:
 ```
-git commit -m "your commit message"
+make fmt-check
+```
+
+8. If all checks pass, you can add and commit your changes using the following commands:
 ```
-11. Push the changes to your forked repository using the following command:
+git add xxxx
 ```
-git push origin branch-name
+make sure to replace `xxxx` with the files you want to commit.
+
+then commit your changes using the following command:
+```
+git commit -m "your commit message"
 ```
-12. Go to the GitHub website and navigate to your forked repository.
+Please replace `your commit message` with a meaningful commit message.
 
-13. Click the "New pull request" button.
+It will take some time to get used to the process, but it's worth it. And it will run 
+all git hooks and checks before you commit. If it fails, you need to fix the issues 
+then re-commit it.
 
-14. Select the branch you just pushed to and the branch you want to merge into on the original repository.
+9. Push the changes to your forked repository using the following command:
+```
+git push origin <branch-name>
+```
 
-15. Add a description of your changes and click the "Create pull request" button.
+## Create A Pull Request
 
-16. Wait for the project maintainer to review your changes and provide feedback.
+1. Go to the GitHub website and navigate to your forked repository.
 
-17. Make any necessary changes based on feedback and repeat steps 5-12 until your changes are accepted and merged into the main project.
+2. Click the "New pull request" button.
 
-18. Once your changes are merged, you can update your forked repository and local copy of the repository with the following commands:
+3. Select the branch you just pushed to and the branch you want to merge into on the original repository.
+Write necessary information about your changes and click "Create pull request".
+
+4. Wait for the project maintainer to review your changes and provide feedback.
 
-```
-git fetch upstream
-git checkout master
-git merge upstream/master
-```
-Finally, delete the branch you created with the following command:
-```
-git branch -d branch-name
-```
 That's it you made it 🐣⭐⭐
 
diff --git a/Makefile b/Makefile
@@ -26,8 +26,7 @@ testenv: $(VENV)/.testenv
 
 $(VENV)/.testenv: $(VENV)/bin/activate
 	# $(VENV_BIN)/pip install -e ".[framework]"
-	# $(VENV_BIN)/pip install -e ".[knowledge]"
-	# the openai optional dependency is include framework and knowledge dependencies
+	# the openai optional dependency is include framework and rag dependencies
 	$(VENV_BIN)/pip install -e ".[openai]"
 	touch $(VENV)/.testenv
 
@@ -100,7 +99,7 @@ package: clean-dist ## Package the project for distribution
 	IS_DEV_MODE=false python setup.py sdist bdist_wheel
 
 .PHONY: upload
-upload: package ## Upload the package to PyPI
+upload: ## Upload the package to PyPI
 	# upload to testpypi: twine upload --repository testpypi dist/*
 	twine upload dist/*
 

diff --git a/dbgpt/_version.py b/dbgpt/_version.py
@@ -1 +1 @@
-version = "0.5.1"
+version = "0.5.2"
diff --git a/dbgpt/app/initialization/embedding_component.py b/dbgpt/app/initialization/embedding_component.py
@@ -82,8 +82,10 @@ def create(
         return self._model
 
     def _load_model(self) -> "Embeddings":
-        from dbgpt.model.cluster.embedding.loader import EmbeddingLoader
-        from dbgpt.model.cluster.worker.embedding_worker import _parse_embedding_params
+        from dbgpt.model.adapter.embeddings_loader import (
+            EmbeddingLoader,
+            _parse_embedding_params,
+        )
         from dbgpt.model.parameter import (
             EMBEDDING_NAME_TO_PARAMETER_CLASS_CONFIG,
             BaseEmbeddingModelParameters,

diff --git a/dbgpt/app/knowledge/service.py b/dbgpt/app/knowledge/service.py
@@ -27,6 +27,8 @@
 from dbgpt.configs.model_config import EMBEDDING_MODEL_CONFIG
 from dbgpt.core import Chunk
 from dbgpt.model import DefaultLLMClient
+from dbgpt.rag.assembler.embedding import EmbeddingAssembler
+from dbgpt.rag.assembler.summary import SummaryAssembler
 from dbgpt.rag.chunk_manager import ChunkParameters
 from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
 from dbgpt.rag.knowledge.base import ChunkStrategy, KnowledgeType
@@ -36,8 +38,6 @@
     SpacyTextSplitter,
 )
 from dbgpt.serve.rag.api.schemas import KnowledgeSyncRequest
-from dbgpt.serve.rag.assembler.embedding import EmbeddingAssembler
-from dbgpt.serve.rag.assembler.summary import SummaryAssembler
 from dbgpt.serve.rag.models.models import KnowledgeSpaceDao, KnowledgeSpaceEntity
 from dbgpt.serve.rag.service.service import Service, SyncStatus
 from dbgpt.storage.vector_store.base import VectorStoreConfig
@@ -347,6 +347,7 @@ def _sync_knowledge_document(
         assembler = EmbeddingAssembler.load_from_knowledge(
             knowledge=knowledge,
             chunk_parameters=chunk_parameters,
+            embeddings=embedding_fn,
             vector_store_connector=vector_store_connector,
         )
         chunk_docs = assembler.get_chunks()

diff --git a/dbgpt/core/awel/dag/base.py b/dbgpt/core/awel/dag/base.py
@@ -710,6 +710,11 @@ def visualize_dag(self, view: bool = True, **kwargs) -> Optional[str]:
         self.print_tree()
         return _visualize_dag(self, view=view, **kwargs)
 
+    def show(self, mermaid: bool = False) -> Any:
+        """Return the graph of current DAG."""
+        dot, mermaid_str = _get_graph(self)
+        return mermaid_str if mermaid else dot
+
     def __enter__(self):
         """Enter a DAG context."""
         DAGVar.enter_dag(self)
@@ -813,26 +818,12 @@ def _handle_dag_nodes(
         _handle_dag_nodes(is_down_to_up, level, node, func)
 
 
-def _visualize_dag(
-    dag: DAG, view: bool = True, generate_mermaid: bool = True, **kwargs
-) -> Optional[str]:
-    """Visualize the DAG.
-
-    Args:
-        dag (DAG): The DAG to visualize
-        view (bool, optional): Whether view the DAG graph. Defaults to True.
-        generate_mermaid (bool, optional): Whether to generate a Mermaid syntax file.
-            Defaults to True.
-
-    Returns:
-        Optional[str]: The filename of the DAG graph
-    """
+def _get_graph(dag: DAG):
     try:
         from graphviz import Digraph
     except ImportError:
         logger.warn("Can't import graphviz, skip visualize DAG")
-        return None
-
+        return None, None
     dot = Digraph(name=dag.dag_id)
     mermaid_str = "graph TD;\n"  # Initialize Mermaid graph definition
     # Record the added edges to avoid adding duplicate edges
@@ -851,6 +842,26 @@ def add_edges(node: DAGNode):
 
     for root in dag.root_nodes:
         add_edges(root)
+    return dot, mermaid_str
+
+
+def _visualize_dag(
+    dag: DAG, view: bool = True, generate_mermaid: bool = True, **kwargs
+) -> Optional[str]:
+    """Visualize the DAG.
+
+    Args:
+        dag (DAG): The DAG to visualize
+        view (bool, optional): Whether view the DAG graph. Defaults to True.
+        generate_mermaid (bool, optional): Whether to generate a Mermaid syntax file.
+            Defaults to True.
+
+    Returns:
+        Optional[str]: The filename of the DAG graph
+    """
+    dot, mermaid_str = _get_graph(dag)
+    if not dot:
+        return None
     filename = f"dag-vis-{dag.dag_id}.gv"
     if "filename" in kwargs:
         filename = kwargs["filename"]

diff --git a/dbgpt/core/awel/operators/common_operator.py b/dbgpt/core/awel/operators/common_operator.py
@@ -1,12 +1,13 @@
 """Common operators of AWEL."""
 import asyncio
 import logging
-from typing import Awaitable, Callable, Dict, Generic, List, Optional, Union
+from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, Union
 
 from ..dag.base import DAGContext
 from ..task.base import (
     IN,
     OUT,
+    SKIP_DATA,
     InputContext,
     InputSource,
     JoinFunc,
@@ -276,6 +277,11 @@ async def _do_run(self, dag_ctx: DAGContext) -> TaskOutput[OUT]:
         curr_task_ctx.set_task_output(task_output)
         return task_output
 
+    @classmethod
+    def dummy_input(cls, dummy_data: Any = SKIP_DATA, **kwargs) -> "InputOperator[OUT]":
+        """Create a dummy InputOperator with a given input value."""
+        return cls(input_source=InputSource.from_data(dummy_data), **kwargs)
+
 
 class TriggerOperator(InputOperator[OUT], Generic[OUT]):
     """Operator node that triggers the DAG to run."""

diff --git a/dbgpt/datasource/__init__.py b/dbgpt/datasource/__init__.py
@@ -1,6 +1,17 @@
 """Module to define the data source connectors."""
 
+from typing import Any
+
 from .base import BaseConnector  # noqa: F401
-from .rdbms.base import RDBMSConnector  # noqa: F401
+
+
+def __getattr__(name: str) -> Any:
+    if name == "RDBMSConnector":
+        from .rdbms.base import RDBMSConnector  # noqa: F401
+
+        return RDBMSConnector
+    else:
+        raise AttributeError(f"Could not find: {name} in datasource")
+
 
 __ALL__ = ["BaseConnector", "RDBMSConnector"]
diff --git a/dbgpt/datasource/operators/datasource_operator.py b/dbgpt/datasource/operators/datasource_operator.py
@@ -12,15 +12,15 @@
 class DatasourceOperator(MapOperator[str, Any]):
     """The Datasource Operator."""
 
-    def __init__(self, connection: BaseConnector, **kwargs):
+    def __init__(self, connector: BaseConnector, **kwargs):
         """Create the datasource operator."""
         super().__init__(**kwargs)
-        self._connection = connection
+        self._connector = connector
 
     async def map(self, input_value: str) -> Any:
         """Execute the query."""
         return await self.blocking_func_to_async(self.query, input_value)
 
     def query(self, input_value: str) -> Any:
         """Execute the query."""
-        return self._connection.run_to_df(input_value)
+        return self._connector.run_to_df(input_value)
diff --git a/dbgpt/datasource/rdbms/base.py b/dbgpt/datasource/rdbms/base.py
@@ -3,11 +3,11 @@
 from __future__ import annotations
 
 import logging
+import re
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, cast
 from urllib.parse import quote
 from urllib.parse import quote_plus as urlquote
 
-import regex as re
 import sqlalchemy
 import sqlparse
 from sqlalchemy import MetaData, Table, create_engine, inspect, select, text