chore: update notebook to use latest scrapi code

GoogleCloudPlatform · Sep 27, 2024 · db2710d · db2710d
1 parent 09fdbc1
commit db2710d
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 172 deletions.
diff --git a/examples/vertex_ai_conversation/check_documents_in_datastore.ipynb b/examples/vertex_ai_conversation/check_documents_in_datastore.ipynb
@@ -45,15 +45,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Dependencies\n",
-    "!pip install google-cloud-discoveryengine --quiet\n",
+    "!pip install dfcx-scrapi --quiet\n",
     "\n",
     "from google.colab import auth\n",
     "from google.auth import default\n",
@@ -67,169 +63,53 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# DiscoveryEngine Client and Helper Functions\n"
+    "# USER INPUTS\n",
+    "You can find your `datastore_id` by using the `get_data_stores_map` method in SCRAPI."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "import time\n",
-    "from google.cloud import discoveryengine_v1\n",
-    "from google.api_core import operations_v1, grpc_helpers\n",
-    "from google.longrunning import operations_pb2\n",
-    "from typing import List, Optional\n",
-    "from google.api_core.client_options import ClientOptions\n",
-    "\n",
-    "\n",
-    "def list_documents(\n",
-    "    project_id: str, location: str, datastore_id: str, rate_limit: int = 1):\n",
-    "  \"\"\"Gets a list of docs in a datastore.\"\"\"\n",
-    "  # Create a client\n",
-    "  client_options = (\n",
-    "        ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
-    "        if location != \"global\"\n",
-    "        else None)\n",
-    "    \n",
-    "  client = discoveryengine_v1.DocumentServiceClient(client_options=client_options)\n",
-    "\n",
-    "  request = discoveryengine_v1.ListDocumentsRequest(\n",
-    "      parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
-    "      page_size=1000\n",
-    "  )\n",
-    "\n",
-    "  res = client.list_documents(request=request)\n",
-    "\n",
-    "  # setup the list with the first batch of docs\n",
-    "  docs = res.documents\n",
-    "\n",
-    "  while res.next_page_token:\n",
-    "    # implement a rate_limit to prevent quota exhaustion\n",
-    "    time.sleep(rate_limit)\n",
-    "\n",
-    "    request = discoveryengine_v1.ListDocumentsRequest(\n",
-    "      parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
-    "      page_size=1000,\n",
-    "      page_token=res.next_page_token\n",
-    "    )\n",
-    "\n",
-    "    res = client.list_documents(request=request)\n",
-    "    docs.extend(res.documents)\n",
-    "\n",
-    "  return docs\n",
-    "\n",
-    "def list_indexed_urls(\n",
-    "    project_id: str,\n",
-    "    location: str,\n",
-    "    datastore_id: str,\n",
-    "    docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
-    "  \"\"\"Get the list of docs in datastore, then parse to only urls.\"\"\"\n",
-    "  if not docs:\n",
-    "    docs = list_documents(project_id, location, datastore_id)\n",
-    "  urls = [doc.content.uri for doc in docs]\n",
+    "from dfcx_scrapi.core.data_stores import DataStores\n",
+    "from dfcx_scrapi.core.search import Search\n",
     "\n",
-    "  return urls\n",
+    "PROJECT_ID = \"\" #@param{type: 'string'}\n",
     "\n",
-    "def search_url(urls: List[str], url: str):\n",
-    "  \"\"\"Searches a url in a list of urls.\"\"\"\n",
-    "  for item in urls:\n",
-    "    if url in item:\n",
-    "      print(item)\n",
+    "s = Search()\n",
+    "ds = DataStores(project_id=PROJECT_ID)\n",
     "\n",
-    "def search_doc_id(\n",
-    "    doc_id: str, docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
-    "  \"\"\"Searches a doc_id in a list of docs.\"\"\"\n",
-    "  if not docs:\n",
-    "    docs = list_documents(project_id, location, datastore_id)\n",
-    "\n",
-    "  doc_found = False\n",
-    "  for doc in docs:\n",
-    "    if doc.parent_document_id == document_id:\n",
-    "      doc_found = True\n",
-    "      print(doc)\n",
-    "\n",
-    "  if not doc_found:\n",
-    "    print(f\"Document not found for provided Doc ID: `{doc_id}`\")\n",
-    "\n",
-    "\n",
-    "def get_operations_status(operation_id: str):\n",
-    "  \"\"\"Get the status of an import operation for Discovery Engine.\"\"\"\n",
-    "  host = \"discoveryengine.googleapis.com\"\n",
-    "  channel = grpc_helpers.create_channel(host)\n",
-    "  client = operations_v1.OperationsClient(channel)\n",
-    "\n",
-    "  response = client.get_operation(operation_id)\n",
-    "\n",
-    "  return response\n",
-    "\n",
-    "PENDING_MESSAGE = \"\"\"\n",
-    "No docs found.\\n\\nIt\\'s likely one of two issues: \\n  [1] Your data store is not finished indexing. \\n  [2] Your data store failed indexing.\\n\n",
-    "If you just added your data store, it can take up to 4 hours before it will become available.\n",
-    "\"\"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# USER INPUTS\n",
-    "You can find your `datastore_id` by going following these steps:\n",
-    "1. Click on Gen App Builder\n",
-    "2. Select your App / Engine\n",
-    "3. Select your Available Data Store\n",
-    "4. Find your Data Store ID"
+    "ds_map = ds.get_data_stores_map(reverse=True, location=\"global\")\n",
+    "ds_map"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "project_id = '' #@param{type: 'string'}\n",
-    "location = 'global'  #@param{type: 'string'}\n",
-    "datastore_id = ''  #@param{type: 'string'}"
+    "# Access your datastore_id from the ds_map by using the human readable display name\n",
+    "datastore_id = ds_map[\"my-cool-datastore\"]"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Check Data Store Index Status\n",
-    "Using the `list_documents` method, to check if the data store has finished indexing."
+    "Using the `check_datastore_index_status` method, to check if the data store has finished indexing."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "docs = list_documents(project_id, location, datastore_id)\n",
-    "\n",
-    "if len(docs) == 0:\n",
-    "  print(PENDING_MESSAGE)\n",
-    "else:\n",
-    "  SUCCESS_MESSAGE = f\"\"\"\n",
-    "  Success! 🎉\\n\n",
-    "  Your indexing is complete.\\n\n",
-    "  Your index contains {len(docs)} documents.\n",
-    "  \"\"\"\n",
-    "  print(SUCCESS_MESSAGE)"
+    "s.check_datastore_index_status(datastore_id)"
    ]
   },
   {
@@ -243,14 +123,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "docs = list_documents(project_id, location, datastore_id)\n",
+    "docs = s.list_documents(datastore_id)\n",
     "docs[0]"
    ]
   },
@@ -264,14 +140,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "urls = list_indexed_urls(project_id, location, datastore_id, docs)\n",
+    "urls = s.list_indexed_urls(datastore_id, docs)\n",
     "urls[0]"
    ]
   },
@@ -285,11 +157,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "urls"
@@ -305,11 +173,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
@@ -328,14 +192,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "search_url(urls, 'tundra-250')"
+    "s.search_url(urls, 'tundra-250')"
    ]
   },
   {
@@ -351,16 +211,30 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
+    "\n",
+    "s.search_doc_id(document_id=document_id, docs=docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "List docs and search document ID all at once."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
     "\n",
-    "search_doc_id(document_id, docs)"
+    "s.search_doc_id(document_id=document_id, datastore_id=datastore_id)"
    ]
   }
  ],

diff --git a/setup.py b/setup.py
@@ -49,6 +49,7 @@
     install_requires=[
         'google-cloud-dialogflow-cx',
         'google-cloud-aiplatform',
+        'google-cloud-discoveryengine',
         'rouge-score'
         ]
 )