Skip to content

Commit

Permalink
chore: update notebook to use latest scrapi code
Browse files Browse the repository at this point in the history
  • Loading branch information
kmaphoenix committed Sep 27, 2024
1 parent 09fdbc1 commit db2710d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 172 deletions.
218 changes: 46 additions & 172 deletions examples/vertex_ai_conversation/check_documents_in_datastore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Dependencies\n",
"!pip install google-cloud-discoveryengine --quiet\n",
"!pip install dfcx-scrapi --quiet\n",
"\n",
"from google.colab import auth\n",
"from google.auth import default\n",
Expand All @@ -67,169 +63,53 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# DiscoveryEngine Client and Helper Functions\n"
"# USER INPUTS\n",
"You can find your `datastore_id` by using the `get_data_stores_map` method in SCRAPI."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from google.cloud import discoveryengine_v1\n",
"from google.api_core import operations_v1, grpc_helpers\n",
"from google.longrunning import operations_pb2\n",
"from typing import List, Optional\n",
"from google.api_core.client_options import ClientOptions\n",
"\n",
"\n",
"def list_documents(\n",
" project_id: str, location: str, datastore_id: str, rate_limit: int = 1):\n",
" \"\"\"Gets a list of docs in a datastore.\"\"\"\n",
" # Create a client\n",
" client_options = (\n",
" ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
" if location != \"global\"\n",
" else None)\n",
" \n",
" client = discoveryengine_v1.DocumentServiceClient(client_options=client_options)\n",
"\n",
" request = discoveryengine_v1.ListDocumentsRequest(\n",
" parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
" page_size=1000\n",
" )\n",
"\n",
" res = client.list_documents(request=request)\n",
"\n",
" # setup the list with the first batch of docs\n",
" docs = res.documents\n",
"\n",
" while res.next_page_token:\n",
" # implement a rate_limit to prevent quota exhaustion\n",
" time.sleep(rate_limit)\n",
"\n",
" request = discoveryengine_v1.ListDocumentsRequest(\n",
" parent=f'projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/0',\n",
" page_size=1000,\n",
" page_token=res.next_page_token\n",
" )\n",
"\n",
" res = client.list_documents(request=request)\n",
" docs.extend(res.documents)\n",
"\n",
" return docs\n",
"\n",
"def list_indexed_urls(\n",
" project_id: str,\n",
" location: str,\n",
" datastore_id: str,\n",
" docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
" \"\"\"Get the list of docs in datastore, then parse to only urls.\"\"\"\n",
" if not docs:\n",
" docs = list_documents(project_id, location, datastore_id)\n",
" urls = [doc.content.uri for doc in docs]\n",
"from dfcx_scrapi.core.data_stores import DataStores\n",
"from dfcx_scrapi.core.search import Search\n",
"\n",
" return urls\n",
"PROJECT_ID = \"\" #@param{type: 'string'}\n",
"\n",
"def search_url(urls: List[str], url: str):\n",
" \"\"\"Searches a url in a list of urls.\"\"\"\n",
" for item in urls:\n",
" if url in item:\n",
" print(item)\n",
"s = Search()\n",
"ds = DataStores(project_id=PROJECT_ID)\n",
"\n",
"def search_doc_id(\n",
" doc_id: str, docs: Optional[List[discoveryengine_v1.Document]] = None):\n",
" \"\"\"Searches a doc_id in a list of docs.\"\"\"\n",
" if not docs:\n",
" docs = list_documents(project_id, location, datastore_id)\n",
"\n",
" doc_found = False\n",
" for doc in docs:\n",
" if doc.parent_document_id == document_id:\n",
" doc_found = True\n",
" print(doc)\n",
"\n",
" if not doc_found:\n",
" print(f\"Document not found for provided Doc ID: `{doc_id}`\")\n",
"\n",
"\n",
"def get_operations_status(operation_id: str):\n",
" \"\"\"Get the status of an import operation for Discovery Engine.\"\"\"\n",
" host = \"discoveryengine.googleapis.com\"\n",
" channel = grpc_helpers.create_channel(host)\n",
" client = operations_v1.OperationsClient(channel)\n",
"\n",
" response = client.get_operation(operation_id)\n",
"\n",
" return response\n",
"\n",
"PENDING_MESSAGE = \"\"\"\n",
"No docs found.\\n\\nIt\\'s likely one of two issues: \\n [1] Your data store is not finished indexing. \\n [2] Your data store failed indexing.\\n\n",
"If you just added your data store, it can take up to 4 hours before it will become available.\n",
"\"\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# USER INPUTS\n",
"You can find your `datastore_id` by going following these steps:\n",
"1. Click on Gen App Builder\n",
"2. Select your App / Engine\n",
"3. Select your Available Data Store\n",
"4. Find your Data Store ID"
"ds_map = ds.get_data_stores_map(reverse=True, location=\"global\")\n",
"ds_map"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"project_id = '' #@param{type: 'string'}\n",
"location = 'global' #@param{type: 'string'}\n",
"datastore_id = '' #@param{type: 'string'}"
"# Access your datastore_id from the ds_map by using the human readable display name\n",
"datastore_id = ds_map[\"my-cool-datastore\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Check Data Store Index Status\n",
"Using the `list_documents` method, to check if the data store has finished indexing."
"Using the `check_datastore_index_status` method, to check if the data store has finished indexing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"docs = list_documents(project_id, location, datastore_id)\n",
"\n",
"if len(docs) == 0:\n",
" print(PENDING_MESSAGE)\n",
"else:\n",
" SUCCESS_MESSAGE = f\"\"\"\n",
" Success! 🎉\\n\n",
" Your indexing is complete.\\n\n",
" Your index contains {len(docs)} documents.\n",
" \"\"\"\n",
" print(SUCCESS_MESSAGE)"
"s.check_datastore_index_status(datastore_id)"
]
},
{
Expand All @@ -243,14 +123,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"docs = list_documents(project_id, location, datastore_id)\n",
"docs = s.list_documents(datastore_id)\n",
"docs[0]"
]
},
Expand All @@ -264,14 +140,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"urls = list_indexed_urls(project_id, location, datastore_id, docs)\n",
"urls = s.list_indexed_urls(datastore_id, docs)\n",
"urls[0]"
]
},
Expand All @@ -285,11 +157,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"urls"
Expand All @@ -305,11 +173,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"import json\n",
Expand All @@ -328,14 +192,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"search_url(urls, 'tundra-250')"
"s.search_url(urls, 'tundra-250')"
]
},
{
Expand All @@ -351,16 +211,30 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"metadata": {},
"outputs": [],
"source": [
"document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
"\n",
"s.search_doc_id(document_id=document_id, docs=docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"List docs and search document ID all at once."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_id = 'a71d802406f2f0e546b621245e1cbc6a'\n",
"\n",
"search_doc_id(document_id, docs)"
"s.search_doc_id(document_id=document_id, datastore_id=datastore_id)"
]
}
],
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
install_requires=[
'google-cloud-dialogflow-cx',
'google-cloud-aiplatform',
'google-cloud-discoveryengine',
'rouge-score'
]
)

0 comments on commit db2710d

Please sign in to comment.