diff --git a/.github/utils/generate_json_schema.py b/.github/utils/generate_json_schema.py index 36f124d36c..3ecc311cd4 100644 --- a/.github/utils/generate_json_schema.py +++ b/.github/utils/generate_json_schema.py @@ -1,5 +1,6 @@ import sys import logging +from pathlib import Path logging.basicConfig(level=logging.INFO) @@ -7,4 +8,6 @@ sys.path.append(".") from haystack.nodes._json_schema import update_json_schema -update_json_schema(update_index=True) +update_json_schema( + update_index=True, destination_path=Path(__file__).parent.parent.parent / "haystack" / "json-schemas" +) diff --git a/VERSION.txt b/VERSION.txt index 85807a07f3..f0bb29e763 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.2.1rc0 +1.3.0 diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000..8634435d76 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,25 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. + +SPHINXBUILD := sphinx-build +MAKEINFO := makeinfo + +BUILDDIR := build +SOURCE := _src/ +# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree +SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree +SPHINXOPTS := $(SPHINXFLAGS) $(SOURCE) + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + $(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@ diff --git a/docs/_src/api/openapi/openapi-1.3.0.json b/docs/_src/api/openapi/openapi-1.3.0.json new file mode 100644 index 0000000000..cd388129fb --- /dev/null +++ b/docs/_src/api/openapi/openapi-1.3.0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.3.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json index 8c8ae9c864..cd388129fb 100644 --- a/docs/_src/api/openapi/openapi.json +++ b/docs/_src/api/openapi/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.2", "info": { "title": "Haystack REST API", - "version": "1.2.1rc0" + "version": "1.3.0" }, "paths": { "/initialized": { diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000000..7d79440912 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,38 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=_src/ +set BUILDDIR=build +set SPHINXFLAGS=-a -n -A local=1 +set SPHINXOPTS=%SPHINXFLAGS% %SOURCE% +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -b %1 %ALLSPINXOPTS% %BUILDDIR%/%1 +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/release_docs.sh b/docs/release_docs.sh old mode 100644 new mode 100755 index cb7563ef06..9ad53eca72 --- a/docs/release_docs.sh +++ b/docs/release_docs.sh @@ -6,4 +6,4 @@ # Create folder for new docs veresion mkdir "$1" -cp -ar make.bat Makefile _src static templates "$1" \ No newline at end of file +cp -a make.bat Makefile _src "$1" \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/Makefile b/docs/v1.3.0/_src/api/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/v1.3.0/_src/api/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/v1.3.0/_src/api/_static/floating_sidebar.css b/docs/v1.3.0/_src/api/_static/floating_sidebar.css new file mode 100644 index 0000000000..e59adc6722 --- /dev/null +++ b/docs/v1.3.0/_src/api/_static/floating_sidebar.css @@ -0,0 +1,29 @@ +div.sphinxsidebarwrapper { + position: relative; + top: 0px; + padding: 0; +} + +div.sphinxsidebar { + margin: 0; + padding: 0 15px 0 15px; + width: 210px; + float: left; + font-size: 1em; + text-align: left; +} + +div.sphinxsidebar .logo { + font-size: 1.8em; + color: #0A507A; + font-weight: 300; + text-align: center; +} + +div.sphinxsidebar .logo img { + vertical-align: middle; +} + +div.sphinxsidebar .download a img { + vertical-align: middle; +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/_templates/xxlayout.html b/docs/v1.3.0/_src/api/_templates/xxlayout.html new file mode 100644 index 0000000000..de71588332 --- /dev/null +++ b/docs/v1.3.0/_src/api/_templates/xxlayout.html @@ -0,0 +1,46 @@ +{# put the sidebar before the body #} +{% block sidebar1 %}{{ sidebar() }}{% endblock %} +{% block sidebar2 %}{% endblock %} + +{% block extrahead %} + +{{ super() }} +{#- if not embedded #} + + +{#- endif #} +{% endblock %} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/api/crawler.md b/docs/v1.3.0/_src/api/api/crawler.md new file mode 100644 index 0000000000..710570609b --- /dev/null +++ b/docs/v1.3.0/_src/api/api/crawler.md @@ -0,0 +1,101 @@ + + +# Module crawler + + + +## Crawler + +```python +class Crawler(BaseComponent) +``` + +Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc. + +**Example:** +```python +| from haystack.nodes.connector import Crawler +| +| crawler = Crawler(output_dir="crawled_files") +| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ +| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], +| filter_urls= ["haystack\.deepset\.ai\/overview\/"]) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True) +``` + +Init object with basic params for crawling (can be overwritten later). + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl()) +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content + + + +#### crawl + +```python +def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None) -> List[Path] +``` + +Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON + +file per URL, including text and basic meta data). +You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern. +All parameters are optional here and only meant to overwrite instance attributes at runtime. +If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used. + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content + +**Returns**: + +List of paths where the crawled webpages got stored + + + +#### run + +```python +def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str] +``` + +Method to be executed when the Crawler is used as a Node within a Haystack pipeline. + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content +- `return_documents`: Return json files content + +**Returns**: + +Tuple({"paths": List of filepaths, ...}, Name of output edge) + diff --git a/docs/v1.3.0/_src/api/api/document_classifier.md b/docs/v1.3.0/_src/api/api/document_classifier.md new file mode 100644 index 0000000000..2f4ae07b62 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/document_classifier.md @@ -0,0 +1,139 @@ + + +# Module base + + + +## BaseDocumentClassifier + +```python +class BaseDocumentClassifier(BaseComponent) +``` + + + +#### timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +# Module transformers + + + +## TransformersDocumentClassifier + +```python +class TransformersDocumentClassifier(BaseDocumentClassifier) +``` + +Transformer based model for document classification using the HuggingFace's transformers framework +(https://github.com/huggingface/transformers). +While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same. +This node classifies documents and adds the output from the classification step to the document's meta data. +The meta field of the document is a dictionary with the following format: +``'meta': {'name': '450_Baelor.txt', 'classification': {'label': 'neutral', 'probability' = 0.9997646, ...} }`` + +Classification is run on document's content field by default. If you want it to run on another field, +set the `classification_field` to one of document's meta fields. + +With this document_classifier, you can directly get predictions via predict() + + **Usage example at query time:** + ```python +| ... +| retriever = ElasticsearchRetriever(document_store=document_store) +| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") +| p = Pipeline() +| p.add_node(component=retriever, name="Retriever", inputs=["Query"]) +| p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"]) +| res = p.run( +| query="Who is the father of Arya Stark?", +| params={"Retriever": {"top_k": 10}} +| ) +| +| # print the classification results +| print_documents(res, max_text_len=100, print_meta=True) +| # or access the predicted class label directly +| res["documents"][0].to_dict()["meta"]["classification"]["label"] + ``` + +**Usage example at index time:** + ```python +| ... +| converter = TextConverter() +| preprocessor = Preprocessor() +| document_store = ElasticsearchDocumentStore() +| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion", +| batch_size=16) +| p = Pipeline() +| p.add_node(component=converter, name="TextConverter", inputs=["File"]) +| p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"]) +| p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"]) +| p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]) +| p.run(file_paths=file_paths) + ``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = -1, classification_field: str = None) +``` + +Load a text classification model from Transformers. + +Available models for the task of text-classification include: +- ``'bhadresh-savani/distilbert-base-uncased-emotion'`` +- ``'Hate-speech-CNERG/dehatebert-mono-english'`` + +Available models for the task of zero-shot-classification include: +- ``'valhalla/distilbart-mnli-12-3'`` +- ``'cross-encoder/nli-distilroberta-base'`` + +See https://huggingface.co/models for full list of available models. +Filter for text classification models: https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads +Filter for zero-shot classification models (NLI): https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=nli + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bhadresh-savani/distilbert-base-uncased-emotion'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `use_gpu`: Whether to use GPU (if available). +- `return_all_scores`: Whether to return all prediction scores or just the one of the predicted class. Only used for task 'text-classification'. +- `task`: 'text-classification' or 'zero-shot-classification' +- `labels`: Only used for task 'zero-shot-classification'. List of string defining class labels, e.g., +["positive", "negative"] otherwise None. Given a LABEL, the sequence fed to the model is " sequence to +classify This example is LABEL . " and the model predicts whether that sequence is a contradiction +or an entailment. +- `batch_size`: batch size to be processed at once +- `classification_field`: Name of Document's meta field to be used for classification. If left unset, Document.content is used by default. + + + +#### predict + +```python +def predict(documents: List[Document]) -> List[Document] +``` + +Returns documents containing classification result in meta field. + +Documents are updated in place. + +**Arguments**: + +- `documents`: List of Document to classify + +**Returns**: + +List of Document enriched with meta information + diff --git a/docs/v1.3.0/_src/api/api/document_store.md b/docs/v1.3.0/_src/api/api/document_store.md new file mode 100644 index 0000000000..61b69bf7d8 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/document_store.md @@ -0,0 +1,4758 @@ + + +# Module base + + + +## BaseKnowledgeGraph + +```python +class BaseKnowledgeGraph(BaseComponent) +``` + +Base class for implementing Knowledge Graphs. + + + +## BaseDocumentStore + +```python +class BaseDocumentStore(BaseComponent) +``` + +Base class for implementing Document Stores. + + + +#### write\_documents + +```python +@abstractmethod +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + +**Returns**: + +None + + + +#### get\_all\_documents + +```python +@abstractmethod +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### get\_all\_documents\_generator + +```python +@abstractmethod +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + +__Example__: +```python +filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } +} +``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### get\_all\_labels\_aggregated + +```python +def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] +``` + +Return all labels in the DocumentStore, aggregated into MultiLabel objects. + +This aggregation step helps, for example, if you collected multiple possible answers for one question and you +want now all answers bundled together in one place for evaluation. +How they are aggregated is defined by the open_domain and aggregate_by_meta parameters. +If the questions are being asked to a single document (i.e. SQuAD style), you should set open_domain=False to aggregate by question and document. +If the questions are being asked to your full collection of documents, you should set open_domain=True to aggregate just by question. +If the questions are being asked to a subslice of your document set (e.g. product review use cases), +you should set open_domain=True and populate aggregate_by_meta with the names of Label meta fields to aggregate by question and your custom meta fields. +For example, in a product review use case, you might set aggregate_by_meta=["product_id"] so that Labels +with the same question but different answers from different documents are aggregated into the one MultiLabel +object, provided that they have the same product_id (to be found in Label.meta["product_id"]) + +**Arguments**: + +- `index`: Name of the index to get the labels from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `open_domain`: When True, labels are aggregated purely based on the question text alone. +When False, labels are aggregated in a closed domain fashion based on the question text +and also the id of the document that the label is tied to. In this setting, this function +might return multiple MultiLabel objects with the same question string. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `aggregate_by_meta`: The names of the Label meta fields by which to aggregate. For example: ["product_id"] +TODO drop params + + + +#### normalize\_embedding + +```python +def normalize_embedding(emb: np.ndarray) -> None +``` + +Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix +(2D array). + + + +#### add\_eval\_data + +```python +def add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, max_docs: Union[int, bool] = None, open_domain: bool = False, headers: Optional[Dict[str, str]] = None) +``` + +Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. + +If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise +from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors. + +**Arguments**: + +- `filename`: Name of the file containing evaluation data (json or jsonl) +- `doc_index`: Elasticsearch index where evaluation documents should be stored +- `label_index`: Elasticsearch index where labeled questions should be stored +- `batch_size`: Optional number of documents that are loaded and processed at a time. +When set to None (default) all documents are processed at once. +- `preprocessor`: Optional PreProcessor to preprocess evaluation documents. +It can be used for splitting documents into passages (and assigning labels to corresponding passages). +Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0. +When set to None (default) preprocessing is disabled. +- `max_docs`: Optional number of documents that will be loaded. +When set to None (default) all available eval documents are used. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the +same question might be found in different contexts. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### delete\_index + +```python +@abstractmethod +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### run + +```python +def run(documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None) +``` + +Run requests of document stores + +Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. +In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function +is therefore only an interim solution until the run function also accepts documents. + +**Arguments**: + +- `documents`: A list of dicts that are documents. +- `headers`: A list of headers. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `id_hash_keys`: List of the fields that the hashes of the ids are generated from. + + + +## KeywordDocumentStore + +```python +class KeywordDocumentStore(BaseDocumentStore) +``` + +Base class for implementing Document Stores that support keyword searches. + + + +#### query + +```python +@abstractmethod +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by keyword matching algorithms like BM25. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### get\_batches\_from\_generator + +```python +def get_batches_from_generator(iterable, n) +``` + +Batch elements of an iterable into fixed-length chunks or blocks. + + + +# Module elasticsearch + + + +## ElasticsearchDocumentStore + +```python +class ElasticsearchDocumentStore(KeywordDocumentStore) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) +``` + +A DocumentStore using Elasticsearch to store and query the documents for our search. + +* Keeps all the logic to store and query documents from Elastic, incl. mapping of fields, adding filters or boosts to your queries, and storing embeddings + * You can either use an existing Elasticsearch index or create a new one via haystack + * Retrievers operate on top of this DocumentStore to find the relevant documents for a query + +**Arguments**: + +- `host`: url(s) of elasticsearch nodes +- `port`: port(s) of elasticsearch nodes +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth) +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth) +- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) +- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. +- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. +- `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary. +- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index. +Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at: +https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html +- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]). +Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors). +- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates +- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be +created using the config you are using for initialization. Be aware that all data in the old index will be +lost if you choose to recreate the index. Be aware that both the document_index and the label_index will +be recreated. +- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case) +..deprecated:: 2.0 +This param is deprecated. In the next major version we will always try to create an index if there is no +existing index (the current behaviour when create_index=True). If you are looking to recreate an +existing index by deleting it first if it already exist use param recreate_index. +- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. +If set to 'wait_for', continue only after changes are visible (slow, but safe). +If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). +More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +- `timeout`: Number of seconds after which an ElasticSearch request times out. +- `return_embedding`: To return document embedding +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the +ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does. +- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings. +Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h" +For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html +- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings. +Parameter options: (True, False) +False: Raises exception if one or more documents do not have embeddings at query time +True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) +- `synonyms`: List of synonyms can be passed while elasticsearch initialization. +For example: [ "foo, bar => baz", + "foozball , foosball" ] +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html +- `synonym_type`: Synonym filter type can be passed. +Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html +- `use_system_proxy`: Whether to use system proxy. + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead +to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. + + + +#### get\_metadata\_values\_by\_key + +```python +def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict] +``` + +Get values associated with a metadata key. The output is in the format: + +[{"value": "my-value-1", "count": 23}, {"value": "my-value-2", "count": 12}, ... ] + +**Arguments**: + +- `key`: the meta key name to get the values for. +- `query`: narrow down the scope to documents matching the query string. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `index`: Elasticsearch index where the meta values should be searched. If not supplied, +self.index will be used. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries in Elasticsearch. + +Behaviour if a document with the same ID already exists in ElasticSearch: +a) (Default) Throw Elastic's standard error message for duplicate IDs. +b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents. +(This is only relevant if you pass your own ID when initializing a `Document`. +If don't set custom IDs for your Documents or just pass a list of dictionaries here, +they will automatically get UUIDs assigned. See the `Document` class for details) + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"content": ""}. +Optionally: Include meta data via {"content": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary +should be changed to what you have set for self.content_field and self.name_field. +- `index`: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. +- `batch_size`: Number of documents that are passed to Elasticsearch's bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### write\_labels + +```python +def write_labels(labels: Union[List[Label], List[dict]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +``` + +Write annotation labels into document store. + +**Arguments**: + +- `labels`: A list of Python dictionaries or a list of Haystack Label objects. +- `index`: Elasticsearch index where the labels should be stored. If not supplied, self.label_index will be used. +- `batch_size`: Number of labels that are passed to Elasticsearch's bulk function at a time. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None, index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### get\_all\_labels + +```python +def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label] +``` + +Return all labels in the document store + + + +#### query + +```python +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by the BM25 algorithm. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). +Optionally, ES `filter` clause can be added where the values of `terms` are placeholders +that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) +names must match with the filters dict supplied in self.retrieve(). +:: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python + | self.retrieve(query="Why did the revenue increase?", + | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + ``` + +Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. +See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. +You will find the highlighted output in the returned Document's meta field by key "highlighted". +:: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### describe\_documents + +```python +def describe_documents(index=None) +``` + +Return a summary of the documents in the document store + + + +#### update\_embeddings + +```python +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to update the embeddings. +- `index`: Index name to update +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels in an index. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Optional filters to narrow down the labels to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing elasticsearch index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +## OpenSearchDocumentStore + +```python +class OpenSearchDocumentStore(ElasticsearchDocumentStore) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(verify_certs=False, scheme="https", username="admin", password="admin", port=9200, **kwargs) +``` + +Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. + +In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using +the KNN plugin that can scale to a large number of documents. + +**Arguments**: + +- `host`: url(s) of elasticsearch nodes +- `port`: port(s) of elasticsearch nodes +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth) +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth) +- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) +- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. +- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. +- `search_fields`: Name of fields used by ElasticsearchRetriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +Note, that in OpenSearch the similarity type for efficient approximate vector similarity calculations is tied to the embedding field's data type which cannot be changed after creation. +- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary. +- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index. +Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at: +https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html +- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]). +Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors). +- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates +- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case +- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. +If set to 'wait_for', continue only after changes are visible (slow, but safe). +If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). +More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +Note, that the use of efficient approximate vector calculations in OpenSearch is tied to embedding_field's data type which cannot be changed after creation. +You won't be able to use approximate vector calculations on an embedding_field which was created with a different similarity value. +In such cases a fallback to exact but slow vector calculations will happen and a warning will be displayed. +- `timeout`: Number of seconds after which an ElasticSearch request times out. +- `return_embedding`: To return document embedding +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. +As OpenSearch currently does not support all similarity functions (e.g. dot_product) in exact vector similarity calculations, +we don't make use of exact vector similarity when index_type='flat'. Instead we use the same approximate vector similarity calculations like in 'hnsw', but further optimized for accuracy. +Exact vector similarity is only used as fallback when there's a mismatch between certain requested and indexed similarity types. +In these cases however, a warning will be displayed. See similarity param for more information. +- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings. +Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h" +For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html +- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings. +Parameter options: (True, False) +False: Raises exception if one or more documents do not have embeddings at query time +True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) +- `synonyms`: List of synonyms can be passed while elasticsearch initialization. +For example: [ "foo, bar => baz", + "foozball , foosball" ] +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html +- `synonym_type`: Synonym filter type can be passed. +Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +## OpenDistroElasticsearchDocumentStore + +```python +class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore) +``` + +A DocumentStore which has an Open Distro for Elasticsearch service behind it. + + + +# Module memory + + + +## InMemoryDocumentStore + +```python +class InMemoryDocumentStore(BaseDocumentStore) +``` + +In-memory document store + + + +#### \_\_init\_\_ + +```python +def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000) +``` + +**Arguments**: + +- `index`: The documents are scoped to an index attribute that can be used when writing, querying, +or deleting documents. This parameter sets the default value for document index. +- `label_index`: The default value of index attribute for the labels. +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `embedding_dim`: The size of the embedding vector. +- `return_embedding`: To return document embedding +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `use_gpu`: Whether to use a GPU or the CPU for calculating embedding similarity. +Falls back to CPU if no GPU is available. +- `scoring_batch_size`: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent. +Very large batch sizes can overrun GPU memory. In general you want to make sure +you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory. +Since the data is originally stored in CPU memory there is little risk of overruning memory +when running on CPU. + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. + Optionally: Include meta data via {"text": "", + "meta": {"name": ", "author": "somebody", ...}} + It can be used for filtering and is accessible in the responses of the Finder. +:param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a + separate index than the documents for search. +:param duplicate_documents: Handle duplicates document based on parameter options. + Parameter options : ( 'skip','overwrite','fail') + skip: Ignore the duplicates documents + overwrite: Update any existing documents with the same ID when adding documents. + fail: an error is raised if the document ID of the document being added already + exists. +:raises DuplicateDocumentError: Exception trigger on duplicate document +:return: None + + + +#### write\_labels + +```python +def write_labels(labels: Union[List[dict], List[Label]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Write annotation labels into document store. + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string. + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings. + + + +#### get\_scores\_torch + +```python +def get_scores_torch(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +``` + +Calculate similarity scores between query embedding and a list of documents using torch. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `document_to_search`: List of documents to compare `query_emb` against. + + + +#### get\_scores\_numpy + +```python +def get_scores_numpy(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +``` + +Calculate similarity scores between query embedding and a list of documents using numpy. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `document_to_search`: List of documents to compare `query_emb` against. + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` +To use the same logical operator multiple times on the same level, logical operators take +optionally a list of dictionaries as value. +Example: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get all documents from the document store as a list. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. The methods returns a Python Generator that yields individual + +documents. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. + + + +#### get\_all\_labels + +```python +def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +``` + +Return all labels in the document store. + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels in an index. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used. +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +# Module sql + + + +## SQLDocumentStore + +```python +class SQLDocumentStore(BaseDocumentStore) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False, isolation_level: str = None) +``` + +An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends. + +**Arguments**: + +- `url`: URL for SQL database as expected by SQLAlchemy. More info here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls +- `index`: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents. +This parameter sets the default value for document index. +- `label_index`: The default value of index attribute for the labels. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `check_same_thread`: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings + + + +#### get\_documents\_by\_vector\_ids + +```python +def get_documents_by_vector_ids(vector_ids: List[str], index: Optional[str] = None, batch_size: int = 10_000) +``` + +Fetch documents by specifying a list of text vector id strings + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_all\_labels + +```python +def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None) +``` + +Return all labels in the document store + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: add an optional index attribute to documents. It can be later used for filtering. For instance, +documents for evaluation can be indexed in a separate index than the documents for search. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents +but is considerably slower (default). +fail: an error is raised if the document ID of the document being added already +exists. + +**Returns**: + +None + + + +#### write\_labels + +```python +def write_labels(labels, index=None, headers: Optional[Dict[str, str]] = None) +``` + +Write annotation labels into document store. + + + +#### update\_vector\_ids + +```python +def update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None, batch_size: int = 10_000) +``` + +Update vector_ids for given document_ids. + +**Arguments**: + +- `vector_id_map`: dict containing mapping of document_id -> vector_id. +- `index`: filter documents by the optional index attribute for documents in database. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### reset\_vector\_ids + +```python +def reset_vector_ids(index: Optional[str] = None) +``` + +Set vector IDs for all documents as None + + + +#### update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels from the document store. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used. +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Optional filters to narrow down the labels to be deleted. +Example filters: {"id": ["9a196e41-f7b5-45b4-bd19-5feb7501c159", "9a196e41-f7b5-45b4-bd19-5feb7501c159"]} or {"query": ["question2"]} + +**Returns**: + +None + + + +# Module faiss + + + +## FAISSDocumentStore + +```python +class FAISSDocumentStore(SQLDocumentStore) +``` + +Document store for very large scale embedding based dense retrievers like the DPR. + +It implements the FAISS library(https://github.com/facebookresearch/faiss) +to perform similarity search on vectors. + +The document text and meta-data (for filtering) are stored using the SQLDocumentStore, while +the vector embeddings are indexed in a FAISS Index. + + + +#### \_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,) +``` + +**Arguments**: + +- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale +deployment, Postgres is recommended. +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `faiss_index_factory_str`: Create a new FAISS index of the specified type. +The type is determined from the given string following the conventions +of the original FAISS index factory. +Recommended options: +- "Flat" (default): Best accuracy (= exact). Becomes slow and RAM intense for > 1 Mio docs. +- "HNSW": Graph-based heuristic. If not further specified, + we use the following config: + HNSW64, efConstruction=80 and efSearch=20 +- "IVFx,Flat": Inverted Index. Replace x with the number of centroids aka nlist. + Rule of thumb: nlist = 10 * sqrt (num_docs) is a good starting point. +For more details see: +- Overview of indices https://github.com/facebookresearch/faiss/wiki/Faiss-indexes +- Guideline for choosing an index https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index +- FAISS Index factory https://github.com/facebookresearch/faiss/wiki/The-index-factory +Benchmarks: XXX +- `faiss_index`: Pass an existing FAISS Index, i.e. an empty one that you configured manually +or one with docs that you used in Haystack before and want to load again. +- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings +- `index`: Name of index in document store to use. +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model. +In both cases, the returned values in Document.score are normalized to be in range [0,1]: +For `dot_product`: expit(np.asarray(raw_score / 100)) +FOr `cosine`: (raw_score + 1) / 2 +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `faiss_index_path`: Stored FAISS index file. Can be created via calling `save()`. +If specified no other params besides faiss_config_path must be specified. +- `faiss_config_path`: Stored FAISS initial configuration parameters. +Can be created via calling `save()` +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in FAISS. If not, you can later call update_embeddings() to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. Unlike other document stores, FAISS will return normalized embeddings +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### train\_index + +```python +def train_index(documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None, index: Optional[str] = None) +``` + +Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. + +The train vectors should come from the same distribution as your final ones. +You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. + +**Arguments**: + +- `documents`: Documents (incl. the embeddings) +- `embeddings`: Plain embeddings +- `index`: Name of the index to train. If None, the DocumentStore's default index (self.index) will be used. + +**Returns**: + +None + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete all documents from the document store. + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents from the document store. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: Index name to query the document from. +- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings + + + +#### save + +```python +def save(index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +``` + +Save FAISS Index to the specified file. + +**Arguments**: + +- `index_path`: Path to save the FAISS index to. +- `config_path`: Path to save the initial configuration parameters to. +Defaults to the same as the file path, save the extension (.json). +This file contains all the parameters passed to FAISSDocumentStore() +at creation time (for example the SQL path, embedding_dim, etc), and will be +used by the `load` method to restore the index with the appropriate configuration. + +**Returns**: + +None + + + +#### load + +```python +@classmethod +def load(cls, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +``` + +Load a saved FAISS index from a file and connect to the SQL database. + +Note: In order to have a correct mapping from FAISS to SQL, + make sure to use the same SQL DB that you used when calling `save()`. + +**Arguments**: + +- `index_path`: Stored FAISS index file. Can be created via calling `save()` +- `config_path`: Stored FAISS initial configuration parameters. +Can be created via calling `save()` + + + +# Module milvus1 + + + +## Milvus1DocumentStore + +```python +class Milvus1DocumentStore(SQLDocumentStore) +``` + +Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors. +Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR). +In contrast to FAISS, Milvus ... + - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment + - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index) + - encapsulates multiple ANN libraries (FAISS, ANNOY ...) + +This class uses Milvus for all vector related storage, processing and querying. +The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus +does not allow these data types (yet). + +Usage: +1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md) +2. Run pip install farm-haystack[milvus1] +3. Init a MilvusDocumentStore in Haystack + + + +#### \_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, **kwargs, ,) +``` + +**Arguments**: + +- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale +deployment, Postgres is recommended. If using MySQL then same server can also be used for +Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md. +- `milvus_url`: Milvus server connection URL for storing and processing vectors. +Protocol, host and port will automatically be inferred from the URL. +See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance. +- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread". +- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name"). +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB. +When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment. +Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one. +As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048. +Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory. +(From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size) +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings. +'cosine' is recommended for Sentence Transformers. +- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy. +Some popular options: +- FLAT (default): Exact method, slow +- IVF_FLAT, inverted file based heuristic, fast +- HSNW: Graph based, fast +- ANNOY: Tree based, fast +See: https://milvus.io/docs/v1.0.0/index.md +- `index_param`: Configuration parameters for the chose index_type needed at indexing time. +For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT. +See https://milvus.io/docs/v1.0.0/index.md +- `search_param`: Configuration parameters for the chose index_type needed at query time +For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT. +See https://milvus.io/docs/v1.0.0/index.md +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in Milvus. If not, you can later call update_embeddings() to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: (SQL) index name for storing the docs and metadata +- `return_embedding`: To return document embedding + +**Returns**: + +list of Documents that are the most similar to `query_emb` + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store (optionally using filter criteria). + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + +**Arguments**: + +- `id`: ID of the document +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch multiple documents by specifying their IDs (strings) + +**Arguments**: + +- `ids`: List of IDs of the documents +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `batch_size`: is currently not used + + + +#### get\_all\_vectors + +```python +def get_all_vectors(index: Optional[str] = None) -> List[np.ndarray] +``` + +Helper function to dump all vectors stored in Milvus server. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + +**Returns**: + +List[np.array]: List of vectors. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +# Module milvus2 + + + +## Milvus2DocumentStore + +```python +class Milvus2DocumentStore(SQLDocumentStore) +``` + +Limitations: +Milvus 2.0 so far doesn't support the deletion of documents (https://github.com/milvus-io/milvus/issues/7130). +Therefore, delete_documents() and update_embeddings() won't work yet. + +Differences to 1.x: +Besides big architectural changes that impact performance and reliability 2.0 supports the filtering by scalar data types. +For Haystack users this means you can now run a query using vector similarity and filter for some meta data at the same time! +(See https://milvus.io/docs/v2.0.0/comparison.md for more details) + +Usage: +1. Start a Milvus service via docker (see https://milvus.io/docs/v2.0.0/install_standalone-docker.md) +2. Run pip install farm-haystack[milvus] +3. Init a MilvusDocumentStore() in Haystack + +Overview: +Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors. +Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR). + +In contrast to FAISS, Milvus ... + - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment + - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index) + - encapsulates multiple ANN libraries (FAISS, ANNOY ...) + +This class uses Milvus for all vector related storage, processing and querying. +The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus +does not allow these data types (yet). + + + +#### \_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///", host: str = "localhost", port: str = "19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: str = "IVF_FLAT", index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", id_field: str = "id", custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, consistency_level: int = 0) +``` + +**Arguments**: + +- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale +deployment, Postgres is recommended. If using MySQL then same server can also be used for +Milvus metadata. For more details see https://milvus.io/docs/v2.0.0/data_manage.md. +- `milvus_url`: Milvus server connection URL for storing and processing vectors. +Protocol, host and port will automatically be inferred from the URL. +See https://milvus.io/docs/v2.0.0/install_milvus.md for instructions to start a Milvus instance. +- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread". +- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name"). +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB. +When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment. +Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one. +As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048. +Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory. +(From https://milvus.io/docs/v2.0.0/performance_faq.md) +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings. +'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus. +However, you can normalize your embeddings and use `dot_product` to get the same results. +See https://milvus.io/docs/v2.0.0/metric.md. +- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy. +Some popular options: +- FLAT (default): Exact method, slow +- IVF_FLAT, inverted file based heuristic, fast +- HSNW: Graph based, fast +- ANNOY: Tree based, fast +See: https://milvus.io/docs/v2.0.0/index.md +- `index_param`: Configuration parameters for the chose index_type needed at indexing time. +For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT. +See https://milvus.io/docs/v2.0.0/index.md +- `search_param`: Configuration parameters for the chose index_type needed at query time +For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT. +See https://milvus.io/docs/v2.0.0/index.md +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in Milvus. If not, you can later call `update_embeddings()` to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: (SQL) index name for storing the docs and metadata +- `return_embedding`: To return document embedding + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store (optionally using filter criteria). + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + +**Arguments**: + +- `id`: ID of the document +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch multiple documents by specifying their IDs (strings) + +**Arguments**: + +- `ids`: List of IDs of the documents +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +# Module weaviate + + + +## WeaviateDocumentStore + +```python +class WeaviateDocumentStore(BaseDocumentStore) +``` + +Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. +(See https://www.semi.technology/developers/weaviate/current/index.html#what-is-weaviate) + +Some of the key differences in contrast to FAISS & Milvus: +1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up +2. Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset +3. Has less variety of ANN algorithms, as of now only HNSW. +4. Requires document ids to be in uuid-format. If wrongly formatted ids are provided at indexing time they will be replaced with uuids automatically. +5. Only support cosine similarity. + +Weaviate python client is used to connect to the server, more details are here +https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html + +Usage: +1. Start a Weaviate server (see https://www.semi.technology/developers/weaviate/current/getting-started/installation.html) +2. Init a WeaviateDocumentStore in Haystack + +Limitations: +The current implementation is not supporting the storage of labels, so you cannot run any evaluation workflows. + + + +#### \_\_init\_\_ + +```python +def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", **kwargs, ,) +``` + +**Arguments**: + +- `host`: Weaviate server connection URL for storing and processing documents and vectors. +For more details, refer "https://www.semi.technology/developers/weaviate/current/getting-started/installation.html" +- `port`: port of Weaviate instance +- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `index`: Index name for document text, embedding and metadata (in Weaviate terminology, this is a "Class" in Weaviate schema). +- `embedding_dim`: The embedding vector size. Default: 768. +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `similarity`: The similarity function used to compare document vectors. 'cosine' is the only currently supported option and default. +'cosine' is recommended for Sentence Transformers. +- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. +Currently, HSNW is only supported. +See: https://www.semi.technology/developers/weaviate/current/more-resources/performance.html +- `custom_schema`: Allows to create custom schema in Weaviate, for more details +See https://www.semi.technology/developers/weaviate/current/data-schema/schema-configuration.html +- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" +For more details, See https://www.semi.technology/developers/weaviate/current/modules/ +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already exists. + + + +#### get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its uuid string + + + +#### get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of uuid strings. + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. A dummy embedding vector for each document is automatically generated if it is not provided. The document id needs to be in uuid format. Otherwise a correctly formatted uuid will be automatically generated based on the provided id. +- `index`: index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id. +Overwrites only the specified fields, the unspecified ones remain unchanged. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int +``` + +Return the number of embeddings in the document store, which is the same as the number of documents since +every document has a default embedding. + + + +#### get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### query + +```python +def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by Weaviate semantic search. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query that will executed using query.raw method, for more details refer +https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html +- `index`: The name of the index in the DocumentStore from which to retrieve documents + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: index name for storing the docs and metadata +- `return_embedding`: To return document embedding + + + +#### update\_embeddings + +```python +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to update the embeddings. +- `index`: Index name to update +- `update_existing_embeddings`: Weaviate mandates an embedding while creating the document itself. +This option must be always true for weaviate and it will update the embeddings for all the documents. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). + +**Returns**: + +None + + + +#### delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### delete\_labels + +```python +def delete_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### get\_all\_labels + +```python +def get_all_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### get\_label\_count + +```python +def get_label_count() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### write\_labels + +```python +def write_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +# Module graphdb + + + +## GraphDBKnowledgeGraph + +```python +class GraphDBKnowledgeGraph(BaseKnowledgeGraph) +``` + +Knowledge graph store that runs on a GraphDB instance. + + + +#### \_\_init\_\_ + +```python +def __init__(host: str = "localhost", port: int = 7200, username: str = "", password: str = "", index: Optional[str] = None, prefixes: str = "") +``` + +Init the knowledge graph by defining the settings to connect with a GraphDB instance + +**Arguments**: + +- `host`: address of server where the GraphDB instance is running +- `port`: port where the GraphDB instance is running +- `username`: username to login to the GraphDB instance (if any) +- `password`: password to login to the GraphDB instance (if any) +- `index`: name of the index (also called repository) stored in the GraphDB instance +- `prefixes`: definitions of namespaces with a new line after each namespace, e.g., PREFIX hp: + + + +#### create\_index + +```python +def create_index(config_path: Path, headers: Optional[Dict[str, str]] = None) +``` + +Create a new index (also called repository) stored in the GraphDB instance + +**Arguments**: + +- `config_path`: path to a .ttl file with configuration settings, details: +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +https://graphdb.ontotext.com/documentation/free/configuring-a-repository.html#configure-a-repository-programmatically + + + +#### delete\_index + +```python +def delete_index(headers: Optional[Dict[str, str]] = None) +``` + +Delete the index that GraphDBKnowledgeGraph is connected to. This method deletes all data stored in the index. + +**Arguments**: + +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + + + +#### import\_from\_ttl\_file + +```python +def import_from_ttl_file(index: str, path: Path, headers: Optional[Dict[str, str]] = None) +``` + +Load an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file into an index of GraphDB + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance where the imported triples shall be stored +- `path`: path to a .ttl containing a knowledge graph +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + + + +#### get\_all\_triples + +```python +def get_all_triples(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored triples. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all triples stored in the index + + + +#### get\_all\_subjects + +```python +def get_all_subjects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored subjects. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all subjects stored in the index + + + +#### get\_all\_predicates + +```python +def get_all_predicates(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored predicates. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all predicates stored in the index + + + +#### get\_all\_objects + +```python +def get_all_objects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all objects stored in the index + + + +#### query + +```python +def query(sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Execute a SPARQL query on the given index in the GraphDB instance + +**Arguments**: + +- `sparql_query`: SPARQL query that shall be executed +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +query result + + + +# Module deepsetcloud + + + +## DeepsetCloudDocumentStore + +```python +class DeepsetCloudDocumentStore(KeywordDocumentStore) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(api_key: str = None, workspace: str = "default", index: str = "default", duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False) +``` + +A DocumentStore facade enabling you to interact with the documents stored in Deepset Cloud. + +Thus you can run experiments like trying new nodes, pipelines, etc. without having to index your data again. + +DeepsetCloudDocumentStore is not intended for use in production-like scenarios. +See https://haystack.deepset.ai/components/document-store for more information. + +**Arguments**: + +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `workspace`: workspace in Deepset Cloud +- `index`: index to access within the Deepset Cloud workspace +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +- `return_embedding`: To return document embedding. + + + +#### get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to requests + + + +#### query + +```python +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by the BM25 algorithm. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to requests + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + +**Returns**: + +None + + + +# Module pinecone + + + +## PineconeDocumentStore + +```python +class PineconeDocumentStore(SQLDocumentStore) +``` + +Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store, +this means that your vectors will not be stored locally but in the cloud. This means that the similarity +search will be run on the cloud as well. + +It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) +to perform similarity search on vectors. In order to use this document store, you need an API key that you can +obtain by creating an account on the [Pinecone website](https://www.pinecone.io). + +The document text is stored using the SQLDocumentStore, while +the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. + + + +#### \_\_init\_\_ + +```python +def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite") +``` + +**Arguments**: + +- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). +- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are +supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. +- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale +deployment, Postgres is recommended. +- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified. +- `embedding_dim`: The embedding vector size. +- `return_embedding`: Whether to return document embeddings. +- `index`: Name of index in document store to use. +- `similarity`: The similarity function used to compare document vectors. `"dot_product"` is the default +since it is more performant with DPR embeddings. `"cosine"` is recommended if you are using a +Sentence-Transformer model. +In both cases, the returned values in Document.score are normalized to be in range [0,1]: + - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` + - For `"cosine"`: `(raw_score + 1) / 2` +- `replicas`: The number of replicas. Replicas duplicate the index. They provide higher availability and +throughput. +- `shards`: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicate documents based on parameter options.\ +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them +right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. +- `index`: Index name for storing the docs and metadata. +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help to reduce the memory footprint. +- `duplicate_documents`: handle duplicate documents based on parameter options. +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. +- `headers`: PineconeDocumentStore does not support headers. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document. + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32) +``` + +Updates the embeddings in the document store using the encoding model specified in the retriever. + +This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the +retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text. +- `index`: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is +used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to `False`, +only documents without embeddings are processed. This mode can be used for incremental updating of +embeddings, wherein, only newly indexed documents get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help reduce memory footprint. + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: PineconeDocumentStore does not support headers. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents from the document store. + +**Arguments**: + +- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index +(`self.index`) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: PineconeDocumentStore does not support headers. + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR). +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return. +- `index`: The name of the index from which to retrieve documents. +- `return_embedding`: Whether to return document embedding. +- `headers`: PineconeDocumentStore does not support headers. + + + +#### load + +```python +@classmethod +def load(cls) +``` + +Default class method used for loading indexes. Not applicable to the PineconeDocumentStore. + + + +# Module utils + + + +#### eval\_data\_from\_json + +```python +def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Tuple[List[Document], List[Label]] +``` + +Read Documents + Labels from a SQuAD-style file. + +Document and Labels can then be indexed to the DocumentStore and be used for evaluation. + +**Arguments**: + +- `filename`: Path to file in SQuAD format +- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts. + + + +#### eval\_data\_from\_jsonl + +```python +def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Generator[Tuple[List[Document], List[Label]], None, None] +``` + +Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line. + +Document and Labels can then be indexed to the DocumentStore and be used for evaluation. + +This is a generator which will yield one tuple per iteration containing a list +of batch_size documents and a list with the documents' labels. +If batch_size is set to None, this method will yield all documents and labels. + +**Arguments**: + +- `filename`: Path to file in SQuAD format +- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts. + + + +#### squad\_json\_to\_jsonl + +```python +def squad_json_to_jsonl(squad_file: str, output_file: str) +``` + +Converts a SQuAD-json-file into jsonl format with one document per line. + +**Arguments**: + +- `squad_file`: SQuAD-file in json format. +- `output_file`: Name of output file (SQuAD in jsonl format) + + + +#### convert\_date\_to\_rfc3339 + +```python +def convert_date_to_rfc3339(date: str) -> str +``` + +Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and +timezone. + +If the provided date string does not contain a time and/or timezone, we use 00:00 as default time +and UTC as default time zone. + +This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py +and filter_utils.py. + + + +#### open\_search\_index\_to\_document\_store + +```python +def open_search_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "admin", password: str = "admin", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "https", ca_certs: Optional[str] = None, verify_certs: bool = False, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore" +``` + +This function provides brownfield support of existing OpenSearch indexes by converting each of the records in + +the provided index to haystack `Document` objects and writing them to the specified `DocumentStore`. It can be used +on a regular basis in order to add new records of the OpenSearch index to the `DocumentStore`. + +**Arguments**: + +- `document_store`: The haystack `DocumentStore` to write the converted `Document` objects to. +- `original_index_name`: OpenSearch index containing the records to be converted. +- `original_content_field`: OpenSearch field containing the text to be put in the `content` field of the +resulting haystack `Document` objects. +- `original_name_field`: Optional OpenSearch field containing the title of the Document. +- `included_metadata_fields`: List of OpenSearch fields that shall be stored in the `meta` field of the +resulting haystack `Document` objects. If `included_metadata_fields` and `excluded_metadata_fields` are `None`, +all the fields found in the OpenSearch records will be kept as metadata. You can specify only one of the +`included_metadata_fields` and `excluded_metadata_fields` parameters. +- `excluded_metadata_fields`: List of OpenSearch fields that shall be excluded from the `meta` field of the +resulting haystack `Document` objects. If `included_metadata_fields` and `excluded_metadata_fields` are `None`, +all the fields found in the OpenSearch records will be kept as metadata. You can specify only one of the +`included_metadata_fields` and `excluded_metadata_fields` parameters. +- `store_original_ids`: Whether to store the ID a record had in the original OpenSearch index at the +`"_original_es_id"` metadata field of the resulting haystack `Document` objects. This should be set to `True` +if you want to continuously update the `DocumentStore` with new records inside your OpenSearch index. If this +parameter was set to `False` on the first call of `open_search_index_to_document_store`, +all the indexed Documents in the `DocumentStore` will be overwritten in the second call. +- `index`: Name of index in `document_store` to use to store the resulting haystack `Document` objects. +- `preprocessor`: Optional PreProcessor that will be applied on the content field of the original OpenSearch +record. +- `batch_size`: Number of records to process at once. +- `host`: URL(s) of OpenSearch nodes. +- `port`: Ports(s) of OpenSearch nodes. +- `username`: Username (standard authentication via http_auth). +- `password`: Password (standard authentication via http_auth). +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth). +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth). +- `aws4auth`: Authentication for usage with AWS OpenSearch +(can be generated with the requests-aws4auth package). +- `scheme`: `"https"` or `"http"`, protocol used to connect to your OpenSearch instance. +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. +You can use certifi package with `certifi.where()` to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates. +- `timeout`: Number of seconds after which an OpenSearch request times out. +- `use_system_proxy`: Whether to use system proxy. + + + +#### elasticsearch\_index\_to\_document\_store + +```python +def elasticsearch_index_to_document_store(document_store: "BaseDocumentStore", original_index_name: str, original_content_field: str, original_name_field: Optional[str] = None, included_metadata_fields: Optional[List[str]] = None, excluded_metadata_fields: Optional[List[str]] = None, store_original_ids: bool = True, index: Optional[str] = None, preprocessor: Optional[PreProcessor] = None, batch_size: int = 10_000, host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, timeout: int = 30, use_system_proxy: bool = False) -> "BaseDocumentStore" +``` + +This function provides brownfield support of existing Elasticsearch indexes by converting each of the records in + +the provided index to haystack `Document` objects and writing them to the specified `DocumentStore`. It can be used +on a regular basis in order to add new records of the Elasticsearch index to the `DocumentStore`. + +**Arguments**: + +- `document_store`: The haystack `DocumentStore` to write the converted `Document` objects to. +- `original_index_name`: Elasticsearch index containing the records to be converted. +- `original_content_field`: Elasticsearch field containing the text to be put in the `content` field of the +resulting haystack `Document` objects. +- `original_name_field`: Optional Elasticsearch field containing the title of the Document. +- `included_metadata_fields`: List of Elasticsearch fields that shall be stored in the `meta` field of the +resulting haystack `Document` objects. If `included_metadata_fields` and `excluded_metadata_fields` are `None`, +all the fields found in the Elasticsearch records will be kept as metadata. You can specify only one of the +`included_metadata_fields` and `excluded_metadata_fields` parameters. +- `excluded_metadata_fields`: List of Elasticsearch fields that shall be excluded from the `meta` field of the +resulting haystack `Document` objects. If `included_metadata_fields` and `excluded_metadata_fields` are `None`, +all the fields found in the Elasticsearch records will be kept as metadata. You can specify only one of the +`included_metadata_fields` and `excluded_metadata_fields` parameters. +- `store_original_ids`: Whether to store the ID a record had in the original Elasticsearch index at the +`"_original_es_id"` metadata field of the resulting haystack `Document` objects. This should be set to `True` +if you want to continuously update the `DocumentStore` with new records inside your Elasticsearch index. If this +parameter was set to `False` on the first call of `elasticsearch_index_to_document_store`, +all the indexed Documents in the `DocumentStore` will be overwritten in the second call. +- `index`: Name of index in `document_store` to use to store the resulting haystack `Document` objects. +- `preprocessor`: Optional PreProcessor that will be applied on the content field of the original Elasticsearch +record. +- `batch_size`: Number of records to process at once. +- `host`: URL(s) of Elasticsearch nodes. +- `port`: Ports(s) of Elasticsearch nodes. +- `username`: Username (standard authentication via http_auth). +- `password`: Password (standard authentication via http_auth). +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth). +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth). +- `aws4auth`: Authentication for usage with AWS Elasticsearch +(can be generated with the requests-aws4auth package). +- `scheme`: `"https"` or `"http"`, protocol used to connect to your Elasticsearch instance. +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. +You can use certifi package with `certifi.where()` to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates. +- `timeout`: Number of seconds after which an Elasticsearch request times out. +- `use_system_proxy`: Whether to use system proxy. + diff --git a/docs/v1.3.0/_src/api/api/evaluation.md b/docs/v1.3.0/_src/api/api/evaluation.md new file mode 100644 index 0000000000..fed855e80b --- /dev/null +++ b/docs/v1.3.0/_src/api/api/evaluation.md @@ -0,0 +1,147 @@ + + +# Module evaluator + + + +## EvalDocuments + +```python +class EvalDocuments(BaseComponent) +``` + +This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or +Ranker, in order to assess its performance. Performance metrics are stored in this class and updated as each +sample passes through it. To view the results of the evaluation, call EvalDocuments.print(). Note that results +from this Node may differ from that when calling Retriever.eval() since that is a closed domain evaluation. Have +a look at our evaluation tutorial for more info about open vs closed domain eval ( +https://haystack.deepset.ai/tutorials/evaluation). + +EvalDocuments node is deprecated and will be removed in a future version. +Please use pipeline.eval() instead. + + + +#### \_\_init\_\_ + +```python +def __init__(debug: bool = False, open_domain: bool = True, top_k: int = 10) +``` + +**Arguments**: + +- `open_domain`: When True, a document is considered correctly retrieved so long as the answer string can be found within it. +When False, correct retrieval is evaluated based on document_id. +- `debug`: When True, a record of each sample and its evaluation will be stored in EvalDocuments.log +- `top_k`: calculate eval metrics for top k results, e.g., recall@k + + + +#### run + +```python +def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None) +``` + +Run this node on one sample and its labels + + + +#### print + +```python +def print() +``` + +Print the evaluation results + + + +## EvalAnswers + +```python +class EvalAnswers(BaseComponent) +``` + +This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader +individually or to assess the extractive QA performance of the whole pipeline. Performance metrics are stored in +this class and updated as each sample passes through it. To view the results of the evaluation, call EvalAnswers.print(). +Note that results from this Node may differ from that when calling Reader.eval() +since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about +open vs closed domain eval (https://haystack.deepset.ai/tutorials/evaluation). + +EvalAnswers node is deprecated and will be removed in a future version. +Please use pipeline.eval() instead. + + + +#### \_\_init\_\_ + +```python +def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False) +``` + +**Arguments**: + +- `skip_incorrect_retrieval`: When set to True, this eval will ignore the cases where the retriever returned no correct documents +- `open_domain`: When True, extracted answers are evaluated purely on string similarity rather than the position of the extracted answer +- `sas_model`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. +The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +More info in the paper: https://arxiv.org/abs/2108.06130 +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. + Not all cross encoders can be used because of different return types. + If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class +- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" +- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" +- Large model for German only: "deepset/gbert-large-sts" +- `debug`: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log + + + +#### run + +```python +def run(labels: List[Label], answers: List[Answer], correct_retrieval: bool) +``` + +Run this node on one sample and its labels + + + +#### print + +```python +def print(mode) +``` + +Print the evaluation results + + + +#### semantic\_answer\_similarity + +```python +def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True) -> Tuple[List[float], List[float]] +``` + +Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. + +Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels + b) the highest similarity of all predictions to gold labels + +**Arguments**: + +- `predictions`: Predicted answers as list of multiple preds per question +- `gold_labels`: Labels as list of multiple possible answers per question +- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string +pointing to downloadable models. +- `batch_size`: Number of prediction label pairs to encode at once. +- `use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. + +**Returns**: + +top_1_sas, top_k_sas + diff --git a/docs/v1.3.0/_src/api/api/extractor.md b/docs/v1.3.0/_src/api/api/extractor.md new file mode 100644 index 0000000000..26262ef12b --- /dev/null +++ b/docs/v1.3.0/_src/api/api/extractor.md @@ -0,0 +1,58 @@ + + +# Module entity + + + +## EntityExtractor + +```python +class EntityExtractor(BaseComponent) +``` + +This node is used to extract entities out of documents. +The most common use case for this would be as a named entity extractor. +The default model used is dslim/bert-base-NER. +This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only, +or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities. +The entities extracted by this Node will populate Document.entities + + + +#### run + +```python +def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str] +``` + +This is the method called when this node is used in a pipeline + + + +#### extract + +```python +def extract(text) +``` + +This function can be called to perform entity extraction when using the node in isolation. + + + +#### simplify\_ner\_for\_qa + +```python +def simplify_ner_for_qa(output) +``` + +Returns a simplified version of the output dictionary +with the following structure: +[ + { + answer: { ... } + entities: [ { ... }, {} ] + } +] +The entities included are only the ones that overlap with +the answer itself. + diff --git a/docs/v1.3.0/_src/api/api/file_classifier.md b/docs/v1.3.0/_src/api/api/file_classifier.md new file mode 100644 index 0000000000..363b31a7e2 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/file_classifier.md @@ -0,0 +1,47 @@ + + +# Module file\_type + + + +## FileTypeClassifier + +```python +class FileTypeClassifier(BaseComponent) +``` + +Route files in an Indexing Pipeline to corresponding file converters. + + + +#### \_\_init\_\_ + +```python +def __init__(supported_types: List[str] = DEFAULT_TYPES) +``` + +Node that sends out files on a different output edge depending on their extension. + +**Arguments**: + +- `supported_types`: the file types that this node can distinguish. +Note that it's limited to a maximum of 10 outgoing edges, which +correspond each to a file extension. Such extension are, by default +`txt`, `pdf`, `md`, `docx`, `html`. Lists containing more than 10 +elements will not be allowed. Lists with duplicate elements will +also be rejected. + + + +#### run + +```python +def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]]) +``` + +Sends out files on a different output edge depending on their extension. + +**Arguments**: + +- `file_paths`: paths to route on different edges. + diff --git a/docs/v1.3.0/_src/api/api/file_converter.md b/docs/v1.3.0/_src/api/api/file_converter.md new file mode 100644 index 0000000000..76d1543630 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/file_converter.md @@ -0,0 +1,455 @@ + + +# Module base + + + +## BaseConverter + +```python +class BaseConverter(BaseComponent) +``` + +Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. + + + +#### \_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + + +#### convert + +```python +@abstractmethod +def convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> List[Dict[str, Any]] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) + + + +#### validate\_language + +```python +def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool +``` + +Validate if the language of the text is one of valid languages. + + + +# Module docx + + + +## DocxToTextConverter + +```python +class DocxToTextConverter(BaseConverter) +``` + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> List[Dict[str, Any]] +``` + +Extract text from a .docx file. + +Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. +For compliance with other converters we nevertheless opted for keeping the methods name. + +**Arguments**: + +- `file_path`: Path to the .docx file you want to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable + + + +# Module image + + + +## ImageToTextConverter + +```python +class ImageToTextConverter(BaseConverter) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"]) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified here +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. Run the following line of code to check available language packs: +# List of available languages +print(pytesseract.get_languages(config='')) + + + +#### convert + +```python +def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> List[Dict[str, Any]] +``` + +Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + +**Arguments**: + +- `file_path`: path to image file +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages supported by tessarect +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + + +# Module markdown + + + +## MarkdownConverter + +```python +class MarkdownConverter(BaseConverter) +``` + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> List[Dict[str, Any]] +``` + +Reads text from a txt file and executes optional preprocessing steps. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `encoding`: Select the file encoding (default is `utf-8`) +- `remove_numeric_tables`: Not applicable +- `valid_languages`: Not applicable + +**Returns**: + +Dict of format {"text": "The text from file", "meta": meta}} + + + +#### markdown\_to\_text + +```python +@staticmethod +def markdown_to_text(markdown_string: str) -> str +``` + +Converts a markdown string to plaintext + +**Arguments**: + +- `markdown_string`: String in markdown format + + + +# Module pdf + + + +## PDFToTextConverter + +```python +class PDFToTextConverter(BaseConverter) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "Latin1") -> List[Dict[str, Any]] +``` + +Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) + +**Arguments**: + +- `file_path`: Path to the .pdf file you want to convert +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding +of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or +others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...). +Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as +"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here. +(See list of available encodings by running `pdftotext -listenc` in the terminal) + + + +## PDFToTextOCRConverter + +```python +class PDFToTextOCRConverter(BaseConverter) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"]) +``` + +Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages supported by tessarect +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> List[Dict[str, Any]] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) + + + +# Module tika + + + +## TikaConverter + +```python +class TikaConverter(BaseConverter) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) +``` + +**Arguments**: + +- `tika_url`: URL of the Tika server +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> List[Dict[str, Any]] +``` + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable + +**Returns**: + +a list of pages and the extracted meta data of the file. + + + +# Module txt + + + +## TextConverter + +```python +class TextConverter(BaseConverter) +``` + + + +#### convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> List[Dict[str, Any]] +``` + +Reads text from a txt file and executes optional preprocessing steps. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) + +**Returns**: + +Dict of format {"text": "The text from file", "meta": meta}} + diff --git a/docs/v1.3.0/_src/api/api/generator.md b/docs/v1.3.0/_src/api/api/generator.md new file mode 100644 index 0000000000..5fa8caa064 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/generator.md @@ -0,0 +1,251 @@ + + +# Module base + + + +## BaseGenerator + +```python +class BaseGenerator(BaseComponent) +``` + +Abstract class for Generators + + + +#### predict + +```python +@abstractmethod +def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict +``` + +Abstract method to generate answers. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers plus additional infos in a dict + + + +# Module transformers + + + +## RAGenerator + +```python +class RAGenerator(BaseGenerator) +``` + +Implementation of Facebook's Retrieval-Augmented Generator (https://arxiv.org/abs/2005.11401) based on +HuggingFace's transformers (https://huggingface.co/transformers/model_doc/rag.html). + +Instead of "finding" the answer within a document, these models **generate** the answer. +In that sense, RAG follows a similar approach as GPT-3 but it comes with two huge advantages +for real-world applications: +a) it has a manageable model size +b) the answer generation is conditioned on retrieved documents, +i.e. the model can easily adjust to domain documents even after training has finished +(in contrast: GPT-3 relies on the web data seen during training) + +**Example** + +```python +| query = "who got the first nobel prize in physics?" +| +| # Retrieve related documents from retriever +| retrieved_docs = retriever.retrieve(query=query) +| +| # Now generate answer from query and retrieved documents +| generator.predict( +| query=query, +| documents=retrieved_docs, +| top_k=1 +| ) +| +| # Answer +| +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True) +``` + +Load a RAG model from Transformers along with passage_embedding_model. + +See https://huggingface.co/transformers/model_doc/rag.html for more details + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'facebook/rag-token-nq', 'facebook/rag-sequence-nq'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`. +- `generator_type`: Which RAG generator implementation to use ("token" or "sequence") +- `top_k`: Number of independently generated text to return +- `max_length`: Maximum length of generated text +- `min_length`: Minimum length of generated text +- `num_beams`: Number of beams for beam search. 1 means no beam search. +- `embed_title`: Embedded the title of passage while generating embedding +- `prefix`: The prefix used by the generator's tokenizer. +- `use_gpu`: Whether to use GPU. Falls back on CPU if no GPU is available. + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Generate the answer to the input query. The generation will be conditioned on the supplied documents. + +These document can for example be retrieved via the Retriever. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers plus additional infos in a dict like this: +```python +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +## Seq2SeqGenerator + +```python +class Seq2SeqGenerator(BaseGenerator) +``` + +A generic sequence-to-sequence generator based on HuggingFace's transformers. + +Text generation is supported by so called auto-regressive language models like GPT2, +XLNet, XLM, Bart, T5 and others. In fact, any HuggingFace language model that extends +GenerationMixin can be used by Seq2SeqGenerator. + +Moreover, as language models prepare model input in their specific encoding, each model +specified with model_name_or_path parameter in this Seq2SeqGenerator should have an +accompanying model input converter that takes care of prefixes, separator tokens etc. +By default, we provide model input converters for a few well-known seq2seq language models (e.g. ELI5). +It is the responsibility of Seq2SeqGenerator user to ensure an appropriate model input converter +is either already registered or specified on a per-model basis in the Seq2SeqGenerator constructor. + +For mode details on custom model input converters refer to _BartEli5Converter + + +See https://huggingface.co/transformers/main_classes/model.html?transformers.generation_utils.GenerationMixin#transformers.generation_utils.GenerationMixin +as well as https://huggingface.co/blog/how-to-generate + +For a list of all text-generation models see https://huggingface.co/models?pipeline_tag=text-generation + +**Example** + +```python +| query = "Why is Dothraki language important?" +| +| # Retrieve related documents from retriever +| retrieved_docs = retriever.retrieve(query=query) +| +| # Now generate answer from query and retrieved documents +| generator.predict( +| query=query, +| documents=retrieved_docs, +| top_k=1 +| ) +| +| # Answer +| +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: a HF model name for auto-regressive language model like GPT2, XLNet, XLM, Bart, T5 etc +- `input_converter`: an optional Callable to prepare model input for the underlying language model +specified in model_name_or_path parameter. The required __call__ method signature for +the Callable is: +__call__(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], +top_k: Optional[int] = None) -> BatchEncoding: +- `top_k`: Number of independently generated text to return +- `max_length`: Maximum length of generated text +- `min_length`: Minimum length of generated text +- `num_beams`: Number of beams for beam search. 1 means no beam search. +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Generate the answer to the input query. The generation will be conditioned on the supplied documents. + +These document can be retrieved via the Retriever or supplied directly via predict method. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers + diff --git a/docs/v1.3.0/_src/api/api/other.md b/docs/v1.3.0/_src/api/api/other.md new file mode 100644 index 0000000000..c7c0098a02 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/other.md @@ -0,0 +1,120 @@ + + +# Module docs2answers + + + +## Docs2Answers + +```python +class Docs2Answers(BaseComponent) +``` + +This Node is used to convert retrieved documents into predicted answers format. +It is useful for situations where you are calling a Retriever only pipeline via REST API. +This ensures that your output is in a compatible format. + + + +# Module join\_docs + + + +## JoinDocuments + +```python +class JoinDocuments(BaseComponent) +``` + +A node to join documents outputted by multiple retriever nodes. + +The node allows multiple join modes: +* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded. +* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different + `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents. +* reciprocal_rank_fusion: combines the documents based on their rank in multiple nodes. + + + +#### \_\_init\_\_ + +```python +def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None) +``` + +**Arguments**: + +- `join_mode`: `concatenate` to combine documents from multiple retrievers `merge` to aggregate scores of +individual documents, `reciprocal_rank_fusion` to apply rank based scoring. +- `weights`: A node-wise list(length of list must be equal to the number of input nodes) of weights for +adjusting document scores when using the `merge` join_mode. By default, equal weight is given +to each retriever score. This param is not compatible with the `concatenate` join_mode. +- `top_k_join`: Limit documents to top_k based on the resulting scores of the join. + + + +# Module join\_answers + + + +## JoinAnswers + +```python +class JoinAnswers(BaseComponent) +``` + +A node to join `Answer`s produced by multiple `Reader` nodes. + + + +#### \_\_init\_\_ + +```python +def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None) +``` + +**Arguments**: + +- `join_mode`: `"concatenate"` to combine documents from multiple `Reader`s. `"merge"` to aggregate scores +of individual `Answer`s. +- `weights`: A node-wise list (length of list must be equal to the number of input nodes) of weights for +adjusting `Answer` scores when using the `"merge"` join_mode. By default, equal weight is assigned to each +`Reader` score. This parameter is not compatible with the `"concatenate"` join_mode. +- `top_k_join`: Limit `Answer`s to top_k based on the resulting scored of the join. + + + +# Module route\_documents + + + +## RouteDocuments + +```python +class RouteDocuments(BaseComponent) +``` + +A node to split a list of `Document`s by `content_type` or by the values of a metadata field and route them to +different nodes. + + + +#### \_\_init\_\_ + +```python +def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None) +``` + +**Arguments**: + +- `split_by`: Field to split the documents by, either `"content_type"` or a metadata field name. +If this parameter is set to `"content_type"`, the list of `Document`s will be split into a list containing +only `Document`s of type `"text"` (will be routed to `"output_1"`) and a list containing only `Document`s of +type `"text"` (will be routed to `"output_2"`). +If this parameter is set to a metadata field name, you need to specify the parameter `metadata_values` as +well. +- `metadata_values`: If the parameter `split_by` is set to a metadata field name, you need to provide a list +of values to group the `Document`s to. `Document`s whose metadata field is equal to the first value of the +provided list will be routed to `"output_1"`, `Document`s whose metadata field is equal to the second +value of the provided list will be routed to `"output_2"`, etc. + diff --git a/docs/v1.3.0/_src/api/api/pipelines.md b/docs/v1.3.0/_src/api/api/pipelines.md new file mode 100644 index 0000000000..0cce6fc5d0 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/pipelines.md @@ -0,0 +1,1407 @@ + + +# Module base + + + +## RootNode + +```python +class RootNode(BaseComponent) +``` + +RootNode feeds inputs together with corresponding params to a Pipeline. + + + +## BasePipeline + +```python +class BasePipeline(ABC) +``` + +Base class for pipelines, providing the most basic methods to load and save them in different ways. +See also the `Pipeline` class for the actual pipeline logic. + + + +#### get\_config + +```python +@abstractmethod +def get_config(return_defaults: bool = False) -> dict +``` + +Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`. + +**Arguments**: + +- `return_defaults`: whether to output parameters that have the default values. + + + +#### to\_code + +```python +def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str +``` + +Returns the code to create this pipeline as string. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. +- `generate_imports`: Whether to include the required import statements into the code. +Default value is True. +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is False. + + + +#### to\_notebook\_cell + +```python +def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True) +``` + +Creates a new notebook cell with the code to create this pipeline. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. +- `generate_imports`: Whether to include the required import statements into the code. +Default value is True. +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is True. + + + +#### load\_from\_config + +```python +@classmethod +@abstractmethod +def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a config dict defining the individual components and how they're tied together to form + +a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```python + | { + | "version": "1.0", + | "components": [ + | { # define all the building-blocks for Pipeline + | "name": "MyReader", # custom-name for the component; helpful for visualization & debugging + | "type": "FARMReader", # Haystack Class name for the component + | "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"}, + | }, + | { + | "name": "MyESRetriever", + | "type": "ElasticsearchRetriever", + | "params": { + | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML + | "custom_query": None, + | }, + | }, + | {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}}, + | ], + | "pipelines": [ + | { # multiple Pipelines can be defined using the components from above + | "name": "my_query_pipeline", # a simple extractive-qa Pipeline + | "nodes": [ + | {"name": "MyESRetriever", "inputs": ["Query"]}, + | {"name": "MyReader", "inputs": ["MyESRetriever"]}, + | ], + | } + | ], + | } + ``` + +**Arguments**: + +- `pipeline_config`: the pipeline config as dict +- `pipeline_name`: if the config contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### load\_from\_yaml + +```python +@classmethod +@abstractmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### load\_from\_deepset\_cloud + +```python +@classmethod +def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False) +``` + +Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form + +a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +In order to get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`. +Use the returned `name` as `pipeline_config_name`. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +To get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`. +- `pipeline_name`: specifies which pipeline to load from config. +Deepset Cloud typically provides a 'query' and a 'index' pipeline per config. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `overwrite_with_env_variables`: Overwrite the config with environment variables. For example, +to change return_no_answer param for a FARMReader, an env +variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### list\_pipelines\_on\_deepset\_cloud + +```python +@classmethod +def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict] +``` + +Lists all pipeline configs available on Deepset Cloud. + +**Arguments**: + +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. + +Returns: + list of dictionaries: List[dict] + each dictionary: { + "name": str -> `pipeline_config_name` to be used in `load_from_deepset_cloud()`, + "..." -> additional pipeline meta information + } + example: + [{'name': 'my_super_nice_pipeline_config', + 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', + 'status': 'DEPLOYED', + 'created_at': '2022-02-01T09:57:03.803991+00:00', + 'deleted': False, + 'is_default': False, + 'indexing': {'status': 'IN_PROGRESS', + 'pending_file_count': 3, + 'total_file_count': 31}}] + + + +#### save\_to\_deepset\_cloud + +```python +@classmethod +def save_to_deepset_cloud(cls, query_pipeline: BasePipeline, index_pipeline: BasePipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False) +``` + +Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form + +a Pipeline. A single config must declare a query pipeline and a index pipeline. + +**Arguments**: + +- `query_pipeline`: the query pipeline to save. +- `index_pipeline`: the index pipeline to save. +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `overwrite`: Whether to overwrite the config if it already exists. Otherwise an error is being raised. + + + +#### deploy\_on\_deepset\_cloud + +```python +@classmethod +def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60) +``` + +Deploys the pipelines of a pipeline config on Deepset Cloud. + +Blocks until pipelines are successfully deployed, deployment failed or timeout exceeds. +If pipelines are already deployed no action will be taken and an info will be logged. +If timeout exceeds a TimeoutError will be raised. +If deployment fails a DeepsetCloudError will be raised. + +Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `timeout`: The time in seconds to wait until deployment completes. +If the timeout is exceeded an error will be raised. + + + +#### undeploy\_on\_deepset\_cloud + +```python +@classmethod +def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60) +``` + +Undeploys the pipelines of a pipeline config on Deepset Cloud. + +Blocks until pipelines are successfully undeployed, undeployment failed or timeout exceeds. +If pipelines are already undeployed no action will be taken and an info will be logged. +If timeout exceeds a TimeoutError will be raised. +If deployment fails a DeepsetCloudError will be raised. + +Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `timeout`: The time in seconds to wait until undeployment completes. +If the timeout is exceeded an error will be raised. + + + +## Pipeline + +```python +class Pipeline(BasePipeline) +``` + +Pipeline brings together building blocks to build a complex search pipeline with Haystack & user-defined components. + +Under-the-hood, a pipeline is represented as a directed acyclic graph of component nodes. It enables custom query +flows with options to branch queries(eg, extractive qa vs keyword match query), merge candidate documents for a +Reader from multiple Retrievers, or re-ranking of candidate documents. + + + +#### add\_node + +```python +def add_node(component: BaseComponent, name: str, inputs: List[str]) +``` + +Add a new node to the pipeline. + +**Arguments**: + +- `component`: The object to be called when the data is passed to the node. It can be a Haystack component +(like Retriever, Reader, or Generator) or a user-defined object that implements a run() +method to process incoming data from predecessor node. +- `name`: The name for the node. It must not contain any dots. +- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name +of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single +edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. + +In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output +must be specified explicitly as "QueryClassifier.output_2". + + + +#### get\_node + +```python +def get_node(name: str) -> Optional[BaseComponent] +``` + +Get a node from the Pipeline. + +**Arguments**: + +- `name`: The name of the node. + + + +#### set\_node + +```python +def set_node(name: str, component) +``` + +Set the component for a node in the Pipeline. + +**Arguments**: + +- `name`: The name of the node. +- `component`: The component object to be set at the node. + + + +#### run + +```python +def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +Runs the pipeline, one node at a time. + +**Arguments**: + +- `query`: The search query (for query pipelines only) +- `file_paths`: The files to index (for indexing pipelines only) +- `labels`: +- `documents`: +- `meta`: +- `params`: Dictionary of parameters to be dispatched to the nodes. +If you want to pass a param to all nodes, you can just use: {"top_k":10} +If you want to pass it to targeted nodes, you can do: +{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. All debug information can +then be found in the dict returned by this method under the key "_debug" + + + +#### eval\_beir + +```python +@classmethod +def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]] +``` + +Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset. + +See https://github.com/beir-cellar/beir for more information. + +**Arguments**: + +- `index_pipeline`: The indexing pipeline to use. +- `query_pipeline`: The query pipeline to evaluate. +- `index_params`: The params to use during indexing (see pipeline.run's params). +- `query_params`: The params to use during querying (see pipeline.run's params). +- `dataset`: The BEIR dataset to use. +- `dataset_dir`: The directory to store the dataset to. +- `top_k_values`: The top_k values each metric will be calculated for. +- `keep_index`: Whether to keep the index after evaluation. +If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards. + Defaults to False. + +Returns a tuple containing the ncdg, map, recall and precision scores. +Each metric is represented by a dictionary containing the scores for each top_k value. + + + +#### eval + +```python +@send_event +def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False) -> EvaluationResult +``` + +Evaluates the pipeline by running the pipeline once per query in debug mode + +and putting together all data that is needed for evaluation, e.g. calculating metrics. + +**Arguments**: + +- `labels`: The labels to evaluate on +- `documents`: List of List of Document that the first node in the pipeline should get as input per multilabel. Can be used to evaluate a pipeline that consists of a reader without a retriever. +- `params`: Dictionary of parameters to be dispatched to the nodes. +If you want to pass a param to all nodes, you can just use: {"top_k":10} +If you want to pass it to targeted nodes, you can do: +{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} +- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. +The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +More info in the paper: https://arxiv.org/abs/2108.06130 +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. +Not all cross encoders can be used because of different return types. +If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class +- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" +- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" +- Large model for German only: "deepset/gbert-large-sts" +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. +- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. +This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. +If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. +The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. +To this end, labels are used as input to the node instead of the output of the previous node in the pipeline. +The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the +values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics. + + + +#### get\_nodes\_by\_class + +```python +def get_nodes_by_class(class_type) -> List[Any] +``` + +Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). + +This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. +Example: +| from haystack.document_stores.base import BaseDocumentStore +| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) +| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + +**Returns**: + +List of components that are an instance the requested class + + + +#### get\_document\_store + +```python +def get_document_store() -> Optional[BaseDocumentStore] +``` + +Return the document store object used in the current pipeline. + +**Returns**: + +Instance of DocumentStore or None + + + +#### draw + +```python +def draw(path: Path = Path("pipeline.png")) +``` + +Create a Graphviz visualization of the pipeline. + +**Arguments**: + +- `path`: the path to save the image. + + + +#### load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### load\_from\_config + +```python +@classmethod +def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a config dict defining the individual components and how they're tied together to form + +a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```python + | { + | "version": "0.9", + | "components": [ + | { # define all the building-blocks for Pipeline + | "name": "MyReader", # custom-name for the component; helpful for visualization & debugging + | "type": "FARMReader", # Haystack Class name for the component + | "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"}, + | }, + | { + | "name": "MyESRetriever", + | "type": "ElasticsearchRetriever", + | "params": { + | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML + | "custom_query": None, + | }, + | }, + | {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}}, + | ], + | "pipelines": [ + | { # multiple Pipelines can be defined using the components from above + | "name": "my_query_pipeline", # a simple extractive-qa Pipeline + | "nodes": [ + | {"name": "MyESRetriever", "inputs": ["Query"]}, + | {"name": "MyReader", "inputs": ["MyESRetriever"]}, + | ], + | } + | ], + | } + ``` + +**Arguments**: + +- `pipeline_config`: the pipeline config as dict +- `pipeline_name`: if the config contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### save\_to\_yaml + +```python +def save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + + + +#### get\_config + +```python +def get_config(return_defaults: bool = False) -> dict +``` + +Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`. + +**Arguments**: + +- `return_defaults`: whether to output parameters that have the default values. + + + +#### print\_eval\_report + +```python +def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None) +``` + +Prints evaluation report containing a metrics funnel and worst queries for further analysis. + +**Arguments**: + +- `eval_result`: The evaluation result, can be obtained by running eval(). +- `n_wrong_examples`: The number of worst queries to show. +- `metrics_filter`: The metrics to show per node. If None all metrics will be shown. + + + +## RayPipeline + +```python +class RayPipeline(Pipeline) +``` + +Ray (https://ray.io) is a framework for distributed computing. + +Ray allows distributing a Pipeline's components across a cluster of machines. The individual components of a +Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas +of the Reader and a single replica for the Retriever. It enables efficient resource utilization by horizontally +scaling Components. + +To set the number of replicas, add `replicas` in the YAML config for the node in a pipeline: + + ```yaml + | components: + | ... + | + | pipelines: + | - name: ray_query_pipeline + | type: RayPipeline + | nodes: + | - name: ESRetriever + | replicas: 2 # number of replicas to create on the Ray cluster + | inputs: [ Query ] + ``` + +A RayPipeline can only be created with a YAML Pipeline config. +>>> from haystack.pipeline import RayPipeline +>>> pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline") +>>> pipeline.run(query="What is the capital of Germany?") + +By default, RayPipelines creates an instance of RayServe locally. To connect to an existing Ray instance, +set the `address` parameter when creating the RayPipeline instance. + + + +#### \_\_init\_\_ + +```python +def __init__(address: str = None, **kwargs) +``` + +**Arguments**: + +- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started. +- `kwargs`: Optional parameters for initializing Ray. + + + +#### load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, **kwargs, ,) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '0.9' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | type: RayPipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | replicas: 2 # number of replicas to create on the Ray cluster + | - name: MyReader + | inputs: [MyESRetriever] + ``` + + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `RayPipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. +- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started. + + + +## \_RayDeploymentWrapper + +```python +class _RayDeploymentWrapper() +``` + +Ray Serve supports calling of __init__ methods on the Classes to create "deployment" instances. + +In case of Haystack, some Components like Retrievers have complex init methods that needs objects +like Document Stores. + +This wrapper class encapsulates the initialization of Components. Given a Component Class +name, it creates an instance using the YAML Pipeline config. + + + +#### \_\_init\_\_ + +```python +def __init__(pipeline_config: dict, component_name: str) +``` + +Create an instance of Component. + +**Arguments**: + +- `pipeline_config`: Pipeline YAML parsed as a dict. +- `component_name`: Component Class name. + + + +#### \_\_call\_\_ + +```python +def __call__(*args, **kwargs) +``` + +Ray calls this method which is then re-directed to the corresponding component's run(). + + + +## \_HaystackBeirRetrieverAdapter + +```python +class _HaystackBeirRetrieverAdapter() +``` + + + +#### \_\_init\_\_ + +```python +def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict) +``` + +Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines. + +This has nothing to do with Haystack's retriever classes. +See https://github.com/beir-cellar/beir/blob/main/beir/retrieval/evaluation.py. + +**Arguments**: + +- `index_pipeline`: The indexing pipeline to use. +- `query_pipeline`: The query pipeline to evaluate. +- `index_params`: The params to use during indexing (see pipeline.run's params). +- `query_params`: The params to use during querying (see pipeline.run's params). + + + +# Module standard\_pipelines + + + +## BaseStandardPipeline + +```python +class BaseStandardPipeline(ABC) +``` + +Base class for pre-made standard Haystack pipelines. +This class does not inherit from Pipeline. + + + +#### add\_node + +```python +def add_node(component, name: str, inputs: List[str]) +``` + +Add a new node to the pipeline. + +**Arguments**: + +- `component`: The object to be called when the data is passed to the node. It can be a Haystack component +(like Retriever, Reader, or Generator) or a user-defined object that implements a run() +method to process incoming data from predecessor node. +- `name`: The name for the node. It must not contain any dots. +- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name +of node is sufficient. For instance, a 'ElasticsearchRetriever' node would always output a single +edge with a list of documents. It can be represented as ["ElasticsearchRetriever"]. + +In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output +must be specified explicitly as "QueryClassifier.output_2". + + + +#### get\_node + +```python +def get_node(name: str) +``` + +Get a node from the Pipeline. + +**Arguments**: + +- `name`: The name of the node. + + + +#### set\_node + +```python +def set_node(name: str, component) +``` + +Set the component for a node in the Pipeline. + +**Arguments**: + +- `name`: The name of the node. +- `component`: The component object to be set at the node. + + + +#### draw + +```python +def draw(path: Path = Path("pipeline.png")) +``` + +Create a Graphviz visualization of the pipeline. + +**Arguments**: + +- `path`: the path to save the image. + + + +#### save\_to\_yaml + +```python +def save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + + + +#### load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '0.8' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### get\_nodes\_by\_class + +```python +def get_nodes_by_class(class_type) -> List[Any] +``` + +Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). + +This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. +Example: +```python +| from haystack.document_stores.base import BaseDocumentStore +| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) +| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) +``` + +**Returns**: + +List of components that are an instance of the requested class + + + +#### get\_document\_store + +```python +def get_document_store() -> Optional[BaseDocumentStore] +``` + +Return the document store object used in the current pipeline. + +**Returns**: + +Instance of DocumentStore or None + + + +#### eval + +```python +def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, add_isolated_node_eval: bool = False) -> EvaluationResult +``` + +Evaluates the pipeline by running the pipeline once per query in debug mode + +and putting together all data that is needed for evaluation, e.g. calculating metrics. + +**Arguments**: + +- `labels`: The labels to evaluate on +- `params`: Params for the `retriever` and `reader`. For instance, +params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model to be used for sas value calculation, +should be path or string pointing to downloadable models. +- `add_isolated_node_eval`: Whether to additionally evaluate the reader based on labels as input instead of output of previous node in pipeline + + + +## ExtractiveQAPipeline + +```python +class ExtractiveQAPipeline(BaseStandardPipeline) +``` + +Pipeline for Extractive Question Answering. + + + +#### \_\_init\_\_ + +```python +def __init__(reader: BaseReader, retriever: BaseRetriever) +``` + +**Arguments**: + +- `reader`: Reader instance +- `retriever`: Retriever instance + + + +#### run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: The search query string. +- `params`: Params for the `retriever` and `reader`. For instance, +params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## DocumentSearchPipeline + +```python +class DocumentSearchPipeline(BaseStandardPipeline) +``` + +Pipeline for semantic document search. + + + +#### \_\_init\_\_ + +```python +def __init__(retriever: BaseRetriever) +``` + +**Arguments**: + +- `retriever`: Retriever instance + + + +#### run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `reader`. For instance, params={"Retriever": {"top_k": 10}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## GenerativeQAPipeline + +```python +class GenerativeQAPipeline(BaseStandardPipeline) +``` + +Pipeline for Generative Question Answering. + + + +#### \_\_init\_\_ + +```python +def __init__(generator: BaseGenerator, retriever: BaseRetriever) +``` + +**Arguments**: + +- `generator`: Generator instance +- `retriever`: Retriever instance + + + +#### run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `generator`. For instance, +params={"Retriever": {"top_k": 10}, "Generator": {"top_k": 5}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## SearchSummarizationPipeline + +```python +class SearchSummarizationPipeline(BaseStandardPipeline) +``` + +Pipeline that retrieves documents for a query and then summarizes those documents. + + + +#### \_\_init\_\_ + +```python +def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False) +``` + +**Arguments**: + +- `summarizer`: Summarizer instance +- `retriever`: Retriever instance +- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer +format used in other QA pipelines (True). With the latter, you can use this +pipeline as a "drop-in replacement" for other QA pipelines. + + + +#### run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `summarizer`. For instance, +params={"Retriever": {"top_k": 10}, "Summarizer": {"generate_single_summary": True}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## FAQPipeline + +```python +class FAQPipeline(BaseStandardPipeline) +``` + +Pipeline for finding similar FAQs using semantic document search. + + + +#### \_\_init\_\_ + +```python +def __init__(retriever: BaseRetriever) +``` + +**Arguments**: + +- `retriever`: Retriever instance + + + +#### run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever`. For instance, params={"Retriever": {"top_k": 10}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## TranslationWrapperPipeline + +```python +class TranslationWrapperPipeline(BaseStandardPipeline) +``` + +Takes an existing search pipeline and adds one "input translation node" after the Query and one +"output translation" node just before returning the results + + + +#### \_\_init\_\_ + +```python +def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline) +``` + +Wrap a given `pipeline` with the `input_translator` and `output_translator`. + +**Arguments**: + +- `input_translator`: A Translator node that shall translate the input query from language A to B +- `output_translator`: A Translator node that shall translate the pipeline results from language B to A +- `pipeline`: The pipeline object (e.g. ExtractiveQAPipeline) you want to "wrap". +Note that pipelines with split or merge nodes are currently not supported. + + + +## QuestionGenerationPipeline + +```python +class QuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes documents as input and generates +questions that it thinks can be answered by the documents. + + + +## RetrieverQuestionGenerationPipeline + +```python +class RetrieverQuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes a query as input, performs retrieval, and then generates +questions that it thinks can be answered by the retrieved documents. + + + +## QuestionAnswerGenerationPipeline + +```python +class QuestionAnswerGenerationPipeline(BaseStandardPipeline) +``` + +This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by +this document, and then performs question answering of this questions using that single document. + + + +## MostSimilarDocumentsPipeline + +```python +class MostSimilarDocumentsPipeline(BaseStandardPipeline) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore) +``` + +Initialize a Pipeline for finding the most similar documents to a given document. + +This pipeline can be helpful if you already show a relevant document to your end users and they want to search for just similar ones. + +**Arguments**: + +- `document_store`: Document Store instance with already stored embeddings. + + + +#### run + +```python +def run(document_ids: List[str], top_k: int = 5) +``` + +**Arguments**: + +- `document_ids`: document ids +- `top_k`: How many documents id to return against single document + diff --git a/docs/v1.3.0/_src/api/api/preprocessor.md b/docs/v1.3.0/_src/api/api/preprocessor.md new file mode 100644 index 0000000000..aab435cb26 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/preprocessor.md @@ -0,0 +1,99 @@ + + +# Module base + + + +## BasePreProcessor + +```python +class BasePreProcessor(BaseComponent) +``` + + + +#### process + +```python +@abstractmethod +def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] +``` + +Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. + + + +# Module preprocessor + + + +## PreProcessor + +```python +class PreProcessor(BasePreProcessor) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en") +``` + +**Arguments**: + +- `clean_header_footer`: Use heuristic to remove footers and headers across different pages by searching +for the longest common string. This heuristic uses exact matches and therefore +works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" +or similar. +- `clean_whitespace`: Strip whitespaces before or after each line in the text. +- `clean_empty_lines`: Remove more than two empty lines in the text. +- `remove_substrings`: Remove specified substrings from the text. +- `split_by`: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. +- `split_length`: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> +"sentence", then each output document will have 10 sentences. +- `split_overlap`: Word overlap between two adjacent documents after a split. +Setting this to a positive number essentially enables the sliding window approach. +For example, if split_by -> `word`, +split_length -> 5 & split_overlap -> 2, then the splits would be like: +[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. +Set the value to 0 to ensure there is no overlap among the documents after splitting. +- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set +to True, the individual split will always have complete sentences & +the number of words will be <= split_length. +- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. + + + +#### process + +```python +def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] +``` + +Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. + + + +#### clean + +```python +def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str]) -> dict +``` + +Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers +and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). + + + +#### split + +```python +def split(document: dict, split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool) -> List[dict] +``` + +Perform document splitting on a single document. This method can split on different units, at different lengths, +with different strides. It can also respect sentence boundaries. Its exact functionality is defined by +the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. + diff --git a/docs/v1.3.0/_src/api/api/primitives.md b/docs/v1.3.0/_src/api/api/primitives.md new file mode 100644 index 0000000000..0a4c02efd9 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/primitives.md @@ -0,0 +1,433 @@ + + +# Module schema + + + +## Document + +```python +@dataclass +class Document() +``` + + + +#### \_\_init\_\_ + +```python +def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) +``` + +One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. + +Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in +many other places that manipulate or interact with document-level data. + +Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text +into smaller passages. We'll have one Document per passage in this case. + +Each document has a unique ID. This can be supplied by the user or generated automatically. +It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) + +There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. + +**Arguments**: + +- `content`: Content of the document. For most cases, this will be text, but it can be a table or image. +- `content_type`: One of "image", "table" or "image". Haystack components can use this to adjust their +handling of Documents and check compatibility. +- `id`: Unique ID for the document. If not supplied by the user, we'll generate one automatically by +creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`. +- `score`: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker). +In the range of [0,1], where 1 means extremely relevant. +- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). +- `embedding`: Vector encoding of the text +- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes. +If you want ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### to\_dict + +```python +def to_dict(field_map={}) -> Dict +``` + +Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the + +resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| doc = Document(content="some text", content_type="text") +| doc.to_dict(field_map={"custom_content_field": "content"}) +| >>> {"custom_content_field": "some text", content_type": "text"} + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + + +#### from\_dict + +```python +@classmethod +def from_dict(cls, dict, field_map={}, id_hash_keys=None) +``` + +Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the + +input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| my_dict = {"custom_content_field": "some text", content_type": "text"} +| Document.from_dict(my_dict, field_map={"custom_content_field": "content"}) + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + + +#### \_\_lt\_\_ + +```python +def __lt__(other) +``` + +Enable sorting of Documents by score + + + +## Span + +```python +@dataclass +class Span() +``` + + + +#### end + +Defining a sequence of characters (Text span) or cells (Table span) via start and end index. + +For extractive QA: Character where answer starts/ends +For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table) + +**Arguments**: + +- `start`: Position where the span starts +- `end`: Position where the spand ends + + + +## Answer + +```python +@dataclass +class Answer() +``` + + + +#### meta + +The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA). + +For example, it's used within some Nodes like the Reader, but also in the REST API. + +**Arguments**: + +- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string. +- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model +(i.e. we can locate an exact answer string in one of the documents) or from a generative model +(i.e. no pointer to a specific document, no offsets ...). +- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator). +In the range of [0,1], where 1 means extremely relevant. +- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...) +- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the +document** (as stored in the document store). +For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start +For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start +(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the +context** (i.e. the surrounding text/table of a certain window size). +For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start +For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start +(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `document_id`: ID of the document that the answer was located it (if any) +- `meta`: Dict that can be used to associate any kind of custom meta data with the answer. +In extractive QA, this will carry the meta data of the document where the answer was found. + + + +#### \_\_lt\_\_ + +```python +def __lt__(other) +``` + +Enable sorting of Answers by score + + + +## Label + +```python +@dataclass +class Label() +``` + + + +#### \_\_init\_\_ + +```python +def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None) +``` + +Object used to represent label/feedback in a standardized way within Haystack. + +This includes labels from dataset like SQuAD, annotations from labeling tools, +or, user-feedback from the Haystack REST API. + +**Arguments**: + +- `query`: the question (or query) for finding answers. +- `document`: +- `answer`: the answer object. +- `is_correct_answer`: whether the sample is positive or negative. +- `is_correct_document`: in case of negative sample(is_correct_answer is False), there could be two cases; +incorrect answer but correct document & incorrect document. This flag denotes if +the returned document was correct. +- `origin`: the source for the labels. It can be used to later for filtering. +- `id`: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically. +- `no_answer`: whether the question in unanswerable. +- `pipeline_id`: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback). +- `created_at`: Timestamp of creation with format yyyy-MM-dd HH:mm:ss. +Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S"). +- `created_at`: Timestamp of update with format yyyy-MM-dd HH:mm:ss. +Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S") +- `meta`: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed). +- `filters`: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers +in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters. + + + +## MultiLabel + +```python +@dataclass +class MultiLabel() +``` + + + +#### \_\_init\_\_ + +```python +def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False) +``` + +There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated + +answers for one question or multiple documents contain the information you want for a query. +This class is "syntactic sugar" that simplifies the work with such a list of related Labels. +It stored the original labels in MultiLabel.labels and provides additional aggregated attributes that are +automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the +underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer. + +**Arguments**: + +- `labels`: A list of labels that belong to a similar query and shall be "grouped" together +- `drop_negative_labels`: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI) +- `drop_no_answers`: Whether to drop labels that specify the answer is impossible + + + +## EvaluationResult + +```python +class EvaluationResult() +``` + + + +#### \_\_init\_\_ + +```python +def __init__(node_results: Dict[str, pd.DataFrame] = None) -> None +``` + +Convenience class to store, pass and interact with results of a pipeline evaluation run (e.g. pipeline.eval()). + +Detailed results are stored as one dataframe per node. This class makes them more accessible and provides +convenience methods to work with them. +For example, you can calculate eval metrics, get detailed reports or simulate different top_k settings. + +Example: +```python +| eval_results = pipeline.eval(...) +| +| # derive detailed metrics +| eval_results.calculate_metrics() +| +| # show summary of incorrect queries +| eval_results.wrong_examples() +``` + +Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation. +Rows are enriched with basic infos like rank, query, type or node. +Additional answer or document specific evaluation infos like gold labels +and metrics depicting whether the row matches the gold labels are included, too. +The DataFrames have the following schema: +- multilabel_id: the id of the multilabel, which is unique for the pair of query and filters +- query: the query +- filters: the filters used with the query +- gold_answers (answers only): the answers to be given +- answer (answers only): the answer +- context (answers only): the surrounding context of the answer within the document +- exact_match (answers only): metric depicting if the answer exactly matches the gold label +- f1 (answers only): metric depicting how well the answer overlaps with the gold label on token basis +- sas (answers only, optional): metric depciting how well the answer matches the gold label on a semantic basis +- gold_document_contents (documents only): the contents of the gold documents +- content (documents only): the content of the document +- gold_id_match (documents only): metric depicting whether one of the gold document ids matches the document +- answer_match (documents only): metric depicting whether the document contains the answer +- gold_id_or_answer_match (documents only): metric depicting whether one of the former two conditions are met +- rank: rank or 1-based-position in result list +- document_id: the id of the document that has been retrieved or that contained the answer +- gold_document_ids: the documents to be retrieved +- offsets_in_document (answers only): the position or offsets within the document the answer was found +- gold_offsets_in_documents (answers only): the positon or offsets of the gold answer within the document +- type: 'answer' or 'document' +- node: the node name +- eval_mode: evaluation mode depicting whether the evaluation was executed in integrated or isolated mode. + Check pipeline.eval()'s add_isolated_node_eval param for more information. + +**Arguments**: + +- `node_results`: the evaluation Dataframes per pipeline node + + + +#### calculate\_metrics + +```python +def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match", eval_mode: str = "integrated") -> Dict[str, Dict[str, float]] +``` + +Calculates proper metrics for each node. + +For document returning nodes default metrics are: +- mrr (Mean Reciprocal Rank: see https://en.wikipedia.org/wiki/Mean_reciprocal_rank) +- map (Mean Average Precision: see https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Mean_average_precision) +- ndcg (Normalized Discounted Cumulative Gain: see https://en.wikipedia.org/wiki/Discounted_cumulative_gain) +- precision (Precision: How many of the returned documents were relevant?) +- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?) +- recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?) + +For answer returning nodes default metrics are: +- exact_match (How many of the queries returned the exact answer?) +- f1 (How well do the returned results overlap with any gold answer on token basis?) +- sas if a SAS model has bin provided during during pipeline.eval() (How semantically similar is the prediction to the gold answers?) + +Lower top_k values for reader and retriever than the actual values during the eval run can be simulated. +E.g. top_1_f1 for reader nodes can be calculated by setting simulated_top_k_reader=1. + +Results for reader nodes with applied simulated_top_k_retriever should be considered with caution +as there are situations the result can heavily differ from an actual eval run with corresponding top_k_retriever. + +**Arguments**: + +- `simulated_top_k_reader`: simulates top_k param of reader +- `simulated_top_k_retriever`: simulates top_k param of retriever. +remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k +- `doc_relevance_col`: column in the underlying eval table that contains the relevance criteria for documents. +values can be: 'gold_id_match', 'answer_match', 'gold_id_or_answer_match' +- `eval_mode`: the input on which the node was evaluated on. +Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated'). +However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality, +you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node. +For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline. +Values can be 'integrated', 'isolated'. +Default value is 'integrated'. + + + +#### wrong\_examples + +```python +def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, doc_relevance_col: str = "gold_id_match", document_metric: str = "recall_single_hit", answer_metric: str = "f1", eval_mode: str = "integrated") -> List[Dict] +``` + +Returns the worst performing queries. + +Worst performing queries are calculated based on the metric +that is either a document metric or an answer metric according to the node type. + +Lower top_k values for reader and retriever than the actual values during the eval run can be simulated. +See calculate_metrics() for more information. + +**Arguments**: + +- `simulated_top_k_reader`: simulates top_k param of reader +- `simulated_top_k_retriever`: simulates top_k param of retriever. +remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k +- `doc_relevance_col`: column that contains the relevance criteria for documents. +values can be: 'gold_id_match', 'answer_match', 'gold_id_or_answer_match' +- `document_metric`: the document metric worst queries are calculated with. +values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision' +- `document_metric`: the answer metric worst queries are calculated with. +values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model. +- `eval_mode`: the input on which the node was evaluated on. +Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated'). +However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality, +you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node. +For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline. +Values can be 'integrated', 'isolated'. +Default value is 'integrated'. + + + +#### save + +```python +def save(out_dir: Union[str, Path]) +``` + +Saves the evaluation result. + +The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder. + +**Arguments**: + +- `out_dir`: Path to the target folder the csvs will be saved. + + + +#### load + +```python +@classmethod +def load(cls, load_dir: Union[str, Path]) +``` + +Loads the evaluation result from disk. Expects one csv file per node. See save() for further information. + +**Arguments**: + +- `load_dir`: The directory containing the csv files. + diff --git a/docs/v1.3.0/_src/api/api/query_classifier.md b/docs/v1.3.0/_src/api/api/query_classifier.md new file mode 100644 index 0000000000..fb7b2a7c4a --- /dev/null +++ b/docs/v1.3.0/_src/api/api/query_classifier.md @@ -0,0 +1,151 @@ + + +# Module base + + + +## BaseQueryClassifier + +```python +class BaseQueryClassifier(BaseComponent) +``` + +Abstract class for Query Classifiers + + + +# Module sklearn + + + +## SklearnQueryClassifier + +```python +class SklearnQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into one of two categories using a lightweight sklearn model. Depending on the result, the query flows to a different branch in your pipeline +and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Sklearn` binary classification model or use one of the following pretrained ones: + 1) Keywords vs. Questions/Statements (Default) + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle) + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle) + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[ + str, Any + ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[ + str, Any + ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle") +``` + +**Arguments**: + +- `model_name_or_path`: Gradient boosting based binary classifier to classify between keyword vs statement/question +queries or statement vs question queries. +- `vectorizer_name_or_path`: A ngram based Tfidf vectorizer for extracting features from query. + + + +# Module transformers + + + +## TransformersQueryClassifier + +```python +class TransformersQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into one of two categories using a (small) BERT transformer model. +Depending on the result, the query flows to a different branch in your pipeline and the further processing +can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` +from this node. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Transformer` binary classification model from file/huggingface or use one of the following + pretrained ones hosted on Huggingface: + 1) Keywords vs. Questions/Statements (Default) + model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection" + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + `model_name_or_path`="shahrukhx01/question-vs-statement-classifier" + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", use_gpu: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: Transformer based fine tuned mini bert model for query classification +- `use_gpu`: Whether to use GPU (if available). + diff --git a/docs/v1.3.0/_src/api/api/question_generator.md b/docs/v1.3.0/_src/api/api/question_generator.md new file mode 100644 index 0000000000..b6c3b792f3 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/question_generator.md @@ -0,0 +1,41 @@ + + +# Module question\_generator + + + +## QuestionGenerator + +```python +class QuestionGenerator(BaseComponent) +``` + +The Question Generator takes only a document as input and outputs questions that it thinks can be +answered by this document. In our current implementation, input texts are split into chunks of 50 words +with a 10 word overlap. This is because the default model `valhalla/t5-base-e2e-qg` seems to generate only +about 3 questions per passage regardless of length. Our approach prioritizes the creation of more questions +over processing efficiency (T5 is able to digest much more than 50 words at once). The returned questions +generally come in an order dictated by the order of their answers i.e. early questions in the list generally +come from earlier in the document. + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:") +``` + +Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is + +implemented as a Seq2SeqLM in HuggingFace Transformers. Note that this style of question generation (where the only input +is a document) is sometimes referred to as end-to-end question generation. Answer-supervised question +generation is not currently supported. + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. "valhalla/t5-base-e2e-qg". +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. + diff --git a/docs/v1.3.0/_src/api/api/ranker.md b/docs/v1.3.0/_src/api/api/ranker.md new file mode 100644 index 0000000000..e9de4491b3 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/ranker.md @@ -0,0 +1,154 @@ + + +# Module base + + + +## BaseRanker + +```python +class BaseRanker(BaseComponent) +``` + + + +#### timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +#### eval + +```python +def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict +``` + +Performs evaluation of the Ranker. + +Ranker is evaluated in the same way as a Retriever based on whether it finds the correct document given the query string and at which +position in the ranking of documents the correct document is. + +| Returns a dict containing the following metrics: + + - "recall": Proportion of questions for which correct document is among retrieved documents + - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. + Only considers the highest ranked relevant document. + - "map": Mean of average precision for each question. Rewards retrievers that give relevant + documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, + average precision is normalized by the number of retrieved relevant documents per query. + If ``open_domain=False``, average precision is normalized by the number of all relevant documents + per query. + +**Arguments**: + +- `label_index`: Index/Table in DocumentStore where labeled questions are stored +- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored +- `top_k`: How many documents to return per query +- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is +contained in the retrieved docs (common approach in open-domain QA). +If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids +are within ids explicitly stated in the labels. +- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary +contains the keys "predictions" and "metrics". + + + +# Module sentence\_transformers + + + +## SentenceTransformersRanker + +```python +class SentenceTransformersRanker(BaseRanker) +``` + +Sentence Transformer based pre-trained Cross-Encoder model for Document Re-ranking (https://huggingface.co/cross-encoder). +Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance. + +SentenceTransformerRanker handles Cross-Encoder models + - use a single logit as similarity score e.g. cross-encoder/ms-marco-MiniLM-L-12-v2 + - use two output logits (no_answer, has_answer) e.g. deepset/gbert-base-germandpr-reranking +https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers + +| With a SentenceTransformersRanker, you can: + - directly get predictions via predict() + +Usage example: +... +retriever = ElasticsearchRetriever(document_store=document_store) +ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None) +``` + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'cross-encoder/ms-marco-MiniLM-L-12-v2'. +See https://huggingface.co/cross-encoder for full list of available models +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `top_k`: The maximum number of documents to return +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +The strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). + + + +#### predict\_batch + +```python +def predict_batch(query_doc_list: List[dict], top_k: int = None, batch_size: int = None) +``` + +Use loaded Ranker model to, for a list of queries, rank each query's supplied list of Document. + +Returns list of dictionary of query and list of document sorted by (desc.) similarity with query + +**Arguments**: + +- `query_doc_list`: List of dictionaries containing queries with their retrieved documents +- `top_k`: The maximum number of answers to return for each query +- `batch_size`: Number of samples the model receives in one batch for inference + +**Returns**: + +List of dictionaries containing query and ranked list of Document + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document] +``` + +Use loaded ranker model to re-rank the supplied list of Document. + +Returns list of Document sorted by (desc.) similarity with the query. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document to be re-ranked +- `top_k`: The maximum number of documents to return + +**Returns**: + +List of Document + diff --git a/docs/v1.3.0/_src/api/api/reader.md b/docs/v1.3.0/_src/api/api/reader.md new file mode 100644 index 0000000000..8e280d2609 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/reader.md @@ -0,0 +1,814 @@ + + +# Module base + + + +## BaseReader + +```python +class BaseReader(BaseComponent) +``` + + + +#### run\_batch + +```python +def run_batch(query_doc_list: List[Dict], top_k: Optional[int] = None) +``` + +A unoptimized implementation of running Reader queries in batch + + + +#### timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +# Module farm + + + +## FARMReader + +```python +class FARMReader(BaseReader) +``` + +Transformer based model for extractive Question Answering using the FARM framework (https://github.com/deepset-ai/FARM). +While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. + +| With a FARMReader, you can: + + - directly get predictions via predict() + - fine-tune the model on QA data via train() + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None, **kwargs, ,) +``` + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', +'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `context_window_size`: The size, in characters, of the window around the answer span that is used when +displaying the context around the answer. +- `batch_size`: Number of samples the model receives in one batch for inference. +Memory consumption is much lower in inference mode. Recommendation: Increase the batch size +to a value so only a single batch is used. +- `use_gpu`: Whether to use GPU (if available) +- `no_ans_boost`: How much the no_answer logit is boosted/increased. +If set to 0 (default), the no_answer logit is not changed. +If a negative number, there is a lower chance of "no_answer" being predicted. +If a positive number, there is an increased chance of "no_answer" +- `return_no_answer`: Whether to include no_answer predictions in the results. +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). +Note that this is not the number of "final answers" you will receive +(see `top_k` in FARMReader.predict() or Finder.get_answers() for that) +and that FARM includes no_answer in the sorted list of predictions. +- `top_k_per_sample`: How many answers to extract from each small text passage that the model can process at once +(one "candidate doc" is usually split into many smaller "passages"). +You usually want a very small value here, as it slows down inference +and you don't gain much of quality by having multiple answers from one passage. +Note that this is not the number of "final answers" you will receive +(see `top_k` in FARMReader.predict() or Finder.get_answers() for that) +and that FARM includes no_answer in the sorted list of predictions. +- `num_processes`: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable +multiprocessing. Set to None to let Inferencer determine optimum number. If you +want to debug the Language Model, you might need to disable multiprocessing! +- `max_seq_len`: Max sequence length of one input text for the model +- `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered. +The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. +- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer. +`True` => a scaled confidence / relevance score between [0, 1]. +This score can also be further calibrated on your dataset via self.eval() +(see https://haystack.deepset.ai/components/reader#confidence-scores) . +`False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit +from the model for the predicted span. +- `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'} +- `local_files_only`: Whether to force checking for local files only (and forbid downloads) +- `force_download`: Whether fo force a (re-)download even if the model exists locally in the cache. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + + +#### train + +```python +def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo")) +``` + +Fine-tune a model on a QA dataset. Options: + +- Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) +- Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `batch_size`: Number of samples the model receives in one batch for training +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset +- `cache_path`: Path to cache the preprocessed dataset +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. + +**Returns**: + +None + + + +#### distil\_prediction\_layer\_from + +```python +def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0) +``` + +Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset + +and a student model that will be trained using the teacher's logits. The idea of this is to increase the accuracy of a lightweight student model. +using a more complex teacher. +Originally proposed in: https://arxiv.org/pdf/1503.02531.pdf +This can also be considered as the second stage of distillation finetuning as described in the TinyBERT paper: +https://arxiv.org/pdf/1909.10351.pdf +**Example** +```python +student = FARMReader(model_name_or_path="prajjwal1/bert-medium") +teacher = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2") +student.distil_prediction_layer_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json", + learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5) +``` + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `teacher_model`: Model whose logits will be used to improve accuracy +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `student_batch_size`: Number of samples the student model receives in one batch for training +- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset and teacher logits +- `cache_path`: Path to cache the preprocessed dataset and teacher logits +- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important. +- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits) +- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model. +- `tinybert_loss`: Whether to use the TinyBERT loss function for distillation. This requires the student to be a TinyBERT model and the teacher to be a finetuned version of bert-base-uncased. +- `tinybert_epochs`: Number of epochs to train the student model with the TinyBERT loss function. After this many epochs, the student model is trained with the regular distillation loss function. +- `tinybert_learning_rate`: Learning rate to use when training the student model with the TinyBERT loss function. +- `tinybert_train_filename`: Filename of training data to use when training the student model with the TinyBERT loss function. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script. If not specified, the training data from the original training is used. +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. + +**Returns**: + +None + + + +#### distil\_intermediate\_layers\_from + +```python +def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None) +``` + +The first stage of distillation finetuning as described in the TinyBERT paper: + +https://arxiv.org/pdf/1909.10351.pdf +**Example** +```python +student = FARMReader(model_name_or_path="prajjwal1/bert-medium") +teacher = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D") +student.distil_intermediate_layers_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json", + learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5) +``` + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `teacher_model`: Model whose logits will be used to improve accuracy +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `student_batch_size`: Number of samples the student model receives in one batch for training +- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset and teacher logits +- `cache_path`: Path to cache the preprocessed dataset and teacher logits +- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important. +- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits) +- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model. +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. + +**Returns**: + +None + + + +#### update\_parameters + +```python +def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None) +``` + +Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests. + + + +#### save + +```python +def save(directory: Path) +``` + +Saves the Reader model so that it can be reused at a later point in time. + +**Arguments**: + +- `directory`: Directory where the Reader model should be saved + + + +#### predict\_batch + +```python +def predict_batch(query_doc_list: List[dict], top_k: int = None, batch_size: int = None) +``` + +Use loaded QA model to find answers for a list of queries in each query's supplied list of Document. + +Returns list of dictionaries containing answers sorted by (desc.) score + +**Arguments**: + +- `query_doc_list`: List of dictionaries containing queries with their retrieved documents +- `top_k`: The maximum number of answers to return for each query +- `batch_size`: Number of samples the model receives in one batch for inference + +**Returns**: + +List of dictionaries containing query and answers + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a query in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + ```python + |{ + | 'query': 'Who is the father of Arya Stark?', + | 'answers':[Answer( + | 'answer': 'Eddard,', + | 'context': "She travels with her father, Eddard, to King's Landing when he is", + | 'score': 0.9787139466668613, + | 'offsets_in_context': [Span(start=29, end=35], + | 'offsets_in_context': [Span(start=347, end=353], + | 'document_id': '88d1ed769d003939d3a0d28034464ab2' + | ),... + | ] + |} + ``` + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +#### eval\_on\_file + +```python +def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None) +``` + +Performs evaluation on a SQuAD-formatted file. + +Returns a dict containing the following metrics: + - "EM": exact match score + - "f1": F1-Score + - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer + +**Arguments**: + +- `data_dir`: The directory in which the test set can be found +- `test_filename`: The name of the file containing the test data in SQuAD format. +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. + + + +#### eval + +```python +def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False) +``` + +Performs evaluation on evaluation documents in the DocumentStore. + +Returns a dict containing the following metrics: + - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers + - "f1": Average overlap between predicted answers and their corresponding correct answers + - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer + +**Arguments**: + +- `document_store`: DocumentStore containing the evaluation documents +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. +- `label_index`: Index/Table name where labeled questions are stored +- `doc_index`: Index/Table name where documents that are used for evaluation are stored +- `label_origin`: Field name where the gold labels are stored +- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores + + + +#### calibrate\_confidence\_scores + +```python +def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label") +``` + +Calibrates confidence scores on evaluation documents in the DocumentStore. + +**Arguments**: + +- `document_store`: DocumentStore containing the evaluation documents +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. +- `label_index`: Index/Table name where labeled questions are stored +- `doc_index`: Index/Table name where documents that are used for evaluation are stored +- `label_origin`: Field name where the gold labels are stored + + + +#### predict\_on\_texts + +```python +def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a question in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + ```python + |{ + | 'question': 'Who is the father of Arya Stark?', + | 'answers':[ + | {'answer': 'Eddard,', + | 'context': " She travels with her father, Eddard, to King's Landing when he is ", + | 'offset_answer_start': 147, + | 'offset_answer_end': 154, + | 'score': 0.9787139466668613, + | 'document_id': '1337' + | },... + | ] + |} + ``` + +**Arguments**: + +- `question`: Question string +- `documents`: List of documents as string type +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing question and answers + + + +#### convert\_to\_onnx + +```python +@classmethod +def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11) +``` + +Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model + +can be loaded with in the `FARMReader` using the export path as `model_name_or_path` param. + +Usage: + + `from haystack.reader.farm import FARMReader + from pathlib import Path + onnx_model_path = Path("roberta-onnx-model") + FARMReader.convert_to_onnx(model_name="deepset/bert-base-cased-squad2", output_path=onnx_model_path) + reader = FARMReader(onnx_model_path)` + +**Arguments**: + +- `model_name`: transformers model name +- `output_path`: Path to output the converted model +- `convert_to_float16`: Many models use float32 precision by default. With the half precision of float16, +inference is faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, +float32 could still be be more performant. +- `quantize`: convert floating point number to integers +- `task_type`: Type of task for the model. Available options: "question_answering" or "embeddings". +- `opset_version`: ONNX opset version + + + +# Module transformers + + + +## TransformersReader + +```python +class TransformersReader(BaseReader) +``` + +Transformer based model for extractive Question Answering using the HuggingFace's transformers framework +(https://github.com/huggingface/transformers). +While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same. +With this reader, you can directly get predictions via predict() + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 4, return_no_answers: bool = True, max_seq_len: int = 256, doc_stride: int = 128) +``` + +Load a QA model from Transformers. + +Available models include: + +- ``'distilbert-base-uncased-distilled-squad`'`` +- ``'bert-large-cased-whole-word-masking-finetuned-squad``' +- ``'bert-large-uncased-whole-word-masking-finetuned-squad``' + +See https://huggingface.co/models for full list of available QA models + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', +'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer. +The context usually helps users to understand if the answer really makes sense. +- `use_gpu`: Whether to use GPU (if available). +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). +Note that this is not the number of "final answers" you will receive +(see `top_k` in TransformersReader.predict() or Finder.get_answers() for that) +and that no_answer can be included in the sorted list of predictions. +- `return_no_answers`: If True, the HuggingFace Transformers model could return a "no_answer" (i.e. when there is an unanswerable question) +If False, it cannot return a "no_answer". Note that `no_answer_boost` is unfortunately not available with TransformersReader. +If you would like to set no_answer_boost, use a `FARMReader`. +- `max_seq_len`: max sequence length of one input text for the model +- `doc_stride`: length of striding window for splitting long texts (used if len(text) > max_seq_len) + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a query in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + + ```python + |{ + | 'query': 'Who is the father of Arya Stark?', + | 'answers':[ + | {'answer': 'Eddard,', + | 'context': " She travels with her father, Eddard, to King's Landing when he is ", + | 'offset_answer_start': 147, + | 'offset_answer_end': 154, + | 'score': 0.9787139466668613, + | 'document_id': '1337' + | },... + | ] + |} + ``` + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +# Module table + + + +## TableReader + +```python +class TableReader(BaseReader) +``` + +Transformer-based model for extractive Question Answering on Tables with TaPas +using the HuggingFace's transformers framework (https://github.com/huggingface/transformers). +With this reader, you can directly get predictions via predict() + +**Example**: + +```python +from haystack import Document +from haystack.reader import TableReader +import pandas as pd + +table_reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq") +data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["57", "46", "60"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], +} +table = pd.DataFrame(data) +document = Document(content=table, content_type="table") +query = "When was DiCaprio born?" +prediction = table_reader.predict(query=query, documents=[document]) +answer = prediction["answers"][0].answer # "10 june 1996" +``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256) +``` + +Load a TableQA model from Transformers. + +Available models include: + +- ``'google/tapas-base-finetuned-wtq`'`` +- ``'google/tapas-base-finetuned-wikisql-supervised``' +- ``'deepset/tapas-large-nq-hn-reader'`` +- ``'deepset/tapas-large-nq-reader'`` + +See https://huggingface.co/models?pipeline_tag=table-question-answering +for full list of available TableQA models. + +The nq-reader models are able to provide confidence scores, but cannot handle questions that need aggregation +over multiple cells. The returned answers are sorted first by a general table score and then by answer span +scores. +All the other models can handle aggregation questions, but don't provide reasonable confidence scores. + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +See https://huggingface.co/models?pipeline_tag=table-question-answering for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, +or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available. +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate table that is coming from +the retriever. +- `return_no_answer`: Whether to include no_answer predictions in the results. +(Only applicable with nq-reader models.) +- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of +query + table exceed max_seq_len, the table will be truncated by removing rows until the +input size fits the model. + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Use loaded TableQA model to find answers for a query in the supplied list of Documents + +of content_type ``'table'``. + +Returns dictionary containing query and list of Answer objects sorted by (desc.) score. +WARNING: The answer scores are not reliable, as they are always extremely high, even if + a question cannot be answered by a given table. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer. Documents should be +of content_type ``'table'``. +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +## RCIReader + +```python +class RCIReader(BaseReader) +``` + +Table Reader model based on Glass et al. (2021)'s Row-Column-Intersection model. +See the original paper for more details: +Glass, Michael, et al. (2021): "Capturing Row and Column Semantics in Transformer Based Question Answering over Tables" +(https://aclanthology.org/2021.naacl-main.96/) + +Each row and each column is given a score with regard to the query by two separate models. The score of each cell +is then calculated as the sum of the corresponding row score and column score. Accordingly, the predicted answer is +the cell with the highest score. + +Pros and Cons of RCIReader compared to TableReader: ++ Provides meaningful confidence scores ++ Allows larger tables as input +- Does not support aggregation over table cells +- Slower + + + +#### \_\_init\_\_ + +```python +def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256) +``` + +Load an RCI model from Transformers. + +Available models include: + +- ``'michaelrglass/albert-base-rci-wikisql-row'`` + ``'michaelrglass/albert-base-rci-wikisql-col'`` +- ``'michaelrglass/albert-base-rci-wtq-row'`` + ``'michaelrglass/albert-base-rci-wtq-col'`` + +**Arguments**: + +- `row_model_name_or_path`: Directory of a saved row scoring model or the name of a public model +- `column_model_name_or_path`: Directory of a saved column scoring model or the name of a public model +- `row_model_version`: The version of row model to use from the HuggingFace model hub. +Can be tag name, branch name, or commit hash. +- `column_model_version`: The version of column model to use from the HuggingFace model hub. +Can be tag name, branch name, or commit hash. +- `row_tokenizer`: Name of the tokenizer for the row model (usually the same as model) +- `column_tokenizer`: Name of the tokenizer for the column model (usually the same as model) +- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available. +- `top_k`: The maximum number of answers to return +- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of +query + table exceed max_seq_len, the table will be truncated by removing rows until the +input size fits the model. + + + +#### predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Use loaded RCI models to find answers for a query in the supplied list of Documents + +of content_type ``'table'``. + +Returns dictionary containing query and list of Answer objects sorted by (desc.) score. +The existing RCI models on the HF model hub don"t allow aggregation, therefore, the answer will always be +composed of a single cell. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer. Documents should be +of content_type ``'table'``. +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + diff --git a/docs/v1.3.0/_src/api/api/retriever.md b/docs/v1.3.0/_src/api/api/retriever.md new file mode 100644 index 0000000000..90e7db44fd --- /dev/null +++ b/docs/v1.3.0/_src/api/api/retriever.md @@ -0,0 +1,854 @@ + + +# Module base + + + +## BaseGraphRetriever + +```python +class BaseGraphRetriever(BaseComponent) +``` + +Base classfor knowledge graph retrievers. + + + +## BaseRetriever + +```python +class BaseRetriever(BaseComponent) +``` + +Base class for regular retrievers. + + + +#### retrieve + +```python +@abstractmethod +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +#### eval + +```python +def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict +``` + +Performs evaluation on the Retriever. + +Retriever is evaluated based on whether it finds the correct document given the query string and at which +position in the ranking of documents the correct document is. + +| Returns a dict containing the following metrics: + + - "recall": Proportion of questions for which correct document is among retrieved documents + - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. + Only considers the highest ranked relevant document. + - "map": Mean of average precision for each question. Rewards retrievers that give relevant + documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, + average precision is normalized by the number of retrieved relevant documents per query. + If ``open_domain=False``, average precision is normalized by the number of all relevant documents + per query. + +**Arguments**: + +- `label_index`: Index/Table in DocumentStore where labeled questions are stored +- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored +- `top_k`: How many documents to return per query +- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is +contained in the retrieved docs (common approach in open-domain QA). +If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids +are within ids explicitly stated in the labels. +- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary +contains the keys "predictions" and "metrics". +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +# Module sparse + + + +## ElasticsearchRetriever + +```python +class ElasticsearchRetriever(BaseRetriever) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None) +``` + +**Arguments**: + +- `document_store`: an instance of an ElasticsearchDocumentStore to retrieve documents from. +- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). + Optionally, ES `filter` clause can be added where the values of `terms` are placeholders + that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) + names must match with the filters dict supplied in self.retrieve(). + :: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python +| self.retrieve(query="Why did the revenue increase?", +| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) +``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` +- `top_k`: How many documents to return per query. + + + +#### retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +## ElasticsearchFilterOnlyRetriever + +```python +class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever) +``` + +Naive "Retriever" that returns all documents that match the given filters. No impact of query at all. +Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever. + + + +#### retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +## TfidfRetriever + +```python +class TfidfRetriever(BaseRetriever) +``` + +Read all documents from a SQL backend. + +Split documents into smaller units (eg, paragraphs or pages) to reduce the +computations when text is passed on to a Reader for QA. + +It uses sklearn's TfidfVectorizer to compute a tf-idf matrix. + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True) +``` + +**Arguments**: + +- `document_store`: an instance of a DocumentStore to retrieve documents from. +- `top_k`: How many documents to return per query. +- `auto_fit`: Whether to automatically update tf-idf matrix by calling fit() after new documents have been added + + + +#### retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents + + + +#### fit + +```python +def fit() +``` + +Performing training on this class according to the TF-IDF algorithm. + + + +# Module dense + + + +## DensePassageRetriever + +```python +class DensePassageRetriever(BaseRetriever) +``` + +Retriever that uses a bi-encoder (one transformer for query, one transformer for passage). +See the original paper for more details: +Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Question Answering." +(https://arxiv.org/abs/2004.04906). + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None) +``` + +Init the Retriever incl. the two encoder models from a local or remote model checkpoint. + +The checkpoint format matches huggingface transformers' model format + +**Example:** + + ```python + | # remote model from FAIR + | DensePassageRetriever(document_store=your_doc_store, + | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") + | # or from local path + | DensePassageRetriever(document_store=your_doc_store, + | query_embedding_model="model_directory/question-encoder", + | passage_embedding_model="model_directory/context-encoder") + ``` + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models +Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` +- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models +Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." +- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." +- `top_k`: How many documents to return per query. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size. +- `embed_title`: Whether to concatenate title and passage to a text pair that is then used to create the embedding. +This is the approach used in the original paper and is likely to improve performance if your +titles contain meaningful information for retrieval (topic, entities etc.) . +The title is expected to be present in doc.meta["name"] and can be supplied in the documents +before writing them to the DocumentStore like this: +{"text": "my text", "meta": {"name": "my title"}}. +- `use_fast_tokenizers`: Whether to use fast Rust tokenizers +- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name. +If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. +- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. +Options: `dot_product` (Default) or `cosine` +- `global_loss_buffer_size`: Buffer size for all_gather() in DDP. +Increase if errors like "encoded data exceeds max_size ..." come up +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training +will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + + +#### retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents + + + +#### embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries using the query encoder + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of documents using the passage encoder + +**Arguments**: + +- `docs`: List of Document objects used to represent documents / passages in a standardized way within Haystack. + +**Returns**: + +Embeddings of documents / passages shape (batch_size, embedding_dim) + + + +#### train + +```python +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder") +``` + +train a DensePassageRetrieval model + +**Arguments**: + +- `data_dir`: Directory where training file, dev file and test file are present +- `train_filename`: training filename +- `dev_filename`: development set filename, file to be used by model in eval step of training +- `test_filename`: test set filename, file to be used by model in test step after training +- `max_samples`: maximum number of input samples to convert. Can be used for debugging a smaller dataset. +- `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. +It can be set to 1 to disable the use of multiprocessing or make debugging easier. +- `multiprocessing_strategy`: Set the multiprocessing sharing strategy, this can be one of file_descriptor/file_system depending on your OS. +If your system has low limits for the number of open file descriptors, and you can’t raise them, +you should use the file_system strategy. +- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None +- `batch_size`: total number of samples in 1 batch of data +- `embed_title`: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage +- `num_hard_negatives`: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer +- `num_positives`: number of positive passages +- `n_epochs`: number of epochs to train the model on +- `evaluate_every`: number of training steps after evaluation is run +- `n_gpu`: number of gpus to train on +- `learning_rate`: learning rate of optimizer +- `epsilon`: epsilon parameter of optimizer +- `weight_decay`: weight decay parameter of optimizer +- `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: +"O0" (FP32) +"O1" (Mixed Precision) +"O2" (Almost FP16) +"O3" (Pure FP16). +For more information, refer to: https://nvidia.github.io/apex/amp.html +- `optimizer_name`: what optimizer to use (default: AdamW) +- `num_warmup_steps`: number of warmup steps +- `optimizer_correct_bias`: Whether to correct bias in optimizer +- `save_dir`: directory where models are saved +- `query_encoder_save_dir`: directory inside save_dir where query_encoder model files are saved +- `passage_encoder_save_dir`: directory inside save_dir where passage_encoder model files are saved + + + +#### save + +```python +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") +``` + +Save DensePassageRetriever to the specified directory. + +**Arguments**: + +- `save_dir`: Directory to save to. +- `query_encoder_dir`: Directory in save_dir that contains query encoder model. +- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model. + +**Returns**: + +None + + + +#### load + +```python +@classmethod +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", infer_tokenizer_classes: bool = False) +``` + +Load DensePassageRetriever from the specified directory. + + + +## TableTextRetriever + +```python +class TableTextRetriever(BaseRetriever) +``` + +Retriever that uses a tri-encoder to jointly retrieve among a database consisting of text passages and tables +(one transformer for query, one transformer for text passages, one transformer for tables). +See the original paper for more details: +Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using Tri-encoder Models" +(https://arxiv.org/abs/2108.04049), + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None) +``` + +Init the Retriever incl. the two encoder models from a local or remote model checkpoint. + +The checkpoint format matches huggingface transformers' model format + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models. +- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models. +- `table_embedding_model`: Local path or remote name of table encoder checkpoint. The format equala the +one used by hugging-face transformers' modelhub models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." +- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." +- `top_k`: How many documents to return per query. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size. +- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is +then used to create the embedding. +This is the approach used in the original paper and is likely to improve +performance if your titles contain meaningful information for retrieval +(topic, entities etc.). +- `use_fast_tokenizers`: Whether to use fast Rust tokenizers +- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name. +If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. +- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. +Options: `dot_product` (Default) or `cosine` +- `global_loss_buffer_size`: Buffer size for all_gather() in DDP. +Increase if errors like "encoded data exceeds max_size ..." come up +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever, +training will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + + +#### embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries using the query encoder + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of text documents and / or tables using the text passage encoder and + +the table encoder. + +**Arguments**: + +- `docs`: List of Document objects used to represent documents / passages in +a standardized way within Haystack. + +**Returns**: + +Embeddings of documents / passages. Shape: (batch_size, embedding_dim) + + + +#### train + +```python +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder") +``` + +Train a TableTextRetrieval model. + +**Arguments**: + +- `data_dir`: Directory where training file, dev file and test file are present. +- `train_filename`: Training filename. +- `dev_filename`: Development set filename, file to be used by model in eval step of training. +- `test_filename`: Test set filename, file to be used by model in test step after training. +- `max_samples`: Maximum number of input samples to convert. Can be used for debugging a smaller dataset. +- `max_processes`: The maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. +It can be set to 1 to disable the use of multiprocessing or make debugging easier. +- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None. +- `batch_size`: Total number of samples in 1 batch of data. +- `embed_meta_fields`: Concatenate meta fields with each passage and table. +The default setting in official MMRetrieval embeds page title, +section title and caption with the corresponding table and title with +corresponding text passage. +- `num_hard_negatives`: Number of hard negative passages (passages which are +very similar (high score by BM25) to query but do not contain the answer)- +- `num_positives`: Number of positive passages. +- `n_epochs`: Number of epochs to train the model on. +- `evaluate_every`: Number of training steps after evaluation is run. +- `n_gpu`: Number of gpus to train on. +- `learning_rate`: Learning rate of optimizer. +- `epsilon`: Epsilon parameter of optimizer. +- `weight_decay`: Weight decay parameter of optimizer. +- `grad_acc_steps`: Number of steps to accumulate gradient over before back-propagation is done. +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: +"O0" (FP32) +"O1" (Mixed Precision) +"O2" (Almost FP16) +"O3" (Pure FP16). +For more information, refer to: https://nvidia.github.io/apex/amp.html +- `optimizer_name`: What optimizer to use (default: TransformersAdamW). +- `num_warmup_steps`: Number of warmup steps. +- `optimizer_correct_bias`: Whether to correct bias in optimizer. +- `save_dir`: Directory where models are saved. +- `query_encoder_save_dir`: Directory inside save_dir where query_encoder model files are saved. +- `passage_encoder_save_dir`: Directory inside save_dir where passage_encoder model files are saved. +- `table_encoder_save_dir`: Directory inside save_dir where table_encoder model files are saved. + + + +#### save + +```python +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") +``` + +Save TableTextRetriever to the specified directory. + +**Arguments**: + +- `save_dir`: Directory to save to. +- `query_encoder_dir`: Directory in save_dir that contains query encoder model. +- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model. +- `table_encoder_dir`: Directory in save_dir that contains table encoder model. + +**Returns**: + +None + + + +#### load + +```python +@classmethod +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", infer_tokenizer_classes: bool = False) +``` + +Load TableTextRetriever from the specified directory. + + + +## EmbeddingRetriever + +```python +class EmbeddingRetriever(BaseRetriever) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None) +``` + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `embedding_model`: Local path or name of model in Hugging Face's model hub such as ``'sentence-transformers/all-MiniLM-L6-v2'`` +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of documents to encode at once. +- `max_seq_len`: Longest length of each document sequence. Maximum number of tokens for the document text. Longer ones will be cut down. +- `model_format`: Name of framework that was used for saving the model. Options: +- ``'farm'`` +- ``'transformers'`` +- ``'sentence_transformers'`` +- `pooling_strategy`: Strategy for combining the embeddings from the model (for farm / transformers models only). +Options: + +- ``'cls_token'`` (sentence vector) +- ``'reduce_mean'`` (sentence vector) +- ``'reduce_max'`` (sentence vector) +- ``'per_token'`` (individual token vectors) +- `emb_extraction_layer`: Number of layer from which the embeddings shall be extracted (for farm / transformers models only). +Default: -1 (very last layer). +- `top_k`: How many documents to return per query. +- `progress_bar`: If true displays progress bar during embedding. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, +training will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + + +#### retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents + + + +#### embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries. + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of documents. + +**Arguments**: + +- `docs`: List of documents to embed + +**Returns**: + +Embeddings, one per input document + + + +# Module text2sparql + + + +## Text2SparqlRetriever + +```python +class Text2SparqlRetriever(BaseGraphRetriever) +``` + +Graph retriever that uses a pre-trained Bart model to translate natural language questions +given in text form to queries in SPARQL format. +The generated SPARQL query is executed on a knowledge graph. + + + +#### \_\_init\_\_ + +```python +def __init__(knowledge_graph, model_name_or_path, top_k: int = 1) +``` + +Init the Retriever by providing a knowledge graph and a pre-trained BART model + +**Arguments**: + +- `knowledge_graph`: An instance of BaseKnowledgeGraph on which to execute SPARQL queries. +- `model_name_or_path`: Name of or path to a pre-trained BartForConditionalGeneration model. +- `top_k`: How many SPARQL queries to generate per text query. + + + +#### retrieve + +```python +def retrieve(query: str, top_k: Optional[int] = None) +``` + +Translate a text query to SPARQL and execute it on the knowledge graph to retrieve a list of answers + +**Arguments**: + +- `query`: Text query that shall be translated to SPARQL and then executed on the knowledge graph +- `top_k`: How many SPARQL queries to generate per text query. + + + +#### format\_result + +```python +def format_result(result) +``` + +Generate formatted dictionary output with text answer and additional info + +**Arguments**: + +- `result`: The result of a SPARQL query as retrieved from the knowledge graph + diff --git a/docs/v1.3.0/_src/api/api/summarizer.md b/docs/v1.3.0/_src/api/api/summarizer.md new file mode 100644 index 0000000000..9116984371 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/summarizer.md @@ -0,0 +1,141 @@ + + +# Module base + + + +## BaseSummarizer + +```python +class BaseSummarizer(BaseComponent) +``` + +Abstract class for Summarizer + + + +#### predict + +```python +@abstractmethod +def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] +``` + +Abstract method for creating a summary. + +**Arguments**: + +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! + +**Returns**: + +List of Documents, where Document.text contains the summarization and Document.meta["context"] +the original, not summarized text + + + +# Module transformers + + + +## TransformersSummarizer + +```python +class TransformersSummarizer(BaseSummarizer) +``` + +Transformer based model to summarize the documents using the HuggingFace's transformers framework + +You can use any model that has been fine-tuned on a summarization task. For example: +'`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. +See the up-to-date list of available models on +`huggingface.co/models `__ + +**Example** + +```python +| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." +| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" +| "the shutoffs which were expected to last through at least midday tomorrow.")] +| +| # Summarize +| summary = summarizer.predict( +| documents=docs, +| generate_single_summary=True +| ) +| +| # Show results (List of Documents, containing summary and original text) +| print(summary) +| +| [ +| { +| "text": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", +| ... +| "meta": { +| "context": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. ..." +| }, +| ... +| }, +``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False) +``` + +Load a Summarization model from Transformers. + +See the up-to-date list of available models at +https://huggingface.co/models?filter=summarization + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'facebook/rag-token-nq', 'facebook/rag-sequence-nq'. +See https://huggingface.co/models?filter=summarization for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `max_length`: Maximum length of summarized text +- `min_length`: Minimum length of summarized text +- `use_gpu`: Whether to use GPU (if available). +- `clean_up_tokenization_spaces`: Whether or not to clean up the potential extra spaces in the text output +- `separator_for_single_summary`: If `generate_single_summary=True` in `predict()`, we need to join all docs +into a single text. This separator appears between those subsequent docs. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! + + + +#### predict + +```python +def predict(documents: List[Document], generate_single_summary: Optional[bool] = None, truncation: bool = True) -> List[Document] +``` + +Produce the summarization from the supplied documents. + +These document can for example be retrieved via the Retriever. + +**Arguments**: + +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! +- `truncation`: Truncate to a maximum length accepted by the model + +**Returns**: + +List of Documents, where Document.text contains the summarization and Document.meta["context"] +the original, not summarized text + diff --git a/docs/v1.3.0/_src/api/api/translator.md b/docs/v1.3.0/_src/api/api/translator.md new file mode 100644 index 0000000000..8b6e3ceeb3 --- /dev/null +++ b/docs/v1.3.0/_src/api/api/translator.md @@ -0,0 +1,113 @@ + + +# Module base + + + +## BaseTranslator + +```python +class BaseTranslator(BaseComponent) +``` + +Abstract class for a Translator component that translates either a query or a doc from language A to language B. + + + +#### translate + +```python +@abstractmethod +def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +``` + +Translate the passed query or a list of documents from language A to B. + + + +#### run + +```python +def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) +``` + +Method that gets executed when this class is used as a Node in a Haystack Pipeline + + + +# Module transformers + + + +## TransformersTranslator + +```python +class TransformersTranslator(BaseTranslator) +``` + +Translator component based on Seq2Seq models from Huggingface's transformers library. +Exemplary use cases: +- Translate a query from Language A to B (e.g. if you only have good models + documents in language B) +- Translate a document from Language A to B (e.g. if you want to return results in the native language of the user) + +We currently recommend using OPUS models (see __init__() for details) + +**Example:** + +```python +| DOCS = [ +| Document(content="Heinz von Foerster was an Austrian American scientist combining physics and philosophy, +| and widely attributed as the originator of Second-order cybernetics.") +| ] +| translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") +| res = translator.translate(documents=DOCS, query=None) +``` + + + +#### \_\_init\_\_ + +```python +def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True) +``` + +Initialize the translator with a model that fits your targeted languages. While we support all seq2seq + +models from Hugging Face's model hub, we recommend using the OPUS models from Helsiniki NLP. They provide plenty +of different models, usually one model per language pair and translation direction. +They have a pretty standardized naming that should help you find the right model: +- "Helsinki-NLP/opus-mt-en-de" => translating from English to German +- "Helsinki-NLP/opus-mt-de-en" => translating from German to English +- "Helsinki-NLP/opus-mt-fr-en" => translating from French to English +- "Helsinki-NLP/opus-mt-hi-en"=> translating from Hindi to English +... + +They also have a few multilingual models that support multiple languages at once. + +**Arguments**: + +- `model_name_or_path`: Name of the seq2seq model that shall be used for translation. +Can be a remote name from Huggingface's modelhub or a local path. +- `tokenizer_name`: Optional tokenizer name. If not supplied, `model_name_or_path` will also be used for the +tokenizer. +- `max_seq_len`: The maximum sentence length the model accepts. (Optional) +- `clean_up_tokenization_spaces`: Whether or not to clean up the tokenization spaces. (default True) +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. + + + +#### translate + +```python +def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +``` + +Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated. + +**Arguments**: + +- `results`: Generated QA pairs to translate +- `query`: The query string to translate +- `documents`: The documents to translate +- `dict_key`: If you pass a dictionary in `documents`, you can specify here the field which shall be translated. + diff --git a/docs/v1.3.0/_src/api/conf.py b/docs/v1.3.0/_src/api/conf.py new file mode 100644 index 0000000000..46046eccc0 --- /dev/null +++ b/docs/v1.3.0/_src/api/conf.py @@ -0,0 +1,52 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import sphinx_rtd_theme +import os +import sys + +sys.path.append("/Users/deepset/deepset/haystack") + + +# -- Project information ----------------------------------------------------- + +project = "Haystack" +copyright = "2020, deepset" +author = "deepset" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/v1.3.0/_src/api/img/annotation_tool.png b/docs/v1.3.0/_src/api/img/annotation_tool.png new file mode 100644 index 0000000000..eb2c601d9e Binary files /dev/null and b/docs/v1.3.0/_src/api/img/annotation_tool.png differ diff --git a/docs/v1.3.0/_src/api/img/code_snippet_usage.png b/docs/v1.3.0/_src/api/img/code_snippet_usage.png new file mode 100644 index 0000000000..e7d836bd9c Binary files /dev/null and b/docs/v1.3.0/_src/api/img/code_snippet_usage.png differ diff --git a/docs/v1.3.0/_src/api/img/colab_gpu_runtime.jpg b/docs/v1.3.0/_src/api/img/colab_gpu_runtime.jpg new file mode 100644 index 0000000000..883180b97e Binary files /dev/null and b/docs/v1.3.0/_src/api/img/colab_gpu_runtime.jpg differ diff --git a/docs/v1.3.0/_src/api/img/sketched_concepts_white.png b/docs/v1.3.0/_src/api/img/sketched_concepts_white.png new file mode 100644 index 0000000000..9fe5fd5c94 Binary files /dev/null and b/docs/v1.3.0/_src/api/img/sketched_concepts_white.png differ diff --git a/docs/v1.3.0/_src/api/index.rst b/docs/v1.3.0/_src/api/index.rst new file mode 100644 index 0000000000..42ff660913 --- /dev/null +++ b/docs/v1.3.0/_src/api/index.rst @@ -0,0 +1,16 @@ +.. Haystack documentation master file, created by + sphinx-quickstart on Tue Jul 28 14:14:55 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + api/database + api/retriever + api/reader + api/indexing + api/rest_api + api/file_converters + api/finder diff --git a/docs/v1.3.0/_src/api/make.bat b/docs/v1.3.0/_src/api/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/v1.3.0/_src/api/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/v1.3.0/_src/api/openapi/openapi-1.1.0.json b/docs/v1.3.0/_src/api/openapi/openapi-1.1.0.json new file mode 100644 index 0000000000..f5fcee5b74 --- /dev/null +++ b/docs/v1.3.0/_src/api/openapi/openapi-1.1.0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.1.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/openapi/openapi-1.2.0.json b/docs/v1.3.0/_src/api/openapi/openapi-1.2.0.json new file mode 100644 index 0000000000..36971bd89f --- /dev/null +++ b/docs/v1.3.0/_src/api/openapi/openapi-1.2.0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.2.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/openapi/openapi-1.2.1rc0.json b/docs/v1.3.0/_src/api/openapi/openapi-1.2.1rc0.json new file mode 100644 index 0000000000..8c8ae9c864 --- /dev/null +++ b/docs/v1.3.0/_src/api/openapi/openapi-1.2.1rc0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.2.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/openapi/openapi.json b/docs/v1.3.0/_src/api/openapi/openapi.json new file mode 100644 index 0000000000..8c8ae9c864 --- /dev/null +++ b/docs/v1.3.0/_src/api/openapi/openapi.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.2.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/api/pydoc/answer-generator.yml b/docs/v1.3.0/_src/api/pydoc/answer-generator.yml new file mode 100644 index 0000000000..7cc6c234b5 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/answer-generator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/answer_generator] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: generator.md diff --git a/docs/v1.3.0/_src/api/pydoc/crawler.yml b/docs/v1.3.0/_src/api/pydoc/crawler.yml new file mode 100644 index 0000000000..952e5f5b5e --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/crawler.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/connector] + modules: ['crawler'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: crawler.md diff --git a/docs/v1.3.0/_src/api/pydoc/document-classifier.yml b/docs/v1.3.0/_src/api/pydoc/document-classifier.yml new file mode 100644 index 0000000000..de1dea292a --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/document-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/document_classifier] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: document_classifier.md diff --git a/docs/v1.3.0/_src/api/pydoc/document-store.yml b/docs/v1.3.0/_src/api/pydoc/document-store.yml new file mode 100644 index 0000000000..ae233e1567 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/document-store.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/document_stores] + modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone', 'utils'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: document_store.md diff --git a/docs/v1.3.0/_src/api/pydoc/evaluation.yml b/docs/v1.3.0/_src/api/pydoc/evaluation.yml new file mode 100644 index 0000000000..364db4ef33 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/evaluation.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/evaluator] + modules: ['evaluator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: evaluation.md diff --git a/docs/v1.3.0/_src/api/pydoc/extractor.yml b/docs/v1.3.0/_src/api/pydoc/extractor.yml new file mode 100644 index 0000000000..a7c6ff1793 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/extractor.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/extractor] + modules: ['entity'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: extractor.md diff --git a/docs/v1.3.0/_src/api/pydoc/file-classifier.yml b/docs/v1.3.0/_src/api/pydoc/file-classifier.yml new file mode 100644 index 0000000000..6a59289344 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/file-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/file_classifier] + modules: ['file_type'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: file_classifier.md diff --git a/docs/v1.3.0/_src/api/pydoc/file-converters.yml b/docs/v1.3.0/_src/api/pydoc/file-converters.yml new file mode 100644 index 0000000000..5bb109525b --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/file-converters.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/file_converter] + modules: ['base', 'docx', 'image', 'markdown', 'pdf', 'tika', 'txt'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: file_converter.md diff --git a/docs/v1.3.0/_src/api/pydoc/other.yml b/docs/v1.3.0/_src/api/pydoc/other.yml new file mode 100644 index 0000000000..feef4c00d2 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/other.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/other] + modules: ['docs2answers', 'join_docs', 'join_answers', 'route_documents'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: other.md diff --git a/docs/v1.3.0/_src/api/pydoc/pipelines.yml b/docs/v1.3.0/_src/api/pydoc/pipelines.yml new file mode 100644 index 0000000000..07da6fd2d1 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/pipelines.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/pipelines] + modules: ['base', 'standard_pipelines'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: pipelines.md diff --git a/docs/v1.3.0/_src/api/pydoc/preprocessor.yml b/docs/v1.3.0/_src/api/pydoc/preprocessor.yml new file mode 100644 index 0000000000..49298cffb3 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/preprocessor.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/preprocessor] + modules: ['base', 'preprocessor'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: preprocessor.md diff --git a/docs/v1.3.0/_src/api/pydoc/primitives.yml b/docs/v1.3.0/_src/api/pydoc/primitives.yml new file mode 100644 index 0000000000..3262e1cbc3 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/primitives.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/] + modules: ['schema'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: primitives.md diff --git a/docs/v1.3.0/_src/api/pydoc/query-classifier.yml b/docs/v1.3.0/_src/api/pydoc/query-classifier.yml new file mode 100644 index 0000000000..79027e42e7 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/query-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/query_classifier] + modules: ['base', 'sklearn', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: query_classifier.md diff --git a/docs/v1.3.0/_src/api/pydoc/question-generator.yml b/docs/v1.3.0/_src/api/pydoc/question-generator.yml new file mode 100644 index 0000000000..8253f78bab --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/question-generator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/question_generator] + modules: ['question_generator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: question_generator.md diff --git a/docs/v1.3.0/_src/api/pydoc/ranker.yml b/docs/v1.3.0/_src/api/pydoc/ranker.yml new file mode 100644 index 0000000000..0b341ce3e0 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/ranker.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/ranker] + modules: ['base', 'sentence_transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: ranker.md diff --git a/docs/v1.3.0/_src/api/pydoc/reader.yml b/docs/v1.3.0/_src/api/pydoc/reader.yml new file mode 100644 index 0000000000..fd886742a6 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/reader.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/reader] + modules: ['base', 'farm', 'transformers', 'table'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: reader.md diff --git a/docs/v1.3.0/_src/api/pydoc/retriever.yml b/docs/v1.3.0/_src/api/pydoc/retriever.yml new file mode 100644 index 0000000000..43041f3bc3 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/retriever.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/retriever] + modules: ['base', 'sparse', 'dense', 'text2sparql'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: retriever.md diff --git a/docs/v1.3.0/_src/api/pydoc/summarizer.yml b/docs/v1.3.0/_src/api/pydoc/summarizer.yml new file mode 100644 index 0000000000..0aae78b600 --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/summarizer.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/summarizer] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: summarizer.md diff --git a/docs/v1.3.0/_src/api/pydoc/translator.yml b/docs/v1.3.0/_src/api/pydoc/translator.yml new file mode 100644 index 0000000000..76cdedec5a --- /dev/null +++ b/docs/v1.3.0/_src/api/pydoc/translator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/translator] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: translator.md diff --git a/docs/v1.3.0/_src/benchmarks/farm_per_component.html b/docs/v1.3.0/_src/benchmarks/farm_per_component.html new file mode 100644 index 0000000000..6a9d3d5cea --- /dev/null +++ b/docs/v1.3.0/_src/benchmarks/farm_per_component.html @@ -0,0 +1,48 @@ + + + + + + + +
+ + + diff --git a/docs/v1.3.0/_src/benchmarks/reader_performance.json b/docs/v1.3.0/_src/benchmarks/reader_performance.json new file mode 100644 index 0000000000..be935fe271 --- /dev/null +++ b/docs/v1.3.0/_src/benchmarks/reader_performance.json @@ -0,0 +1,44 @@ +{ + "chart_type": "BarChart", + "title": "Reader Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set max_seq_len=384 and doc_stride=128. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this script. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.", + "bars": "horizontal", + "columns": [ + "Model", + "F1", + "Speed (passages/sec)" + ], + "data": [ + { + "F1": 82.58860575299658, + "Speed": 125.81040525892848, + "Model": "RoBERTa" + }, + { + "F1": 78.87858491007042, + "Speed": 260.6443097981493, + "Model": "MiniLM" + }, + { + "F1": 74.31182400443286, + "Speed": 121.08066567525722, + "Model": "BERT base" + }, + { + "F1": 83.26306774734308, + "Speed": 42.21949937744112, + "Model": "BERT large" + }, + { + "F1": 84.50422699207468, + "Speed": 42.07400844838985, + "Model": "XLM-RoBERTa" + }, + { + "F1": 42.31925844723574, + "Speed": 222.91207128366702, + "Model": "DistilBERT" + } + ] +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/benchmarks/retriever_map.json b/docs/v1.3.0/_src/benchmarks/retriever_map.json new file mode 100644 index 0000000000..51e0687cf3 --- /dev/null +++ b/docs/v1.3.0/_src/benchmarks/retriever_map.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Accuracy", + "subtitle": "mAP at different number of docs", + "description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "mAP" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "map": 66.26543444531747 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "map": 90.06638620360428 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "map": 87.11255142468549 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "map": 89.51337675393017 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "map": 88.24421129104469 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "map": 86.54606328368976 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "map": 56.25299537353825 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "map": 45.595090262466535 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "map": 82.74686664920836 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "map": 76.49564526892904 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "map": 84.33419639513305 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "map": 75.73062475537202 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "map": 81.63864883662649 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "map": 73.57986207906387 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "map": 74.20444712972909 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "map": 89.8709701490436 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "map": 92.76308330349686 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "map": 89.00403653862938 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "map": 85.7342431384476 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "map": 80.85588135082547 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "map": 77.5426462347698 + } + ] +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/benchmarks/retriever_performance.json b/docs/v1.3.0/_src/benchmarks/retriever_performance.json new file mode 100644 index 0000000000..dbb9340481 --- /dev/null +++ b/docs/v1.3.0/_src/benchmarks/retriever_performance.json @@ -0,0 +1,88 @@ +{ + "chart_type": "BarChart", + "title": "Retriever Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", + "bars": "horizontal", + "columns": [ + "Model", + "mAP", + "Index Speed (docs/sec)", + "Query Speed (queries/sec)" + ], + "series": { + "s0": "map", + "s1": "time", + "s2": "time" + }, + "axes": { + "label": "map", + "time_side": "top", + "time_label": "seconds" + }, + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "index_speed": 71.36964873196698, + "query_speed": 5.192368815242574, + "map": 86.54606328368976 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "index_speed": 485.5602670200369, + "query_speed": 103.0884393334727, + "map": 56.25299537353825 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "index_speed": 119.52937722555107, + "query_speed": 6.385621466857457, + "map": 82.74686664920836 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "index_speed": 100.01184910084558, + "query_speed": 6.6270933964840415, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "index_speed": 89.90389306648805, + "query_speed": 39.7839528511866, + "map": 84.33419639513305 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "index_speed": 116.00982709720004, + "query_speed": 28.57264344960955, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "index_speed": 115.61076852516383, + "query_speed": 38.80526238789059, + "map": 81.63864883662649 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "index_speed": 70.05381128388427, + "query_speed": 15.306895223372484, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "index_speed": 70.31004397719536, + "query_speed": 24.95733865947408, + "map": 85.7342431384476 + } + ] +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/benchmarks/retriever_speed.json b/docs/v1.3.0/_src/benchmarks/retriever_speed.json new file mode 100644 index 0000000000..7877d2a358 --- /dev/null +++ b/docs/v1.3.0/_src/benchmarks/retriever_speed.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Speed", + "subtitle": "Query Speed at different number of docs", + "description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "Queries/sec" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "query_speed": 34.22768858415144 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "query_speed": 22.197089725786853 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "query_speed": 127.11481826852273 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "query_speed": 47.51341215808855 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "query_speed": 29.74515869340777 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "query_speed": 42.49634272581313 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 27.684040507849826 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "query_speed": 43.36685860983961 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "query_speed": 41.819147130090286 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "query_speed": 41.12204778755844 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "query_speed": 37.86882443918513 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "query_speed": 41.14803671045185 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "query_speed": 40.072871546542935 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "query_speed": 5.192368815242574 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.0337466563959614 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "query_speed": 103.0884393334727 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "query_speed": 78.95037031647355 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "query_speed": 6.385621466857457 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.4175454254854258 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.6270933964840415 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "query_speed": 1.5394964631878052 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "query_speed": 39.7839528511866 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "query_speed": 39.84177061191119 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "query_speed": 28.57264344960955 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "query_speed": 15.645867393099733 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "query_speed": 38.80526238789059 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "query_speed": 37.15717318924075 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "query_speed": 282.95914917837337 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "query_speed": 29.061163356184426 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "query_speed": 24.834414667596725 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "query_speed": 15.306895223372484 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "query_speed": 29.10621389658101 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "query_speed": 26.92417300437131 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "query_speed": 24.95733865947408 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "query_speed": 11.33271222977541 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "query_speed": 24.13921492357397 + } + ] +} \ No newline at end of file diff --git a/docs/v1.3.0/_src/conf.py b/docs/v1.3.0/_src/conf.py new file mode 100644 index 0000000000..fcbedb6dcd --- /dev/null +++ b/docs/v1.3.0/_src/conf.py @@ -0,0 +1,88 @@ +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "Haystack" +copyright = "2020, deepset" +author = "deepset" + +# The full version, including alpha/beta/rc tags + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "IPython.sphinxext.ipython_console_highlighting", + "sphinx_rtd_theme", + "sphinx_tabs.tabs", + "sphinx_copybutton", + "nbsphinx", + "sphinx.ext.autosectionlabel", + "sphinx_markdown_builder", + "recommonmark", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["../templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["build/*"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# This fixes weird spacing between bullet points in lists +html4_writer = True + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["../static"] + +# -- Added configuration ----------------------------------------------------- + +# Define master file which is by default contents.rst +master_doc = "index" + +# Logo for the title +html_logo = "img/logo.png" + +# Custom css +# html_context = {"css_files":["_static/custom.css"]} + +# Additional layouts +html_additional_pages = {"index": "pages/index.html"} + +# The file extensions of source files. +source_suffix = {".rst": "restructuredtext", ".txt": "restructuredtext", ".md": "markdown"} + +# -- Add autodocs for __init__() methods ------------------------------------- + + +def skip(app, what, name, obj, would_skip, options): + if name == "__init__": + return False + return would_skip + + +def setup(app): + # Custom css + app.add_stylesheet("rtd_theme.css") + app.connect("autodoc-skip-member", skip) diff --git a/docs/v1.3.0/_src/img/annotation_tool.png b/docs/v1.3.0/_src/img/annotation_tool.png new file mode 100644 index 0000000000..eb2c601d9e Binary files /dev/null and b/docs/v1.3.0/_src/img/annotation_tool.png differ diff --git a/docs/v1.3.0/_src/img/code_snippet_usage.png b/docs/v1.3.0/_src/img/code_snippet_usage.png new file mode 100644 index 0000000000..e7d836bd9c Binary files /dev/null and b/docs/v1.3.0/_src/img/code_snippet_usage.png differ diff --git a/docs/v1.3.0/_src/img/colab_gpu_runtime.jpg b/docs/v1.3.0/_src/img/colab_gpu_runtime.jpg new file mode 100644 index 0000000000..883180b97e Binary files /dev/null and b/docs/v1.3.0/_src/img/colab_gpu_runtime.jpg differ diff --git a/docs/v1.3.0/_src/img/concepts_haystack_handdrawn.png b/docs/v1.3.0/_src/img/concepts_haystack_handdrawn.png new file mode 100644 index 0000000000..e24e86b755 Binary files /dev/null and b/docs/v1.3.0/_src/img/concepts_haystack_handdrawn.png differ diff --git a/docs/v1.3.0/_src/img/concepts_haystack_v2.png b/docs/v1.3.0/_src/img/concepts_haystack_v2.png new file mode 100644 index 0000000000..d5f973b8d5 Binary files /dev/null and b/docs/v1.3.0/_src/img/concepts_haystack_v2.png differ diff --git a/docs/v1.3.0/_src/img/demo.png b/docs/v1.3.0/_src/img/demo.png new file mode 100644 index 0000000000..9fec970fb4 Binary files /dev/null and b/docs/v1.3.0/_src/img/demo.png differ diff --git a/docs/v1.3.0/_src/img/first_time_contributor_enable_access.png b/docs/v1.3.0/_src/img/first_time_contributor_enable_access.png new file mode 100644 index 0000000000..8161b6ec0a Binary files /dev/null and b/docs/v1.3.0/_src/img/first_time_contributor_enable_access.png differ diff --git a/docs/v1.3.0/_src/img/fork_action_config.png b/docs/v1.3.0/_src/img/fork_action_config.png new file mode 100644 index 0000000000..253d841382 Binary files /dev/null and b/docs/v1.3.0/_src/img/fork_action_config.png differ diff --git a/docs/v1.3.0/_src/img/haystack_logo_blue_banner.png b/docs/v1.3.0/_src/img/haystack_logo_blue_banner.png new file mode 100644 index 0000000000..a95bca428b Binary files /dev/null and b/docs/v1.3.0/_src/img/haystack_logo_blue_banner.png differ diff --git a/docs/v1.3.0/_src/img/haystack_logo_blue_banner_social_media.png b/docs/v1.3.0/_src/img/haystack_logo_blue_banner_social_media.png new file mode 100644 index 0000000000..28732d7097 Binary files /dev/null and b/docs/v1.3.0/_src/img/haystack_logo_blue_banner_social_media.png differ diff --git a/docs/v1.3.0/_src/img/haystack_logo_colored.png b/docs/v1.3.0/_src/img/haystack_logo_colored.png new file mode 100644 index 0000000000..4e7e3cfa3a Binary files /dev/null and b/docs/v1.3.0/_src/img/haystack_logo_colored.png differ diff --git a/docs/v1.3.0/_src/img/logo.png b/docs/v1.3.0/_src/img/logo.png new file mode 100644 index 0000000000..8a8aa7b665 Binary files /dev/null and b/docs/v1.3.0/_src/img/logo.png differ diff --git a/docs/v1.3.0/_src/img/main_example.gif b/docs/v1.3.0/_src/img/main_example.gif new file mode 100644 index 0000000000..f93ad80543 Binary files /dev/null and b/docs/v1.3.0/_src/img/main_example.gif differ diff --git a/docs/v1.3.0/_src/img/retriever_reader.png b/docs/v1.3.0/_src/img/retriever_reader.png new file mode 100644 index 0000000000..50a5c451c9 Binary files /dev/null and b/docs/v1.3.0/_src/img/retriever_reader.png differ diff --git a/docs/v1.3.0/_src/img/search.png b/docs/v1.3.0/_src/img/search.png new file mode 100644 index 0000000000..db54b0211e Binary files /dev/null and b/docs/v1.3.0/_src/img/search.png differ diff --git a/docs/v1.3.0/_src/img/sketched_concepts_white.png b/docs/v1.3.0/_src/img/sketched_concepts_white.png new file mode 100644 index 0000000000..9fe5fd5c94 Binary files /dev/null and b/docs/v1.3.0/_src/img/sketched_concepts_white.png differ diff --git a/docs/v1.3.0/_src/img/streamlit_ui_screenshot.png b/docs/v1.3.0/_src/img/streamlit_ui_screenshot.png new file mode 100644 index 0000000000..b4a96d3b18 Binary files /dev/null and b/docs/v1.3.0/_src/img/streamlit_ui_screenshot.png differ diff --git a/docs/v1.3.0/_src/img/streamlit_ui_screenshot_eval_mode.PNG b/docs/v1.3.0/_src/img/streamlit_ui_screenshot_eval_mode.PNG new file mode 100755 index 0000000000..c05cad28ba Binary files /dev/null and b/docs/v1.3.0/_src/img/streamlit_ui_screenshot_eval_mode.PNG differ diff --git a/docs/v1.3.0/_src/img/tutorial11_custompipelines_pipeline_ensemble.png b/docs/v1.3.0/_src/img/tutorial11_custompipelines_pipeline_ensemble.png new file mode 100644 index 0000000000..56b58cbe35 Binary files /dev/null and b/docs/v1.3.0/_src/img/tutorial11_custompipelines_pipeline_ensemble.png differ diff --git a/docs/v1.3.0/_src/img/tutorial11_decision_nodes_pipeline_classifier.png b/docs/v1.3.0/_src/img/tutorial11_decision_nodes_pipeline_classifier.png new file mode 100644 index 0000000000..28a4eb4d30 Binary files /dev/null and b/docs/v1.3.0/_src/img/tutorial11_decision_nodes_pipeline_classifier.png differ diff --git a/docs/v1.3.0/_src/img/zenhub_board.png b/docs/v1.3.0/_src/img/zenhub_board.png new file mode 100644 index 0000000000..d1b9efb3c9 Binary files /dev/null and b/docs/v1.3.0/_src/img/zenhub_board.png differ diff --git a/docs/v1.3.0/_src/img/zenhub_issue.png b/docs/v1.3.0/_src/img/zenhub_issue.png new file mode 100644 index 0000000000..6edae7e46f Binary files /dev/null and b/docs/v1.3.0/_src/img/zenhub_issue.png differ diff --git a/docs/v1.3.0/_src/img/zenhub_roadmap.png b/docs/v1.3.0/_src/img/zenhub_roadmap.png new file mode 100644 index 0000000000..ae88154ed6 Binary files /dev/null and b/docs/v1.3.0/_src/img/zenhub_roadmap.png differ diff --git a/docs/v1.3.0/_src/index.rst b/docs/v1.3.0/_src/index.rst new file mode 100644 index 0000000000..07fb63bdb5 --- /dev/null +++ b/docs/v1.3.0/_src/index.rst @@ -0,0 +1,21 @@ +.. toctree:: + :caption: Usage + :maxdepth: 3 + :numbered: + + usage/index + +.. toctree:: + :caption: Tutorials + :maxdepth: 3 + :numbered: + + tutorials/index + +.. toctree:: + :caption: API + :maxdepth: 3 + :numbered: + + api/index + diff --git a/docs/v1.3.0/_src/tutorials/Makefile b/docs/v1.3.0/_src/tutorials/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/v1.3.0/_src/tutorials/conf.py b/docs/v1.3.0/_src/tutorials/conf.py new file mode 100644 index 0000000000..4511b84159 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/conf.py @@ -0,0 +1,51 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "Tutorials" +copyright = "2020, deepset" +author = "deepset" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["IPython.sphinxext.ipython_console_highlighting"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "alabaster" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/v1.3.0/_src/tutorials/index.rst b/docs/v1.3.0/_src/tutorials/index.rst new file mode 100644 index 0000000000..4351a5f784 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/index.rst @@ -0,0 +1,13 @@ +Tutorials +==================================== + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + 1) Using Haystack to search through your own documents + 2) Make Haystack understand your jargon + 3) Connect Haystack to your Datastore of choice + 4) Answer incoming questions using FAQ pages + 5) Benchmark the different components of Haystack + 6) SoTA: Powerup Haystack with DPR diff --git a/docs/v1.3.0/_src/tutorials/make.bat b/docs/v1.3.0/_src/tutorials/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/v1.3.0/_src/tutorials/tutorials/1.md b/docs/v1.3.0/_src/tutorials/tutorials/1.md new file mode 100644 index 0000000000..2f544bb815 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/1.md @@ -0,0 +1,286 @@ + + +# Build Your First QA System + + + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb) + +Question Answering can be used in a variety of use cases. A very common one: Using it to navigate through complex knowledge bases or long documents ("search setting"). + +A "knowledge base" could for example be your website, an internal wiki or a collection of financial reports. +In this tutorial we will work on a slightly different domain: "Game of Thrones". + +Let's see how we can use a bunch of Wikipedia articles to answer a variety of questions about the +marvellous seven kingdoms. + + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +## Document Store + +Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. + +**Here:** We recommended Elasticsearch as it comes preloaded with features like [full-text queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html), [BM25 retrieval](https://www.elastic.co/elasticon/conf/2016/sf/improved-text-scoring-with-bm25), and [vector storage for text embeddings](https://www.elastic.co/guide/en/elasticsearch/reference/7.6/dense-vector.html). + +**Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the [Tutorial 3](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) for using SQL/InMemory document stores. + +**Hint**: This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores. + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch + +from haystack.document_stores import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") +``` + +## Preprocessing of documents + +Haystack provides a customizable pipeline for: + - converting files into texts + - cleaning texts + - splitting texts + - writing them to a Document Store + +In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch. + + +```python +# Let's first fetch some documents that we want to query +# Here: 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial1" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) +# It must take a str as input, and return a str. +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# We now have a list of dictionaries that we can write to our document store. +# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself. +# The default format here is: +# { +# 'content': "", +# 'meta': {'name': "", ...} +# } +# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and +# can be accessed later for filtering or shown in the responses of the Pipeline) + +# Let's have a look at the first 3 entries: +print(dicts[:3]) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(dicts) +``` + +## Initalize Retriever, Reader, & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. +They use some simple but fast algorithm. + +**Here:** We use Elasticsearch's default BM25 algorithm + +**Alternatives:** + +- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters +- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging +- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT) +- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6) + + +```python +from haystack.nodes import ElasticsearchRetriever + +retriever = ElasticsearchRetriever(document_store=document_store) +``` + + +```python +# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store. + +# from haystack.nodes import TfidfRetriever +# retriever = TfidfRetriever(document_store=document_store) +``` + +### Reader + +A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based +on powerful, but slower deep learning models. + +Haystack currently supports Readers based on the frameworks FARM and Transformers. +With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). + +**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2) + +**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) + +**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) + +**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible" + +#### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +#### TransformersReader + + +```python +# Alternative: +# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the reader and retriever shall return +# The higher top_k_retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) +``` + + +```python +# Now you can either print the object directly... +from pprint import pprint + +pprint(prediction) + +# Sample output: +# { +# 'answers': [ , +# , +# ... +# ] +# 'documents': [ , +# , +# ... +# ], +# 'no_ans_gap': 11.688868522644043, +# 'node_id': 'Reader', +# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}}, +# 'query': 'Who is the father of Arya Stark?', +# 'root_node': 'Query' +# } +``` + + +```python +# ...or use a util to simplify the output +# Change `minimum` to `medium` or `all` to raise the level of detail +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.3.0/_src/tutorials/tutorials/10.md b/docs/v1.3.0/_src/tutorials/tutorials/10.md new file mode 100644 index 0000000000..6b71c70893 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/10.md @@ -0,0 +1,158 @@ + + +# Question Answering on a Knowledge Graph + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.ipynb) + +Haystack allows storing and querying knowledge graphs with the help of pre-trained models that translate text queries to SPARQL queries. +This tutorial demonstrates how to load an existing knowledge graph into haystack, load a pre-trained retriever, and execute text queries on the knowledge graph. +The training of models that translate text queries into SPARQL queries is currently not supported. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,graphdb] +``` + + +```python +# Here are some imports that we'll need + +import subprocess +import time +from pathlib import Path + +from haystack.nodes import Text2SparqlRetriever +from haystack.document_stores import GraphDBKnowledgeGraph +from haystack.utils import fetch_archive_from_http +``` + +## Downloading Knowledge Graph and Model + + +```python +# Let's first fetch some triples that we want to store in our knowledge graph +# Here: exemplary triples from the wizarding world +graph_dir = "data/tutorial10" +s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip" +fetch_archive_from_http(url=s3_url, output_dir=graph_dir) + +# Fetch a pre-trained BART model that translates text queries to SPARQL queries +model_dir = "../saved_models/tutorial10_knowledge_graph/" +s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" +fetch_archive_from_http(url=s3_url, output_dir=model_dir) +``` + +## Launching a GraphDB instance + + +```python +# Unfortunately, there seems to be no good way to run GraphDB in colab environments +# In your local environment, you could start a GraphDB server with docker +# Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/ +print("Starting GraphDB ...") +status = subprocess.run( + [ + "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" + ], + shell=True, +) +if status.returncode: + raise Exception( + "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?" + ) +time.sleep(5) +``` + +## Creating a new GraphDB repository (also known as index in haystack's document stores) + + +```python +# Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index +kg = GraphDBKnowledgeGraph(index="tutorial_10_index") + +# Delete the index as it might have been already created in previous runs +kg.delete_index() + +# Create the index based on a configuration file +kg.create_index(config_path=Path(graph_dir + "repo-config.ttl")) + +# Import triples of subject, predicate, and object statements from a ttl file +kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir + "triples.ttl")) +print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") +print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") +``` + + +```python +# Define prefixes for names of resources so that we can use shorter resource names in queries +prefixes = """PREFIX rdf: +PREFIX xsd: +PREFIX hp: +""" +kg.prefixes = prefixes + +# Load a pre-trained model that translates text queries to SPARQL queries +kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=model_dir + "hp_v3.4") +``` + +## Query Execution + +We can now ask questions that will be answered by our knowledge graph! +One limitation though: our pre-trained model can only generate questions about resources it has seen during training. +Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph. +E.g. "Harry" -> "hp:Harry_potter" + + +```python +query = "In which house is Harry Potter?" +print(f'Translating the text query "{query}" to a SPARQL query and executing it on the knowledge graph...') +result = kgqa_retriever.retrieve(query=query) +print(result) +# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . } +# Correct answer: Gryffindor + +print("Executing a SPARQL query with prefixed names of resources...") +result = kgqa_retriever._query_kg( + sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }" +) +print(result) +# Paraphrased question: Who is the keeper of keys and grounds? +# Correct answer: Rubeus Hagrid + +print("Executing a SPARQL query with full names of resources...") +result = kgqa_retriever._query_kg( + sparql_query="select distinct ?obj where { ?obj . }" +) +print(result) +# Paraphrased question: What is the patronus of Hermione? +# Correct answer: Otter +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/11.md b/docs/v1.3.0/_src/tutorials/tutorials/11.md new file mode 100644 index 0000000000..d3f2a8065d --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/11.md @@ -0,0 +1,418 @@ + + +# Pipelines Tutorial + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial11_Pipelines.ipynb) + +In this tutorial, you will learn how the `Pipeline` class acts as a connector between all the different +building blocks that are found in FARM. Whether you are using a Reader, Generator, Summarizer +or Retriever (or 2), the `Pipeline` class will help you build a Directed Acyclic Graph (DAG) that +determines how to route the output of one component into the input of another. + + + + +## Setting Up the Environment + +Let's start by ensuring we have a GPU running to ensure decent speed in this tutorial. +In Google colab, you can change to a GPU runtime in the menu: +- **Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + +These lines are to install Haystack through pip + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# Install pygraphviz +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +If running from Colab or a no Docker environment, you will want to start Elasticsearch from source + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +## Initialization + +Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can +be used indexed into our `DocumentStore` + + +```python +from haystack.utils import ( + print_answers, + print_documents, + fetch_archive_from_http, + convert_files_to_dicts, + clean_wiki_text, +) + +# Download and prepare data - 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial11" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) +``` + +Here we initialize the core components that we will be gluing together using the `Pipeline` class. +We have a `DocumentStore`, an `ElasticsearchRetriever` and a `FARMReader`. +These can be combined to create a classic Retriever-Reader pipeline that is designed +to perform Open Domain Question Answering. + + +```python +from haystack import Pipeline +from haystack.utils import launch_es +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import ElasticsearchRetriever, EmbeddingRetriever, FARMReader + + +# Initialize DocumentStore and index documents +launch_es() +document_store = ElasticsearchDocumentStore() +document_store.delete_documents() +document_store.write_documents(got_dicts) + +# Initialize Sparse retriever +es_retriever = ElasticsearchRetriever(document_store=document_store) + +# Initialize dense retriever +embedding_retriever = EmbeddingRetriever( + document_store, + model_format="sentence_transformers", + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", +) +document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) + +# Initialize reader +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") +``` + +## Prebuilt Pipelines + +Haystack features many prebuilt pipelines that cover common tasks. +Here we have an `ExtractiveQAPipeline` (the successor to the now deprecated `Finder` class). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +# Prebuilt pipeline +p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) +res = p_extractive_premade.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +print_answers(res, details="minimum") +``` + +If you want to just do the retrieval step, you can use a `DocumentSearchPipeline` + + +```python +from haystack.pipelines import DocumentSearchPipeline + +p_retrieval = DocumentSearchPipeline(es_retriever) +res = p_retrieval.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_documents(res, max_text_len=200) +``` + +Or if you want to use a `Generator` instead of a `Reader`, +you can initialize a `GenerativeQAPipeline` like this: + + +```python +from haystack.pipelines import GenerativeQAPipeline, FAQPipeline +from haystack.nodes import RAGenerator + +# We set this to True so that the document store returns document embeddings with each document +# This is needed by the Generator +document_store.return_embedding = True + +# Initialize generator +rag_generator = RAGenerator() + +# Generative QA +p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever) +res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_answers(res, details="minimum") + +# We are setting this to False so that in later pipelines, +# we get a cleaner printout +document_store.return_embedding = False +``` + +Haystack features prebuilt pipelines to do: +- just document search (DocumentSearchPipeline), +- document search with summarization (SearchSummarizationPipeline) +- generative QA (GenerativeQAPipeline) +- FAQ style QA (FAQPipeline) +- translated search (TranslationWrapperPipeline) +To find out more about these pipelines, have a look at our [documentation](https://haystack.deepset.ai/docs/latest/pipelinesmd) + + +With any Pipeline, whether prebuilt or custom constructed, +you can save a diagram showing how all the components are connected. + +![image](https://user-images.githubusercontent.com/1563902/102451716-54813700-4039-11eb-881e-f3c01b47ca15.png) + + +```python +p_extractive_premade.draw("pipeline_extractive_premade.png") +p_retrieval.draw("pipeline_retrieval.png") +p_generator.draw("pipeline_generator.png") +``` + +## Custom Pipelines + +Now we are going to rebuild the `ExtractiveQAPipelines` using the generic Pipeline class. +We do this by adding the building blocks that we initialized as nodes in the graph. + + +```python +# Custom built extractive QA pipeline +p_extractive = Pipeline() +p_extractive.add_node(component=es_retriever, name="Retriever", inputs=["Query"]) +p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) + +# Now we can run it +res = p_extractive.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +print_answers(res, details="minimum") +p_extractive.draw("pipeline_extractive.png") +``` + +Pipelines offer a very simple way to ensemble together different components. +In this example, we are going to combine the power of an `EmbeddingRetriever` +with the keyword based `ElasticsearchRetriever`. +See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why +we might want to combine a dense and sparse retriever. + +![image](https://github.com/deepset-ai/haystack/blob/master/docs/_src/img/tutorial11_custompipelines_pipeline_ensemble.png?raw=true) + +Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together. + + +```python +from haystack.pipelines import JoinDocuments + +# Create ensembled pipeline +p_ensemble = Pipeline() +p_ensemble.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) +p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) +p_ensemble.add_node( + component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"] +) +p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) +p_ensemble.draw("pipeline_ensemble.png") + +# Run pipeline +res = p_ensemble.run( + query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} +) +print_answers(res, details="minimum") +``` + +## Custom Nodes + +Nodes are relatively simple objects +and we encourage our users to design their own if they don't see on that fits their use case + +The only requirements are: +- Create a class that inherits `BaseComponent`. +- Add a method run() to your class. Add the mandatory and optional arguments it needs to process. These arguments must be passed as input to the pipeline, inside `params`, or output by preceding nodes. +- Add processing logic inside the run() (e.g. reformatting the query). +- Return a tuple that contains your output data (for the next node) +and the name of the outgoing edge (by default "output_1" for nodes that have one output) +- Add a class attribute outgoing_edges = 1 that defines the number of output options from your node. You only need a higher number here if you have a decision node (see below). + +Here we have a template for a Node: + + +```python +from haystack import BaseComponent +from typing import Optional + + +class CustomNode(BaseComponent): + outgoing_edges = 1 + + def run(self, query: str, my_optional_param: Optional[int]): + # process the inputs + output = {"my_output": ...} + return output, "output_1" +``` + +## Decision Nodes + +Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. +One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader. +With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful. + +![image](https://github.com/deepset-ai/haystack/blob/master/docs/_src/img/tutorial11_decision_nodes_pipeline_classifier.png?raw=true) + +Though this looks very similar to the ensembled pipeline shown above, +the key difference is that only one of the retrievers is run for each request. +By contrast both retrievers are always run in the ensembled approach. + +Below, we define a very naive `QueryClassifier` and show how to use it: + + +```python +class CustomQueryClassifier(BaseComponent): + outgoing_edges = 2 + + def run(self, query: str): + if "?" in query: + return {}, "output_2" + else: + return {}, "output_1" + + +# Here we build the pipeline +p_classifier = Pipeline() +p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +p_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) +p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"]) +p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) +p_classifier.draw("pipeline_classifier.png") + +# Run only the dense retriever on the full sentence query +res_1 = p_classifier.run(query="Who is the father of Arya Stark?") +print("Embedding Retriever Results" + "\n" + "=" * 15) +print_answers(res_1) + +# Run only the sparse retriever on a keyword based query +res_2 = p_classifier.run(query="Arya Stark father") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_2) +``` + +## Evaluation Nodes + +We have also designed a set of nodes that can be used to evaluate the performance of a system. +Have a look at our [tutorial](https://haystack.deepset.ai/docs/latest/tutorial5md) to get hands on with the code and learn more about Evaluation Nodes! + +## Debugging Pipelines + +You can print out debug information from nodes in your pipelines in a few different ways. + + +```python +# 1) You can set the `debug` attribute of a given node. +es_retriever.debug = True + +# 2) You can provide `debug` as a parameter when running your pipeline +result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}}) + +# 3) You can provide the `debug` paramter to all nodes in your pipeline +result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True}) + +result["_debug"] +``` + +## YAML Configs + +A full `Pipeline` can be defined in a YAML file and simply loaded. +Having your pipeline available in a YAML is particularly useful +when you move between experimentation and production environments. +Just export the YAML from your notebook / IDE and import it into your production environment. +It also helps with version control of pipelines, +allows you to share your pipeline easily with colleagues, +and simplifies the configuration of pipeline parameters in production. + +It consists of two main sections: you define all objects (e.g. a reader) in components +and then stick them together to a pipeline in pipelines. +You can also set one component to be multiple nodes of a pipeline or to be a node across multiple pipelines. +It will be loaded just once in memory and therefore doesn't hurt your resources more than actually needed. + +The contents of a YAML file should look something like this: + +```yaml +version: '0.7' +components: # define all the building-blocks for Pipeline +- name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 +- name: MyESRetriever + type: ElasticsearchRetriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + custom_query: null +- name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test +pipelines: # multiple Pipelines can be defined using the components from above +- name: my_query_pipeline # a simple extractive-qa Pipeline + nodes: + - name: MyESRetriever + inputs: [Query] + - name: MyReader + inputs: [MyESRetriever] +``` + +To load, simply call: +``` python +pipeline.load_from_yaml(Path("sample.yaml")) +``` + +## Conclusion + +The possibilities are endless with the `Pipeline` class and we hope that this tutorial will inspire you +to build custom pipeplines that really work for your use case! + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/12.md b/docs/v1.3.0/_src/tutorials/tutorials/12.md new file mode 100644 index 0000000000..0ccb72fdcc --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/12.md @@ -0,0 +1,168 @@ + + +# Long-Form Question Answering + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial12_LFQA.ipynb) + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install -q git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + + +```python +from haystack.utils import convert_files_to_dicts, fetch_archive_from_http, clean_wiki_text +from haystack.nodes import Seq2SeqGenerator +``` + +### Document Store + +FAISS is a library for efficient similarity search on a cluster of dense vectors. +The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood +to store the document text and other meta data. The vector embeddings of the text are +indexed on a FAISS Index that later is queried for searching answers. +The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for +faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. +For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + + +```python +from haystack.document_stores import FAISSDocumentStore + +document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat") +``` + +### Cleaning & indexing documents + +Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore + + +```python +# Let's first get some files that we want to use +doc_dir = "data/tutorial12" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(dicts) +``` + +### Initalize Retriever and Reader/Generator + +#### Retriever + +We use a `DensePassageRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore` + + + + +```python +from haystack.nodes import DensePassageRetriever + +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki", + passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki", +) + +document_store.update_embeddings(retriever) +``` + +Before we blindly use the `DensePassageRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents. + + +```python +from haystack.utils import print_documents +from haystack.pipelines import DocumentSearchPipeline + +p_retrieval = DocumentSearchPipeline(retriever) +res = p_retrieval.run(query="Tell me something about Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_documents(res, max_text_len=512) +``` + +#### Reader/Generator + +Similar to previous Tutorials we now initalize our reader/generator. + +Here we use a `Seq2SeqGenerator` with the *vblagoje/bart_lfqa* model (see: https://huggingface.co/vblagoje/bart_lfqa) + + + + +```python +generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa") +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import GenerativeQAPipeline + +pipe = GenerativeQAPipeline(generator, retriever) +``` + +## Voilà! Ask a question! + + +```python +pipe.run( + query="How did Arya Stark's character get portrayed in a television adaptation?", params={"Retriever": {"top_k": 3}} +) +``` + + +```python +pipe.run(query="Why is Arya Stark an unusual character?", params={"Retriever": {"top_k": 3}}) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/13.md b/docs/v1.3.0/_src/tutorials/tutorials/13.md new file mode 100644 index 0000000000..2fc143b1bb --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/13.md @@ -0,0 +1,187 @@ + + +# Question Generation + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial13_Question_generation.ipynb) + +This is a bare bones tutorial showing what is possible with the QuestionGenerator Nodes and Pipelines which automatically +generate questions which the question generation model thinks can be answered by a given document. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +# Imports needed to run this notebook + +from pprint import pprint +from tqdm import tqdm +from haystack.nodes import QuestionGenerator, ElasticsearchRetriever, FARMReader +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.pipelines import ( + QuestionGenerationPipeline, + RetrieverQuestionGenerationPipeline, + QuestionAnswerGenerationPipeline, +) +from haystack.utils import launch_es, print_questions +``` + +Let's start an Elasticsearch instance with one of the options below: + + +```python +# Option 1: Start Elasticsearch service via Docker +launch_es() +``` + + +```python +# Option 2: In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +Let's initialize some core components + + +```python +text1 = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace." +text2 = "Princess Arya Stark is the third child and second daughter of Lord Eddard Stark and his wife, Lady Catelyn Stark. She is the sister of the incumbent Westerosi monarchs, Sansa, Queen in the North, and Brandon, King of the Andals and the First Men. After narrowly escaping the persecution of House Stark by House Lannister, Arya is trained as a Faceless Man at the House of Black and White in Braavos, using her abilities to avenge her family. Upon her return to Westeros, she exacts retribution for the Red Wedding by exterminating the Frey male line." +text3 = "Dry Cleaning are an English post-punk band who formed in South London in 2018.[3] The band is composed of vocalist Florence Shaw, guitarist Tom Dowse, bassist Lewis Maynard and drummer Nick Buxton. They are noted for their use of spoken word primarily in lieu of sung vocals, as well as their unconventional lyrics. Their musical stylings have been compared to Wire, Magazine and Joy Division.[4] The band released their debut single, 'Magic of Meghan' in 2019. Shaw wrote the song after going through a break-up and moving out of her former partner's apartment the same day that Meghan Markle and Prince Harry announced they were engaged.[5] This was followed by the release of two EPs that year: Sweet Princess in August and Boundary Road Snacks and Drinks in October. The band were included as part of the NME 100 of 2020,[6] as well as DIY magazine's Class of 2020.[7] The band signed to 4AD in late 2020 and shared a new single, 'Scratchcard Lanyard'.[8] In February 2021, the band shared details of their debut studio album, New Long Leg. They also shared the single 'Strong Feelings'.[9] The album, which was produced by John Parish, was released on 2 April 2021.[10]" + +docs = [{"content": text1}, {"content": text2}, {"content": text3}] + +# Initialize document store and write in the documents +document_store = ElasticsearchDocumentStore() +document_store.write_documents(docs) + +# Initialize Question Generator +question_generator = QuestionGenerator() +``` + +## Question Generation Pipeline + +The most basic version of a question generator pipeline takes a document as input and outputs generated questions +which the the document can answer. + + +```python +question_generation_pipeline = QuestionGenerationPipeline(question_generator) +for idx, document in enumerate(document_store): + + print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n") + result = question_generation_pipeline.run(documents=[document]) + print_questions(result) +``` + +## Retriever Question Generation Pipeline + +This pipeline takes a query as input. It retrieves relevant documents and then generates questions based on these. + + +```python +retriever = ElasticsearchRetriever(document_store=document_store) +rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) + +print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n") +result = rqg_pipeline.run(query="Arya Stark") +print_questions(result) +``` + +## Question Answer Generation Pipeline + +This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using +a Reader model + + +```python +reader = FARMReader("deepset/roberta-base-squad2") +qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) +for idx, document in enumerate(tqdm(document_store)): + + print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n") + result = qag_pipeline.run(documents=[document]) + print_questions(result) +``` + +## Translated Question Answer Generation Pipeline +Trained models for Question Answer Generation are not available in many languages other than English. Haystack +provides a workaround for that issue by machine-translating a pipeline's inputs and outputs with the +TranslationWrapperPipeline. The following example generates German questions and answers on a German text +document - by using an English model for Question Answer Generation. + + +```python +# Fill the document store with a German document. +text1 = "Python ist eine interpretierte Hochsprachenprogrammiersprache für allgemeine Zwecke. Sie wurde von Guido van Rossum entwickelt und 1991 erstmals veröffentlicht. Die Design-Philosophie von Python legt den Schwerpunkt auf die Lesbarkeit des Codes und die Verwendung von viel Leerraum (Whitespace)." +docs = [{"content": text1}] +document_store.delete_documents() +document_store.write_documents(docs) + +# Load machine translation models +from haystack.nodes import TransformersTranslator + +in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") +out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + +# Wrap the previously defined QuestionAnswerGenerationPipeline +from haystack.pipelines import TranslationWrapperPipeline + +pipeline_with_translation = TranslationWrapperPipeline( + input_translator=in_translator, output_translator=out_translator, pipeline=qag_pipeline +) + +for idx, document in enumerate(tqdm(document_store)): + print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n") + result = pipeline_with_translation.run(documents=[document]) + print_questions(result) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/14.md b/docs/v1.3.0/_src/tutorials/tutorials/14.md new file mode 100644 index 0000000000..4182aada4a --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/14.md @@ -0,0 +1,362 @@ + + +# Query Classifier Tutorial +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial14_Query_Classifier.ipynb) + +In this tutorial we introduce the query classifier the goal of introducing this feature was to optimize the overall flow of Haystack pipeline by detecting the nature of user queries. Now, the Haystack can detect primarily three types of queries using both light-weight SKLearn Gradient Boosted classifier or Transformer based more robust classifier. The three categories of queries are as follows: + + +### 1. Keyword Queries: +Such queries don't have semantic meaning and merely consist of keywords. For instance these three are the examples of keyword queries. + +* arya stark father +* jon snow country +* arya stark younger brothers + +### 2. Interrogative Queries: +In such queries users usually ask a question, regardless of presence of "?" in the query the goal here is to detect the intent of the user whether any question is asked or not in the query. For example: + +* who is the father of arya stark ? +* which country was jon snow filmed ? +* who are the younger brothers of arya stark ? + +### 3. Declarative Queries: +Such queries are variation of keyword queries, however, there is semantic relationship between words. Fo example: + +* Arya stark was a daughter of a lord. +* Jon snow was filmed in a country in UK. +* Bran was brother of a princess. + +In this tutorial, you will learn how the `TransformersQueryClassifier` and `SklearnQueryClassifier` classes can be used to intelligently route your queries, based on the nature of the user query. Also, you can choose between a lightweight Gradients boosted classifier or a transformer based classifier. + +Furthermore, there are two types of classifiers you can use out of the box from Haystack. +1. Keyword vs Statement/Question Query Classifier +2. Statement vs Question Query Classifier + +As evident from the name the first classifier detects the keywords search queries and semantic statements like sentences/questions. The second classifier differentiates between question based queries and declarative sentences. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + +These lines are to install Haystack through pip + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# Install pygraphviz +!apt install libgraphviz-dev +!pip install pygraphviz + +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +If running from Colab or a no Docker environment, you will want to start Elasticsearch from source + +## Initialization + +Here are some core imports + +Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can +be used indexed into our `DocumentStore` + + +```python +from haystack.utils import print_answers, fetch_archive_from_http, convert_files_to_dicts, clean_wiki_text, launch_es +from haystack.pipelines import Pipeline, RootNode +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import ( + ElasticsearchRetriever, + DensePassageRetriever, + FARMReader, + TransformersQueryClassifier, + SklearnQueryClassifier, +) + +# Download and prepare data - 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial14" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +got_dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Initialize DocumentStore and index documents +launch_es() +document_store = ElasticsearchDocumentStore() +document_store.delete_documents() +document_store.write_documents(got_dicts) + +# Initialize Sparse retriever +es_retriever = ElasticsearchRetriever(document_store=document_store) + +# Initialize dense retriever +dpr_retriever = DensePassageRetriever(document_store) +document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") +``` + +## Keyword vs Question/Statement Classifier + +The keyword vs question/statement query classifier essentially distinguishes between the keyword queries and statements/questions. So you can intelligently route to different retrieval nodes based on the nature of the query. Using this classifier can potentially yield the following benefits: + +* Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries) +* Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries) + +![image](https://user-images.githubusercontent.com/6007894/127831511-f55bad86-4b4f-4b54-9889-7bba37e475c6.png) + + +Below, we define a `SklQueryClassifier` and show how to use it: + +Read more about the trained model and dataset used [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + +```python +# Here we build the pipeline +sklearn_keyword_classifier = Pipeline() +sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) +sklearn_keyword_classifier.draw("pipeline_classifier.png") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_1, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_2 = sklearn_keyword_classifier.run(query="arya stark father") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_2, details="minimum") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_3, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_4 = sklearn_keyword_classifier.run(query="jon snow country") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_4, details="minimum") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_5, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_6 = sklearn_keyword_classifier.run(query="arya stark younger brothers") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_6, details="minimum") +``` + +## Transformer Keyword vs Question/Statement Classifier + +Firstly, it's essential to understand the trade-offs between SkLearn and Transformer query classifiers. The transformer classifier is more accurate than SkLearn classifier however, it requires more memory and most probably GPU for faster inference however the transformer size is roughly `50 MBs`. Whereas, SkLearn is less accurate however is much more faster and doesn't require GPU for inference. + +Below, we define a `TransformersQueryClassifier` and show how to use it: + +Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/bert-mini-finetune-question-detection) + + +```python +# Here we build the pipeline +transformer_keyword_classifier = Pipeline() +transformer_keyword_classifier.add_node( + component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"] +) +transformer_keyword_classifier.add_node( + component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"] +) +transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) +transformer_keyword_classifier.draw("pipeline_classifier.png") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_1, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_2 = transformer_keyword_classifier.run(query="arya stark father") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_2, details="minimum") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_3, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_4 = transformer_keyword_classifier.run(query="jon snow country") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_4, details="minimum") +``` + + +```python +# Run only the dense retriever on the full sentence query +res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_5, details="minimum") + +# Run only the sparse retriever on a keyword based query +res_6 = transformer_keyword_classifier.run(query="arya stark younger brothers") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_6, details="minimum") +``` + +## Question vs Statement Classifier + +One possible use case of this classifier could be to route queries after the document retrieval to only send questions to QA reader and in case of declarative sentence, just return the DPR/ES results back to user to enhance user experience and only show answers when user explicitly asks it. + +![image](https://user-images.githubusercontent.com/6007894/127864452-f931ea7f-2e62-4f59-85dc-056d56eb9295.png) + + +Below, we define a `TransformersQueryClassifier` and show how to use it: + +Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/question-vs-statement-classifier) + + +```python +# Here we build the pipeline +transformer_question_classifier = Pipeline() +transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) +transformer_question_classifier.add_node( + component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), + name="QueryClassifier", + inputs=["DPRRetriever"], +) +transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"]) +transformer_question_classifier.draw("question_classifier.png") + +# Run only the QA reader on the question query +res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?") +print("DPR Results" + "\n" + "=" * 15) +print_answers(res_1, details="minimum") + +# Show only DPR results +res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_2, details="minimum") +``` + +## Standalone Query Classifier +Below we run queries classifiers standalone to better understand their outputs on each of the three types of queries + + +```python +# Here we create the keyword vs question/statement query classifier +from haystack.pipelines import TransformersQueryClassifier + +queries = [ + "arya stark father", + "jon snow country", + "who is the father of arya stark", + "which country was jon snow filmed?", +] + +keyword_classifier = TransformersQueryClassifier() + +for query in queries: + result = keyword_classifier.run(query=query) + if result[1] == "output_1": + category = "question/statement" + else: + category = "keyword" + + print(f"Query: {query}, raw_output: {result}, class: {category}") +``` + + +```python +# Here we create the question vs statement query classifier +from haystack.pipelines import TransformersQueryClassifier + +queries = [ + "Lord Eddard was the father of Arya Stark.", + "Jon Snow was filmed in United Kingdom.", + "who is the father of arya stark?", + "Which country was jon snow filmed in?", +] + +question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") + +for query in queries: + result = question_classifier.run(query=query) + if result[1] == "output_1": + category = "question" + else: + category = "statement" + + print(f"Query: {query}, raw_output: {result}, class: {category}") +``` + +## Conclusion + +The query classifier gives you more possibility to be more creative with the pipelines and use different retrieval nodes in a flexible fashion. Moreover, as in the case of Question vs Statement classifier you can also choose the queries which you want to send to the reader. + +Finally, you also have the possible of bringing your own classifier and plugging it into either `TransformersQueryClassifier(model_name_or_path="")` or using the `SklearnQueryClassifier(model_name_or_path="url_to_classifier_or_file_path_as_pickle", vectorizer_name_or_path="url_to_vectorizer_or_file_path_as_pickle")` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/15.md b/docs/v1.3.0/_src/tutorials/tutorials/15.md new file mode 100644 index 0000000000..1cacbb1369 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/15.md @@ -0,0 +1,361 @@ + + +# Open-Domain QA on Tables +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial15_TableQA.ipynb) + +This tutorial shows you how to perform question-answering on tables using the `TableTextRetriever` or `ElasticsearchRetriever` as retriever node and the `TableReader` as reader node. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# The TaPAs-based TableReader requires the torch-scatter library +!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html + +# Install pygraphviz for visualization of Pipelines +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch +from haystack.document_stores import ElasticsearchDocumentStore + +# We want to use a small model producing 512-dimensional embeddings, so we need to set embedding_dim to 512 +document_index = "document" +document_store = ElasticsearchDocumentStore( + host="localhost", username="", password="", index=document_index, embedding_dim=512 +) +``` + +## Add Tables to DocumentStore +To quickly demonstrate the capabilities of the `TableTextRetriever` and the `TableReader` we use a subset of 1000 tables of the [Open Table-and-Text Question Answering (OTT-QA) dataset](https://github.com/wenhuchen/OTT-QA). + +Just as text passages, tables are represented as `Document` objects in Haystack. The content field, though, is a pandas DataFrame instead of a string. + + +```python +# Let's first fetch some tables that we want to query +# Here: 1000 tables from OTT-QA +from haystack.utils import fetch_archive_from_http + +doc_dir = "data/tutorial15" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + + +```python +# Add the tables to the DocumentStore + +import json +from haystack import Document +import pandas as pd + + +def read_ottqa_tables(filename): + processed_tables = [] + with open(filename) as tables: + tables = json.load(tables) + for key, table in tables.items(): + current_columns = table["header"] + current_rows = table["data"] + current_df = pd.DataFrame(columns=current_columns, data=current_rows) + current_doc_title = table["title"] + current_section_title = table["section_title"] + document = Document( + content=current_df, + content_type="table", + meta={"title": current_doc_title, "section_title": current_section_title}, + id=key, + ) + processed_tables.append(document) + + return processed_tables + + +tables = read_ottqa_tables(f"{doc_dir}/ottqa_tables_sample.json") +document_store.write_documents(tables, index=document_index) + +# Showing content field and meta field of one of the Documents of content_type 'table' +print(tables[0].content) +print(tables[0].meta) +``` + +## Initalize Retriever, Reader, & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to a subset of tables where a given question could be answered. +They use some simple but fast algorithm. + +**Here:** We use the `TableTextRetriever` capable of retrieving relevant content among a database +of texts and tables using dense embeddings. It is an extension of the `DensePassageRetriever` and consists of three encoders (one query encoder, one text passage encoder and one table encoder) that create embeddings in the same vector space. More details on the `TableTextRetriever` and how it is trained can be found in [this paper](https://arxiv.org/abs/2108.04049). + +**Alternatives:** + +- `ElasticsearchRetriever` that uses BM25 algorithm + + + +```python +from haystack.nodes.retriever import TableTextRetriever + +retriever = TableTextRetriever( + document_store=document_store, + query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder", + passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder", + table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder", + embed_meta_fields=["title", "section_title"], +) +``` + + +```python +# Add table embeddings to the tables in DocumentStore +document_store.update_embeddings(retriever=retriever) +``` + + +```python +## Alternative: ElasticsearchRetriever +# from haystack.nodes.retriever import ElasticsearchRetriever +# retriever = ElasticsearchRetriever(document_store=document_store) +``` + + +```python +# Try the Retriever +from haystack.utils import print_documents + +retrieved_tables = retriever.retrieve("How many twin buildings are under construction?", top_k=5) +# Get highest scored table +print(retrieved_tables[0].content) +``` + +### Reader +The `TableReader` is based on TaPas, a transformer-based language model capable of grasping the two-dimensional structure of a table. It scans the tables returned by the retriever and extracts the anser. The available TableReader models can be found [here](https://huggingface.co/models?pipeline_tag=table-question-answering&sort=downloads). + +**Notice**: The `TableReader` will return an answer for each table, even if the query cannot be answered by the table. Furthermore, the confidence scores are not useful as of now, given that they will *always* be very high (i.e. 1 or close to 1). + + +```python +from haystack.nodes import TableReader + +reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512) +``` + + +```python +# Try the TableReader on one Table (highest-scored retrieved table from previous section) + +table_doc = document_store.get_document_by_id("List_of_tallest_twin_buildings_and_structures_in_the_world_1") +print(table_doc.content) +``` + + +```python +from haystack.utils import print_answers + +prediction = reader.predict(query="How many twin buildings are under construction?", documents=[table_doc]) +print_answers(prediction, details="all") +``` + +The offsets in the `offsets_in_document` and `offsets_in_context` field indicate the table cells that the model predicts to be part of the answer. They need to be interpreted on the linearized table, i.e., a flat list containing all of the table cells. + +In the `Answer`'s meta field, you can find the aggreagtion operator used to construct the answer (in this case `COUNT`) and the answer cells as strings. + + +```python +print(f"Predicted answer: {prediction['answers'][0].answer}") +print(f"Meta field: {prediction['answers'][0].meta}") +``` + +### Pipeline +The Retriever and the Reader can be sticked together to a pipeline in order to first retrieve relevant tables and then extract the answer. + +**Notice**: Given that the `TableReader` does not provide useful confidence scores and returns an answer for each of the tables, the sorting of the answers might be not helpful. + + +```python +# Initialize pipeline +from haystack import Pipeline + +table_qa_pipeline = Pipeline() +table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"]) +table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableTextRetriever"]) +``` + + +```python +prediction = table_qa_pipeline.run("How many twin buildings are under construction?") +print_answers(prediction, details="minimum") +``` + +# Open-Domain QA on Text and Tables +With haystack, you not only have the possibility to do QA on texts or tables, solely, but you can also use both texts and tables as your source of information. + +To demonstrate this, we add 1,000 sample text passages from the OTT-QA dataset. + + +```python +# Add 1,000 text passages from OTT-QA to our document store. + + +def read_ottqa_texts(filename): + processed_passages = [] + with open(filename) as passages: + passages = json.load(passages) + for title, content in passages.items(): + title = title[6:] + title = title.replace("_", " ") + document = Document(content=content, content_type="text", meta={"title": title}) + processed_passages.append(document) + + return processed_passages + + +passages = read_ottqa_texts(f"{doc_dir}/ottqa_texts_sample.json") +document_store.write_documents(passages, index=document_index) +``` + + +```python +document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False) +``` + +## Pipeline for QA on Combination of Text and Tables +We are using one node for retrieving both texts and tables, the `TableTextRetriever`. In order to do question-answering on the Documents coming from the `TableTextRetriever`, we need to route Documents of type `"text"` to a `FARMReader` (or alternatively `TransformersReader`) and Documents of type `"table"` to a `TableReader`. + +To achieve this, we make use of two additional nodes: +- `SplitDocumentList`: Splits the List of Documents retrieved by the `TableTextRetriever` into two lists containing only Documents of type `"text"` or `"table"`, respectively. +- `JoinAnswers`: Takes Answers coming from two different Readers (in this case `FARMReader` and `TableReader`) and joins them to a single list of Answers. + + +```python +from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers + +text_reader = FARMReader("deepset/roberta-base-squad2") +# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or +# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however, +# that they are not capable of doing aggregations over multiple table cells. +table_reader = TableReader("deepset/tapas-large-nq-hn-reader") +route_documents = RouteDocuments() +join_answers = JoinAnswers() +``` + + +```python +text_table_qa_pipeline = Pipeline() +text_table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"]) +text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["TableTextRetriever"]) +text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"]) +text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"]) +text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"]) +``` + + +```python +# Let's have a look on the structure of the combined Table an Text QA pipeline. +from IPython import display + +text_table_qa_pipeline.draw() +display.Image("pipeline.png") +``` + + +```python +# Example query whose answer resides in a text passage +predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + + +```python +# Example query whose answer resides in a table +predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.3.0/_src/tutorials/tutorials/16.md b/docs/v1.3.0/_src/tutorials/tutorials/16.md new file mode 100644 index 0000000000..c622091ae7 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/16.md @@ -0,0 +1,266 @@ + + +# Extending your Metadata using DocumentClassifiers at Index Time + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb) + +With DocumentClassifier it's possible to automatically enrich your documents with categories, sentiments, topics or whatever metadata you like. This metadata could be used for efficient filtering or further processing. Say you have some categories your users typically filter on. If the documents are tagged manually with these categories, you could automate this process by training a model. Or you can leverage the full power and flexibility of zero shot classification. All you need to do is pass your categories to the classifier, no labels required. This tutorial shows how to integrate it in your indexing pipeline. + +DocumentClassifier adds the classification result (label and score) to Document's meta property. +Hence, we can use it to classify documents at index time. \ +The result can be accessed at query time: for example by applying a filter for "classification.label". + +This tutorial will show you how to integrate a classification model into your preprocessing steps and how you can filter for this additional metadata at query time. In the last section we show how to put it all together and create an indexing pipeline. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab, ocr] + +!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz +!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin + +# Install pygraphviz +!apt install libgraphviz-dev +!pip install pygraphviz +``` + + +```python +# Here are the imports we need +from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, ElasticsearchRetriever +from haystack.schema import Document +from haystack.utils import convert_files_to_dicts, fetch_archive_from_http, print_answers +``` + + +```python +# This fetches some sample files to work with + +doc_dir = "data/tutorial16" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + +## Read and preprocess documents + + + +```python +# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents + +all_docs = convert_files_to_dicts(dir_path=doc_dir) +preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False) +docs_sliding_window = preprocessor_sliding_window.process(all_docs) +``` + +## Apply DocumentClassifier + +We can enrich the document metadata at index time using any transformers document classifier model. While traditional classification models are trained to predict one of a few "hard-coded" classes and required a dedicated training dataset, zero-shot classification is super flexible and you can easily switch the classes the model should predict on the fly. Just supply them via the labels param. +Here we use a zero shot model that is supposed to classify our documents in 'music', 'natural language processing' and 'history'. Feel free to change them for whatever you like to classify. \ +These classes can later on be accessed at query time. + + +```python +doc_classifier = TransformersDocumentClassifier( + model_name_or_path="cross-encoder/nli-distilroberta-base", + task="zero-shot-classification", + labels=["music", "natural language processing", "history"], + batch_size=16, +) +``` + + +```python +# we can also use any other transformers model besides zero shot classification + +# doc_classifier_model = 'bhadresh-savani/distilbert-base-uncased-emotion' +# doc_classifier = TransformersDocumentClassifier(model_name_or_path=doc_classifier_model, batch_size=16, use_gpu=-1) +``` + + +```python +# we could also specifiy a different field we want to run the classification on + +# doc_classifier = TransformersDocumentClassifier(model_name_or_path="cross-encoder/nli-distilroberta-base", +# task="zero-shot-classification", +# labels=["music", "natural language processing", "history"], +# batch_size=16, use_gpu=-1, +# classification_field="description") +``` + + +```python +# convert to Document using a fieldmap for custom content fields the classification should run on +docs_to_classify = [Document.from_dict(d) for d in docs_sliding_window] +``` + + +```python +# classify using gpu, batch_size makes sure we do not run out of memory +classified_docs = doc_classifier.predict(docs_to_classify) +``` + + +```python +# let's see how it looks: there should be a classification result in the meta entry containing labels and scores. +print(classified_docs[0].to_dict()) +``` + +## Indexing + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") +``` + + +```python +# Now, let's write the docs to our DB. +document_store.delete_all_documents() +document_store.write_documents(classified_docs) +``` + + +```python +# check if indexed docs contain classification results +test_doc = document_store.get_all_documents()[0] +print( + f'document {test_doc.id} with content \n\n{test_doc.content}\n\nhas label {test_doc.meta["classification"]["label"]}' +) +``` + +## Querying the data + +All we have to do to filter for one of our classes is to set a filter on "classification.label". + + +```python +# Initialize QA-Pipeline +from haystack.pipelines import ExtractiveQAPipeline + +retriever = ElasticsearchRetriever(document_store=document_store) +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +pipe = ExtractiveQAPipeline(reader, retriever) +``` + + +```python +## Voilà! Ask a question while filtering for "music"-only documents +prediction = pipe.run( + query="What is heavy metal?", + params={"Retriever": {"top_k": 10, "filters": {"classification.label": ["music"]}}, "Reader": {"top_k": 5}}, +) +``` + + +```python +print_answers(prediction, details="high") +``` + +## Wrapping it up in an indexing pipeline + + +```python +from pathlib import Path +from haystack.pipelines import Pipeline +from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter +``` + + +```python +file_type_classifier = FileTypeClassifier() +text_converter = TextConverter() +pdf_converter = PDFToTextConverter() +docx_converter = DocxToTextConverter() + +indexing_pipeline_with_classification = Pipeline() +indexing_pipeline_with_classification.add_node( + component=file_type_classifier, name="FileTypeClassifier", inputs=["File"] +) +indexing_pipeline_with_classification.add_node( + component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"] +) +indexing_pipeline_with_classification.add_node( + component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"] +) +indexing_pipeline_with_classification.add_node( + component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"] +) +indexing_pipeline_with_classification.add_node( + component=preprocessor_sliding_window, + name="Preprocessor", + inputs=["TextConverter", "PdfConverter", "DocxConverter"], +) +indexing_pipeline_with_classification.add_node( + component=doc_classifier, name="DocumentClassifier", inputs=["Preprocessor"] +) +indexing_pipeline_with_classification.add_node( + component=document_store, name="DocumentStore", inputs=["DocumentClassifier"] +) +indexing_pipeline_with_classification.draw("index_time_document_classifier.png") + +document_store.delete_documents() +txt_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".txt"] +pdf_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".pdf"] +docx_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".docx"] +indexing_pipeline_with_classification.run(file_paths=txt_files) +indexing_pipeline_with_classification.run(file_paths=pdf_files) +indexing_pipeline_with_classification.run(file_paths=docx_files) + +document_store.get_all_documents()[0] +``` + + +```python +# we can store this pipeline and use it from the REST-API +indexing_pipeline_with_classification.save_to_yaml("indexing_pipeline_with_classification.yaml") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.3.0/_src/tutorials/tutorials/2.md b/docs/v1.3.0/_src/tutorials/tutorials/2.md new file mode 100644 index 0000000000..bf2314f007 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/2.md @@ -0,0 +1,157 @@ + + +# Fine-tuning a Model on Your Own Data + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb) + +For many use cases it is sufficient to just use one of the existing public models that were trained on SQuAD or other public QA datasets (e.g. Natural Questions). +However, if you have domain-specific questions, fine-tuning your model on custom examples will very likely boost your performance. +While this varies by domain, we saw that ~ 2000 examples can easily increase performance by +5-20%. + +This tutorial shows you how to fine-tune a pretrained model on your own dataset. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +from haystack.nodes import FARMReader +from haystack.utils import fetch_archive_from_http +``` + + +## Create Training Data + +There are two ways to generate training data + +1. **Annotation**: You can use the [annotation tool](https://haystack.deepset.ai/guides/annotation) to label your data, i.e. highlighting answers to your questions in a document. The tool supports structuring your workflow with organizations, projects, and users. The labels can be exported in SQuAD format that is compatible for training with Haystack. + +![Snapshot of the annotation tool](https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/_src/img/annotation_tool.png) + +2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's [REST API interface](https://github.com/deepset-ai/haystack#rest-api). This includes a customizable user feedback API for providing feedback on the answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data for fine-tuning your model further. + + +## Fine-tune your model + +Once you have collected training data, you can fine-tune your base models. +We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). +We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer Learning effects. + +**Recommendation**: Run training on a GPU. +If you are using Colab: Enable this in the menu "Runtime" > "Change Runtime type" > Select "GPU" in dropdown. +Then change the `use_gpu` arguments below to `True` + + +```python +reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) +data_dir = "data/squad20" +# data_dir = "PATH/TO_YOUR/TRAIN_DATA" +reader.train(data_dir=data_dir, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") +``` + + +```python +# Saving the model happens automatically at the end of training into the `save_dir` you specified +# However, you could also save a reader manually again via: +reader.save(directory="my_model") +``` + + +```python +# If you want to load it at a later point, just do: +new_reader = FARMReader(model_name_or_path="my_model") +``` + +## Distill your model +In this case, we have used "distilbert-base-uncased" as our base model. This model was trained using a process called distillation. In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. This is why "distilbert-base-uncased" can achieve quite competitive performance while being very small. + +Sometimes, however, you can't use an already distilled model and have to distil it yourself. For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation). + +### Augmenting your training data +To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py): + + +```python +# Downloading script +!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py + +doc_dir = "data/tutorial2" + +# Downloading smaller glove vector file (only for demonstration purposes) +glove_url = "https://nlp.stanford.edu/data/glove.6B.zip" +fetch_archive_from_http(url=glove_url, output_dir=doc_dir) + +# Downloading very small dataset to make tutorial faster (please use a bigger dataset for real use cases) +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Just replace the path with your dataset and adjust the output (also please remove glove path to use bigger glove vector file) +!python augment_squad.py --squad_path squad_small.json --output_path augmented_dataset.json --multiplication_factor 2 --glove_path glove.6B.300d.txt +``` + +In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU. + +### Running distillation +Distillation in haystack is done in two steps: First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task. + +If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate. + + +```python +# Loading a fine-tuned model as teacher e.g. "deepset/​bert-​base-​uncased-​squad2" +teacher = FARMReader(model_name_or_path="my_model", use_gpu=True) + +# You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model. +# The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student. +student = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D", use_gpu=True) + +student.distil_intermediate_layers_from(teacher, data_dir=".", train_filename="augmented_dataset.json", use_gpu=True) +student.distil_prediction_layer_from(teacher, data_dir="data/squad20", train_filename="dev-v2.0.json", use_gpu=True) + +student.save(directory="my_distilled_model") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/3.md b/docs/v1.3.0/_src/tutorials/tutorials/3.md new file mode 100644 index 0000000000..f785e9c776 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/3.md @@ -0,0 +1,230 @@ + + +# Build a QA System Without Elasticsearch + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) + +Haystack provides alternatives to Elasticsearch for developing quick prototypes. + +You can use an `InMemoryDocumentStore` or a `SQLDocumentStore`(with SQLite) as the document store. + +If you are interested in more feature-rich Elasticsearch, then please refer to the Tutorial 1. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +## Document Store + + + +```python +# In-Memory Document Store +from haystack.document_stores import InMemoryDocumentStore + +document_store = InMemoryDocumentStore() +``` + + +```python +# SQLite Document Store +# from haystack.document_stores import SQLDocumentStore +# document_store = SQLDocumentStore(url="sqlite:///qa.db") +``` + +## Preprocessing of documents + +Haystack provides a customizable pipeline for: + - converting files into texts + - cleaning texts + - splitting texts + - writing them to a Document Store + +In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch. + + +```python +# Let's first get some documents that we want to query +# Here: 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial3" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) +# It must take a str as input, and return a str. +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# We now have a list of dictionaries that we can write to our document store. +# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself. +# The default format here is: {"name": ", "text": ""} + +# Let's have a look at the first 3 entries: +print(dicts[:3]) +# Now, let's write the docs to our DB. +document_store.write_documents(dicts) +``` + +## Initalize Retriever, Reader & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. + +With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more retrievers, please refer to the tutorial-1. + + +```python +# An in-memory TfidfRetriever based on Pandas dataframes + +from haystack.nodes import TfidfRetriever + +retriever = TfidfRetriever(document_store=document_store) +``` + +### Reader + +A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based +on powerful, but slower deep learning models. + +Haystack currently supports Readers based on the frameworks FARM and Transformers. +With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). + +**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2) + +**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) + +**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) + +**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible" + +#### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +#### TransformersReader + + +```python +# Alternative: +# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the reader and retriever shall return +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) +``` + + +```python +# Now you can either print the object directly... +from pprint import pprint + +pprint(prediction) + +# Sample output: +# { +# 'answers': [ , +# , +# ... +# ] +# 'documents': [ , +# , +# ... +# ], +# 'no_ans_gap': 11.688868522644043, +# 'node_id': 'Reader', +# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}}, +# 'query': 'Who is the father of Arya Stark?', +# 'root_node': 'Query' +# } +``` + + +```python +# ...or use a util to simplify the output +# Change `minimum` to `medium` or `all` to raise the level of detail +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/4.md b/docs/v1.3.0/_src/tutorials/tutorials/4.md new file mode 100644 index 0000000000..3fa80497fd --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/4.md @@ -0,0 +1,183 @@ + + +# Utilizing existing FAQs for Question Answering + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb) + +While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data. + +**Pros**: + +- Very fast at inference time +- Utilize existing FAQ data +- Quite good control over answers + +**Cons**: + +- Generalizability: We can only answer questions that are similar to existing ones in FAQ + +In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +from haystack.document_stores import ElasticsearchDocumentStore + +from haystack.nodes import EmbeddingRetriever +import pandas as pd +import requests +``` + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es, fetch_archive_from_http + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +### Init the DocumentStore +In contrast to Tutorial 1 (extractive QA), we: + +* specify the name of our `text_field` in Elasticsearch that we want to return as an answer +* specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question +* set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results + + +```python +from haystack.document_stores import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore( + host="localhost", + username="", + password="", + index="document", + embedding_field="question_emb", + embedding_dim=384, + excluded_meta_data=["question_emb"], +) +``` + +### Create a Retriever using embeddings +Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). +We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. + + +```python +retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True +) +``` + +### Prepare & Index FAQ data +We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer) and index those in elasticsearch. +Here: We download some question-answer pairs related to COVID-19 + + +```python +# Download +doc_dir = "data/tutorial4" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Get dataframe with columns "question", "answer" and some custom metadata +df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") +# Minimal cleaning +df.fillna(value="", inplace=True) +df["question"] = df["question"].apply(lambda x: x.strip()) +print(df.head()) + +# Get embeddings for our questions from the FAQs +questions = list(df["question"].values) +df["question_emb"] = retriever.embed_queries(texts=questions) +df = df.rename(columns={"question": "content"}) + +# Convert Dataframe to list of dicts and index them in our DocumentStore +docs_to_index = df.to_dict(orient="records") +document_store.write_documents(docs_to_index) +``` + +### Ask questions +Initialize a Pipeline (this time without a reader) and ask questions + + +```python +from haystack.pipelines import FAQPipeline + +pipe = FAQPipeline(retriever=retriever) +``` + + +```python +from haystack.utils import print_answers + +prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) +print_answers(prediction, details="medium") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/5.md b/docs/v1.3.0/_src/tutorials/tutorials/5.md new file mode 100644 index 0000000000..9c038dcf65 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/5.md @@ -0,0 +1,375 @@ + + +# Evaluation of a Pipeline and its Components + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb) + +To be able to make a statement about the quality of results a question-answering pipeline or any other pipeline in haystack produces, it is important to evaluate it. Furthermore, evaluation allows determining which components of the pipeline can be improved. +The results of the evaluation can be saved as CSV files, which contain all the information to calculate additional metrics later on or inspect individual predictions. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# If Docker is available: Start Elasticsearch as docker container +# from haystack.utils import launch_es +# launch_es() + +# Alternative in Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +## Fetch, Store And Preprocess the Evaluation Dataset + + +```python +from haystack.utils import fetch_archive_from_http + +# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers +doc_dir = "data/tutorial5" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + + +```python +# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted +doc_index = "tutorial5_docs" +label_index = "tutorial5_labels" +``` + + +```python +# Connect to Elasticsearch +from haystack.document_stores import ElasticsearchDocumentStore + +# Connect to Elasticsearch +document_store = ElasticsearchDocumentStore( + host="localhost", + username="", + password="", + index=doc_index, + label_index=label_index, + embedding_field="emb", + embedding_dim=768, + excluded_meta_data=["emb"], +) +``` + + +```python +from haystack.nodes import PreProcessor + +# Add evaluation data to Elasticsearch Document Store +# We first delete the custom tutorial indices to not have duplicate elements +# and also split our documents into shorter passages using the PreProcessor +preprocessor = PreProcessor( + split_length=200, + split_overlap=0, + split_respect_sentence_boundary=False, + clean_empty_lines=False, + clean_whitespace=False, +) +document_store.delete_documents(index=doc_index) +document_store.delete_documents(index=label_index) + +# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. Those objects are then indexed in their respective document and label index in the document store. The method can be used with any dataset in SQuAD format. +document_store.add_eval_data( + filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index, preprocessor=preprocessor +) +``` + +## Initialize the Two Components of an ExtractiveQAPipeline: Retriever and Reader + + +```python +# Initialize Retriever +from haystack.nodes import ElasticsearchRetriever + +retriever = ElasticsearchRetriever(document_store=document_store) + +# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever) +# The EmbeddingRetriever uses a single transformer based encoder model for query and document. +# In contrast, DensePassageRetriever uses two separate encoders for both. + +# Please make sure the "embedding_dim" parameter in the DocumentStore above matches the output dimension of your models! +# Please also take care that the PreProcessor splits your files into chunks that can be completely converted with +# the max_seq_len limitations of Transformers +# The SentenceTransformer model "sentence-transformers/multi-qa-mpnet-base-dot-v1" generally works well with the EmbeddingRetriever on any kind of English text. +# For more information and suggestions on different models check out the documentation at: https://www.sbert.net/docs/pretrained_models.html + +# from haystack.retriever import EmbeddingRetriever, DensePassageRetriever +# retriever = EmbeddingRetriever(document_store=document_store, model_format="sentence_transformers", +# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1") +# retriever = DensePassageRetriever(document_store=document_store, +# query_embedding_model="facebook/dpr-question_encoder-single-nq-base", +# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", +# use_gpu=True, +# max_seq_len_passage=256, +# embed_title=True) +# document_store.update_embeddings(retriever, index=doc_index) +``` + + +```python +# Initialize Reader +from haystack.nodes import FARMReader + +reader = FARMReader("deepset/roberta-base-squad2", top_k=4, return_no_answer=True) + +# Define a pipeline consisting of the initialized retriever and reader +from haystack.pipelines import ExtractiveQAPipeline + +pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever) + +# The evaluation also works with any other pipeline. +# For example you could use a DocumentSearchPipeline as an alternative: + +# from haystack.pipelines import DocumentSearchPipeline +# pipeline = DocumentSearchPipeline(retriever=retriever) +``` + +## Evaluation of an ExtractiveQAPipeline +Here we evaluate retriever and reader in open domain fashion on the full corpus of documents i.e. a document is considered +correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the +predicted answer string, regardless of which document this came from and the position of the extracted span. + +The generation of predictions is seperated from the calculation of metrics. This allows you to run the computation-heavy model predictions only once and then iterate flexibly on the metrics or reports you want to generate. + + + +```python +from haystack.schema import EvaluationResult, MultiLabel + +# We can load evaluation labels from the document store +# We are also opting to filter out no_answer samples +eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=False) +eval_labels = [label for label in eval_labels if not label.no_answer] # filter out no_answer cases + +## Alternative: Define queries and labels directly + +# eval_labels = [ +# MultiLabel( +# labels=[ +# Label( +# query="who is written in the book of life", +# answer=Answer( +# answer="every person who is destined for Heaven or the World to Come", +# offsets_in_context=[Span(374, 434)] +# ), +# document=Document( +# id='1b090aec7dbd1af6739c4c80f8995877-0', +# content_type="text", +# content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is +# about the book mentioned in Christian and Jewish religious teachings...' +# ), +# is_correct_answer=True, +# is_correct_document=True, +# origin="gold-label" +# ) +# ] +# ) +# ] + +# Similar to pipeline.run() we can execute pipeline.eval() +eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}}) +``` + + +```python +# The EvaluationResult contains a pandas dataframe for each pipeline node. +# That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline. + +retriever_result = eval_result["Retriever"] +retriever_result.head() +``` + + +```python +reader_result = eval_result["Reader"] +reader_result.head() +``` + + +```python +# We can filter for all documents retrieved for a given query +query = "who is written in the book of life" +retriever_book_of_life = retriever_result[retriever_result["query"] == query] +``` + + +```python +# We can also filter for all answers predicted for a given query +reader_book_of_life = reader_result[reader_result["query"] == query] +``` + + +```python +# Save the evaluation result so that we can reload it later and calculate evaluation metrics without running the pipeline again. +eval_result.save("../") +``` + +## Calculating Evaluation Metrics +Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions, +such as F1-score of each individual prediction of the Reader node or recall of the retriever. +To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval) + + +```python +saved_eval_result = EvaluationResult.load("../") +metrics = saved_eval_result.calculate_metrics() +print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}') +print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}') +print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}') +print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}') +print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}') + +print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}') +print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}') +``` + +## Generating an Evaluation Report +A summary of the evaluation results can be printed to get a quick overview. It includes some aggregated metrics and also shows a few wrongly predicted examples. + + +```python +pipeline.print_eval_report(saved_eval_result) +``` + +## Advanced Evaluation Metrics +As an advanced evaluation metric, semantic answer similarity (SAS) can be calculated. This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer rather than just doing string comparison. +To this end SAS relies on pre-trained models. For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts". A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2". +More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130) or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa). + + +```python +advanced_eval_result = pipeline.eval( + labels=eval_labels, params={"Retriever": {"top_k": 1}}, sas_model_name_or_path="cross-encoder/stsb-roberta-large" +) + +metrics = advanced_eval_result.calculate_metrics() +print(metrics["Reader"]["sas"]) +``` + +## Isolated Evaluation Mode +The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding Retriever node. +Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader. Note that even with isolated evaluation enabled, integrated evaluation will still be running. + + + +```python +eval_result_with_upper_bounds = pipeline.eval( + labels=eval_labels, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}, add_isolated_node_eval=True +) +``` + + +```python +pipeline.print_eval_report(eval_result_with_upper_bounds) +``` + +## Evaluation of Individual Components: Retriever +Sometimes you might want to evaluate individual components, for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself. +Here we evaluate only the retriever, based on whether the gold_label document is retrieved. + + +```python +## Evaluate Retriever on its own +# Note that no_answer samples are omitted when evaluation is performed with this method +retriever_eval_results = retriever.eval(top_k=5, label_index=label_index, doc_index=doc_index) +# Retriever Recall is the proportion of questions for which the correct document containing the answer is +# among the correct documents +print("Retriever Recall:", retriever_eval_results["recall"]) +# Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank +print("Retriever Mean Avg Precision:", retriever_eval_results["map"]) +``` + +Just as a sanity check, we can compare the recall from `retriever.eval()` with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`. +These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels. + + + +```python +metrics = eval_result_with_upper_bounds.calculate_metrics() +print(metrics["Retriever"]["recall_multi_hit"]) +``` + +## Evaluation of Individual Components: Reader +Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query +and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by +the model as the answer span (i.e. SQuAD style) + + +```python +# Evaluate Reader on its own +reader_eval_results = reader.eval(document_store=document_store, label_index=label_index, doc_index=doc_index) +# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch +# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device) + +# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer +print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"]) +# Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer +print("Reader Exact Match:", reader_eval_results["EM"]) +# Reader F1-Score is the average overlap between the predicted answers and the correct answers +print("Reader F1-Score:", reader_eval_results["f1"]) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/6.md b/docs/v1.3.0/_src/tutorials/tutorials/6.md new file mode 100644 index 0000000000..7aaeea2797 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/6.md @@ -0,0 +1,248 @@ + + +# Better Retrieval via "Dense Passage Retrieval" + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb) + +### Importance of Retrievers + +The Retriever has a huge impact on the performance of our overall search pipeline. + + +### Different types of Retrievers +#### Sparse +Family of algorithms based on counting the occurrences of words (bag-of-words) resulting in very sparse vectors with length = vocab size. + +**Examples**: BM25, TF-IDF + +**Pros**: Simple, fast, well explainable + +**Cons**: Relies on exact keyword matches between query and text + + +#### Dense +These retrievers use neural network models to create "dense" embedding vectors. Within this family there are two different approaches: + +a) Single encoder: Use a **single model** to embed both query and passage. +b) Dual-encoder: Use **two models**, one to embed the query and one to embed the passage + +Recent work suggests that dual encoders work better, likely because they can deal better with the different nature of query and passage (length, style, syntax ...). + +**Examples**: REALM, DPR, Sentence-Transformers + +**Pros**: Captures semantinc similarity instead of "word matches" (e.g. synonyms, related topics ...) + +**Cons**: Computationally more heavy, initial training of model + + +### "Dense Passage Retrieval" + +In this Tutorial, we want to highlight one "Dense Dual-Encoder" called Dense Passage Retriever. +It was introdoced by Karpukhin et al. (2020, https://arxiv.org/abs/2004.04906. + +Original Abstract: + +_"Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented using dense representations alone, where embeddings are learned from a small number of questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks."_ + +Paper: https://arxiv.org/abs/2004.04906 +Original Code: https://fburl.com/qa-dpr + + +*Use this* [link](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb) *to open the notebook in Google Colab.* + + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_dicts, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +### Document Store + +#### Option 1: FAISS + +FAISS is a library for efficient similarity search on a cluster of dense vectors. +The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood +to store the document text and other meta data. The vector embeddings of the text are +indexed on a FAISS Index that later is queried for searching answers. +The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for +faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. +For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + + +```python +from haystack.document_stores import FAISSDocumentStore + +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") +``` + +#### Option 2: Milvus + +Milvus is an open source database library that is also optimized for vector similarity searches like FAISS. +Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management. +It does require a little more setup, however, as it is run through Docker and requires the setup of some config files. +See [their docs](https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md) for more details. + + +```python +# Milvus cannot be run on COlab, so this cell is commented out. +# To run Milvus you need Docker (versions below 2.0.0) or a docker-compose (versions >= 2.0.0), neither of which is available on Colab. +# See Milvus' documentation for more details: https://milvus.io/docs/install_standalone-docker.md + +# !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[milvus] + +# from haystack.utils import launch_milvus +# from haystack.document_stores import MilvusDocumentStore + +# launch_milvus() +# document_store = MilvusDocumentStore() +``` + +### Cleaning & indexing documents + +Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore + + +```python +# Let's first get some files that we want to use +doc_dir = "data/tutorial6" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(dicts) +``` + +### Initalize Retriever, Reader & Pipeline + +#### Retriever + +**Here:** We use a `DensePassageRetriever` + +**Alternatives:** + +- The `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters +- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT) +- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging + + +```python +from haystack.nodes import DensePassageRetriever + +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, + use_gpu=True, + embed_title=True, + use_fast_tokenizers=True, +) +# Important: +# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all +# previously indexed documents and update their embedding representation. +# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. +# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast. +document_store.update_embeddings(retriever) +``` + +#### Reader + +Similar to previous Tutorials we now initalize our reader. + +Here we use a FARMReader with the *deepset/roberta-base-squad2* model (see: https://huggingface.co/deepset/roberta-base-squad2) + + + +##### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the reader and retriever shall return +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/7.md b/docs/v1.3.0/_src/tutorials/tutorials/7.md new file mode 100644 index 0000000000..dfa9a91c35 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/7.md @@ -0,0 +1,189 @@ + + +# Generative QA with "Retrieval-Augmented Generation" + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb) + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. +In this tutorial, you will learn how to set up a generative system using the +[RAG model](https://arxiv.org/abs/2005.11401) which conditions the +answer generator on a set of retrieved documents. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + +Here are the packages and imports that we'll need: + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + + +```python +from typing import List +import requests +import pandas as pd +from haystack import Document +from haystack.document_stores import FAISSDocumentStore +from haystack.nodes import RAGenerator, DensePassageRetriever +from haystack.utils import fetch_archive_from_http +``` + +Let's download a csv containing some sample text and preprocess the data. + + + +```python +# Download sample +doc_dir = "data/tutorial7/" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Create dataframe with columns "title" and "text" +df = pd.read_csv("small_generator_dataset.csv", sep=",") +# Minimal cleaning +df.fillna(value="", inplace=True) + +print(df.head()) +``` + +We can cast our data into Haystack Document objects. +Alternatively, we can also just use dictionaries with "text" and "meta" fields + + +```python +# Use data to initialize Document objects +titles = list(df["title"].values) +texts = list(df["text"].values) +documents: List[Document] = [] +for title, text in zip(titles, texts): + documents.append(Document(content=text, meta={"name": title or ""})) +``` + +Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator. +FAISS is chosen here since it is optimized vector storage. + + +```python +# Initialize FAISS document store. +# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True) + +# Initialize DPR Retriever to encode documents, encode question and query documents +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + use_gpu=True, + embed_title=True, +) + +# Initialize RAG Generator +generator = RAGenerator( + model_name_or_path="facebook/rag-token-nq", + use_gpu=True, + top_k=1, + max_length=200, + min_length=2, + embed_title=True, + num_beams=2, +) +``` + +We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`. +The `update_embeddings()` method uses the retriever to create an embedding for each document. + + + +```python +# Delete existing documents in documents store +document_store.delete_documents() + +# Write documents to document store +document_store.write_documents(documents) + +# Add documents embeddings to index +document_store.update_embeddings(retriever=retriever) +``` + +Here are our questions: + + +```python +QUESTIONS = [ + "who got the first nobel prize in physics", + "when is the next deadpool movie being released", + "which mode is used for short wave broadcast service", + "who is the owner of reading football club", + "when is the next scandal episode coming out", + "when is the last time the philadelphia won the superbowl", + "what is the most current adobe flash player version", + "how many episodes are there in dragon ball z", + "what is the first step in the evolution of the eye", + "where is gall bladder situated in human body", + "what is the main mineral in lithium batteries", + "who is the president of usa right now", + "where do the greasers live in the outsiders", + "panda is a national animal of which country", + "what is the name of manchester united stadium", +] +``` + +Now let's run our system! +The retriever will pick out a small subset of documents that it finds relevant. +These are used to condition the generator as it generates the answer. +What it should return then are novel text spans that form and answer to your question! + + +```python +# Or alternatively use the Pipeline class +from haystack.pipelines import GenerativeQAPipeline +from haystack.utils import print_answers + +pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) +for question in QUESTIONS: + res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}}) + print_answers(res, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/8.md b/docs/v1.3.0/_src/tutorials/tutorials/8.md new file mode 100644 index 0000000000..ffbd75de36 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/8.md @@ -0,0 +1,207 @@ + + +# Preprocessing + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial8_Preprocessing.ipynb) + +Haystack includes a suite of tools to extract text from different file types, normalize white space +and split text into smaller pieces to optimize retrieval. +These data preprocessing steps can have a big impact on the systems performance and effective handling of data is key to getting the most out of Haystack. + +Ultimately, Haystack expects data to be provided as a list documents in the following dictionary format: +``` python +docs = [ + { + 'content': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +``` + +This tutorial will show you all the tools that Haystack provides to help you cast your data into this format. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +from pathlib import Path + +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr] + +!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz +!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin +``` + + +```python +# Here are the imports we need +from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor +from haystack.utils import convert_files_to_dicts, fetch_archive_from_http +``` + + +```python +# This fetches some sample files to work with + +doc_dir = "data/tutorial8" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + +## Converters + +Haystack's converter classes are designed to help you turn files on your computer into the documents +that can be processed by the Haystack pipeline. +There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika. +The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected. +For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great. + + +```python +# Here are some examples of how you would use file converters + +converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_txt = converter.convert(file_path=Path(f"{doc_dir}/classics.txt"), meta=None)[0] + +converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/bert.pdf"), meta=None)[0] + +converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) +doc_docx = converter.convert(file_path=Path(f"{doc_dir}/heavy_metal.docx"), meta=None)[0] +``` + + +```python +# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory. + +all_docs = convert_files_to_dicts(dir_path=doc_dir) +``` + +## PreProcessor + +The PreProcessor class is designed to help you clean text and split text into sensible units. +File splitting can have a very significant impact on the system's performance and is absolutely mandatory for Dense Passage Retrieval models. +In general, we recommend you split the text from your files into small documents of around 100 words for dense retrieval methods +and no more than 10,000 words for sparse methods. +Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd) +and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details. + + +```python +# This is a default usage of the PreProcessor. +# Here, it performs cleaning of consecutive whitespaces +# and splits a single large document into smaller documents. +# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences +# Note how the single document passed into the document gets split into 5 smaller documents + +preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, +) +docs_default = preprocessor.process(doc_txt) +print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}") +``` + +## Cleaning + +- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines +- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text +- `clean_header_footer` will remove any long header or footer texts that are repeated on each page + +## Splitting +By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end +midway through a sentence. +This will help reduce the possibility of answer phrases being split between two documents. +This feature can be turned off by setting `split_respect_sentence_boundary=False`. + + +```python +# Not respecting sentence boundary vs respecting sentence boundary + +preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False) +docs_nrsb = preprocessor_nrsb.process(doc_txt) + +print("RESPECTING SENTENCE BOUNDARY") +end_text = docs_default[0]["content"][-50:] +print('End of document: "...' + end_text + '"') +print() +print("NOT RESPECTING SENTENCE BOUNDARY") +end_text_nrsb = docs_nrsb[0]["content"][-50:] +print('End of document: "...' + end_text_nrsb + '"') +``` + +A commonly used strategy to split long documents, especially in the field of Question Answering, +is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this: + +- doc1 = words[0:10] +- doc2 = words[7:17] +- doc3 = words[14:24] +- ... + +You can use this strategy by following the code below. + + +```python +# Sliding window approach + +preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False) +docs_sliding_window = preprocessor_sliding_window.process(doc_txt) + +doc1 = docs_sliding_window[0]["content"][:200] +doc2 = docs_sliding_window[1]["content"][:100] +doc3 = docs_sliding_window[2]["content"][:100] + +print('Document 1: "' + doc1 + '..."') +print('Document 2: "' + doc2 + '..."') +print('Document 3: "' + doc3 + '..."') +``` + +## Bringing it all together + + +```python +all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial") +preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, +) +docs = preprocessor.process(all_docs) + +print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.3.0/_src/tutorials/tutorials/9.md b/docs/v1.3.0/_src/tutorials/tutorials/9.md new file mode 100644 index 0000000000..e0b1b34dfd --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/9.md @@ -0,0 +1,250 @@ + + +# Training Your Own "Dense Passage Retrieval" Model + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial9_DPR_training.ipynb) + +Haystack contains all the tools needed to train your own Dense Passage Retrieval model. +This tutorial will guide you through the steps required to create a retriever that is specifically tailored to your domain. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + + +```python +# Here are some imports that we'll need + +from haystack.nodes import DensePassageRetriever +from haystack.utils import fetch_archive_from_http +from haystack.document_stores import InMemoryDocumentStore +``` + +## Training Data + +DPR training performed using Information Retrieval data. +More specifically, you want to feed in pairs of queries and relevant documents. + +To train a model, we will need a dataset that has the same format as the original DPR training data. +Each data point in the dataset should have the following dictionary structure. + +``` python + { + "dataset": str, + "question": str, + "answers": list of str + "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + } +``` + +`positive_ctxs` are context passages which are relevant to the query. +In some datasets, queries might have more than one positive context +in which case you can set the `num_positives` parameter to be higher than the default 1. +Note that `num_positives` needs to be lower or equal to the minimum number of `positive_ctxs` for queries in your data. +If you have an unequal number of positive contexts per example, +you might want to generate some soft labels by retrieving similar contexts which contain the answer. + +DPR is standardly trained using a method known as in-batch negatives. +This means that positive contexts for a given query are treated as negative contexts for the other queries in the batch. +Doing so allows for a high degree of computational efficiency, thus allowing the model to be trained on large amounts of data. + +`negative_ctxs` is not actually used in Haystack's DPR training so we recommend you set it to an empty list. +They were used by the original DPR authors in an experiment to compare it against the in-batch negatives method. + +`hard_negative_ctxs` are passages that are not relevant to the query. +In the original DPR paper, these are fetched using a retriever to find the most relevant passages to the query. +Passages which contain the answer text are filtered out. + +If you'd like to convert your SQuAD format data into something that can train a DPR model, +check out the utility script at [`haystack/retriever/squad_to_dpr.py`](https://github.com/deepset-ai/haystack/blob/master/haystack/retriever/squad_to_dpr.py) + +## Using Question Answering Data + +Question Answering datasets can sometimes be used as training data. +Google's Natural Questions dataset, is sufficiently large +and contains enough unique passages, that it can be converted into a DPR training set. +This is done simply by considering answer containing passages as relevant documents to the query. + +The SQuAD dataset, however, is not as suited to this use case since its question and answer pairs +are created on only a very small slice of wikipedia documents. + +## Download Original DPR Training Data + +WARNING: These files are large! The train set is 7.4GB and the dev set is 800MB + +We can download the original DPR training data with the following cell. +Note that this data is probably only useful if you are trying to train from scratch. + + +```python +# Download original DPR data +# WARNING: the train set is 7.4GB and the dev set is 800MB + +doc_dir = "data/tutorial9" + +s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz" +s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz" + +fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "train/") +fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "dev/") +``` + +## Option 1: Training DPR from Scratch + +The default variables that we provide below are chosen to train a DPR model from scratch. +Here, both passage and query embedding models are initialized using BERT base +and the model is trained using Google's Natural Questions dataset (in a format specialised for DPR). + +If you are working in a language other than English, +you will want to initialize the passage and query embedding models with a language model that supports your language +and also provide a dataset in your language. + + +```python +# Here are the variables to specify our training data, the models that we use to initialize DPR +# and the directory where we'll be saving the model + +doc_dir = "data/dpr_training/" + +train_filename = "train/biencoder-nq-train.json" +dev_filename = "dev/biencoder-nq-dev.json" + +query_model = "bert-base-uncased" +passage_model = "bert-base-uncased" + +save_dir = "../saved_models/dpr" +``` + +## Option 2: Finetuning DPR + +If you have your own domain specific question answering or information retrieval dataset, +you might instead be interested in finetuning a pretrained DPR model. +In this case, you would initialize both query and passage models using the original pretrained model. +You will want to load something like this set of variables instead of the ones above + + +```python +# Here are the variables you might want to use instead of the set above +# in order to perform pretraining + +doc_dir = "PATH_TO_YOUR_DATA_DIR" +train_filename = "TRAIN_FILENAME" +dev_filename = "DEV_FILENAME" + +query_model = "facebook/dpr-question_encoder-single-nq-base" +passage_model = "facebook/dpr-ctx_encoder-single-nq-base" + +save_dir = "../saved_models/dpr" +``` + +## Initialization + +Here we want to initialize our model either with plain language model weights for training from scratch +or else with pretrained DPR weights for finetuning. +We follow the [original DPR parameters](https://github.com/facebookresearch/DPR#best-hyperparameter-settings) +for their max passage length but set max query length to 64 since queries are very rarely longer. + + +```python +## Initialize DPR model + +retriever = DensePassageRetriever( + document_store=InMemoryDocumentStore(), + query_embedding_model=query_model, + passage_embedding_model=passage_model, + max_seq_len_query=64, + max_seq_len_passage=256, +) +``` + +## Training + +Let's start training and save our trained model! + +On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order +to simulate the batch size 128 of the original DPR experiment. + +When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token +between it and document text. + +When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance: + +``` +loss: 0.046580662854042276 +task_name: text_similarity +acc: 0.992524064068483 +f1: 0.8804297774366846 +acc_and_f1: 0.9364769207525838 +average_rank: 0.19631619339984652 +report: + precision recall f1-score support + +hard_negative 0.9961 0.9961 0.9961 201887 + positive 0.8804 0.8804 0.8804 6515 + + accuracy 0.9925 208402 + macro avg 0.9383 0.9383 0.9383 208402 + weighted avg 0.9925 0.9925 0.9925 208402 + +``` + + +```python +# Start training our model and save it when it is finished + +retriever.train( + data_dir=doc_dir, + train_filename=train_filename, + dev_filename=dev_filename, + test_filename=dev_filename, + n_epochs=1, + batch_size=16, + grad_acc_steps=8, + save_dir=save_dir, + evaluate_every=3000, + embed_title=True, + num_positives=1, + num_hard_negatives=1, +) +``` + +## Loading + +Loading our newly trained model is simple! + + +```python +reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/convert_ipynb.py b/docs/v1.3.0/_src/tutorials/tutorials/convert_ipynb.py new file mode 100644 index 0000000000..3df5c1eec1 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/convert_ipynb.py @@ -0,0 +1,31 @@ +import re + +from nbconvert import MarkdownExporter +import os +from pathlib import Path +from headers import headers + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + test = [atoi(c) for c in re.split("(\d+)", text)] + return test + + +dir = Path("../../../../tutorials") + +notebooks = [x for x in os.listdir(dir) if x[-6:] == ".ipynb"] +# sort notebooks based on numbers within name of notebook +notebooks = sorted(notebooks, key=lambda x: natural_keys(x)) + + +e = MarkdownExporter(exclude_output=True) +for i, nb in enumerate(notebooks): + body, resources = e.from_filename(dir / nb) + print(f"Processing {dir}/{nb}") + with open(str(i + 1) + ".md", "w", encoding="utf-8") as f: + f.write(headers[i + 1] + "\n\n") + f.write(body) diff --git a/docs/v1.3.0/_src/tutorials/tutorials/headers.py b/docs/v1.3.0/_src/tutorials/tutorials/headers.py new file mode 100644 index 0000000000..b6ae875802 --- /dev/null +++ b/docs/v1.3.0/_src/tutorials/tutorials/headers.py @@ -0,0 +1,130 @@ +headers = { + 1: """""", + 2: """""", + 3: """""", + 4: """""", + 5: """""", + 6: """""", + 7: """""", + 8: """""", + 9: """""", + 10: """""", + 11: """""", + 12: """""", + 13: """""", + 14: """""", + 15: """""", + 16: """""", +} diff --git a/docs/v1.3.0/_src/usage/Makefile b/docs/v1.3.0/_src/usage/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/v1.3.0/_src/usage/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/v1.3.0/_src/usage/conf.py b/docs/v1.3.0/_src/usage/conf.py new file mode 100644 index 0000000000..12ac810874 --- /dev/null +++ b/docs/v1.3.0/_src/usage/conf.py @@ -0,0 +1,51 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "Usage" +copyright = "2020, deepset" +author = "deepset" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["sphinx_tabs.tabs", "sphinx_copybutton", "nbsphinx", "sphinx.ext.autosectionlabel"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "alabaster" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/v1.3.0/_src/usage/index.rst b/docs/v1.3.0/_src/usage/index.rst new file mode 100644 index 0000000000..1a615bf974 --- /dev/null +++ b/docs/v1.3.0/_src/usage/index.rst @@ -0,0 +1,23 @@ +.. Haystack documentation master file, created by + sphinx-quickstart on Tue Jul 28 16:18:42 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Haystack is all you need to scale the latest advances in Question Answering technologies to large collections of documents. +Whether you're building a search system, asking standard questions to an incoming stream of documents or +extending your chatbot, Haystack has you covered. + +This section covers the core concepts and practical usage of Haystack. See our tutorials page for end to end examples +or our API page for comprehensive descriptions of individual functions. + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + usage/intro + usage/get_started + usage/database + usage/retriever + usage/reader + usage/domain_adaptation + usage/terms diff --git a/docs/v1.3.0/_src/usage/make.bat b/docs/v1.3.0/_src/usage/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/v1.3.0/_src/usage/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/v1.3.0/_src/usage/usage/annotation.md b/docs/v1.3.0/_src/usage/usage/annotation.md new file mode 100644 index 0000000000..880b4108f4 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/annotation.md @@ -0,0 +1,69 @@ +# Annotation Tool + +- Create labels with different techniques: Come up with questions (+ answers) while reading passages (SQuAD style) or have a set of predefined questions and look for answers in the document (~ Natural Questions). +- Structure your work via organizations, projects, users +- Upload your documents or import labels from an existing SQuAD-style dataset +- Export your labels in SQuAD Format + +![image](../../img/annotation_tool.png) + +# Hosted version + Signup here: [Haystack Annotation Tool](https://annotate.deepset.ai/login) + +# Local version (Docker) + +1. Configure credentials & database in the [`docker-compose.yml`](https://github.com/deepset-ai/haystack/blob/master/annotation_tool/docker-compose.yml): + +The credentials should match in database image and application configuration. + + DEFAULT_ADMIN_EMAIL: "example@example.com" + DEFAULT_ADMIN_PASSWORD: "DEMO-PASSWORD" + + PROD_DB_NAME: "databasename" + PROD_DB_USERNAME: "somesafeuser" + PROD_DB_PASSWORD: "somesafepassword" + + + POSTGRES_USER: "somesafeuser" + POSTGRES_PASSWORD: "somesafepassword" + POSTGRES_DB: "databasename" + + +2. Run docker-compose by executing `docker-compose up`. + + +3. The UI should be available at `localhost:7001`. + +# Usage +The manual (of a slightly earlier version) can be found [here](https://drive.google.com/file/d/1Wv3OIC0Z7ibHIzOm9Xw_r0gjTFmpl-33/view). While it doesn't include all latest features, the basic workflow and tips for label quality are still the same. + +# Annotation FAQ + +1. What is a good question? + - A good question is a fact-seeking question that can be answered with an entity (person, organisation, location, etc.) or explanation. A bad question is ambiguous, incomprehensible, dependent on clear false presuppositions, opinion seeking, or not clearly a request for factual information. + - The question should ask about information present in the text passage given. It should not be answerable only with additional knowledge or your interpretation. + - Do not copy paste answer text into the question. Good questions do not contain the exact same words as the answer or the context around the answer. The question should be a reformulation with synonyms and in different order as the context of the answer. + - Questions should be very precise natural questions you would ask when you want information from another person. +2. How many questions should you ask per text passage? + - Maximally ask 20 questions per passage + - Some text passages are not suited for 20 questions. Do not make up very constructed and complicated questions just to fill up the 20 - move on to the next text. + - Try to ask questions covering the whole passage and focus on questions covering important information. Do not only ask questions about a single sentence in that passage. +3. What is a good answer span? + - Always mark whole words. Do not start or end the answer within a word. + - For short answers: The answer should be as short and as close to a spoken human answer as possible. Do not include punctuation. + - For long answers: Please mark whole sentences with punctuation. The sentences can also pick up parts of the question, or mark even whole text passages. Mark passages only if they are not too large (e.g. not more than 8-10 sentences). +4. How do I differentiate long vs short answers? + - If there is a short answer possible you should always select short answer over long answer. + - Short precise answers like numbers or a few words are short answers. + - Long answers include lists of possibilities or multiple sentences are needed to answer the question correctly. +5. How to handle multiple possible answers to a single question? + - As of now there is no functionality to mark multiple answers per single question. + - Workaround: You can add a question with the same text but different answer selection by using the button below the question list (Button reads “custom question”) +6. What to do with grammatically wrong or incorrectly spelled questions? + - Include them. When users use the tool and ask questions they will likely contain grammar and spelling errors, too. + - Exception: The question needs to be understandable without reading and interpretation of the corresponding text passage. If you do not understand the question, please mark the question as “I don’t understand the question”. +7. What to do with text passages that are not properly converted or contain (in part) information that cannot be labelled (e.g. just lists or garbage text)? + - Please do not annotate this text + - You can write down what is missing, or the cause why you cannot label the text + the text number and title. +8. Which browser to use? + - Please use the Chrome browser. The tool is not tested for other browsers. diff --git a/docs/v1.3.0/_src/usage/usage/document_store.md b/docs/v1.3.0/_src/usage/usage/document_store.md new file mode 100644 index 0000000000..bca65eac2d --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/document_store.md @@ -0,0 +1,362 @@ + + + +# DocumentStores + +You can think of the DocumentStore as a "database" that: +- stores your texts and meta data +- provides them to the retriever at query time + +There are different DocumentStores in Haystack to fit different use cases and tech stacks. + +## Initialisation + +Initialising a new DocumentStore within Haystack is straight forward. + +
+ +
+ + +
+ +[Install](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) +Elasticsearch and then [start](https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html) +an instance. + +If you have Docker set up, we recommend pulling the Docker image and running it. +```bash +docker pull docker.elastic.co/elasticsearch/elasticsearch:7.9.2 +docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 +``` + +Note that we also have a utility function `haystack.utils.launch_es` that can start up an Elasticsearch instance. + +Next you can initialize the Haystack object that will connect to this instance. + +```python +from haystack.document_store import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore() +``` + +Note that we also support [OpenSearch](https://opensearch.org/). +Follow [their documentation](https://opensearch.org/docs/) +to run it and connect to it using Haystack's `OpenSearchDocumentStore` class. + +We further support [AWS Elastic Search Service](https://aws.amazon.com/elasticsearch-service/) with [signed Requests](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html): +Use e.g. [aws-requests-auth](https://github.com/davidmuller/aws-requests-auth) to create an auth object and pass it as `aws4auth` to the `ElasticsearchDocumentStore` constructor. + +
+
+ +
+ + +
+ +Follow the [official documentation](https://www.milvus.io/docs/v1.0.0/milvus_docker-cpu.md) to start a Milvus instance via Docker. +Note that we also have a utility function `haystack.utils.launch_milvus` that can start up a Milvus instance. + +You can initialize the Haystack object that will connect to this instance as follows: +```python +from haystack.document_store import MilvusDocumentStore + +document_store = MilvusDocumentStore() +``` + +
+
+ +
+ + +
+ +The `FAISSDocumentStore` requires no external setup. Start it by simply using this line. +```python +from haystack.document_store import FAISSDocumentStore + +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") +``` + +
+
+ +
+ + +
+ +The `InMemoryDocumentStore()` requires no external setup. Start it by simply using this line. +```python +from haystack.document_store import InMemoryDocumentStore + +document_store = InMemoryDocumentStore() +``` + +
+
+ +
+ + +
+ +The `SQLDocumentStore` requires SQLite, PostgresQL or MySQL to be installed and started. +Note that SQLite already comes packaged with most operating systems. + +```python +from haystack.document_store import SQLDocumentStore + +document_store = SQLDocumentStore() +``` + +
+
+ +
+ + +
+ +The `WeaviateDocumentStore` requires a running Weaviate Server. +You can start a basic instance like this (see the [Weaviate docs](https://www.semi.technology/developers/weaviate/current/) for details): +``` + docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.7.2 +``` + +Afterwards, you can use it in Haystack: +```python +from haystack.document_store import WeaviateDocumentStore + +document_store = WeaviateDocumentStore() +``` + +
+
+ +
+ + +
+ +See the official [OpenSearch documentation](https://opensearch.org/docs/opensearch/install/docker/) on how to install and start an instance. + +If you have Docker set up, we recommend pulling the Docker image and running it. +```bash +docker pull opensearchproject/opensearch:1.0.0 +docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.0 +``` + +Note that we also have a utility function `haystack.utils.launch_opensearch` that can start up an OpenSearch instance. + +Next you can initialize the Haystack object that will connect to this instance. + +```python +from haystack.document_store import OpenSearchDocumentStore + +document_store = OpenSearchDocumentStore() +``` + +
+
+ +
+ +Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes. +See API documentation for more info. + +## Input Format + +DocumentStores expect Documents in dictionary form, like that below. +They are loaded using the `DocumentStore.write_documents()` method. +See [Preprocessing](/docs/latest/preprocessingmd) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance. + +[//]: # (Add link to preprocessing section) + +```python +from haystack.document_store import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore() +dicts = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +document_store.write_documents(dicts) +``` + +## Writing Documents (Sparse Retrievers) + +Haystack allows for you to write store documents in an optimised fashion so that query times can be kept low. +For **sparse**, keyword based retrievers such as BM25 and TF-IDF, +you simply have to call `DocumentStore.write_documents()`. +The creation of the inverted index which optimises querying speed is handled automatically. + +```python +document_store.write_documents(dicts) +``` + +## Writing Documents (Dense Retrievers) + +For **dense** neural network based retrievers like Dense Passage Retrieval, or Embedding Retrieval, +indexing involves computing the Document embeddings which will be compared against the Query embedding. + +The storing of the text is handled by `DocumentStore.write_documents()` and the computation of the +embeddings is started by `DocumentStore.update_embeddings()`. + +```python +document_store.write_documents(dicts) +document_store.update_embeddings(retriever) +``` + +This step is computationally intensive since it will engage the transformer based encoders. +Having GPU acceleration will significantly speed this up. + + + +## Choosing the Right Document Store + +The Document Stores have different characteristics. You should choose one depending on the maturity of your project, the use case and technical environment: + +
+ +
+ + +
+ +**Pros:** +- Fast & accurate sparse retrieval with many tuning options +- Basic support for dense retrieval +- Production-ready +- Support also for Open Distro + +**Cons:** +- Slow for dense retrieval with more than ~ 1 Mio documents + +
+
+ +
+ + +
+ +**Pros:** +- Scalable DocumentStore that excels at handling vectors (hence suited to dense retrieval methods like DPR) +- Encapsulates multiple ANN libraries (e.g. FAISS and ANNOY) and provides added reliability +- Runs as a separate service (e.g. a Docker container) +- Allows dynamic data management + +**Cons:** +- No efficient sparse retrieval + +
+
+ +
+ + +
+ +**Pros:** +- Fast & accurate dense retrieval +- Highly scalable due to approximate nearest neighbour algorithms (ANN) +- Many options to tune dense retrieval via different index types (more info [here](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index)) + +**Cons:** +- No efficient sparse retrieval + +
+
+ +
+ + +
+ +**Pros:** +- Simple +- Exists already in many environments + +**Cons:** +- Only compatible with minimal TF-IDF Retriever +- Bad retrieval performance +- Not recommended for production + +
+
+ +
+ + +
+ +**Pros:** +- Simple & fast to test +- No database requirements +- Supports MySQL, PostgreSQL and SQLite + +**Cons:** +- Not scalable +- Not persisting your data on disk + +
+
+ + +
+ + +
+ +**Pros:** +- Simple vector search +- Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up +- Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset + +**Cons:** +- Less options for ANN algorithms than FAISS or Milvus +- No BM25 / Tf-idf retrieval + +
+
+ +
+ + +
+ +**Pros:** +- Fully open source fork of Elasticsearch +- Has support for Approximate Nearest Neighbours vector search + +**Cons:** +- It's ANN algorithms seem a little less performant that FAISS or Milvus in our benchmarks + +
+
+ +
+ +
+ +#### Our Recommendations + +**Restricted environment:** Use the `InMemoryDocumentStore`, if you are just giving Haystack a quick try on a small sample and are working in a restricted environment that complicates running Elasticsearch or other databases + +**Allrounder:** Use the `ElasticSearchDocumentStore`, if you want to evaluate the performance of different retrieval options (dense vs. sparse) and are aiming for a smooth transition from PoC to production + +**Vector Specialist:** Use the `MilvusDocumentStore`, if you want to focus on dense retrieval and possibly deal with larger datasets + +
diff --git a/docs/v1.3.0/_src/usage/usage/domain_adaptation.md b/docs/v1.3.0/_src/usage/usage/domain_adaptation.md new file mode 100644 index 0000000000..dede4d79cc --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/domain_adaptation.md @@ -0,0 +1,157 @@ + + +# Domain Adaptation + +## Generalisation + +In our experience, language models trained on SQuAD show very strong general question answering capabilities. +Though SQuAD is composed entirely of Wikipedia articles, these models are flexible enough to deal with many different styles of text. + +Before trying to adapt these models to your domain, we’d recommend trying one of the off the shelf models. +We’ve found that these models are often flexible enough for a wide range of use cases. + +
+ +**Intuition** + +Most people probably don’t know what an HP Valve is. +But you don’t always need to know what a HP Valve is to answer “What is connected to a HP Valve?” +The answer might be there in plain language. +In the same way, many QA models have a good enough grasp of language to answer questions about concepts in an unseen domain. + +
+ + +## Finetuning + +Any model that can be loaded into Haystack can also be finetuned within Haystack. +Simply provide the domain specific dataset and call `Reader.train()` on an initialised model. + +``` +reader.train(data_dir=train_data, + train_filename="dev-v2.0.json", + n_epochs=1, + save_dir="my_model") +``` + +At the end of training, the finetuned model will be saved in the specified `save_dir` and can be loaded as a `Reader`. + +
+ +**Recommendation** + +See Tutorial 2 for a runnable example of this process. +If you’re interested in measuring how much your model has improved, +please also check out Tutorial 5 which walks through the steps needed to perform evaluation. + +
+ +## Generating Labels + +Using our [Haystack Annotate tool](https://annotate.deepset.ai/login) (Beta), +you can easily create a labelled dataset using your own documents featuring your own question/ answer pairs. + + + +![image](./../../img/annotation_tool.png) + +Features include: + + +* Structured workspaces via organisations, projects and users + + +* Easy upload of your own documents and labels in a variety of formats (txt, pdf, SQuAD style) + + +* Export of labels to be used directly in Haystack + +Annotate also supports two different workflows: + + +* Think up questions and answers while reading passages (SQuAD style) + + +* Have a set of predefined questions and look for answers in the document (~ Natural Questions style) + +## User Feedback + +A simpler and faster process to finetune models to your domain is to utilise user feedback. +Dedicated annotation work can be costly and time consuming +but user feedback is an efficient alternative since it allows for labels to be generated by users, for users, +all while the system is already in production. +We, for example, have used a simple thumbs up / down system in our demos to allow +users to give feedback. + +![image](../../img/demo.png) + +Through the Rest API, users can annotate each Haystack result as being either: + +* Correct +* Correct document but wrong text +* Wrong document and wrong text + +To get started, follow these steps: + +* Start up the REST API + * The simplest way to do this is to call `docker-compose up` from the root directory of the Haystack repository + * Alternatively, run `gunicorn rest_api.application:app -b 0.0.0.0:8000 -k uvicorn.workers.UvicornWorker -t 300` +* Make a POST request to the `doc-qa-feedback` endpoint with the following payload: +``` +{ + "question": "string", + "is_correct_answer": true, + "document_id": "string", + "model_id": 0, + "is_correct_document": true, + "answer": "string", + "offset_start_in_doc": 0 +} +``` +To fill out all these values, you can use the response from an executed search request on the `doc-qa` endpoint. + +* Export your labels in SQuAD format by making a GET request to the `export-doc-qa-feedback` endpoint +``` python +# SQUAD format +{ + "data": [ + { + "title": "Normans", + "paragraphs": [ + { + "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the...", + "qas": [ + { + "question": "In what country is Normandy located?", + "id": "56ddde6b9a695914005b9628", + "answers": [ + { + "text": "France", + "answer_start": 159 + } + ] + } + ] + } + ] + } + ] +} +``` + +User feedback labelling also works with FAQ style QA. +Refer to the Swagger API documentation (http://127.0.0.1:8000/docs) for more details. + +This labelled data can then be used to +augment the training data and enable `Reader` finetuning. +Alternatively, they can also be used to form an evaluation set to +measure the performance of trained models, +or monitor how well the model is doing in a live environment. + diff --git a/docs/v1.3.0/_src/usage/usage/faq.md b/docs/v1.3.0/_src/usage/usage/faq.md new file mode 100644 index 0000000000..37b09ff9b8 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/faq.md @@ -0,0 +1,90 @@ + + +#Frequently Asked Questions + +##Why am I seeing duplicate answers being returned? + +The ElasticsearchDocumentStore and MilvusDocumentStore rely on Elasticsearch and Milvus backend services which +persist after your Python script has finished running. +If you rerun your script without deleting documents, you could end up with duplicate +copies of your documents in your database. +The easiest way to avoid this is to call `DocumentStore.delete_documents()` after initialization +to ensure that you are working with an empty DocumentStore. + +DocumentStores also have a `duplicate_documents` argument in their `__init__()` and `write_documents` methods +where you can define whether you'd like skip writing duplicates, overwrite existing duplicates or raise an error when there are duplicates. + +##How can I make sure that my GPU is being engaged when I use Haystack? + +You will want to ensure that a CUDA enabled GPU is being engaged when Haystack is running (you can check by running `nvidia-smi -l` on your command line). +Components which can be sped up by GPU have a `use_gpu` argument in their constructor which you will want to set to `True`. + +##How do I speed up my predictions? + +There are many different ways to speed up the performance of your Haystack system. + +The Reader is usually the most computationally expensive component in a pipeline +and you can often speed up your system by using a smaller model, like `deepset/minilm-uncased-squad2` (see [benchmarks](https://huggingface.co/deepset/minilm-uncased-squad2)). This usually comes with a small trade-off in accuracy. + +You can reduce the work load on the Reader by instructing the Retriever to pass on less documents. +This is done by setting the `top_k_retriever` parameter to a lower value. + +Making sure that your documents are shorter can also increase the speed of your system. You can split +your documents into smaller chunks by using the `PreProcessor` (see [tutorial](https://haystack.deepset.ai/docs/latest/tutorial11md)). + +For more optimization suggestions, have a look at our [optimization page](https://haystack.deepset.ai/docs/latest/optimizationmd) +and also our [blogs](https://medium.com/deepset-ai) + +##How do I use Haystack for my language? + +The components in Haystack, such as the `Retriever` or the `Reader`, are designed in a language agnostic way. However you may +have to set certain parameters or load models pretrained for your language in order to get good performance out of Haystack. +See our [languages page](https://haystack.deepset.ai/docs/latest/languagesmd) for more details. + +##How can I add metadata to my documents so that I can apply filters? + +When providing your documents in the input format (see [here](https://haystack.deepset.ai/docs/latest/documentstoremd#Input-Format)) +you can provide metadata information as a dictionary under the `meta` key. At query time, you can provide a `filters` argument +(most likely through `Pipelines.run()`) that specifies the accepted values for a certain metadata field +(for an example of what a `filters` dictionary might look like, please refer to [this example](https://haystack.deepset.ai/docs/latest/apiretrievermd#__init__)) + +##How can I see predictions during evaluation? + +To see predictions during evaluation, you want to initialize the `EvalDocuments` or `EvalAnswers` with `debug=True`. +This causes their `EvalDocuments.log` or `EvalAnswers.log` to be populated with a record of each prediction made. + +##How can I serve my Haystack model? + +Haystack models can be wrapped in a REST API. For basic details on how to set this up, please refer to this section +on our [Github page](https://github.com/deepset-ai/haystack/blob/master/README.md#7-rest-api). +More comprehensive documentation coming soon! + +##How can I interpret the confidence scores being returned by the Reader? + +The confidence scores are in the range of 0 and 1 and reflect how confident the model is in each prediction that it makes. +Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold. +Many of our users have built systems where predictions below a certain confidence value are routed on to a fallback system. + +For more information on model confidence and how to tune it, please refer to [this section](https://haystack.deepset.ai/docs/latest/readermd#Confidence-Scores). + +##My documents aren't showing up in my DocumentStore even though I've called `DocumentStore.write_documents()` + +When indexing, retrieving or querying for documents from a DocumentStore, you can specify an `index` on which to perform this action. +This can be specified in almost all methods of `DocumentStore` as well as `Retriever.retrieve()`. +Ensure that you are performing these operations on the one index! +Note that this also applies at evaluation where labels are written into their own separate DocumentStore index. + +##What is the difference between the FARMReader and the TransformersReader? + +In short, the FARMReader using a QA pipeline implementation that comes from our own +[FARM framework](https://github.com/deepset-ai/FARM) that we can more easily update and also optimize for performance. +By contrast, the TransformersReader uses a QA pipeline implementation that comes from HuggingFace's [Transformers](https://github.com/huggingface/transformers). +See [this section](https://haystack.deepset.ai/docs/latest/readermd#Deeper-Dive-FARM-vs-Transformers) +for a more details about their differences! \ No newline at end of file diff --git a/docs/v1.3.0/_src/usage/usage/generator.md b/docs/v1.3.0/_src/usage/usage/generator.md new file mode 100644 index 0000000000..2409961b80 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/generator.md @@ -0,0 +1,36 @@ + + +# Generator + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. +The best current approaches, such as [Retriever-Augmented Generation](https://arxiv.org/abs/2005.11401) and [LFQA](https://yjernite.github.io/lfqa.html), +can draw upon both the knowledge it gained during language model pretraining (parametric memory) +as well as passages provided to it with a retriever (non-parametric memory). +With the advent of Transformer based retrieval methods such as [Dense Passage Retrieval](https://arxiv.org/abs/2004.04906), +retriever and generator can be trained concurrently from the one loss signal. + +
+ +**Tutorial** + +Checkout our tutorial notebooks for a guide on how to build your own generative QA system with RAG ([here](/docs/latest/tutorial7md)) +or with LFQA ([here](/docs/latest/tutorial12md)). + +
+ +Pros +* More appropriately phrased answers +* Able to synthesize information from different texts +* Can draw on latent knowledge stored in language model + +Cons +* Not easy to track what piece of information the generator is basing its response off of + diff --git a/docs/v1.3.0/_src/usage/usage/get_started.md b/docs/v1.3.0/_src/usage/usage/get_started.md new file mode 100644 index 0000000000..c727fdafbf --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/get_started.md @@ -0,0 +1,166 @@ + + +# Get Started + +## Installation + +
+ +
+ + +
+The most straightforward way to install Haystack is through pip.

+ +```python +$ pip install farm-haystack +``` + +
+
+ +
+ + +
+If you’d like to run a specific, unreleased version of Haystack, or make edits to the way Haystack runs, +you’ll want to install it using `git` and `pip --editable`. +This clones a copy of the repo to a local directory and runs Haystack from there.

+ +```python +$ git clone https://github.com/deepset-ai/haystack.git +$ cd haystack +$ pip install --editable . +``` + +By default, this will give you the latest version of the master branch. Use regular git commands to switch between different branches and commits. +
+
+ +
+ +Note: On Windows add the arg `-f https://download.pytorch.org/whl/torch_stable.html` to install PyTorch correctly + +## The Building Blocks of Haystack + +Here’s a sample of some Haystack code showing a question answering system using a retriever and a reader. +For a working code example, check out our [starter tutorial](/docs/latest/tutorial1md). + +```python +# DocumentStore: holds all your data +document_store = ElasticsearchDocumentStore() + +# Clean & load your documents into the DocumentStore +dicts = convert_files_to_dicts(doc_dir, clean_func=clean_wiki_text) +document_store.write_documents(dicts) + +# Retriever: A Fast and simple algo to indentify the most promising candidate documents +retriever = ElasticsearchRetriever(document_store) + +# Reader: Powerful but slower neural network trained for QA +model_name = "deepset/roberta-base-squad2" +reader = FARMReader(model_name) + +# Pipeline: Combines all the components +pipe = ExtractiveQAPipeline(reader, retriever) + +# Voilà! Ask a question! +question = "Who is the father of Sansa Stark?" +prediction = pipe.run(query=question) +print_answers(prediction) +``` + +## Loading Documents into the DocumentStore + +In Haystack, DocumentStores expect Documents in a dictionary format. They are loaded as follows: + +```python +document_store = ElasticsearchDocumentStore() +dicts = [ + { + 'text': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +document_store.write_documents(dicts) +``` + +When we talk about Documents in Haystack, we are referring specifically to the individual blocks of text that are being held in the DocumentStore. +You might want to use all the text in one file as a Document, or split it into multiple Documents. +This splitting can have a big impact on speed and performance. + +
+ +**Tip:** If Haystack is running very slowly, you might want to try splitting your text into smaller Documents. +If you want an improvement to performance, you might want to try concatenating text to make larger Documents. +See [Optimization](/docs/latest/optimizationmd) for more details. + + +
+ +## Running Search Queries + +There are many different flavours of search that can be created using Haystack. +But to give just one example of what can be achieved, let's look more closely at +an Open Domain Question Answering (ODQA) Pipeline. + + +**Querying** in an ODQA system involves searching for an answer to a given question within the full document store. +This process will: + + +* make the Retriever filter for a small set of relevant candidate documents + + +* get the Reader to process this set of candidate documents + + +* return potential answers to the given question + +Usually, there are tight time constraints on querying and so it needs to be a lightweight operation. +When documents are loaded, Haystack will precompute any of the results that might be useful at query time. + +In Haystack, querying is performed with a `Pipeline` object which connects the reader to the retriever. + +```python +# Pipeline: Combines all the components +pipe = ExtractiveQAPipeline(reader, retriever) + +# Voilà! Ask a question! +question = "Who is the father of Sansa Stark?" +prediction = pipe.run(query=question) +print_answers(prediction) +``` + +When the query is complete, you can expect to see results that look something like this: + +```python +[ + { 'answer': 'Eddard', + 'context': 's Nymeria after a legendary warrior queen. She travels ' + "with her father, Eddard, to King's Landing when he is made " + 'Hand of the King. Before she leaves,' + }, ... +] +``` + +##Custom Search Pipelines + +Haystack providers many different building blocks for you to mix and match. +They include: +- Readers +- Retrievers (sparse and dense) +- DocumentStores +- Summarizers +- Generators +- Translators + +These can all be combined in the configuration that you want. +Have a look at our [Pipelines page](/docs/latest/pipelinesmd) to see what's possible! \ No newline at end of file diff --git a/docs/v1.3.0/_src/usage/usage/intro.md b/docs/v1.3.0/_src/usage/usage/intro.md new file mode 100644 index 0000000000..71037d23c9 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/intro.md @@ -0,0 +1,67 @@ + + +# What is Haystack? + +Haystack is an **open-source framework** for building **search systems** that work intelligently over large document collections. +Recent advances in NLP have enabled the application of question answering, retrieval and summarization to real world settings +and Haystack is designed to be the bridge between research and industry. + + +* **Latest NLP models**: Utilize all transformer based models (BERT, RoBERTa, MiniLM, DPR ...) and smoothly switch when new ones get published + +* **Flexible databases**: Load data into and query from a range of databases such as Elasticsearch, Milvus, FAISS, SQL and more + +* **Scalability**: Production-ready deployments that scale to millions of documents + +* **End-to-End**: All tooling you need to implement, evaluate, improve and run a search system + +* **Domain adaptation**: Fine-tune models to your own domain & improve them continuously via user feedback + +## Retriever-Reader + +The most common system built with Haystack is the Retriever-Reader Pipeline which is designed to optimize for both +speed and performance on the task of Open Domain Question Answering. +In practice, this is a great back bone for creating a search system that can handle detailed full sentence queries. + + +![image](./../../img/retriever_reader.png) + +**Readers**, also known as Open-Domain QA systems in Machine Learning speak, +are powerful models that do close analysis of documents and perform the core task of question answering. +The Readers in Haystack are trained from the latest transformer based language models and can be significantly sped up using GPU acceleration. +However, it is not currently feasible to use the Reader directly on large collection of documents. + + + +The **Retriever** assists the Reader by acting as a lightweight filter that reduces the number of documents that the Reader has to process. +It does this by: + + +* Scanning through all documents in the database + + +* Quickly identifying the relevant and dismissing the irrelevant + + +* Passing on only a small candidate set of documents to the Reader + +Current methods fall into one of the two categories: + + +* sparse + * keyword based + * fast indexing and querying + * e.g. BM25 + + +* dense + * neural network based + * computationally heavy indexing but fast querying + * e.g. Dense Passage Retrieval diff --git a/docs/v1.3.0/_src/usage/usage/knowledge_graph.md b/docs/v1.3.0/_src/usage/usage/knowledge_graph.md new file mode 100644 index 0000000000..b3469f0f69 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/knowledge_graph.md @@ -0,0 +1,108 @@ + + +# Question Answering on a Knowledge Graph + +Haystack allows loading and querying knowledge graphs. In particular, Haystack can: + +* Load an existing knowledge graph given as a .ttl file +* Execute SPARQL queries on a knowledge graph +* Execute text queries on the knowledge graph by translating them to SPARQL queries with the help of a pre-trained seq2seq model + +Haystack's knowledge graph functionalities are still in a very early stage. Thus, don't expect our [exemplary tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py) to work on your custom dataset out-of-the-box. +Two classes implement the functionalities: GraphDBKnowledgeGraph and Text2SparqlRetriever. + +## GraphDBKnowledgeGraph + +GraphDBKnowledgeGraph is a triple store similar to Haystack's document stores. Currently, it is the only implementation of the BaseKnowledgeGraph class. +GraphDBKnowledgeGraph runs on GraphDB. The licensing of GraphDB is rather complicated and it's more than unfortunate that GraphDB cannot be used right away in colab notebooks. +On your local machine, you can start a GraphDB instance by running: + +```docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11``` + +By default, GraphDBKnowledgeGraph connects to a GraphDB instance running on localhost at port 7200. +Similar to Haystack's ElasticsearchDocumentStore, the only additional setting needed is an index name. +(Note that GraphDB internally calls these indices repositories.) + +```kg = GraphDBKnowledgeGraph(index="tutorial_10_index")``` + +Indices can be deleted and created with ```GraphDBKnowledgeGraph.delete_index()``` and ```GraphDBKnowledgeGraph.create_index(config_path)```. +```config_path``` needs to point to a .ttl file that contains configuration settings (see [GraphDB documentation](https://graphdb.ontotext.com/documentation/free/configuring-a-repository.html#configure-a-repository-programmatically) for details or use the file from our [tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py)). It starts with something like: + +``` +# +# Sesame configuration template for a GraphDB Free repository +# +@prefix rdfs: . +@prefix rep: . +@prefix sr: . +@prefix sail: . +@prefix owlim: . + +[] a rep:Repository ; + rep:repositoryID "tutorial_10_index" ; + rdfs:label "tutorial 10 index" ; +... +``` + +GraphDBKnowledgeGraph can load an existing knowledge graph represented in the form of a .ttl file with the method ```GraphDBKnowledgeGraph.import_from_ttl_file(index, path)```, where path points to a ttl file starting with something like: + +``` +@prefix rdf: . +@prefix xsd: . +@prefix hp: . + +hp:Gryffindor hp:source_url "https://harrypotter.fandom.com/wiki/Gryffindor"^^xsd:string . +hp:Gryffindor rdf:type hp:House_ . +hp:Gryffindor hp:name hp:Gryffindor . +hp:Gryffindor hp:founder hp:Godric_gryffindor . +... +``` + +```GraphDBKnowledgeGraph.get_all_triples()``` returns all loaded triples in the form of subject, predicate, and object. It is helpful to check whether the loading of a .ttl file was successful. + +```GraphDBKnowledgeGraph.query(sparql_query)``` executes SPARQL queries on the knowledge graph. However, we usually do not want to use this method directly but use it through a retriever. + +## Text2SparqlRetriever +Text2SparqlRetriever can execute SPARQL queries translated from text but also any other custom SPARQL queries. Currently, it is the only implementation of the BaseGraphRetriever class. +Internally, Text2SparqlRetriever uses a pre-trained BART model to translate text questions to queries in SPARQL format. + +```Text2SparqlRetriever.retrieve(query)``` can be called with a text query, which is then automatically translated to a SPARQL query. + +```Text2SparqlRetriever._query_kg(sparql_query)``` can be called with a SPARQL query. + +## Trying Question Answering on Knowledge Graphs with Custom Data +If you want to use your custom data you would first need to have your custom knowledge graph in the format of a .ttl file. +You can load your custom graph and execute SPARQL queries with ```Text2SparqlRetriever._query_kg(sparql_query)```. To allow the use of abbreviations of namespaces, GraphDBKnowledgeGraph needs to know about them: + +``` +prefixes = """PREFIX rdf: + PREFIX xsd: + PREFIX hp: + """ +kg.prefixes = prefixes +``` + +If you suspect you are having issues because of abbreviations of namespaces not mapped correctly, you can always try to execute a SPARQL query with the full namespace: + +```Text2SparqlRetriever._query_kg(sparql_query="select distinct ?obj where { ?obj . }")``` + +instead of using the abbreviated form: + +```Text2SparqlRetriever._query_kg(sparql_query="select distinct ?obj where { hp:Hermione_granger hp:patronus ?obj . }")``` + +If you would like to translate text queries to SPARQL queries for your custom data and use ```Text2SparqlRetriever.retrieve(query)```, there is significantly more effort necessary. +We provide an exemplary pre-trained model in our [tutorial](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.py). +One limitation is that this pre-trained model can only generate questions about resources it has seen during training. +Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph. +For example, it can translate "Harry" to "hp:Harry_potter" only because we trained it to do so. + +Unfortunately, our pre-trained model for translating text queries does not work with your custom data. +Instead, you need to train your own model. It needs to be trained according to the [seq2seq example for summarization with BART in transformers](https://github.com/huggingface/transformers/tree/master/examples/legacy/seq2seq). +Haystack currently does not support the training of text2sparql models. We dont have concrete plans to extend the funtionality, but we are more than open to contributions. Don't hesitate to reach out! diff --git a/docs/v1.3.0/_src/usage/usage/languages.md b/docs/v1.3.0/_src/usage/usage/languages.md new file mode 100644 index 0000000000..754f38c4f1 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/languages.md @@ -0,0 +1,333 @@ + + +# Languages Other Than English + +Haystack is well suited to open-domain QA on languages other than English. +While our defaults are tuned for English, +you will find some tips and tricks here for using Haystack in your language. + +##Preprocessor + +
+ +**Note** + +This feature will be implemented by [this PR.](https://github.com/deepset-ai/haystack/pull/1160) + +
+ +The PreProcessor's sentence tokenization is language specific. +If you are using the PreProcessor on a language other than English, +make sure to set the `language` argument when initializing it. + +##Retrievers + +The sparse retriever methods themselves(BM25, TF-IDF) are language agnostic. +Their only requirement is that the text be split into words. +The ElasticsearchDocumentStore relies on an analyzer to impose word boundaries, +but also to handle punctuation, casing and stop words. + +The default analyzer is an English analyzer. +While it can still work decently for a large range of languages, +you will want to set it to your language's analyzer for optimal performance. +In some cases, such as with Thai, the default analyzer is completely incompatible. +See [this page](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html) +for the full list of language specific analyzers. + +```python +from haystack.document_store import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore(analyzer="thai") +``` + +The models used in dense retrievers are language specific. +Be sure to check language of the model used in your EmbeddingRetriever. +The default model that is loaded in the DensePassageRetriever is for English. + +We have created a [German DensePassageRetriever model](https://deepset.ai/germanquad) and know other teams who work on further languages. +If you have a language model and a question answering dataset in your own language, you can also train a DPR model using Haystack! +Below is a simplified example. +See [our tutorial](/docs/latest/tutorial9md) and also the [API reference](/docs/latest/apiretrievermd#train) for `DensePassageRetriever.train()` for more details. + +```python +from haystack.retriever import DensePassageRetriever + +dense_passage_retriever = DensePassageRetriever(document_store) +dense_passage_retriever.train(self, + data_dir: str, + train_filename: str, + dev_filename: str = None, + test_filename: str = None, + batch_size: int = 16, + embed_title: bool = True, + num_hard_negatives: int = 1, + n_epochs: int = 3) +``` + +##Readers + +While models are comparatively more performant on English, +thanks to a wealth of available English training data, +there are a couple QA models that are directly usable in Haystack. + +
+ +
+ + +
+ +
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("deepset/gelectra-large-germanquad") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("illuin/camembert-base-fquad") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("uer/roberta-base-chinese-extractive-qa") +# or +reader = FARMReader("wptoux/albert-chinese-large-qa") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es") +# or +reader = FARMReader("mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("pierreguillou/bert-base-cased-squad-v1.1-portuguese") +# or +reader = FARMReader("pucpr/bioBERTpt-squad-v1.1-portuguese") + +``` + +
+
+ + +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader("deepset/xlm-roberta-large-squad2") +``` + +
+
+ +
+ +
+
+ +
+ + +
+ +
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("deepset/gelectra-large-germanquad") +``` + +
+
+ + +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("illuin/camembert-base-fquad") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("uer/roberta-base-chinese-extractive-qa") +# or +reader = TransformersReader("wptoux/albert-chinese-large-qa") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es") +# or +reader = TransformersReader("mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es") +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("pierreguillou/bert-base-cased-squad-v1.1-portuguese") +# or +reader = TransformersReader("pucpr/bioBERTpt-squad-v1.1-portuguese") + +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("deepset/xlm-roberta-large-squad2") +``` + +
+
+ +
+ +
+
+ +
+ +We are the creators of the **German** model and you can find out more about it [here](https://deepset.ai/germanquad) + +The **French**, **Italian**, **Spanish**, **Portuguese** and **Chinese** models are monolingual language models trained on versions of the SQuAD dataset in their respective languages +and their authors report decent results in their model cards +(e.g. [here](https://huggingface.co/illuin/camembert-base-fquad) and [here](https://huggingface.co/mrm8488/bert-italian-finedtuned-squadv1-it-alfa)). +There also exist Korean QA models on the model hub but their performance is not reported. + +The **zero-shot model** that is shown above is a **multilingual XLM-RoBERTa Large** that is trained on English SQuAD. +It is clear, from our [evaluations](https://huggingface.co/deepset/xlm-roberta-large-squad2#model_card), +that the model has been able to transfer some of its English QA capabilities to other languages, +but still its performance lags behind that of the monolingual models. +Nonetheless, if there is not yet a monolingual model for your language and it is one of the 100 supported by XLM-RoBERTa, +this zero-shot model may serve as a decent first baseline. + diff --git a/docs/v1.3.0/_src/usage/usage/optimization.md b/docs/v1.3.0/_src/usage/usage/optimization.md new file mode 100644 index 0000000000..816d560a80 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/optimization.md @@ -0,0 +1,107 @@ + + +# Optimization + +## Speeding up Reader + +In most pipelines, the Reader will be the most computationally expensive component. +If this is a step that you would like to speed up, you can opt for a smaller Reader model +that can process more passages in the same amount of time. + +On our [benchmarks page](https://haystack.deepset.ai/bm/benchmarks), you will find a comparison of +many of the common model architectures. While our default recommendation is RoBERTa, +MiniLM offers much faster processing for only a minimal drop in accuracy. +You can find the models that we've trained on [the HuggingFace Model Hub](https://huggingface.co/deepset) + +## GPU acceleration + +The transformer based models used in Haystack are designed to be run on a GPU enabled machine. +The design of these models means that they greatly benefit from the parallel processing capabilities of graphics cards. +If Haystack has successfully detected a graphics card, you should see these lines in your console output. + +``` +INFO - farm.utils - Using device: CUDA +INFO - farm.utils - Number of GPUs: 1 +``` + +You can track the work load on your CUDA enabled Nvidia GPU by tracking the output of `nvidia-smi -l` on the command line +while your Haystack program is running. + +## Document Length + +Document length has a very direct impact on the speed of the Reader +which is why we recommend using the `PreProcessor` class to clean and split your documents. +**If you halve the length of your documents, you will halve the workload placed onto your Reader.** + +For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text +can get washed out by the rest of the document. +To get a good balance between Reader speed and Retriever performance, we split documents to a maximum of 500 words. +If there is no Reader in the pipeline following the Retriever, we recommend that **documents be no longer than 10,000 words**. + +**Dense retrievers** are limited in the length of text that they can read in one pass. +As such, it is important that documents are not longer than the dense retriever's maximum input length. +By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens. +As such, we recommend that documents contain significantly less words. +We have found decent performance with **documents around 100 words long**. + +## Respecting Sentence Boundaries + +When splitting documents, it is generally not a good idea to let document boundaries fall in the middle of sentences. +Doing so means that each document will contain incomplete sentence fragments +which maybe be hard for both retriever and reader to interpret. +It is therefore recommended to set `split_respect_sentence_boundary=True` when initializing your `PreProcessor`. + +## Choosing the Right top-k Values + +The `top-k` parameter in both the `Retriever` and `Reader` determine how many results they return. +More specifically, `Retriever` `top-k` dictates how many retrieved documents are passed on to the next stage, +while `Reader` `top-k` determines how many answer candidates to show. + +In our experiments, we have found that **`Retriever` `top_k=10` +gives decent overall performance** and so we have set this as the default in Haystack. + +The choice of `Retriever` `top-k` is a trade-off between speed and accuracy, +especially when there is a `Reader` in the pipeline. +Setting it higher means passing more documents to the `Reader`, +thus reducing the chance that the answer-containing passage is missed. +However, passing more documents to the `Reader` will create a larger workload for the component. + +These parameters can easily be tweaked as follows if using a `Finder`: +``` python +answers = pipeline.run(query="What did Einstein work on?", top_k_retriever=10, top_k_reader=5) +``` +or like this if directly calling the `Retriever`: +``` python +retrieved_docs = retriever.retrieve(top_k=10) +``` + +## Metadata Filtering + +Metadata can be attached to the documents which you index into your DocumentStore (see the input data format [here](/docs/latest/retrievermd)). +At query time, you can apply filters based on this metadata to limit the scope of your search and ensure your answers +come from a specific slice of your data. + +For example, if you have a set of annual reports from various companies, +you may want to perform a search on just a specific year, or on a small selection of companies. +This can reduce the work load of the retriever and also ensure that you get more relevant results. + +Filters are applied via the `filters` argument of the `Retriever` class. In practice, this argument will probably +be passed into the `Pipeline.run()` call, which will then route it on to the `Retriever` class +(see our the Arguments on the [Pipelines page](/docs/latest/pipelinesmd) for an explanation). + +```python +pipeline.run( + query="Why did the revenue increase?", + filters={ + "years": ["2019"], + "companies": ["BMW", "Mercedes"] + } +) +``` diff --git a/docs/v1.3.0/_src/usage/usage/pipelines.md b/docs/v1.3.0/_src/usage/usage/pipelines.md new file mode 100644 index 0000000000..e0aa93b4e4 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/pipelines.md @@ -0,0 +1,188 @@ + + +# Pipelines + +### Flexibility powered by DAGs +In order to build modern search pipelines, you need two things: powerful building blocks and an easy way to stick them together. +The `Pipeline` class is exactly built for this purpose and enables many search scenarios beyond QA. +The core idea is that you can build a Directed Acyclic Graph (DAG) where each node is one building block (Reader, Retriever, Generator ...). +Here's a simple example for a standard Open-Domain QA Pipeline: + +```python +from haystack import Pipeline + +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) +p.add_node(component=reader, name="QAReader", inputs=["ESRetriever1"]) +res = p.run(query="What did Einstein work on?") +``` + +You can **draw the DAG** to better inspect what you are building: +```python +p.draw(path="custom_pipe.png") +``` +![image](https://user-images.githubusercontent.com/1563902/102451716-54813700-4039-11eb-881e-f3c01b47ca15.png) + +### Arguments + +Each node in a Pipeline defines the arguments the run() method accepts. The Pipeline class takes care of passing relevant +arguments to the node. In addition to mandatory inputs like `query`, the `run()` accepts optional node parameters like +`top_k` with the `params` argument. For instance, `params={"top_k": 5}` will set the `top_k` of all nodes as 5. To +target params to a specific node, the node name can be explicitly specifie as `params={"Retriever": {"top_k": 5}}`. + + +```python +res = pipeline.run( + query="What did Einstein work on?", + params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 3}} +) +``` + +### YAML File Definitions + +For your convenience, there is also the option of defining and loading pipelines in YAML files. +Having your pipeline available in a YAML is particularly useful when +you move between experimentation and production environments. +Just export the YAML from your notebook / IDE and import it into your production environment. +It also helps with version control of pipelines, allows you to share your pipeline easily with colleagues, +and simplifies the configuration of pipeline parameters in production. + +For example, you can define and save a simple Retriever Reader pipeline by saving the following to a file: + +```yaml +version: '0.7' + +components: # define all the building-blocks for Pipeline +- name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 +- name: MyESRetriever + type: ElasticsearchRetriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + custom_query: null +- name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test + +pipelines: # multiple Pipelines can be defined using the components from above +- name: my_query_pipeline # a simple extractive-qa Pipeline + nodes: + - name: MyESRetriever + inputs: [Query] + - name: MyReader + inputs: [MyESRetriever] +``` + +To load, simply call: + +```python +pipeline.load_from_yaml(Path("sample.yaml")) +``` + +For another example YAML config, check out [this file](https://github.com/deepset-ai/haystack/blob/master/rest_api/pipeline/pipelines.yaml). + +### Multiple retrievers +You can now also use multiple Retrievers and join their results: +```python +p = Pipeline() +p.add_node(component=es_retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"]) +p.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "DPRRetriever"]) +p.add_node(component=reader, name="QAReader", inputs=["JoinResults"]) +res = p.run(query="What did Einstein work on?", params={"ESRetriever": {"top_k": 1}, "DPRRetriever": {"top_k": 3}}) +``` +![image](https://user-images.githubusercontent.com/1563902/102451782-7bd80400-4039-11eb-9046-01b002a783f8.png) + +### Custom nodes +It is easy to build custom nodes. Just respect the following requirements: + +1. Create a Class that inherits from `BaseComponent`. +2. Add a `run()` method to your class with any parameters it needs to process the input. Ensure that the parameters are either passed with `params` to the pipeline or are returned by the preceding nodes. +3. Do whatever you want within `run()` (e.g., reformatting the query). +4. Return a tuple that contains your output data (for the next node) and the name of the outgoing edge `output_dict, "output_1`. +5. Add a class attribute `outgoing_edges = 1` that defines your node's number of output options. You only need a higher number here if you have a decision node (see below). + +### Decision nodes +Or you can add decision nodes where only one "branch" is executed afterwards. This allows, for example, to classify an incoming query and depending on the result routing it to different modules: +![image](https://user-images.githubusercontent.com/1563902/102452199-41229b80-403a-11eb-9365-7038697e7c3e.png) +```python + from haystack import BaseComponent, Pipeline + + class QueryClassifier(BaseComponent): + outgoing_edges = 2 + + def run(self, query): + if "?" in query: + return {}, "output_1" + + else: + return {}, "output_2" + + pipe = Pipeline() + pipe.add_node(component=QueryClassifier(), name="QueryClassifier", inputs=["Query"]) + pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) + pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_2"]) + pipe.add_node(component=JoinDocuments(join_mode="concatenate"), name="JoinResults", + inputs=["ESRetriever", "DPRRetriever"]) + pipe.add_node(component=reader, name="QAReader", inputs=["JoinResults"]) + res = p.run(query="What did Einstein work on?", params={"ESRetriever": {"top_k": 1}, "DPRRetriever": {"top_k": 3}}) +``` + +### Evaluation nodes + +There are nodes in Haystack that are used to evaluate the performance of readers, retrievers and combine systems. +To get hands on with this kind of node, have a look at the [evaluation tutorial](/docs/latest/tutorial5md). + +### Default Pipelines (replacing the "Finder") +Last but not least, we added some "Default Pipelines" that allow you to run standard patterns with very few lines of code. +This is replacing the `Finder` class which was deprecated with Haystack 0.6.0 . + +``` +from haystack.pipeline import DocumentSearchPipeline, ExtractiveQAPipeline, Pipeline, JoinDocuments + +# Extractive QA +qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever) +res = qa_pipe.run(query="When was Kant born?", params={"Retriever": {"top_k": 3}, "Reader": {"top_k": 5}}) + +# Document Search +doc_pipe = DocumentSearchPipeline(retriever=retriever) +res = doc_pipe.run(query="Physics Einstein", params={"Retriever": {"top_k": 3}}) + +# Generative QA +doc_pipe = GenerativeQAPipeline(generator=rag_generator, retriever=retriever) +res = doc_pipe.run(query="Physics Einstein", params={"Retriever": {"top_k": 3}}) + +# FAQ based QA +doc_pipe = FAQPipeline(retriever=retriever) +res = doc_pipe.run(query="How can I change my address?", params={"Retriever": {"top_k": 3}}) + +``` +So to migrate your QA system from the deprecated `Finder` to `ExtractiveQAPipeline` you'd need to: +``` +# 1. Change import +from haystack.pipeline import ExtractiveQAPipeline + +# 2. Replace the Finder +qa_pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever) + +# 3. Replace get_answers() with run() +res = qa_pipe.run(query="When was Kant born?", top_k_retriever=3, top_k_reader=5) + +# 4. Access your results from ["documents"] rather than ["answers"] +print(res["documents"]) +``` +See also the [Pipelines API documentation](/docs/latest/apipipelinesmd) for more details. + +We plan many more features around the new pipelines incl. parallelized execution, distributed execution, dry runs - so stay tuned ... + diff --git a/docs/v1.3.0/_src/usage/usage/preprocessing.md b/docs/v1.3.0/_src/usage/usage/preprocessing.md new file mode 100644 index 0000000000..a285cc44ce --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/preprocessing.md @@ -0,0 +1,144 @@ + + +# Preprocessing + +Haystack includes a suite of tools to: + +* extract text from different file types, +* normalize white space +* split text into smaller pieces to optimize retrieval + +
+ +Check out our [preprocessing tutorial](/docs/latest/tutorial8md) if you'd like to start working with code examples already! + +
+ +These data preprocessing steps can have a big impact on the systems performance +and effective handling of data is key to getting the most out of Haystack. + +The Document Store expects its inputs to come in the following format. +The sections below will show you all the tools you'll need to ready your data for storing. + +```python +docs = [ + { + 'content': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +``` + +## File Conversion + +There are a range of different file converters in Haystack that +can extract text from files and cast them into the unified dictionary format shown above. +Haystack features support for txt, pdf and docx files and there is even a converter that leverages Apache Tika. +Please refer to [the API docs](/docs/latest/file_convertersmd) to see which converter best suits you. + +
+ +
+ + +
+ +```python +from haystack.file_converter import PDFToTextConverter + +converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["de","en"]) +doc = converter.convert(file_path=file, meta=None) +``` + +
+
+ +
+ + +
+ +```python +from haystack.file_converter import DocxToTextConverter + +converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=["de","en"]) +doc = converter.convert(file_path=file, meta=None) +``` + +
+
+ +
+ + +
+ + +Haystack also has a `convert_files_to_dicts()` utility function that will convert +all txt or pdf files in a given folder into this dictionary format. + +```python +from haystack.preprocessor.utils import convert_files_to_dicts + +docs = convert_files_to_dicts(dir_path=doc_dir) +``` + +
+
+ +
+ +## Web Crawler + +In Haystack, you will find a web crawler that will help you scrape text from websites and save it to file. +See the [API documentation](https://haystack.deepset.ai/docs/latest/apicrawlermd) for more details. + +```python +from haystack.connector import Crawler + +crawler = Crawler() +docs = crawler.crawl(urls=["https://haystack.deepset.ai/docs/latest/get_startedmd"], + output_dir="crawled_files", + filter_urls= ["haystack\.deepset\.ai\/docs\/"]) +``` + +## PreProcessor + +While each of the above conversion methods produce documents that are already in the format expected by the Document Store, +it is recommended that they are further processed in order to ensure optimal Retriever and Reader performance. +The `PreProcessor` takes one of the documents created by the converter as input, +performs various cleaning steps and splits them into multiple smaller documents. + +For suggestions on how best to split your documents, see [Optimization](/docs/latest/optimizationmd) + +```python +from haystack.preprocessor import PreProcessor + +doc = converter.convert(file_path=file, meta=None) +processor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=True, + split_by="word", + split_length=200, + split_respect_sentence_boundary=True, + split_overlap=0 +) +docs = processor.process(doc) +``` + +* `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines +* `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text +* `clean_header_footer` will remove any long header or footer texts that are repeated on each page +* `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'` +* `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document +* `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences +* `split_overlap` sets the amount of overlap between two adjacent documents after a split. Setting this to a positive number essentially enables the sliding window approach. + diff --git a/docs/v1.3.0/_src/usage/usage/query_classifier.md b/docs/v1.3.0/_src/usage/usage/query_classifier.md new file mode 100644 index 0000000000..4673e914d4 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/query_classifier.md @@ -0,0 +1,188 @@ + + +# Query Classifier + +Queries come in all shapes and forms. A keyword-based search differs from a question posed in natural language. In Haystack, we can account for these differences by integrating a special node into our QA pipeline: the query classifier. + +A query classifier puts each incoming query into one of two predefined classes, and routes it to the appropriate section of the pipeline. +Haystack comes with classifiers to distinguish between the three most common query types (Keywords, Question, Statement) and allows two different types of models (SKlearn and Transformer). + +Using a query classifier can potentially yield the following benefits: + +* Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries) +* Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries) + + +### Common Query types + +#### 1. Keyword Queries: +Such queries don't have semantic meaning, merely consist of keywords and the order of words does not matter: +* arya stark father +* jon snow country +* arya stark younger brothers + +#### 2. Questions (Interrogative Queries): +In such queries users ask a question in a complete, "natural" sentence. Regardless of the presence of "?" in the query the goal here is to detect the intent of the user whether any question is asked or not in the query: + +* who is the father of arya stark? +* which country was jon snow filmed in +* who are the younger brothers of arya stark? + +#### 3. Statements (Declarative Queries): +Such queries consist also of a regular, natural sentence with semantic relations between the words. However, they are rather a statement than a question: + +* Arya stark was a daughter of a lord. +* Show countries that Jon snow was filmed in. +* List all brothers of Arya. + +### Usage standalone: Try a Query Classifier +To test how a query classifier works before integrating it into a pipeline, you can run it just as an individual component: + +```python +from haystack.pipeline import TransformersQueryClassifier + +queries = ["Arya Stark father","Jon Snow UK", + "who is the father of arya stark?","Which country was jon snow filmed in?"] + +question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") +# Or Sklearn based: + +for query in queries: + result = question_classifier.run(query=query) + if result[1] == "output_1": + category = "question" + else: + category = "keywords" + + print(f"Query: {query}, raw_output: {result}, class: {category}") + +# Returns: +# Query: Arya Stark father, raw_output: ({'query': 'Arya Stark father'}, 'output_2'), class: keywords +# Query: Jon Snow UK, raw_output: ({'query': 'Jon Snow UK'}, 'output_2'), class: keywords +# Query: who is the father of arya stark?, raw_output: ({'query': 'who is the father of arya stark?'}, 'output_1'), class: question +# Query: Which country was jon snow filmed in?, raw_output: ({'query': 'Which country was jon snow filmed in?'}, 'output_1'), class: question + +``` +Note how the node returns two objects: the query (e.g.'Arya Stark father') and the name of the output edge (e.g. "output_2"). This information can be leveraged in a pipeline for routing the query to the next node. + +### Usage in a pipeline: Use different retrievers depending on the query type + +You can use a Query Classifier within a pipeline as a "decision node". Depending on the output of the classifier other parts of the pipeline will be executed. For example, we can route keyword queries to an ElasticsearchRetriever and semantic queries (questions/statements) to DPR. + +![image](https://user-images.githubusercontent.com/6007894/127831511-f55bad86-4b4f-4b54-9889-7bba37e475c6.png) + +Below, we define a pipeline with a `TransformersQueryClassifier` that routes questions/statements to the node's `output_1` and keyword queries to `output_2`. We leverage this structure in the pipeline by connecting the DPRRetriever to `QueryClassifier.output_1` and the ESRetriever to `QueryClassifier.output_2`. + +```python +from haystack.pipeline import TransformersQueryClassifier, Pipeline +from haystack.utils import print_answers + +query_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") + +pipe = Pipeline() +pipe.add_node(component=query_classifier, name="QueryClassifier", inputs=["Query"]) +pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) + +# Pass a question -> run DPR +res_1 = pipe.run( + query="Who is the father of Arya Stark?", + top_k_retriever=10 +) + +# Pass keywords -> run the ElasticsearchRetriever +res_2 = pipe.run( + query="arya stark father", + top_k_retriever=10 +) + +``` +### Usage in a pipeline: Run QA only on proper questions + +If you add QA to an existing search system, it can make sense to only use it for real questions that come in and keep a basic document search with elasticsearch for the remaining keyword queries. You can use a Query Classifier to build such a hybrid pipeline: + +```python +haystack.pipeline import TransformersQueryClassifier, Pipeline +from haystack.utils import print_answers + +query_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") + +pipe = Pipeline() +pipe.add_node(component=query_classifier, name="QueryClassifier", inputs=["Query"]) +pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +pipe.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +pipe.add_node(component=reader, name="QAReader", inputs=["DPRRetriever"]) + +# Pass a question -> run DPR + QA -> return answers +res_1 = pipe.run( + query="Who is the father of Arya Stark?", + top_k_retriever=10 +) + +# Pass keywords -> run only ElasticsearchRetriever -> return docs +res_2 = pipe.run( + query="arya stark father", + top_k_retriever=10 +) + +``` + + +### Which models are available? +The transformer classifier is more accurate than the SkLearn classifier as it can use the context and order of words. However, it requires more memory and most probably GPU for faster inference. You can mitigate those down sides by choosing a very small transformer model. The default models we trained are using a mini BERT architecture which is only about `50 MB` in size and allows relatively fast inference on CPU. + +#### Transformers +Pass your own `Transformer` binary classification model from file/huggingface or use one of the following pretrained ones hosted on Huggingface: +1) Keywords vs. Questions/Statements (Default) + + ```python + TransformersQueryClassifier(model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection") + # output_1 => question/statement + # output_2 => keyword query + ``` + + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + +2) Questions vs. Statements + ```python + TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") + # output_1 => question + # output_2 => statement + ``` + + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + +#### Sklearn +Pass your own `Sklearn` binary classification model or use one of the following pretrained Gradient boosting models: + +1) Keywords vs. Questions/Statements (Default) + + ```python + SklearnQueryClassifier(query_classifier = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", + query_vectorizer = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle") + + # output_1 => question/statement + # output_2 => keyword query + ``` + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + +2) Questions vs. Statements + + ```python + SklearnQueryClassifier(query_classifier = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle", + query_vectorizer = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle") + + output_1 => question + output_2 => statement + ``` + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) diff --git a/docs/v1.3.0/_src/usage/usage/question_generator.md b/docs/v1.3.0/_src/usage/usage/question_generator.md new file mode 100644 index 0000000000..81ae731261 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/question_generator.md @@ -0,0 +1,45 @@ + + +# Question Generator + +
+ +**Running examples** + +Have a look at our [tutorial notebook](/docs/latest/tutorial13md))) if you'd like to start trying out Question Generation straight away! + +
+ +The Question Generation module is used to generate SQuAD style questions on a given document. + +This module is useful when it comes to labelling in a new domain. It can be used to generate questions quickly for an +annotator to answer. If used in conjunction with a trained Reader model, you can automatically generate question answer +pairs. High impact annotations can then be created if a human annotator looks over these pairs and corrects the incorrect predictions. + +Question generation is also a good way to make large documents more navigable. Generated questions can +quickly give the user a sense of what information is contained within the document, thus acting as a kind of summarization. + +To initialize a question generator, simply call: + +```python +from haystack.question_generator import QuestionGenerator + +question_generator = QuestionGenerator() +``` + +This loads the [`valhalla/t5-base-e2e-qg`](https://huggingface.co/valhalla/t5-base-e2e-qg) model by default which is a T5 model trained on SQuAD for question generation. + +To run the node in isolation, simply use the `generate()` method: + +```python +result = question_generator.generate(text="Nirvana was an American rock band formed in Aberdeen, Washington in 1987.") +``` + +Otherwise, the node can be used in a pipeline where its `run()` method will called. diff --git a/docs/v1.3.0/_src/usage/usage/ranker.md b/docs/v1.3.0/_src/usage/usage/ranker.md new file mode 100644 index 0000000000..e829dcfbfd --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/ranker.md @@ -0,0 +1,55 @@ + + +# Ranker + +There are pure "semantic document search" use cases that do not need question answering functionality but only document ranking. +While the [Retriever](/docs/latest/retrievermd) is a perfect fit for document retrieval, we can further improve its results with the Ranker. +For example, BM25 (sparse retriever) does not take into account semantics of the documents and the query but only their keywords. +The Ranker can re-rank the results of the retriever step by taking semantics into account. +Similar to the Reader, it is based on the latest language models. +Instead of returning answers, it returns documents in re-ranked order. + +Without a Ranker and its re-ranking step, the querying process is faster but the query results might be of lower quality. +If you want to do "semantic document search" instead of a question answering, try first with a Retriever only. +In case the semantic similarity of the query and the resulting documents is low, add a Ranker. + +Note that a Ranker needs to be initialised with a model trained on a text pair classification task. +You can train the model also with the train() method of the Ranker. +Alternatively, [this example](https://github.com/deepset-ai/FARM/blob/master/examples/text_pair_classification.py) shows how to train a text pair classification model in FARM. + + +## FARMRanker + +### Description + +The FARMRanker consists of a Transformer-based model for document re-ranking using the TextPairClassifier of [FARM](https://github.com/deepset-ai/FARM). +Given a text pair of query and passage, the TextPairClassifier either predicts label "1" if the pair is similar or label "0" if they are dissimilar (accompanied with a probability). +While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. +With a FARMRanker, you can: +* Directly get predictions (re-ranked version of the supplied list of Document) via predict() if supplying a pre-trained model +* Take a plain language model (e.g. `bert-base-cased`) and train it for TextPairClassification via train() + +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import ElasticsearchRetriever +from haystack.ranker import FARMRanker +from haystack import Pipeline + +document_store = ElasticsearchDocumentStore() +... +retriever = ElasticsearchRetriever(document_store) +ranker = FARMRanker(model_name_or_path="saved_models/roberta-base-asnq-binary") +... +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) +``` diff --git a/docs/v1.3.0/_src/usage/usage/reader.md b/docs/v1.3.0/_src/usage/usage/reader.md new file mode 100644 index 0000000000..2cd26d352a --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/reader.md @@ -0,0 +1,386 @@ + + +# Reader + +The Reader, also known as Open-Domain QA systems in Machine Learning speak, +is the core component that enables Haystack to find the answers that you need. +Haystack’s Readers are: + + +* built on the latest transformer based language models + + +* strong in their grasp of semantics + + +* sensitive to syntactic structure + + +* state-of-the-art in QA tasks like SQuAD and Natural Questions + +
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +model = "deepset/roberta-base-squad2" +reader = FARMReader(model, use_gpu=True) +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +model = "deepset/roberta-base-squad2" +reader = TransformersReader(model, use_gpu=1) +``` + +
+
+ +
+ +While these models can work on CPU, it is recommended that they are run using GPUs to keep query times low. + +## Choosing the Right Model + +In Haystack, you can start using pretrained QA models simply by providing its HuggingFace Model Hub name to the Reader. +The loading of model weights is handled by Haystack, +and you have the option of using the QA pipeline from deepset FARM or HuggingFace Transformers (see FARM vs Transformers for details). + +Currently, there are a lot of different models out there and it can be rather overwhelming trying to pick the one that fits your use case. +To get you started, we have a few recommendations for you to try out. + +
+ +
+ + +
+ +
+ +
+ + +
+ +**An optimised variant of BERT and a great starting point.** + +```python +from haystack.reader import FARMReader + +reader = FARMReader("deepset/roberta-base-squad2") +``` + +* **Pro**: Strong all round model + +* **Con**: There are other models that are either faster or more accurate + +
+
+ +
+ + +
+ +**A cleverly distilled model that sacrifices a little accuracy for speed.** + +```python +from haystack.reader import FARMReader + +reader = FARMReader("deepset/minilm-uncased-squad2") +``` + +* **Pro**: Inference speed up to 50% faster than BERT base + +* **Con**: Still doesn’t match the best base sized models in accuracy + +
+
+ +
+ + +
+ +**Large, powerful, SotA model.** + +```python +from haystack.reader import FARMReader + +reader = FARMReader("ahotrod/albert_xxlargev1_squad2_512") +``` + +* **Pro**: Better accuracy than any other open source model in QA + +* **Con**: The computational power needed make it impractical for most use cases + +
+
+ +
+ +
+
+ +
+ + +
+ +
+ +
+ + +
+ +**An optimised variant of BERT and a great starting point.** + +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("deepset/roberta-base-squad2") +``` + +* **Pro**: Strong all round model + +* **Con**: There are other models that are either faster or more accurate + +
+
+ +
+ + +
+ +**A cleverly distilled model that sacrifices a little accuracy for speed.** + +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("deepset/minilm-uncased-squad2") +``` + +* **Pro**: Inference speed up to 50% faster than BERT base + +* **Con**: Still doesn’t match the best base sized models in accuracy + +
+
+ +
+ + +
+ +**Large, powerful, SotA model.** + +```python +from haystack.reader import TransformersReader + +reader = TransformersReader("ahotrod/albert_xxlargev1_squad2_512") +``` + +* **Pro**: Better accuracy than any other open source model in QA + +* **Con**: The computational power needed make it impractical for most use cases + +
+
+ +
+ +
+
+ +
+ +
+ +**Recommendations:** + +**All-rounder**: In the class of base sized models trained on SQuAD, **RoBERTa** has shown better performance than BERT +and can be capably handled by any machine equipped with a single NVidia V100 GPU. +We recommend this as the starting point for anyone wanting to create a performant and computationally reasonable instance of Haystack. + +**Built for Speed**: If speed and GPU memory are more of a priority to you than accuracy, +you should try the MiniLM model. +It is a smaller model that is trained to mimic larger models through the distillation process, +and it outperforms the BERT base on SQuAD even though it is about 40% smaller. + + +**State of the Art Accuracy**: For most, **ALBERT XXL** will be too large to feasibly work with. +But if performance is your sole concern, and you have the computational resources, +you might like to try ALBERT XXL which has set SoTA performance on SQuAD 2.0. + + + +
+ + +## Confidence Scores + +When printing the full results of a Reader, +you will see that each prediction is accompanied +by a value in the range of 0 to 1 reflecting the model's confidence in that prediction + +In the output of `print_answers()`, you will find the model confidence in dictionary key called `confidence`. + +```python +from haystack.utils import print_answers + +print_answers(prediction, details="all") +``` + +```python +{ + 'answers': [ + { 'answer': 'Eddard', + 'context': 's Nymeria after a legendary warrior queen. ' + 'She travels with her father, Eddard, to ' + "King's Landing when he is made Hand of the " + 'King. Before she leaves,', + 'confidence': 0.9899835586547852, + ... + }, + ] +} +``` + +In order to align this probability score with the model's accuracy, finetuning needs to be performed +on a specific dataset. +To this end, the reader has a method `calibrate_confidence_scores(document_store, device, label_index, doc_index, label_origin)`. +The parameters of this method are the same as for the `eval()` method because the calibration of confidence scores is performed on a dataset that comes with gold labels. +The calibration calls the `eval()` method internally and therefore needs a DocumentStore containing labeled questions and evaluation documents. + +Have a look at this [FARM tutorial](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering_confidence.py) +to see how to compare calibrated confidence scores with uncalibrated confidence scores within FARM. +Note that a finetuned confidence score is specific to the domain that it is finetuned on. +There is no guarantee that this performance can transfer to a new domain. + +Having a confidence score is particularly useful in cases where you need Haystack to work with a certain accuracy threshold. +Many of our users have built systems where predictions below a certain confidence value are routed +on to a fallback system. + + + +## Deeper Dive: FARM vs Transformers + +Apart from the **model weights**, Haystack Readers contain all the components found in end-to-end open domain QA systems. +This includes **tokenization**, **embedding computation**, **span prediction** and **candidate aggregation**. +While the handling of model weights is the same between the FARM and Transformers libraries, their QA pipelines differ in some ways. +The major points are: + + +* The **TransformersReader** will sometimes predict the same span twice while duplicates are removed in the **FARMReader** + + +* The **FARMReader** currently uses the tokenizers from the HuggingFace Transformers library while the **TransformersReader** uses the tokenizers from the HuggingFace Tokenizers library + + +* Start and end logits are normalized per passage and multiplied in the **TransformersReader** while they are summed and not normalised in the **FARMReader** + +If you’re interested in the finer details of these points, have a look at [this](https://github.com/deepset-ai/haystack/issues/248#issuecomment-661977237) GitHub comment. + +We see value in maintaining both kinds of Readers since Transformers is a very familiar library to many of Haystack’s users +but we at deepset can more easily update and optimise the FARM pipeline for speed and performance. + + +Haystack also has a close integration with FARM which means that you can further fine-tune your Readers on labelled data using a FARMReader. +See our tutorials for an end-to-end example or below for a shortened example. + +```python +from haystack.reader import FARMReader + +# Initialise Reader +model = "deepset/roberta-base-squad2" +reader = FARMReader(model) + +# Perform finetuning +train_data = "PATH/TO_YOUR/TRAIN_DATA" +train_filename = "train.json" +save_dir = "finetuned_model" +reader.train(train_data, train_filename, save_dir=save_dir) + +# Load +finetuned_reader = FARMReader(save_dir) +``` + +## Deeper Dive: From Language Model to Haystack Reader + +Language models form the core of most modern NLP systems and that includes the Readers in Haystack. +They build a general understanding of language when performing training tasks such as Masked Language Modeling or Replaced Token Detection +on large amounts of text. +Well trained language models capture the word distribution in one or more languages +but more importantly, convert input text into a set of word vectors that capture elements of syntax and semantics. + +In order to convert a language model into a Reader model, it needs first to be trained on a Question Answering dataset. +To do so requires the addition of a question answering prediction head on top of the language model. +The task can be thought of as a token classification task where every input token is assigned a probability of being +either the start or end token of the correct answer. +In cases where the answer is not contained within the passage, the prediction head is also expected to return a `no_answer` prediction. + + +Since language models are limited in the number of tokens which they can process in a single forward pass, +a sliding window mechanism is implemented to handle variable length documents. +This functions by slicing the document into overlapping passages of (approximately) `max_seq_length` +that are each offset by `doc_stride` number of tokens. +These can be set when the Reader is initialized. + +
+ +
+ + +
+ +```python +from haystack.reader import FARMReader + +reader = FARMReader(... max_seq_len=384, doc_stride=128 ...) +``` + +
+
+ +
+ + +
+ +```python +from haystack.reader import TransformersReader + +reader = TransformersReader(... max_seq_len=384, doc_stride=128 ...) +``` + +
+
+ +
+ +Predictions are made on each individual passage and the process of aggregation picks the best candidates across all passages. +If you’d like to learn more about what is happening behind the scenes, have a look at [this](https://medium.com/deepset-ai/modern-question-answering-systems-explained-4d0913744097) article. diff --git a/docs/v1.3.0/_src/usage/usage/retriever.md b/docs/v1.3.0/_src/usage/usage/retriever.md new file mode 100644 index 0000000000..94bc1401fe --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/retriever.md @@ -0,0 +1,281 @@ + + +# Retriever + +The Retriever is a lightweight filter that can quickly go through the full document store and pass on a set of candidate documents that are relevant to the query. +When used in combination with a Reader, it is a tool for sifting out the obvious negative cases, saving the Reader from doing more work than it needs to and speeding up the querying process. + +
+ +**Recommendations** + +* BM25 (sparse) + +* Dense Passage Retrieval (dense) + +
+ + + +Note that not all Retrievers can be paired with every DocumentStore. +Here are the combinations which are supported: + +| | Memory | Elasticsearch | SQL | FAISS | Milvus | +| --- | --- | --- | ---- | ---- | ---- | +| BM25 | N | Y | N | N | N | +| TF-IDF | Y | Y | Y | N | N | +| Embedding | Y | Y | N | Y | Y | +| DPR | Y | Y | N | Y | Y | + +See [Optimization](/docs/latest/optimizationmd) for suggestions on how to choose top-k values. + + +## TF-IDF + +### Description + +TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions: + + +* documents that have more lexical overlap with the query are more likely to be relevant + +* words that occur in fewer documents are more significant than words that occur in many documents + +Given a query, a tf-idf score is computed for each document as follows: + +```python +score = tf * idf +``` + +Where: + + +* `tf` is how many times words in the query occur in that document. + + +* `idf` is the inverse of the fraction of documents containing the word. + +In practice, both terms are usually log normalised. + +### Initialisation + +```python +from haystack.document_store import InMemoryDocumentStore +from haystack.retriever.sparse import TfidfRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = InMemoryDocumentStore() +... +retriever = TfidfRetriever(document_store) +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +## BM25 (Recommended) + +### Description + +BM25 is a variant of TF-IDF that we recommend you use if you are looking for a retrieval method that does not need a neural network for indexing. +It improves upon its predecessor in two main aspects: + + +* It saturates `tf` after a set number of occurrences of the given term in the document + + +* It normalises by document length so that short documents are favoured over long documents if they have the same amount of word overlap with the query + +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import ElasticsearchRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = ElasticsearchDocumentStore() +... +retriever = ElasticsearchRetriever(document_store) +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +See [this](https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables) blog post for more details about the algorithm. + + +## Dense Passage Retrieval (Recommended) + +### Description + +[Dense Passage Retrieval](https://arxiv.org/abs/2004.04906) is a highly performant retrieval method that calculates relevance using dense representations. +Key features: + + +* One BERT base model to encode documents + + +* One BERT base model to encode queries + + +* Ranking of documents done by dot product similarity between query and document embeddings + + +Indexing using DPR is comparatively expensive in terms of required computation since all documents in the database need to be processed through the transformer. +The embeddings that are created in this step can be stored in FAISS, a database optimized for vector similarity. +DPR can also work with the ElasticsearchDocumentStore or the InMemoryDocumentStore. + +There are two design decisions that have made DPR particularly performant. + + +* Separate encoders for document and query helps since queries are much shorter than documents + + +* Training with ‘In-batch negatives’ (gold labels are treated as negative examples for other samples in same batch) is highly efficient + +In Haystack, you can simply download the pretrained encoders needed to start using DPR. +If you’d like to learn how to set up a DPR based system, have a look at the [tutorial](/docs/latest/tutorial6md)! + +### Initialisation + +
+ +**Tip** + +When using DPR, it is recommended that you use the dot product similarity function since that is how it is trained. +To do so, simply provide `similarity="dot_product"` when initializing the DocumentStore +as is done in the code example below. + +
+ +```python +from haystack.document_store import FAISSDocumentStore +from haystack.retriever import DensePassageRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = FAISSDocumentStore(similarity="dot_product") +... +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base" +) +... +finder = ExtractiveQAPipeline(reader, retriever) +``` + +
+ +**Training DPR:** Haystack supports training of your own DPR model! Check out the [tutorial](/docs/latest/tutorial9md) to see how this is done! + +
+ + + +## Embedding Retrieval + +### Description + +In Haystack, you also have the option of using a single transformer model to encode document and query. +One style of model that is suited to this kind of retrieval is that of [Sentence Transformers](https://github.com/UKPLab/sentence-transformers). +These models are trained in Siamese Networks and use triplet loss such that they learn to embed similar sentences near to each other in a shared embedding space. + +They are particular suited to cases where your query input is similar in style to that of the documents in your database +i.e. when you are searching for most similar documents. +This is not inherently suited to query based search where the length, language and format of the query usually significantly differs from the searched for text. + +
+ +**Tip** + +When using Sentence Transformer models, we recommend that you use a cosine similarity function. +To do so, simply provide `similarity="cosine"` when initializing the DocumentStore +as is done in the code example below. + +
+ +### Initialisation + +```python +from haystack.document_store import ElasticsearchDocumentStore +from haystack.retriever import EmbeddingRetriever +from haystack.pipeline import ExtractiveQAPipeline + +document_store = ElasticsearchDocumentStore(similarity="cosine") +... +retriever = EmbeddingRetriever(document_store=document_store, + embedding_model="deepset/sentence_bert") +... +p = ExtractiveQAPipeline(reader, retriever) +``` + +## Deeper Dive: Dense vs Sparse + +Broadly speaking, retrieval methods can be split into two categories: **dense** and **sparse**. + +**Sparse** methods, like TF-IDF and BM25, operate by looking for shared keywords between the document and query. +They are: + + +* simple but effective + + +* don’t need to be trained + + +* work on any language + +More recently, **dense** approaches such as Dense Passage Retrieval (DPR) have shown even better performance than their sparse counter parts. +These methods embed both document and query into a shared embedding space using deep neural networks +and the top candidates are the nearest neighbour documents to the query. +They are: + + +* powerful but computationally more expensive especially during indexing + + +* trained using labelled datasets + + +* language specific + +### Qualitative Differences + +Between these two types there are also some qualitative differences too. +For example, sparse methods treat text as a bag-of-words meaning that they **do not take word order and syntax into account**, +while the latest generation of dense methods use transformer based encoders +which are designed to be **sensitive** to these factors. + +Also dense methods are very capable of building strong semantic representations of text, +but they **struggle when encountering out-of-vocabulary** words such as new names. +By contrast, sparse methods don’t need to learn representations of words, +they only care about whether they are present or absent in the text. +As such, **they handle out-of-vocabulary words with no problem**. + + +### Indexing + +Dense methods perform indexing by processing all the documents through a neural network and storing the resulting vectors. +This is a much more expensive operation than the creation of the inverted-index in sparse methods +and will require significant computational power and time. + + +### Terminology + + + +The terms **dense** and **sparse** refer to the representations that the algorithms build for each document and query. +**Sparse** methods characterise texts using vectors with one dimension corresponding to each word in the vocabulary. +Dimensions will be zero if the word is absent and non-zero if it is present. +Since most documents contain only a small subset of the full vocabulary, +these vectors are considered sparse since non-zero values are few and far between. + +**Dense** methods, by contrast, pass text as input into neural network encoders +and represent text in a vector of a manually defined size (usually 768). +Though individual dimensions are not mapped to any corresponding vocabulary or linguistic feature, +each dimension encodes some information about the text. +There are rarely 0s in these vectors hence their relative density. diff --git a/docs/v1.3.0/_src/usage/usage/roadmap.md b/docs/v1.3.0/_src/usage/usage/roadmap.md new file mode 100644 index 0000000000..47835bf4b3 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/roadmap.md @@ -0,0 +1,45 @@ + + +# Open roadmap + +We believe open-source is more than open source code. It's a lot about people, collaboration, transparency and trust. +Therefore, we decided to be as open as possible with our roadmap and sprint planning. +In fact, you can see all of it in real-time on GitHub. +We hope this helps to clarify the direction of the open-source project and inspires discussions in the community. + +## How to access it + +### Add Zenhub Plugin +We decided for Zenhub, as it allows a close integration with GitHub and real-time sharing of roadmaps and sprints. +Once you have installed the browser plugin below, you will see additional tabs and infos on the Haystack GitHub page. + +Zenhub Plugin: https://www.zenhub.com/extension + +### Roadmap +_Zenhub Tab -> Roadmap (left menu)_ + +Here you can find our most recent roadmap with the **high-level projects** that are planned for the upcoming quarters. +We update it regularly and refine the projects as they come closer in time. + +![image](../../img/zenhub_roadmap.png) + +### Board +_Zenhub Tab -> Board (left menu)_ + +If you are interested in the **operational tasks** and their status, you can find our agile board here. + +![image](../../img/zenhub_board.png) +### Additional issue details & Releases +_Right panel in regular Github issues_ + +With Zenhub you can also see some additional tags in every GitHub issue. +For those of you who wonder about the next release date: We aim for releases every ~ 4 weeks and will tag the issues that will need to be finished before a bit in advance. + +![image](../../img/zenhub_issue.png) diff --git a/docs/v1.3.0/_src/usage/usage/summarizer.md b/docs/v1.3.0/_src/usage/usage/summarizer.md new file mode 100644 index 0000000000..93bde9cdaf --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/summarizer.md @@ -0,0 +1,56 @@ + + +# Summarizer + +Retrievers are excellent at returning a set of candidate documents, +but you might not have the time to read through them all. +Haystack's Summmarizer is here to help you make sense of the documents at a glance. + +There is a full integration with Huggingface Transformers and using any of their summarization +models is as simple as providing the model name. +See the up-to-date list of available models [here](https://huggingface.co/models?filter=summarization). +By default, the Google [Pegasus](https://ai.googleblog.com/2020/06/pegasus-state-of-art-model-for.html) model is loaded. + +```python +from haystack.summarizer import TransformersSummarizer +from haystack.schema import Document + +docs = [Document("PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.\ + The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by\ + the shutoffs which were expected to last through at least midday tomorrow.")] + +summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum") +summary = summarizer.predict(documents=docs, generate_single_summary=True) +``` + +The contents of summary should contain both the summarization and also the original document text. + +```python +[ + { + "text": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", + "meta": { + "context": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." + }, + ... + } +] +``` + +The summarizer can also functions as a node in a pipeline. + +```python +from haystack.pipeline import Pipeline + +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever1", inputs=["Query"]) +p.add_node(component=summarizer, name="Summarizer", inputs=["ESRetriever1"]) +res = p.run(query="What did Einstein work on?", top_k_retriever=10) +``` diff --git a/docs/v1.3.0/_src/usage/usage/terms.md b/docs/v1.3.0/_src/usage/usage/terms.md new file mode 100644 index 0000000000..fb77485096 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/terms.md @@ -0,0 +1,65 @@ + + +# Glossary + +**BERT** - A popular, transformer based language model which has been improved upon but is still considered a common benchmark. + +**Dense** - Vectors that contain many non-zero values are considered dense. +Retrieval methods can also be called dense if they create dense vector representations of documents. + +**Document** - A Document in Haystack refers to the individual pieces of text that are stored in the DocumentStore. +Multiple Documents might originally come from the one file. +It is ultimately up to you how to divide up your corpus into Documents. + +**Document Store** - The component in Haystack that stores the text documents and their metadata. +Can have a variety of backends such as Elasticsearch, SQL or FAISS. + +**FARM** - An open-source transfer learning [framework](https://github.com/deepset-ai/FARM) by deepset. +FARM’s question answering models are used in Haystack’s Readers. + +**Indexing** - To store data in a database in a way that optimises retrieval time. +The exact steps involved in indexing depend on what kind of retrieval method is chosen. + +**Language Model** - The component in an NLP model that stores general language understanding, but no task specific knowledge. + +**Model Hub** - The [repository](https://huggingface.co/models) set up by HuggingFace where trained models can be saved to and loaded from. +With Haystack, you can directly load and use any question answering model found on the model hub. + +**Neural Network** - A machine learning architecture composed of artificial neurons that learn a task when exposed to labelled training data. + +**Prediction Head** - The modelling component that adapts the general knowledge of the language model for a specific task. +In question answering models (and hence in Haystack Readers), this is usually a single layer neural network. + +**Querying** - The task of returning relevant documents from a database. + +**Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions. +The term is generally used to refer to extractive question answering, +where a system has to find the minimal text span in a given document that contains the answer to the question. +Note however, that it may also refer to abstractive question answering or FAQ matching. + +**Reader** - The component in Haystack that does the closest reading of a document to extract +the exact text which answers a question. +It is, at its core, a trained Question Answering model. + +**Retriever** - A lightweight filter that selects only the most relevant documents for the Reader to further process. + +**Semantic Search** - A style of search that relies not on the matching of exact string forms +but on the similarity of meaning between a query and a piece of text. + +**Sparse** - Vectors that are composed primarily of zeros are called sparse. +Retrieval methods are also considered sparse if they build sparse vector representations of documents. + +**SQuAD** - The [Stanford Question Answering Dataset](https://rajpurkar.github.io/SQuAD-explorer/) is the defacto standard QA dataset. +The documents are paragraphs from Wikipedia and the question / answer pairs are created by human annotators. + +**Transformers** - Originally refers to the deep learning architecture that is composed of stacked self-attention layers +(first conceptualised [here](https://arxiv.org/pdf/1706.03762.pdf)). +Can also refer to HuggingFace’s [repository](https://github.com/huggingface/transformers) +which contains implementations of popular model architectures. diff --git a/docs/v1.3.0/_src/usage/usage/translator.md b/docs/v1.3.0/_src/usage/usage/translator.md new file mode 100644 index 0000000000..409e30148d --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/translator.md @@ -0,0 +1,61 @@ + + +# Translator + +Texts come in different languages. This is not different for search and there are plenty of options to deal with it. +One of them is actually to translate the incoming query, the documents or the search results. + +Let's imagine you have an English corpus of technical docs, but the mother tongue of many of your users is French. +You can use a Translator node in your pipeline to +1. Translate the incoming query from French to English +2. Search in your English corpus for the right document / answer +3. Translate the results back from English to French + +
+ +**Example (Stand-alone Translator)** + +You can use the Translator component directly to translate your query or document(s): +```python +from haystack.schema import Document +from haystack.translator import TransformersTranslator + +DOCS = [ + Document( + text="""Heinz von Foerster was an Austrian American scientist + combining physics and philosophy, and widely attributed + as the originator of Second-order cybernetics.""" + ) + ] +translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-fr") +res = translator.translate(documents=DOCS, query=None) +``` + +**Example (Wrapping another Pipeline)** + +You can also wrap one of your existing pipelines and "add" the translation nodes at the beginning and at the end of your pipeline. +For example, lets translate the incoming query to from French to English, then do our document retrieval and then translate the results back from English to French: + +```python +from haystack.pipeline import TranslationWrapperPipeline, DocumentSearchPipeline +from haystack.translator import TransformersTranslator + +pipeline = DocumentSearchPipeline(retriever=my_dpr_retriever) + +in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-fr-en") +out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-fr") + +pipeline_with_translation = TranslationWrapperPipeline(input_translator=in_translator, + output_translator=out_translator, + pipeline=pipeline) +``` + + +
diff --git a/docs/v1.3.0/_src/usage/usage/use_cases.md b/docs/v1.3.0/_src/usage/usage/use_cases.md new file mode 100644 index 0000000000..49311328c7 --- /dev/null +++ b/docs/v1.3.0/_src/usage/usage/use_cases.md @@ -0,0 +1,69 @@ + + + +# Use cases + +## Semantic Search System + +Take the leap from using keyword search on your own documents to semantic search with Haystack. + + +* Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS) + + +* Perform question driven queries. + +Expect to see results that highlight the very sentence that contains the answer to your question. +Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning +rather than lexical overlap. + + + +![image](../../img/search.png) + +## Information Extractor + +Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities. + +Haystack can: + + +* Apply a set of standard questions to each document in a store + + +* Return a NO_ANSWER if a given document does not contain the answer to a question + +Say you have the financial reports for different companies over different years. +You can gather a set of standard questions which are applicable to each financial report, +like *what is the revenue forecast for 2020?* or *what are the main sources of income?*. +Haystack will try to find an answer for each question within each document! + +We’ve seen this style of application be particularly effective in the sphere of finance and patent law +but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents. + + +## FAQ Style Question Answering + +Leverage existing FAQ documents and semantic similarity search to answer new incoming questions. +The workflow is as follows: + + +* Store a set of FAQ documents in Haystack + + +* The user presents a new question + + +* Haystack will find the closest match to the new question in the FAQ documents + + +* The user will be presented with the most similar Question Answer pair + +Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation. diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index f66e9c3791..6f44d024f1 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -24,7 +24,7 @@ try: from numba import njit # pylint: disable=import-error except (ImportError, ModuleNotFoundError): - logger.info("Numba not found, replacing njit() with no-op implementation. " "Enable it with 'pip install numba'.") + logger.info("Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.") def njit(f): return f @@ -47,6 +47,7 @@ def run(self, sparql_query: str, index: Optional[str] = None, headers: Optional[ output = {"sparql_result": result} return output, "output_1" + @abstractmethod def query(self, sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None): raise NotImplementedError diff --git a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json index 064b817925..921e8a1ea0 100644 --- a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-1.2.1rc0.schema.json", + "$id": "https://haystack.deepset.ai/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json", "title": "Haystack Pipeline", "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", "type": "object", @@ -12,6 +12,9 @@ "oneOf": [ { "const": "1.2.1rc0" + }, + { + "const": "1.3.0" } ] }, @@ -45,6 +48,9 @@ { "$ref": "#/definitions/OpenSearchDocumentStoreComponent" }, + { + "$ref": "#/definitions/PineconeDocumentStoreComponent" + }, { "$ref": "#/definitions/SQLDocumentStoreComponent" }, @@ -901,10 +907,6 @@ "title": "Parameters", "type": "object", "properties": { - "host": { - "title": "Host", - "default": "https://admin:admin@localhost:9200/" - }, "similarity": { "title": "Similarity", "default": "cosine" @@ -969,6 +971,102 @@ ], "additionalProperties": false }, + "PineconeDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PineconeDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "environment": { + "title": "Environment", + "default": "us-west1-gcp", + "type": "string" + }, + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///pinecone_document_store.db", + "type": "string" + }, + "pinecone_index": { + "title": "Pinecone Index", + "type": "string", + "default": null + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "replicas": { + "title": "Replicas", + "default": 1, + "type": "integer" + }, + "shards": { + "title": "Shards", + "default": 1, + "type": "integer" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "required": [ + "api_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "SQLDocumentStoreComponent": { "type": "object", "properties": { @@ -1399,9 +1497,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -1670,9 +1765,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -2752,9 +2844,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -3097,9 +3186,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, diff --git a/haystack/json-schemas/haystack-pipeline-unstable.schema.json b/haystack/json-schemas/haystack-pipeline-unstable.schema.json index 4f49f36310..2f3a958ffb 100644 --- a/haystack/json-schemas/haystack-pipeline-unstable.schema.json +++ b/haystack/json-schemas/haystack-pipeline-unstable.schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://haystack.deepset.ai/json-schemas/haystack-pipeline-unstable.schema.json", + "$id": "https://haystack.deepset.ai/haystack/json-schemas/haystack-pipeline-unstable.schema.json", "title": "Haystack Pipeline", "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", "type": "object", @@ -15,6 +15,9 @@ }, { "const": "1.2.1rc0" + }, + { + "const": "1.3.0" } ] }, @@ -48,6 +51,9 @@ { "$ref": "#/definitions/OpenSearchDocumentStoreComponent" }, + { + "$ref": "#/definitions/PineconeDocumentStoreComponent" + }, { "$ref": "#/definitions/SQLDocumentStoreComponent" }, @@ -904,10 +910,6 @@ "title": "Parameters", "type": "object", "properties": { - "host": { - "title": "Host", - "default": "https://admin:admin@localhost:9200/" - }, "similarity": { "title": "Similarity", "default": "cosine" @@ -972,6 +974,102 @@ ], "additionalProperties": false }, + "PineconeDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PineconeDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "environment": { + "title": "Environment", + "default": "us-west1-gcp", + "type": "string" + }, + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///pinecone_document_store.db", + "type": "string" + }, + "pinecone_index": { + "title": "Pinecone Index", + "type": "string", + "default": null + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "replicas": { + "title": "Replicas", + "default": 1, + "type": "integer" + }, + "shards": { + "title": "Shards", + "default": 1, + "type": "integer" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + } + }, + "required": [ + "api_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, "SQLDocumentStoreComponent": { "type": "object", "properties": { @@ -1402,9 +1500,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -1673,9 +1768,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -2755,9 +2847,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, @@ -3100,9 +3189,6 @@ "type": "array", "items": { "anyOf": [ - { - "type": "integer" - }, { "type": "string" }, diff --git a/haystack/json-schemas/haystack-pipeline.schema.json b/haystack/json-schemas/haystack-pipeline.schema.json index bc868e97ee..f8e15aecb4 100644 --- a/haystack/json-schemas/haystack-pipeline.schema.json +++ b/haystack/json-schemas/haystack-pipeline.schema.json @@ -55,6 +55,9 @@ "oneOf": [ { "const": "1.2.1rc0" + }, + { + "const": "1.3.0" } ] } diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index e9a880476a..f02e5862f1 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -360,9 +360,7 @@ def new_version_entry(version): def update_json_schema( - update_index: bool, - destination_path: Path = JSON_SCHEMAS_PATH, - index_path: Path = JSON_SCHEMAS_PATH / "haystack-pipeline.schema.json", + update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json" ): # Locate the latest schema's path latest_schema_path = destination_path / Path( @@ -426,23 +424,24 @@ def update_json_schema( # Update the JSON schema index too if update_index: - index = load(index_path) + index = load(destination_path / index_name) index["oneOf"][-1]["allOf"][0]["properties"]["version"]["oneOf"] = supported_versions_block - dump(index, index_path) + dump(index, destination_path / index_name) # Dump the new schema file new_schema["$id"] = f"{SCHEMA_URL}{filename}" unstable_versions_block = [{"const": haystack_version}] new_schema["properties"]["version"]["oneOf"] = [{"const": haystack_version}] dump(new_schema, destination_path / filename) + logger.info(f"Schema saved in {destination_path / filename}") # Update schema index with a whole new entry if update_index: - index = load(index_path) + index = load(destination_path / index_name) new_entry = new_version_entry(haystack_version) if all(new_entry != entry for entry in index["oneOf"]): index["oneOf"].append(new_version_entry(haystack_version)) - dump(index, index_path) + dump(index, destination_path / index_name) # If the two schemas are compatible, no need to write a new one: # Just add the new version to the list of versions supported by @@ -479,12 +478,13 @@ def update_json_schema( unstable_versions_block = supported_versions_block latest_schema["properties"]["version"]["oneOf"] = supported_versions_block dump(latest_schema, latest_schema_path) + logger.info(f"Schema updated in {destination_path / latest_schema_path}") # Update the JSON schema index too if update_index: - index = load(index_path) + index = load(destination_path / index_name) index["oneOf"][-1]["allOf"][0]["properties"]["version"]["oneOf"] = supported_versions_block - dump(index, index_path) + dump(index, destination_path / index_name) # Update the unstable schema (for tests and internal use). unstable_filename = "haystack-pipeline-unstable.schema.json" @@ -492,3 +492,4 @@ def update_json_schema( unstable_schema["$id"] = f"{SCHEMA_URL}{unstable_filename}" unstable_schema["properties"]["version"]["oneOf"] = [{"const": "unstable"}] + unstable_versions_block dump(unstable_schema, destination_path / unstable_filename) + logger.info(f"Unstable schema saved in {destination_path / unstable_filename}")