-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit d0b3400
Showing
24 changed files
with
3,488 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
[flake8] | ||
exclude = | ||
venv | ||
.venv | ||
__pycache__ | ||
notebooks | ||
data | ||
# Recommend matching the black line length (default 88), | ||
# rather than using the flake8 default of 79: | ||
max-line-length = 88 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
name: Push Event Workflow | ||
|
||
on: push | ||
|
||
jobs: | ||
unit-testing: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v2 | ||
|
||
- name: Install Package | ||
run: pip install flake8 | ||
|
||
- name: Run tests | ||
run: flake8 insight_engine/embeddings.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
# Local data | ||
data/local_data/ | ||
|
||
# Secrets | ||
.streamlit/secrets.toml | ||
|
||
# VSCode | ||
.vscode/ | ||
|
||
# TODO | ||
TODO.md | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
cover/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
.pybuilder/ | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pdm | ||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | ||
#pdm.lock | ||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | ||
# in version control. | ||
# https://pdm.fming.dev/#use-with-ide | ||
.pdm.toml | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# pytype static type analyzer | ||
.pytype/ | ||
|
||
# Cython debug symbols | ||
cython_debug/ | ||
|
||
# PyCharm | ||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can | ||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | ||
# and can be added to the global gitignore or merged into this file. For a more nuclear | ||
# option (not recommended) you can uncomment the following to ignore the entire idea folder. | ||
#.idea/ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[server] | ||
maxUploadSize = 15 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2023 Glory Olusola | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
<h1 align="center"> | ||
📖Insight Engine | ||
</h1> | ||
|
||
Accurate answers and instant citations for your documents. | ||
|
||
## 🔧 Features | ||
|
||
- Upload documents 📁(PDF, DOCX, TXT) and answer questions about them. | ||
- Cite sources📚 for the answers, with excerpts from the text. | ||
|
||
## 💻 Running Locally | ||
|
||
1. Clone the repository📂 | ||
|
||
```bash | ||
git clone https://github.com/mmz-001/insight_engine | ||
cd insight_engine | ||
``` | ||
|
||
2. Install dependencies with [Poetry](https://python-poetry.org/) and activate virtual environment🔨 | ||
|
||
```bash | ||
poetry install | ||
poetry shell | ||
``` | ||
|
||
3. Run the Streamlit server🚀 | ||
|
||
```bash | ||
cd engine | ||
streamlit run main.py | ||
``` | ||
|
||
## 🚀 Upcoming Features | ||
|
||
- Add support for more formats (e.g. webpages 🕸️, PPTX 📊, etc.) | ||
- Highlight relevant phrases in citations 🔦 | ||
- Support scanned documents with OCR 📝 | ||
- More customization options (e.g. chain type 🔗, chunk size📏, etc.) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Questions | ||
|
||
These are some questions you can ask from the example data to test KnowlegeGPT. | ||
|
||
## Paul Graham Essay | ||
|
||
1. Why did the author want to get rich? | ||
2. What are some important things the author learned at Interleaf? | ||
3. Why did the author drop out of art school? | ||
4. What was the author's first company? | ||
5. What made the author leave Y Combinator? | ||
6. Why did the author go to Italy? | ||
7. What advice does the author give to aspiring entrepreneurs? | ||
8. What kinds of projects have the author pursued in his career? | ||
9. Why did the author switch art schools? | ||
10. Who is Rtm? | ||
|
||
## Employment Contract | ||
|
||
1. What is the value of my stock options and when can I exercise them? | ||
2. What insurance options are available to me? | ||
3. Are there any mandatory arbitration clauses included in the contract? | ||
4. Are there any mandatory arbitration clauses included in this contract? | ||
5. What termination benefits am I entitled to? | ||
6. What is the vacation policy? | ||
7. What is the salary and bonus structure? | ||
|
||
## HyDE Paper | ||
|
||
> Link to paper [here](https://arxiv.org/abs/2212.10496) | ||
1. Explain what a Hypothetical Document Embedding is to a five-year-old |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# flake8: noqa | ||
import streamlit as st | ||
|
||
|
||
def faq(): | ||
st.markdown( | ||
""" | ||
# FAQ | ||
## How does DocGPT work? | ||
When you upload a document, it will be divided into smaller chunks | ||
and stored in a special type of database called a vector index | ||
that allows for semantic search and retrieval. | ||
When you ask a question, DocGPT will search through the | ||
document chunks and find the most relevant ones using the vector index. | ||
Then, it will use GPT3 to generate a final answer. | ||
## Is my data safe? | ||
Yes, your data is safe. DocGPT does not store your documents or | ||
questions. All uploaded data is deleted after you close the browser tab. | ||
## What do the numbers mean under each source? | ||
For a PDF document, you will see a citation number like this: 3-12. | ||
The first number is the page number and the second number is | ||
the chunk number on that page. For DOCS and TXT documents, | ||
the first number is set to 1 and the second number is the chunk number. | ||
## Are the answers 100% accurate? | ||
No, the answers are not 100% accurate. DocGPT uses GPT-3 to generate | ||
answers. GPT-3 is a powerful language model, but it sometimes makes mistakes | ||
and is prone to hallucinations. Also, DocGPT uses semantic search | ||
to find the most relevant chunks and does not see the entire document, | ||
which means that it may not be able to find all the relevant information and | ||
may not be able to answer all questions (especially summary-type questions | ||
or questions that require a lot of context from the document). | ||
But for most use cases, DocGPT is very accurate and can answer | ||
most questions. Always check with the sources to make sure that the answers | ||
are correct. | ||
""" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import streamlit as st | ||
|
||
from engine.components.faq import faq | ||
|
||
|
||
def set_openai_api_key(api_key: str): | ||
st.session_state["OPENAI_API_KEY"] = api_key | ||
|
||
|
||
def sidebar(): | ||
with st.sidebar: | ||
st.markdown( | ||
"## How to use\n" | ||
# "1. Enter your [OpenAI API key](https://platform.openai.com/account/api-keys) below🔑\n" # noqa: E501 | ||
"1. Upload a pdf, docx, or txt file📄 (Currently we don't support scanned PDF)\n" | ||
"2. Ask a question about the document💬\n" | ||
" Or you can ask DocGPT to give you some questions about the document💬\n" | ||
) | ||
# api_key_input = st.text_input( | ||
# "OpenAI API Key", | ||
# type="password", | ||
# placeholder="Paste your OpenAI API key here (sk-...)", | ||
# help="You can get your API key from https://platform.openai.com/account/api-keys.", # noqa: E501 | ||
# value=st.session_state.get("OPENAI_API_KEY", ""), | ||
# ) | ||
|
||
# if api_key_input: | ||
# set_openai_api_key(api_key_input) | ||
# set_openai_api_key(st.secrets["OPENAI_API_KEY"]) | ||
|
||
st.markdown("---") | ||
st.markdown("# About") | ||
st.markdown( | ||
"📖DocGPT allows you to ask questions about your " | ||
"documents and get accurate answers with instant citations. " | ||
"You can use it to research a paper or practice your exam. " | ||
) | ||
st.markdown( | ||
"This tool is a work in progress. " | ||
) | ||
|
||
faq() |
Oops, something went wrong.