You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Why iam keep Reading the PDF files again and again when i use PDFImageReader from Phi data if those pdf files got present in the pgvector, but if i set PDFReader as default its skip reading the files please help me out
#1947
Open
mahendra867 opened this issue
Jan 30, 2025
· 1 comment
Originally posted by mahendra867 January 30, 2025
when i set PDFReader as default the code logic is working fine like its skipping reading the pdf files ,but when i set PDFImageReader as default its not skipping the reading the pdf files even though they got exist under single table what is the issue please help me out
code
import os
from dotenv import load_dotenv
from phi.agent import Agent
from phi.embedder.azure_openai import AzureOpenAIEmbedder
from phi.knowledge.pdf import PDFKnowledgeBase, PDFImageReader, PDFReader
from phi.vectordb.pgvector import PgVector, SearchType
from phi.model.openai import OpenAIChat
from phi.storage.agent.postgres import PgAgentStorage
from sqlalchemy import create_engine, inspect, text
from phi.vectordb.pgvector.index import Ivfflat, HNSW
from phi.embedder.openai import OpenAIEmbedder
Function to check if a PDF file is already in the database
def is_pdf_in_db(engine, schema, table_name, pdf_name):
pdf_name_no_ext = os.path.splitext(pdf_name)[0] # Remove the .pdf extension
print(f"Checking if PDF '{pdf_name_no_ext}' exists in table '{table_name}'...")
with engine.connect() as connection:
result = connection.execute(
text(f"SELECT 1 FROM {schema}.{table_name} WHERE name = :name"),
{"name": pdf_name_no_ext}
)
exists = result.fetchone() is not None
print(f"PDF '{pdf_name_no_ext}' exists in table '{table_name}': {exists}")
return exists
Set up the PDF knowledge base with vector database
print("Setting up PDF knowledge base with vector database...")
pdf_knowledge_base = PDFKnowledgeBase(
path="/home/ubuntu/rag-strider-arm64/agentic_new_rag/pdfs",
vector_db=PgVector(
table_name="updated_rag81",
schema='ai',
db_url=db_url,
search_type=SearchType.hybrid,
vector_index=HNSW(),
embedder = OpenAIEmbedder(
api_key=OPENAI_API_KEY,
model="text-embedding-ada-002",
dimensions=1536,
encoding_format="float"
)
),
reader=PDFReader(), # Use a default reader
documents=3,
)
Define the PgAgentStorage with connection to database
Before loading, check if the table exists and create if not
if not check_table_exists(engine, "ai", "updated_rag81"):
print("Table does not exist. Creating the table...")
pdf_knowledge_base.load(recreate=True, upsert=True) # Create the table
else:
print("Table exists. Skipping table creation...")
pdf_knowledge_base.load(recreate=False, skip_existing=True) # Skip existing table
Check if PDFs are already in the database and process accordingly
pdf_folder = "D:\Geak Minds Projects\agentic_new_rag\pdfs"
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith(".pdf"):
if not is_pdf_in_db(engine, "ai", "updated_rag81", pdf_file):
print(f"Processing {pdf_file}...")
# Process the PDF file with PDFImageReader
pdf_reader = PDFImageReader(chunk=True)
pdf_reader.read(os.path.join(pdf_folder, pdf_file))
else:
print(f"Skipping {pdf_file}, already in database.")
Initialize the RAG agent
print("Initializing RAG Agent...")
rag_agent = Agent(
name="Agentic RAG Application",
agent_id="rag-agent",
model=OpenAIChat(id="gpt-4o-mini"),
knowledge=pdf_knowledge_base,
add_context=True,
search_knowledge=True,
read_chat_history=True,
debug_mode=True,
#storage=storage,
description=(
"You are an intelligent retrieval assistant specialized in utilizing knowledge stored in "
"a curated set of documents related to the Cybersecurity "
),
markdown=True
)
print("RAG Agent initialized.")
Print the agent's response to a query
rag_agent.print_response("give me Figure 5 - Assessment file directory after script execution", stream=True)
The text was updated successfully, but these errors were encountered:
And also please tell me why i cannot able to use Recursive chunking text splitter when i use the PDFKnowledgeBase , why phi data allowing me to use Fixed chunking, i think this a draw back of phi data for letting to use recursive chunking
Discussed in #1946
Originally posted by mahendra867 January 30, 2025
when i set PDFReader as default the code logic is working fine like its skipping reading the pdf files ,but when i set PDFImageReader as default its not skipping the reading the pdf files even though they got exist under single table what is the issue please help me out
code
import os
from dotenv import load_dotenv
from phi.agent import Agent
from phi.embedder.azure_openai import AzureOpenAIEmbedder
from phi.knowledge.pdf import PDFKnowledgeBase, PDFImageReader, PDFReader
from phi.vectordb.pgvector import PgVector, SearchType
from phi.model.openai import OpenAIChat
from phi.storage.agent.postgres import PgAgentStorage
from sqlalchemy import create_engine, inspect, text
from phi.vectordb.pgvector.index import Ivfflat, HNSW
from phi.embedder.openai import OpenAIEmbedder
Load environment variables
load_dotenv()
print("Environment variables loaded.")
Fetch API keys and endpoint from environment variables
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
Function to check if the table exists
def check_table_exists(engine, schema, table_name):
print(f"Checking if table '{table_name}' exists in schema '{schema}'...")
inspector = inspect(engine)
exists = inspector.has_table(table_name, schema=schema)
print(f"Table '{table_name}' exists: {exists}")
return exists
Function to check if a PDF file is already in the database
def is_pdf_in_db(engine, schema, table_name, pdf_name):
pdf_name_no_ext = os.path.splitext(pdf_name)[0] # Remove the .pdf extension
print(f"Checking if PDF '{pdf_name_no_ext}' exists in table '{table_name}'...")
with engine.connect() as connection:
result = connection.execute(
text(f"SELECT 1 FROM {schema}.{table_name} WHERE name = :name"),
{"name": pdf_name_no_ext}
)
exists = result.fetchone() is not None
print(f"PDF '{pdf_name_no_ext}' exists in table '{table_name}': {exists}")
return exists
Set up the PDF knowledge base with vector database
print("Setting up PDF knowledge base with vector database...")
pdf_knowledge_base = PDFKnowledgeBase(
path="/home/ubuntu/rag-strider-arm64/agentic_new_rag/pdfs",
vector_db=PgVector(
table_name="updated_rag81",
schema='ai',
db_url=db_url,
search_type=SearchType.hybrid,
vector_index=HNSW(),
embedder = OpenAIEmbedder(
api_key=OPENAI_API_KEY,
model="text-embedding-ada-002",
dimensions=1536,
encoding_format="float"
)
),
reader=PDFReader(), # Use a default reader
documents=3,
)
Define the PgAgentStorage with connection to database
#storage = PgAgentStorage(table_name="updated_rag81", schema="ai", db_url=db_url)
print("PgAgentStorage initialized.")
Create a SQLAlchemy engine
engine = create_engine(db_url)
Before loading, check if the table exists and create if not
if not check_table_exists(engine, "ai", "updated_rag81"):
print("Table does not exist. Creating the table...")
pdf_knowledge_base.load(recreate=True, upsert=True) # Create the table
else:
print("Table exists. Skipping table creation...")
pdf_knowledge_base.load(recreate=False, skip_existing=True) # Skip existing table
Check if PDFs are already in the database and process accordingly
pdf_folder = "D:\Geak Minds Projects\agentic_new_rag\pdfs"
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith(".pdf"):
if not is_pdf_in_db(engine, "ai", "updated_rag81", pdf_file):
print(f"Processing {pdf_file}...")
# Process the PDF file with PDFImageReader
pdf_reader = PDFImageReader(chunk=True)
pdf_reader.read(os.path.join(pdf_folder, pdf_file))
else:
print(f"Skipping {pdf_file}, already in database.")
Initialize the RAG agent
print("Initializing RAG Agent...")
rag_agent = Agent(
name="Agentic RAG Application",
agent_id="rag-agent",
model=OpenAIChat(id="gpt-4o-mini"),
knowledge=pdf_knowledge_base,
add_context=True,
search_knowledge=True,
read_chat_history=True,
debug_mode=True,
#storage=storage,
description=(
"You are an intelligent retrieval assistant specialized in utilizing knowledge stored in "
"a curated set of documents related to the Cybersecurity "
),
markdown=True
)
print("RAG Agent initialized.")
Print the agent's response to a query
rag_agent.print_response("give me Figure 5 - Assessment file directory after script execution", stream=True)
The text was updated successfully, but these errors were encountered: