-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.py
More file actions
99 lines (81 loc) · 3.36 KB
/
index.py
File metadata and controls
99 lines (81 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import streamlit as st
from dotenv import load_dotenv
from app.loaders import load_and_chunk_files
from app.vectorstore import store_chunks, get_vectorstore, get_bm25_retriever
from app.chain import build_llm_chain, retrieve_hybrid_docs, rerank_documents
from app.pdf_handler import upload_pdfs, save_uploaded_files, list_uploaded_files, delete_uploaded_file
# Load environment variables
load_dotenv()
st.set_page_config(page_title="📄 Chat with PDF", layout="wide")
st.title("AI PDF RAG Agent: Chat with your PDFs")
UPLOAD_DIR = "uploaded_files"
os.makedirs(UPLOAD_DIR, exist_ok=True)
# STEP 1: Sidebar upload + config
uploaded_files, submitted, vectorstore_backend, chunk_size, overlap = upload_pdfs()
vectorstore = None
all_chunks = []
# STEP 2: Preview chunks for uploaded PDFs (optional)
if uploaded_files and not submitted:
st.sidebar.subheader("📑 Chunk Preview (First 5 Chunks of Each PDF)")
import tempfile
preview_file_paths = []
for file in uploaded_files:
tmp_path = os.path.join(tempfile.gettempdir(), file.name)
with open(tmp_path, "wb") as f:
f.write(file.read())
preview_file_paths.append(tmp_path)
preview_chunks = load_and_chunk_files(
preview_file_paths, chunk_size=chunk_size, chunk_overlap=overlap
)
for i, doc in enumerate(preview_chunks[:5]):
st.sidebar.markdown(f"**Chunk {i+1}**: {doc.page_content[:200]}...")
# STEP 3: Load + Index PDFs
if uploaded_files and submitted:
file_paths = save_uploaded_files(uploaded_files)
st.sidebar.success(f"Uploaded {len(file_paths)} file(s)")
with st.spinner("🔍 Indexing PDFs with FAISS..."):
all_chunks = load_and_chunk_files(
file_paths,
chunk_size=chunk_size,
chunk_overlap=overlap
)
st.success(f"✅ Indexed {len(all_chunks)} chunks from uploaded PDFs!")
# Store chunks in FAISS
vectorstore = store_chunks(all_chunks)
bm25 = get_bm25_retriever(all_chunks)
# STEP 3b: Clear vectorstore
if st.sidebar.button("🧹 Clear Vectorstore"):
if vectorstore:
vectorstore.delete_all()
all_chunks = []
st.sidebar.success("Vectorstore cleared!")
st.experimental_rerun()
# STEP 3c: Manage uploaded PDFs
st.sidebar.subheader("📄 Manage Uploaded PDFs")
existing_files = list_uploaded_files()
for f in existing_files:
col1, col2 = st.sidebar.columns([4, 1])
col1.markdown(f"- {f}")
if col2.button("🗑️ Delete", key=f):
delete_uploaded_file(f)
st.experimental_rerun()
# STEP 4: Question input
if vectorstore:
st.header("💬 Ask a question")
query = st.text_input("Enter your question", key="user_question")
if query:
retrieved_docs = retrieve_hybrid_docs(query, vectorstore)
reranked_docs = rerank_documents(query, retrieved_docs)
chain = build_llm_chain()
st.subheader("🤖 Answer:")
response_container = st.empty()
response = chain.invoke({"question": query, "docs": reranked_docs})
response_container.markdown(response)
st.sidebar.subheader("🔍 Retrieved Chunks")
if reranked_docs:
for i, doc in enumerate(reranked_docs):
st.sidebar.markdown(f"**Chunk {i+1}**")
st.sidebar.caption(doc.page_content[:400])
else:
st.sidebar.info("No chunks retrieved yet.")