Rag_PDF_AI_Agent/index.py at main · RabbitDaCoder/Rag_PDF_AI_Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import streamlit as st
from dotenv import load_dotenv

from app.loaders import load_and_chunk_files
from app.vectorstore import store_chunks, get_vectorstore, get_bm25_retriever
from app.chain import build_llm_chain, retrieve_hybrid_docs, rerank_documents
from app.pdf_handler import upload_pdfs, save_uploaded_files, list_uploaded_files, delete_uploaded_file

# Load environment variables
load_dotenv()

st.set_page_config(page_title="📄 Chat with PDF", layout="wide")
st.title("AI PDF RAG Agent: Chat with your PDFs")

UPLOAD_DIR = "uploaded_files"
os.makedirs(UPLOAD_DIR, exist_ok=True)

# STEP 1: Sidebar upload + config
uploaded_files, submitted, vectorstore_backend, chunk_size, overlap = upload_pdfs()


vectorstore = None
all_chunks = []

# STEP 2: Preview chunks for uploaded PDFs (optional)
if uploaded_files and not submitted:
    st.sidebar.subheader("📑 Chunk Preview (First 5 Chunks of Each PDF)")
    import tempfile
    preview_file_paths = []
    for file in uploaded_files:
        tmp_path = os.path.join(tempfile.gettempdir(), file.name)
        with open(tmp_path, "wb") as f:
            f.write(file.read())
        preview_file_paths.append(tmp_path)

    preview_chunks = load_and_chunk_files(
        preview_file_paths, chunk_size=chunk_size, chunk_overlap=overlap
    )
    for i, doc in enumerate(preview_chunks[:5]):
        st.sidebar.markdown(f"**Chunk {i+1}**: {doc.page_content[:200]}...")

# STEP 3: Load + Index PDFs
if uploaded_files and submitted:
    file_paths = save_uploaded_files(uploaded_files)
    st.sidebar.success(f"Uploaded {len(file_paths)} file(s)")

    with st.spinner("🔍 Indexing PDFs with FAISS..."):
        all_chunks = load_and_chunk_files(
            file_paths,
            chunk_size=chunk_size,
            chunk_overlap=overlap
        )
        st.success(f"✅ Indexed {len(all_chunks)} chunks from uploaded PDFs!")

        # Store chunks in FAISS
        vectorstore = store_chunks(all_chunks)
        bm25 = get_bm25_retriever(all_chunks)

# STEP 3b: Clear vectorstore
if st.sidebar.button("🧹 Clear Vectorstore"):
    if vectorstore:
        vectorstore.delete_all()
        all_chunks = []
        st.sidebar.success("Vectorstore cleared!")
        st.experimental_rerun()

# STEP 3c: Manage uploaded PDFs
st.sidebar.subheader("📄 Manage Uploaded PDFs")
existing_files = list_uploaded_files()
for f in existing_files:
    col1, col2 = st.sidebar.columns([4, 1])
    col1.markdown(f"- {f}")
    if col2.button("🗑️ Delete", key=f):
        delete_uploaded_file(f)
        st.experimental_rerun()

# STEP 4: Question input
if vectorstore:
    st.header("💬 Ask a question")
    query = st.text_input("Enter your question", key="user_question")

    if query:
        retrieved_docs = retrieve_hybrid_docs(query, vectorstore)
        reranked_docs = rerank_documents(query, retrieved_docs)
        chain = build_llm_chain()

        st.subheader("🤖 Answer:")
        response_container = st.empty()
        response = chain.invoke({"question": query, "docs": reranked_docs})
        response_container.markdown(response)

        st.sidebar.subheader("🔍 Retrieved Chunks")
        if reranked_docs:
            for i, doc in enumerate(reranked_docs):
                st.sidebar.markdown(f"**Chunk {i+1}**")
                st.sidebar.caption(doc.page_content[:400])
        else:
            st.sidebar.info("No chunks retrieved yet.")