diging · Girik1105 · Jun 10, 2026 · Jun 12, 2026
diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py
@@ -113,6 +113,60 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
     raise last_exc
 
 
+def update_pdf_in_kb(doc_id, file_bytes, filename, title, url=None):
+    """Update an existing PDF in the MCP KB server by document id.
+
+    Calls POST /docs/pdf/update on the MCP KB server with multipart form data.
+    The KB server re-extracts and replaces the document's chunks in place,
+    keeping the same document id. Mirrors add_pdf_to_kb's retry policy.
+    """
+    headers = {
+        "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
+    }
+    endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/update"
+
+    data = {"doc_id": str(doc_id), "title": title}
+    if url:
+        data["url"] = url
+
+    # only retry on transport errors — a timeout
+    # likely means the KB received the file and is still processing it
+    attempts = max(1, settings.KB_MCP_PDF_RETRIES)
+    last_exc = None
+    for attempt in range(1, attempts + 1):
+        # rebuild files each attempt: httpx consumes the stream on send
+        files = {"file": (filename, file_bytes, "application/pdf")}
+        try:
+            with httpx.Client() as client:
+                response = client.post(
+                    endpoint,
+                    headers=headers,
+                    files=files,
+                    data=data,
+                    timeout=settings.KB_MCP_PDF_TIMEOUT,
+                )
+            response.raise_for_status()
+            return response.json()
+        except httpx.TimeoutException as e:
+            logger.warning(
+                "KB PDF update timed out for %s: %s; not retrying (KB may still be processing)",
+                filename, e,
+            )
+            raise
+        except httpx.TransportError as e:
+            last_exc = e
+            if attempt == attempts:
+                break
+            backoff = 2 ** (attempt - 1)
+            logger.warning(
+                "KB PDF update failed (attempt %d/%d) for %s: %s; retrying in %ds",
+                attempt, attempts, filename, e, backoff,
+            )
+            time.sleep(backoff)
+
+    raise last_exc
+
+
 def download_kb_pdf(doc_id):
     """Download the original PDF bytes for a KB document.
 

diff --git a/hospexplorer/ask/templates/kb/resources.html b/hospexplorer/ask/templates/kb/resources.html
@@ -522,29 +522,35 @@ <h5 class="kb-section-heading">PDFs in KB but not tracked internally</h5>
                         'X-CSRFToken': '{{ csrf_token }}',
                         'Content-Type': 'application/json',
                     },
-                    body: JSON.stringify({ url: doc.url, title: doc.title }),
+                    body: JSON.stringify({ url: doc.url, title: doc.title, doc_id: doc.doc_id }),
                 });
                 const data = await response.json();
                 if (data.success) {
                     this.untracked = this.untracked.filter(d => d.doc_id !== doc.doc_id);
 
-                    const tbody = document.querySelector('#websites-pane .kb-table tbody');
-                    if (tbody) {
-                        const tr = document.createElement('tr');
-                        tr.className = 'kb-row-maroon';
-                        const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
-                        const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
-                        tr.innerHTML = `
-                            <td>${doc.title}</td>
-                            <td><a href="${doc.url}" target="_blank" rel="noopener" class="kb-link text-truncate d-inline-block" style="max-width: 300px;">${truncUrl}</a></td>
-                            <td class="text-muted">${now}</td>
-                            <td>In Sync</td>
-                            {% if can_change %}<td></td>{% endif %}
-                        `;
-                        tbody.prepend(tr);
+                    if (data.merged) {
+                        // Linked an existing Hopper row — that row is already shown in
+                        // the table, so refresh its status instead of adding a duplicate.
+                        this.statusMap[data.id] = 'in_kb';
+                        this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
+                    } else {
+                        const tbody = document.querySelector('#websites-pane .kb-table tbody');
+                        if (tbody) {
+                            const tr = document.createElement('tr');
+                            tr.className = 'kb-row-maroon';
+                            const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
+                            const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
+                            tr.innerHTML = `
+                                <td>${doc.title}</td>
+                                <td><a href="${doc.url}" target="_blank" rel="noopener" class="kb-link text-truncate d-inline-block" style="max-width: 300px;">${truncUrl}</a></td>
+                                <td class="text-muted">${now}</td>
+                                <td>In Sync</td>
+                                {% if can_change %}<td></td>{% endif %}
+                            `;
+                            tbody.prepend(tr);
+                        }
+                        this.showToast(`Now tracking "${doc.title}" in Hopper.`);
                     }
-
-                    this.showToast(`Now tracking "${doc.title}" in Hopper.`);
                 } else {
                     this.showToast(data.error || 'Failed to track resource.', 'error');
                 }
@@ -570,24 +576,29 @@ <h5 class="kb-section-heading">PDFs in KB but not tracked internally</h5>
                     this.untrackedPdfs = this.untrackedPdfs.filter(d => d.doc_id !== doc.doc_id);
                     this.pdfStatusMap[data.id] = 'in_kb';
 
-                    const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
-                    if (tbody) {
-                        const tr = document.createElement('tr');
-                        tr.className = 'kb-row-maroon';
-                        const filename = data.filename || '';
-                        const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
-                        const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
-                        tr.innerHTML = `
-                            <td>${data.title}</td>
-                            <td class="text-muted">${truncFilename}</td>
-                            <td class="text-muted">${now}</td>
-                            <td>In Sync</td>
-                            {% if can_change_pdf %}<td></td>{% endif %}
-                        `;
-                        tbody.prepend(tr);
+                    if (data.merged) {
+                        // Linked an existing Hopper row — its row is already shown and its
+                        // status was just set above, so don't add a duplicate row.
+                        this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
+                    } else {
+                        const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
+                        if (tbody) {
+                            const tr = document.createElement('tr');
+                            tr.className = 'kb-row-maroon';
+                            const filename = data.filename || '';
+                            const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
+                            const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
+                            tr.innerHTML = `
+                                <td>${data.title}</td>
+                                <td class="text-muted">${truncFilename}</td>
+                                <td class="text-muted">${now}</td>
+                                <td>In Sync</td>
+                                {% if can_change_pdf %}<td></td>{% endif %}
+                            `;
+                            tbody.prepend(tr);
+                        }
+                        this.showToast(`Now tracking "${doc.title}" in Hopper.`);
                     }
-
-                    this.showToast(`Now tracking "${doc.title}" in Hopper.`);
                 } else {
                     this.showToast(data.error || 'Failed to track PDF.', 'error');
                 }

diff --git a/hospexplorer/ask/views.py b/hospexplorer/ask/views.py
@@ -15,7 +15,7 @@
 from ask.tasks import run_llm_task
 from django.core.files.base import ContentFile
 
-from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, delete_kb_document, download_kb_pdf
+from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, update_pdf_in_kb, delete_kb_document, download_kb_pdf
 
 logger = logging.getLogger(__name__)
 
@@ -335,16 +335,36 @@ def kb_add_resource(request):
 
     url = body.get("url", "").strip()
     title = body.get("title", "").strip()
+    doc_id = body.get("doc_id")
     if not url:
         return JsonResponse({"success": False, "error": "URL is required."}, status=400)
 
+    try:
+        doc_id = int(doc_id) if doc_id is not None else None
+    except (TypeError, ValueError):
+        doc_id = None
+
+    # merge, don't reject if a Hopper resource already exists for this URL
+    # (e.g. created but never linked to the KB), link it to the KB doc instead
+    # of creating a duplicate. .first() is safe against pre-existing duplicates
+    existing = WebsiteResource.objects.filter(url=url).first()
+    if existing is not None:
+        update_fields = ["modifier", "modified_at"]
+        existing.modifier = request.user
+        if doc_id is not None:
+            existing.mcp_kb_document_id = doc_id
+            update_fields.append("mcp_kb_document_id")
+        existing.save(update_fields=update_fields)
+        return JsonResponse({"success": True, "id": existing.id, "merged": True})
+
     resource = WebsiteResource.objects.create(
         url=url,
         title=title or url,
+        mcp_kb_document_id=doc_id,
         creator=request.user,
         modifier=request.user,
     )
-    return JsonResponse({"success": True, "id": resource.id})
+    return JsonResponse({"success": True, "id": resource.id, "merged": False})
 
 
 @login_required
@@ -374,9 +394,39 @@ def kb_add_pdf_resource(request):
     except (TypeError, ValueError):
         return JsonResponse({"success": False, "error": "doc_id is required."}, status=400)
 
-    if PDFResource.objects.filter(mcp_kb_document_id=doc_id).exists():
-        return JsonResponse({"success": False, "error": "Already tracked in Hopper."}, status=400)
+    # Merge, don't reject. 1) Already linked to this KB doc -> idempotent no-op.
+    already = PDFResource.objects.filter(mcp_kb_document_id=doc_id).first()
+    if already is not None:
+        return JsonResponse({
+            "success": True,
+            "id": already.id,
+            "title": already.title,
+            "filename": already.file.name if already.file else "",
+            "merged": True,
+        })
+
+    # 2) An unlinked local row whose title matches the KB doc (e.g. a zip upload
+    #    that never linked to the KB) -> link it in place, no re-download.
+    if title:
+        candidate = PDFResource.objects.filter(
+            mcp_kb_document_id__isnull=True, title=title
+        ).first()
+        if candidate is not None:
+            candidate.mcp_kb_document_id = doc_id
+            candidate.modifier = request.user
+            candidate.status = PDFResource.Status.SUCCESS
+            candidate.save(update_fields=[
+                "mcp_kb_document_id", "modifier", "status", "modified_at",
+            ])
+            return JsonResponse({
+                "success": True,
+                "id": candidate.id,
+                "title": candidate.title,
+                "filename": candidate.file.name if candidate.file else "",
+                "merged": True,
+            })
 
+    # 3) No local row -> download from the KB and create a new tracked row.
     try:
         filename, content = download_kb_pdf(doc_id)
     except httpx.ConnectError:
@@ -407,6 +457,7 @@ def kb_add_pdf_resource(request):
         "id": resource.id,
         "title": resource.title,
         "filename": resource.file.name if resource.file else "",
+        "merged": False,
     })
 
 
@@ -551,7 +602,15 @@ def kb_add_pdf_to_mcp(request):
         resource.file.open("rb")
         file_bytes = resource.file.read()
         resource.file.close()
-        result = add_pdf_to_kb(file_bytes, resource.file.name.split("/")[-1], resource.title)
+        filename = resource.file.name.split("/")[-1]
+        # Re-ingest of an already-linked resource updates the existing KB doc by
+        # id (no duplicate); an unlinked resource is added as a new KB doc.
+        if resource.mcp_kb_document_id:
+            result = update_pdf_in_kb(
+                resource.mcp_kb_document_id, file_bytes, filename, resource.title
+            )
+        else:
+            result = add_pdf_to_kb(file_bytes, filename, resource.title)
         resource.mcp_kb_document_id = result.get("doc_id")
         resource.modifier = request.user
         resource.save(update_fields=["mcp_kb_document_id", "modifier", "modified_at"])