diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py
index b374a55..9ddb27a 100644
--- a/hospexplorer/ask/kb_connector.py
+++ b/hospexplorer/ask/kb_connector.py
@@ -113,6 +113,60 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
raise last_exc
+def update_pdf_in_kb(doc_id, file_bytes, filename, title, url=None):
+ """Update an existing PDF in the MCP KB server by document id.
+
+ Calls POST /docs/pdf/update on the MCP KB server with multipart form data.
+ The KB server re-extracts and replaces the document's chunks in place,
+ keeping the same document id. Mirrors add_pdf_to_kb's retry policy.
+ """
+ headers = {
+ "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
+ }
+ endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/update"
+
+ data = {"doc_id": str(doc_id), "title": title}
+ if url:
+ data["url"] = url
+
+ # only retry on transport errors — a timeout
+ # likely means the KB received the file and is still processing it
+ attempts = max(1, settings.KB_MCP_PDF_RETRIES)
+ last_exc = None
+ for attempt in range(1, attempts + 1):
+ # rebuild files each attempt: httpx consumes the stream on send
+ files = {"file": (filename, file_bytes, "application/pdf")}
+ try:
+ with httpx.Client() as client:
+ response = client.post(
+ endpoint,
+ headers=headers,
+ files=files,
+ data=data,
+ timeout=settings.KB_MCP_PDF_TIMEOUT,
+ )
+ response.raise_for_status()
+ return response.json()
+ except httpx.TimeoutException as e:
+ logger.warning(
+ "KB PDF update timed out for %s: %s; not retrying (KB may still be processing)",
+ filename, e,
+ )
+ raise
+ except httpx.TransportError as e:
+ last_exc = e
+ if attempt == attempts:
+ break
+ backoff = 2 ** (attempt - 1)
+ logger.warning(
+ "KB PDF update failed (attempt %d/%d) for %s: %s; retrying in %ds",
+ attempt, attempts, filename, e, backoff,
+ )
+ time.sleep(backoff)
+
+ raise last_exc
+
+
def download_kb_pdf(doc_id):
"""Download the original PDF bytes for a KB document.
diff --git a/hospexplorer/ask/templates/kb/resources.html b/hospexplorer/ask/templates/kb/resources.html
index ea9374c..4b69883 100644
--- a/hospexplorer/ask/templates/kb/resources.html
+++ b/hospexplorer/ask/templates/kb/resources.html
@@ -522,29 +522,35 @@
PDFs in KB but not tracked internally
'X-CSRFToken': '{{ csrf_token }}',
'Content-Type': 'application/json',
},
- body: JSON.stringify({ url: doc.url, title: doc.title }),
+ body: JSON.stringify({ url: doc.url, title: doc.title, doc_id: doc.doc_id }),
});
const data = await response.json();
if (data.success) {
this.untracked = this.untracked.filter(d => d.doc_id !== doc.doc_id);
- const tbody = document.querySelector('#websites-pane .kb-table tbody');
- if (tbody) {
- const tr = document.createElement('tr');
- tr.className = 'kb-row-maroon';
- const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
- const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
- tr.innerHTML = `
- ${doc.title} |
- ${truncUrl} |
- ${now} |
- In Sync |
- {% if can_change %} | {% endif %}
- `;
- tbody.prepend(tr);
+ if (data.merged) {
+ // Linked an existing Hopper row — that row is already shown in
+ // the table, so refresh its status instead of adding a duplicate.
+ this.statusMap[data.id] = 'in_kb';
+ this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
+ } else {
+ const tbody = document.querySelector('#websites-pane .kb-table tbody');
+ if (tbody) {
+ const tr = document.createElement('tr');
+ tr.className = 'kb-row-maroon';
+ const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
+ const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
+ tr.innerHTML = `
+ ${doc.title} |
+ ${truncUrl} |
+ ${now} |
+ In Sync |
+ {% if can_change %} | {% endif %}
+ `;
+ tbody.prepend(tr);
+ }
+ this.showToast(`Now tracking "${doc.title}" in Hopper.`);
}
-
- this.showToast(`Now tracking "${doc.title}" in Hopper.`);
} else {
this.showToast(data.error || 'Failed to track resource.', 'error');
}
@@ -570,24 +576,29 @@ PDFs in KB but not tracked internally
this.untrackedPdfs = this.untrackedPdfs.filter(d => d.doc_id !== doc.doc_id);
this.pdfStatusMap[data.id] = 'in_kb';
- const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
- if (tbody) {
- const tr = document.createElement('tr');
- tr.className = 'kb-row-maroon';
- const filename = data.filename || '';
- const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
- const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
- tr.innerHTML = `
- ${data.title} |
- ${truncFilename} |
- ${now} |
- In Sync |
- {% if can_change_pdf %} | {% endif %}
- `;
- tbody.prepend(tr);
+ if (data.merged) {
+ // Linked an existing Hopper row — its row is already shown and its
+ // status was just set above, so don't add a duplicate row.
+ this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
+ } else {
+ const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
+ if (tbody) {
+ const tr = document.createElement('tr');
+ tr.className = 'kb-row-maroon';
+ const filename = data.filename || '';
+ const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
+ const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
+ tr.innerHTML = `
+ ${data.title} |
+ ${truncFilename} |
+ ${now} |
+ In Sync |
+ {% if can_change_pdf %} | {% endif %}
+ `;
+ tbody.prepend(tr);
+ }
+ this.showToast(`Now tracking "${doc.title}" in Hopper.`);
}
-
- this.showToast(`Now tracking "${doc.title}" in Hopper.`);
} else {
this.showToast(data.error || 'Failed to track PDF.', 'error');
}
diff --git a/hospexplorer/ask/views.py b/hospexplorer/ask/views.py
index 6135717..84cab08 100644
--- a/hospexplorer/ask/views.py
+++ b/hospexplorer/ask/views.py
@@ -15,7 +15,7 @@
from ask.tasks import run_llm_task
from django.core.files.base import ContentFile
-from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, delete_kb_document, download_kb_pdf
+from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, update_pdf_in_kb, delete_kb_document, download_kb_pdf
logger = logging.getLogger(__name__)
@@ -335,16 +335,36 @@ def kb_add_resource(request):
url = body.get("url", "").strip()
title = body.get("title", "").strip()
+ doc_id = body.get("doc_id")
if not url:
return JsonResponse({"success": False, "error": "URL is required."}, status=400)
+ try:
+ doc_id = int(doc_id) if doc_id is not None else None
+ except (TypeError, ValueError):
+ doc_id = None
+
+ # merge, don't reject if a Hopper resource already exists for this URL
+ # (e.g. created but never linked to the KB), link it to the KB doc instead
+ # of creating a duplicate. .first() is safe against pre-existing duplicates
+ existing = WebsiteResource.objects.filter(url=url).first()
+ if existing is not None:
+ update_fields = ["modifier", "modified_at"]
+ existing.modifier = request.user
+ if doc_id is not None:
+ existing.mcp_kb_document_id = doc_id
+ update_fields.append("mcp_kb_document_id")
+ existing.save(update_fields=update_fields)
+ return JsonResponse({"success": True, "id": existing.id, "merged": True})
+
resource = WebsiteResource.objects.create(
url=url,
title=title or url,
+ mcp_kb_document_id=doc_id,
creator=request.user,
modifier=request.user,
)
- return JsonResponse({"success": True, "id": resource.id})
+ return JsonResponse({"success": True, "id": resource.id, "merged": False})
@login_required
@@ -374,9 +394,39 @@ def kb_add_pdf_resource(request):
except (TypeError, ValueError):
return JsonResponse({"success": False, "error": "doc_id is required."}, status=400)
- if PDFResource.objects.filter(mcp_kb_document_id=doc_id).exists():
- return JsonResponse({"success": False, "error": "Already tracked in Hopper."}, status=400)
+ # Merge, don't reject. 1) Already linked to this KB doc -> idempotent no-op.
+ already = PDFResource.objects.filter(mcp_kb_document_id=doc_id).first()
+ if already is not None:
+ return JsonResponse({
+ "success": True,
+ "id": already.id,
+ "title": already.title,
+ "filename": already.file.name if already.file else "",
+ "merged": True,
+ })
+
+ # 2) An unlinked local row whose title matches the KB doc (e.g. a zip upload
+ # that never linked to the KB) -> link it in place, no re-download.
+ if title:
+ candidate = PDFResource.objects.filter(
+ mcp_kb_document_id__isnull=True, title=title
+ ).first()
+ if candidate is not None:
+ candidate.mcp_kb_document_id = doc_id
+ candidate.modifier = request.user
+ candidate.status = PDFResource.Status.SUCCESS
+ candidate.save(update_fields=[
+ "mcp_kb_document_id", "modifier", "status", "modified_at",
+ ])
+ return JsonResponse({
+ "success": True,
+ "id": candidate.id,
+ "title": candidate.title,
+ "filename": candidate.file.name if candidate.file else "",
+ "merged": True,
+ })
+ # 3) No local row -> download from the KB and create a new tracked row.
try:
filename, content = download_kb_pdf(doc_id)
except httpx.ConnectError:
@@ -407,6 +457,7 @@ def kb_add_pdf_resource(request):
"id": resource.id,
"title": resource.title,
"filename": resource.file.name if resource.file else "",
+ "merged": False,
})
@@ -551,7 +602,15 @@ def kb_add_pdf_to_mcp(request):
resource.file.open("rb")
file_bytes = resource.file.read()
resource.file.close()
- result = add_pdf_to_kb(file_bytes, resource.file.name.split("/")[-1], resource.title)
+ filename = resource.file.name.split("/")[-1]
+ # Re-ingest of an already-linked resource updates the existing KB doc by
+ # id (no duplicate); an unlinked resource is added as a new KB doc.
+ if resource.mcp_kb_document_id:
+ result = update_pdf_in_kb(
+ resource.mcp_kb_document_id, file_bytes, filename, resource.title
+ )
+ else:
+ result = add_pdf_to_kb(file_bytes, filename, resource.title)
resource.mcp_kb_document_id = result.get("doc_id")
resource.modifier = request.user
resource.save(update_fields=["mcp_kb_document_id", "modifier", "modified_at"])