diff --git a/hospexplorer/ask/kb_connector.py b/hospexplorer/ask/kb_connector.py index b374a55..9ddb27a 100644 --- a/hospexplorer/ask/kb_connector.py +++ b/hospexplorer/ask/kb_connector.py @@ -113,6 +113,60 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None): raise last_exc +def update_pdf_in_kb(doc_id, file_bytes, filename, title, url=None): + """Update an existing PDF in the MCP KB server by document id. + + Calls POST /docs/pdf/update on the MCP KB server with multipart form data. + The KB server re-extracts and replaces the document's chunks in place, + keeping the same document id. Mirrors add_pdf_to_kb's retry policy. + """ + headers = { + "Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}", + } + endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/update" + + data = {"doc_id": str(doc_id), "title": title} + if url: + data["url"] = url + + # only retry on transport errors — a timeout + # likely means the KB received the file and is still processing it + attempts = max(1, settings.KB_MCP_PDF_RETRIES) + last_exc = None + for attempt in range(1, attempts + 1): + # rebuild files each attempt: httpx consumes the stream on send + files = {"file": (filename, file_bytes, "application/pdf")} + try: + with httpx.Client() as client: + response = client.post( + endpoint, + headers=headers, + files=files, + data=data, + timeout=settings.KB_MCP_PDF_TIMEOUT, + ) + response.raise_for_status() + return response.json() + except httpx.TimeoutException as e: + logger.warning( + "KB PDF update timed out for %s: %s; not retrying (KB may still be processing)", + filename, e, + ) + raise + except httpx.TransportError as e: + last_exc = e + if attempt == attempts: + break + backoff = 2 ** (attempt - 1) + logger.warning( + "KB PDF update failed (attempt %d/%d) for %s: %s; retrying in %ds", + attempt, attempts, filename, e, backoff, + ) + time.sleep(backoff) + + raise last_exc + + def download_kb_pdf(doc_id): """Download the original PDF bytes for a KB document. diff --git a/hospexplorer/ask/templates/kb/resources.html b/hospexplorer/ask/templates/kb/resources.html index ea9374c..4b69883 100644 --- a/hospexplorer/ask/templates/kb/resources.html +++ b/hospexplorer/ask/templates/kb/resources.html @@ -522,29 +522,35 @@
PDFs in KB but not tracked internally
'X-CSRFToken': '{{ csrf_token }}', 'Content-Type': 'application/json', }, - body: JSON.stringify({ url: doc.url, title: doc.title }), + body: JSON.stringify({ url: doc.url, title: doc.title, doc_id: doc.doc_id }), }); const data = await response.json(); if (data.success) { this.untracked = this.untracked.filter(d => d.doc_id !== doc.doc_id); - const tbody = document.querySelector('#websites-pane .kb-table tbody'); - if (tbody) { - const tr = document.createElement('tr'); - tr.className = 'kb-row-maroon'; - const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url; - const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' }); - tr.innerHTML = ` - ${doc.title} - ${truncUrl} - ${now} - In Sync - {% if can_change %}{% endif %} - `; - tbody.prepend(tr); + if (data.merged) { + // Linked an existing Hopper row — that row is already shown in + // the table, so refresh its status instead of adding a duplicate. + this.statusMap[data.id] = 'in_kb'; + this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`); + } else { + const tbody = document.querySelector('#websites-pane .kb-table tbody'); + if (tbody) { + const tr = document.createElement('tr'); + tr.className = 'kb-row-maroon'; + const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url; + const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' }); + tr.innerHTML = ` + ${doc.title} + ${truncUrl} + ${now} + In Sync + {% if can_change %}{% endif %} + `; + tbody.prepend(tr); + } + this.showToast(`Now tracking "${doc.title}" in Hopper.`); } - - this.showToast(`Now tracking "${doc.title}" in Hopper.`); } else { this.showToast(data.error || 'Failed to track resource.', 'error'); } @@ -570,24 +576,29 @@
PDFs in KB but not tracked internally
this.untrackedPdfs = this.untrackedPdfs.filter(d => d.doc_id !== doc.doc_id); this.pdfStatusMap[data.id] = 'in_kb'; - const tbody = document.querySelector('#pdfs-pane .kb-table tbody'); - if (tbody) { - const tr = document.createElement('tr'); - tr.className = 'kb-row-maroon'; - const filename = data.filename || ''; - const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename; - const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' }); - tr.innerHTML = ` - ${data.title} - ${truncFilename} - ${now} - In Sync - {% if can_change_pdf %}{% endif %} - `; - tbody.prepend(tr); + if (data.merged) { + // Linked an existing Hopper row — its row is already shown and its + // status was just set above, so don't add a duplicate row. + this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`); + } else { + const tbody = document.querySelector('#pdfs-pane .kb-table tbody'); + if (tbody) { + const tr = document.createElement('tr'); + tr.className = 'kb-row-maroon'; + const filename = data.filename || ''; + const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename; + const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' }); + tr.innerHTML = ` + ${data.title} + ${truncFilename} + ${now} + In Sync + {% if can_change_pdf %}{% endif %} + `; + tbody.prepend(tr); + } + this.showToast(`Now tracking "${doc.title}" in Hopper.`); } - - this.showToast(`Now tracking "${doc.title}" in Hopper.`); } else { this.showToast(data.error || 'Failed to track PDF.', 'error'); } diff --git a/hospexplorer/ask/views.py b/hospexplorer/ask/views.py index 6135717..84cab08 100644 --- a/hospexplorer/ask/views.py +++ b/hospexplorer/ask/views.py @@ -15,7 +15,7 @@ from ask.tasks import run_llm_task from django.core.files.base import ContentFile -from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, delete_kb_document, download_kb_pdf +from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, update_pdf_in_kb, delete_kb_document, download_kb_pdf logger = logging.getLogger(__name__) @@ -335,16 +335,36 @@ def kb_add_resource(request): url = body.get("url", "").strip() title = body.get("title", "").strip() + doc_id = body.get("doc_id") if not url: return JsonResponse({"success": False, "error": "URL is required."}, status=400) + try: + doc_id = int(doc_id) if doc_id is not None else None + except (TypeError, ValueError): + doc_id = None + + # merge, don't reject if a Hopper resource already exists for this URL + # (e.g. created but never linked to the KB), link it to the KB doc instead + # of creating a duplicate. .first() is safe against pre-existing duplicates + existing = WebsiteResource.objects.filter(url=url).first() + if existing is not None: + update_fields = ["modifier", "modified_at"] + existing.modifier = request.user + if doc_id is not None: + existing.mcp_kb_document_id = doc_id + update_fields.append("mcp_kb_document_id") + existing.save(update_fields=update_fields) + return JsonResponse({"success": True, "id": existing.id, "merged": True}) + resource = WebsiteResource.objects.create( url=url, title=title or url, + mcp_kb_document_id=doc_id, creator=request.user, modifier=request.user, ) - return JsonResponse({"success": True, "id": resource.id}) + return JsonResponse({"success": True, "id": resource.id, "merged": False}) @login_required @@ -374,9 +394,39 @@ def kb_add_pdf_resource(request): except (TypeError, ValueError): return JsonResponse({"success": False, "error": "doc_id is required."}, status=400) - if PDFResource.objects.filter(mcp_kb_document_id=doc_id).exists(): - return JsonResponse({"success": False, "error": "Already tracked in Hopper."}, status=400) + # Merge, don't reject. 1) Already linked to this KB doc -> idempotent no-op. + already = PDFResource.objects.filter(mcp_kb_document_id=doc_id).first() + if already is not None: + return JsonResponse({ + "success": True, + "id": already.id, + "title": already.title, + "filename": already.file.name if already.file else "", + "merged": True, + }) + + # 2) An unlinked local row whose title matches the KB doc (e.g. a zip upload + # that never linked to the KB) -> link it in place, no re-download. + if title: + candidate = PDFResource.objects.filter( + mcp_kb_document_id__isnull=True, title=title + ).first() + if candidate is not None: + candidate.mcp_kb_document_id = doc_id + candidate.modifier = request.user + candidate.status = PDFResource.Status.SUCCESS + candidate.save(update_fields=[ + "mcp_kb_document_id", "modifier", "status", "modified_at", + ]) + return JsonResponse({ + "success": True, + "id": candidate.id, + "title": candidate.title, + "filename": candidate.file.name if candidate.file else "", + "merged": True, + }) + # 3) No local row -> download from the KB and create a new tracked row. try: filename, content = download_kb_pdf(doc_id) except httpx.ConnectError: @@ -407,6 +457,7 @@ def kb_add_pdf_resource(request): "id": resource.id, "title": resource.title, "filename": resource.file.name if resource.file else "", + "merged": False, }) @@ -551,7 +602,15 @@ def kb_add_pdf_to_mcp(request): resource.file.open("rb") file_bytes = resource.file.read() resource.file.close() - result = add_pdf_to_kb(file_bytes, resource.file.name.split("/")[-1], resource.title) + filename = resource.file.name.split("/")[-1] + # Re-ingest of an already-linked resource updates the existing KB doc by + # id (no duplicate); an unlinked resource is added as a new KB doc. + if resource.mcp_kb_document_id: + result = update_pdf_in_kb( + resource.mcp_kb_document_id, file_bytes, filename, resource.title + ) + else: + result = add_pdf_to_kb(file_bytes, filename, resource.title) resource.mcp_kb_document_id = result.get("doc_id") resource.modifier = request.user resource.save(update_fields=["mcp_kb_document_id", "modifier", "modified_at"])