Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions hospexplorer/ask/kb_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,60 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
raise last_exc


def update_pdf_in_kb(doc_id, file_bytes, filename, title, url=None):
"""Update an existing PDF in the MCP KB server by document id.

Calls POST /docs/pdf/update on the MCP KB server with multipart form data.
The KB server re-extracts and replaces the document's chunks in place,
keeping the same document id. Mirrors add_pdf_to_kb's retry policy.
"""
headers = {
"Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
}
endpoint = f"{settings.KB_MCP_HOST}/docs/pdf/update"

data = {"doc_id": str(doc_id), "title": title}
if url:
data["url"] = url

# only retry on transport errors — a timeout
# likely means the KB received the file and is still processing it
attempts = max(1, settings.KB_MCP_PDF_RETRIES)
last_exc = None
for attempt in range(1, attempts + 1):
# rebuild files each attempt: httpx consumes the stream on send
files = {"file": (filename, file_bytes, "application/pdf")}
try:
with httpx.Client() as client:
response = client.post(
endpoint,
headers=headers,
files=files,
data=data,
timeout=settings.KB_MCP_PDF_TIMEOUT,
)
response.raise_for_status()
return response.json()
except httpx.TimeoutException as e:
logger.warning(
"KB PDF update timed out for %s: %s; not retrying (KB may still be processing)",
filename, e,
)
raise
except httpx.TransportError as e:
last_exc = e
if attempt == attempts:
break
backoff = 2 ** (attempt - 1)
logger.warning(
"KB PDF update failed (attempt %d/%d) for %s: %s; retrying in %ds",
attempt, attempts, filename, e, backoff,
)
time.sleep(backoff)

raise last_exc


def download_kb_pdf(doc_id):
"""Download the original PDF bytes for a KB document.

Expand Down
79 changes: 45 additions & 34 deletions hospexplorer/ask/templates/kb/resources.html
Original file line number Diff line number Diff line change
Expand Up @@ -522,29 +522,35 @@ <h5 class="kb-section-heading">PDFs in KB but not tracked internally</h5>
'X-CSRFToken': '{{ csrf_token }}',
'Content-Type': 'application/json',
},
body: JSON.stringify({ url: doc.url, title: doc.title }),
body: JSON.stringify({ url: doc.url, title: doc.title, doc_id: doc.doc_id }),
});
const data = await response.json();
if (data.success) {
this.untracked = this.untracked.filter(d => d.doc_id !== doc.doc_id);

const tbody = document.querySelector('#websites-pane .kb-table tbody');
if (tbody) {
const tr = document.createElement('tr');
tr.className = 'kb-row-maroon';
const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
tr.innerHTML = `
<td>${doc.title}</td>
<td><a href="${doc.url}" target="_blank" rel="noopener" class="kb-link text-truncate d-inline-block" style="max-width: 300px;">${truncUrl}</a></td>
<td class="text-muted">${now}</td>
<td>In Sync</td>
{% if can_change %}<td></td>{% endif %}
`;
tbody.prepend(tr);
if (data.merged) {
// Linked an existing Hopper row — that row is already shown in
// the table, so refresh its status instead of adding a duplicate.
this.statusMap[data.id] = 'in_kb';
this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
} else {
const tbody = document.querySelector('#websites-pane .kb-table tbody');
if (tbody) {
const tr = document.createElement('tr');
tr.className = 'kb-row-maroon';
const truncUrl = doc.url.length > 60 ? doc.url.slice(0, 57) + '...' : doc.url;
const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
tr.innerHTML = `
<td>${doc.title}</td>
<td><a href="${doc.url}" target="_blank" rel="noopener" class="kb-link text-truncate d-inline-block" style="max-width: 300px;">${truncUrl}</a></td>
<td class="text-muted">${now}</td>
<td>In Sync</td>
{% if can_change %}<td></td>{% endif %}
`;
tbody.prepend(tr);
}
this.showToast(`Now tracking "${doc.title}" in Hopper.`);
}

this.showToast(`Now tracking "${doc.title}" in Hopper.`);
} else {
this.showToast(data.error || 'Failed to track resource.', 'error');
}
Expand All @@ -570,24 +576,29 @@ <h5 class="kb-section-heading">PDFs in KB but not tracked internally</h5>
this.untrackedPdfs = this.untrackedPdfs.filter(d => d.doc_id !== doc.doc_id);
this.pdfStatusMap[data.id] = 'in_kb';

const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
if (tbody) {
const tr = document.createElement('tr');
tr.className = 'kb-row-maroon';
const filename = data.filename || '';
const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
tr.innerHTML = `
<td>${data.title}</td>
<td class="text-muted">${truncFilename}</td>
<td class="text-muted">${now}</td>
<td>In Sync</td>
{% if can_change_pdf %}<td></td>{% endif %}
`;
tbody.prepend(tr);
if (data.merged) {
// Linked an existing Hopper row — its row is already shown and its
// status was just set above, so don't add a duplicate row.
this.showToast(`Linked "${doc.title}" to the existing Hopper resource.`);
} else {
const tbody = document.querySelector('#pdfs-pane .kb-table tbody');
if (tbody) {
const tr = document.createElement('tr');
tr.className = 'kb-row-maroon';
const filename = data.filename || '';
const truncFilename = filename.length > 40 ? filename.slice(0, 37) + '...' : filename;
const now = new Date().toLocaleString('en-US', { month: 'short', day: 'numeric', year: 'numeric', hour: 'numeric', minute: '2-digit' });
tr.innerHTML = `
<td>${data.title}</td>
<td class="text-muted">${truncFilename}</td>
<td class="text-muted">${now}</td>
<td>In Sync</td>
{% if can_change_pdf %}<td></td>{% endif %}
`;
tbody.prepend(tr);
}
this.showToast(`Now tracking "${doc.title}" in Hopper.`);
}

this.showToast(`Now tracking "${doc.title}" in Hopper.`);
} else {
this.showToast(data.error || 'Failed to track PDF.', 'error');
}
Expand Down
69 changes: 64 additions & 5 deletions hospexplorer/ask/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from ask.tasks import run_llm_task
from django.core.files.base import ContentFile

from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, delete_kb_document, download_kb_pdf
from ask.kb_connector import list_kb_documents, add_website_to_kb, add_pdf_to_kb, update_pdf_in_kb, delete_kb_document, download_kb_pdf

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -335,16 +335,36 @@ def kb_add_resource(request):

url = body.get("url", "").strip()
title = body.get("title", "").strip()
doc_id = body.get("doc_id")
if not url:
return JsonResponse({"success": False, "error": "URL is required."}, status=400)

try:
doc_id = int(doc_id) if doc_id is not None else None
except (TypeError, ValueError):
doc_id = None

# merge, don't reject if a Hopper resource already exists for this URL
# (e.g. created but never linked to the KB), link it to the KB doc instead
# of creating a duplicate. .first() is safe against pre-existing duplicates
existing = WebsiteResource.objects.filter(url=url).first()
if existing is not None:
update_fields = ["modifier", "modified_at"]
existing.modifier = request.user
if doc_id is not None:
existing.mcp_kb_document_id = doc_id
update_fields.append("mcp_kb_document_id")
existing.save(update_fields=update_fields)
return JsonResponse({"success": True, "id": existing.id, "merged": True})

resource = WebsiteResource.objects.create(
url=url,
title=title or url,
mcp_kb_document_id=doc_id,
creator=request.user,
modifier=request.user,
)
return JsonResponse({"success": True, "id": resource.id})
return JsonResponse({"success": True, "id": resource.id, "merged": False})


@login_required
Expand Down Expand Up @@ -374,9 +394,39 @@ def kb_add_pdf_resource(request):
except (TypeError, ValueError):
return JsonResponse({"success": False, "error": "doc_id is required."}, status=400)

if PDFResource.objects.filter(mcp_kb_document_id=doc_id).exists():
return JsonResponse({"success": False, "error": "Already tracked in Hopper."}, status=400)
# Merge, don't reject. 1) Already linked to this KB doc -> idempotent no-op.
already = PDFResource.objects.filter(mcp_kb_document_id=doc_id).first()
if already is not None:
return JsonResponse({
"success": True,
"id": already.id,
"title": already.title,
"filename": already.file.name if already.file else "",
"merged": True,
})

# 2) An unlinked local row whose title matches the KB doc (e.g. a zip upload
# that never linked to the KB) -> link it in place, no re-download.
if title:
candidate = PDFResource.objects.filter(
mcp_kb_document_id__isnull=True, title=title
).first()
if candidate is not None:
candidate.mcp_kb_document_id = doc_id
candidate.modifier = request.user
candidate.status = PDFResource.Status.SUCCESS
candidate.save(update_fields=[
"mcp_kb_document_id", "modifier", "status", "modified_at",
])
return JsonResponse({
"success": True,
"id": candidate.id,
"title": candidate.title,
"filename": candidate.file.name if candidate.file else "",
"merged": True,
})

# 3) No local row -> download from the KB and create a new tracked row.
try:
filename, content = download_kb_pdf(doc_id)
except httpx.ConnectError:
Expand Down Expand Up @@ -407,6 +457,7 @@ def kb_add_pdf_resource(request):
"id": resource.id,
"title": resource.title,
"filename": resource.file.name if resource.file else "",
"merged": False,
})


Expand Down Expand Up @@ -551,7 +602,15 @@ def kb_add_pdf_to_mcp(request):
resource.file.open("rb")
file_bytes = resource.file.read()
resource.file.close()
result = add_pdf_to_kb(file_bytes, resource.file.name.split("/")[-1], resource.title)
filename = resource.file.name.split("/")[-1]
# Re-ingest of an already-linked resource updates the existing KB doc by
# id (no duplicate); an unlinked resource is added as a new KB doc.
if resource.mcp_kb_document_id:
result = update_pdf_in_kb(
resource.mcp_kb_document_id, file_bytes, filename, resource.title
)
else:
result = add_pdf_to_kb(file_bytes, filename, resource.title)
resource.mcp_kb_document_id = result.get("doc_id")
resource.modifier = request.user
resource.save(update_fields=["mcp_kb_document_id", "modifier", "modified_at"])
Expand Down