Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion src/ks_xlsx_parser/chunking/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import logging

from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
from ks_xlsx_parser.models.common import CellCoord, EdgeType
from ks_xlsx_parser.models.common import CellCoord, EdgeType, col_number_to_letter
from ks_xlsx_parser.models.sheet import SheetDTO
from ks_xlsx_parser.models.workbook import WorkbookDTO
from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
Expand Down Expand Up @@ -154,6 +154,28 @@ def _block_to_chunk(
for coord in block.key_cells
]

# Hidden-content provenance: record which parts of this chunk Excel
# hides — the whole sheet, specific rows, or specific columns — so
# downstream consumers can flag or filter hidden data instead of
# silently treating it as visible. Rows/cols are scoped to this
# chunk's range; the renderers also mark them inline ([hidden]).
rng = block.cell_range
metadata: dict[str, object] = {}
if sheet.properties.is_hidden:
metadata["sheet_hidden"] = True
hidden_rows = sorted(
r for r in sheet.hidden_rows
if rng.top_left.row <= r <= rng.bottom_right.row
)
if hidden_rows:
metadata["hidden_rows"] = hidden_rows
hidden_cols = sorted(
c for c in sheet.hidden_cols
if rng.top_left.col <= c <= rng.bottom_right.col
)
if hidden_cols:
metadata["hidden_cols"] = [col_number_to_letter(c) for c in hidden_cols]

return ChunkDTO(
sheet_name=block.sheet_name,
block_type=block.block_type,
Expand All @@ -166,6 +188,7 @@ def _block_to_chunk(
render_html=render_html,
render_text=render_text,
token_count=token_count,
metadata=metadata,
)

def _chart_to_chunk(self, chart) -> ChunkDTO:
Expand Down
1 change: 1 addition & 0 deletions src/ks_xlsx_parser/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def to_json(self) -> dict[str, Any]:
"cells": _chunk_cells(c, self.workbook),
"key_cells": c.key_cells,
"named_ranges": c.named_ranges,
"metadata": c.metadata,
"dependency_summary": {
"upstream_refs": c.dependency_summary.upstream_refs,
"downstream_refs": c.dependency_summary.downstream_refs,
Expand Down
16 changes: 9 additions & 7 deletions src/ks_xlsx_parser/rendering/html_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,30 +67,30 @@ def render_block(self, block: BlockDTO) -> str:
BlockType.ASSUMPTIONS_TABLE,
)

sheet_hidden_attr = (
' data-sheet-hidden="true"' if self._sheet.properties.is_hidden else ""
)
parts: list[str] = []
parts.append(
f'<table data-sheet="{html.escape(block.sheet_name)}" '
f'data-range="{rng.to_a1()}" '
f'data-block-type="{block.block_type.value}">'
f'data-block-type="{block.block_type.value}"{sheet_hidden_attr}>'
)

for row_idx, row in enumerate(rows):
if row in self._sheet.hidden_rows:
continue
# Hidden rows are emitted (flagged data-hidden) rather than dropped.
row_hidden = row in self._sheet.hidden_rows

is_header_row = row_idx == 0 and is_first_row_header
tag = "th" if is_header_row else "td"
wrapper = "thead" if is_header_row else "tbody"

if row_idx == 0 and is_header_row:
parts.append("<thead>")
elif row_idx == 1 and is_first_row_header:
parts.append("<tbody>")

parts.append("<tr>")
parts.append('<tr data-hidden="true">' if row_hidden else "<tr>")
for col in cols:
if col in self._sheet.hidden_cols:
continue
if (row, col) in skip_cells:
continue

Expand All @@ -106,6 +106,8 @@ def render_block(self, block: BlockDTO) -> str:

# Build cell attributes
attrs = [f'data-ref="{cell_ref}"']
if row_hidden or col in self._sheet.hidden_cols:
attrs.append('data-hidden="true"')
if rowspan > 1:
attrs.append(f'rowspan="{rowspan}"')
if colspan > 1:
Expand Down
155 changes: 94 additions & 61 deletions src/ks_xlsx_parser/rendering/text_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,37 +101,78 @@ def __init__(self, sheet: SheetDTO):

def render_block(self, block: BlockDTO) -> str:
"""
Render a block as plain text with coordinate context.

Format:
[Sheet1!A1:D10] (table: "SalesData")
| A | B | C | D |
|----------|---------|--------|---------|
| Product | Q1 | Q2 | Q3 |
| Widget A | 100 | 150 | 200 |
...
Render a block as a plain-text / Markdown table with coordinate
context.

The grid is a *standard* Markdown table whose header row holds the
block's real column names (for ``TABLE`` / ``ASSUMPTIONS_TABLE``
blocks, mirroring :class:`HtmlRenderer`'s ``<thead>`` behaviour).
Excel column letters are published on the bracket line as a
``cols:`` map rather than occupying the header row, and a leading
``row`` gutter carries the Excel row number of every line. Together
the ``cols:`` map and the row gutter let an agent reconstruct a full
A1 reference (column ``Amount`` + row ``3`` → ``B3``) without the
column letters masquerading as the table's headers and defeating
downstream header detection.

Hidden rows and columns are *included* (not dropped) and flagged
``[hidden]`` — in the gutter for rows, in the ``cols:`` map for
columns.

Format::

[Sheet1!A1:D3] (table) cols: A=Product, B=Q1, C=Q2, D=Q3
| row | Product | Q1 | Q2 | Q3 |
|-----|----------|-----|-----|-----|
| 2 | Widget A | 100 | 150 | 200 |
"""
rng = block.cell_range
rows = range(rng.top_left.row, rng.bottom_right.row + 1)
cols = range(rng.top_left.col, rng.bottom_right.col + 1)
rows = list(range(rng.top_left.row, rng.bottom_right.row + 1))
cols = list(range(rng.top_left.col, rng.bottom_right.col + 1))

# Mirror HtmlRenderer: for these block types the first row carries the
# real column names, so it becomes the Markdown header row.
first_row_is_header = block.block_type in (
BlockType.TABLE,
BlockType.ASSUMPTIONS_TABLE,
)

lines: list[str] = []

# Header with location and type
# --- Bracket line + column-letter map ------------------------------
type_label = block.block_type.value.replace("_", " ")
header = f"[{block.sheet_name}!{rng.to_a1()}] ({type_label})"
if self._sheet.properties.is_hidden:
header += " [hidden sheet]"
if block.table_name:
header += f' table: "{block.table_name}"'

# Column letters live here (not as a grid row) so the grid header is
# free to hold real names while an agent can still map name → letter.
col_descs: list[str] = []
for col in cols:
desc = col_number_to_letter(col)
if first_row_is_header:
name_cell = self._sheet.get_cell(rng.top_left.row, col)
name = (
_flatten_cell_text(_cell_render_value(name_cell))
if name_cell
else ""
)
if name:
desc = f"{desc}={name}"
if col in self._sheet.hidden_cols:
desc += " [hidden]"
col_descs.append(desc)
header += " cols: " + ", ".join(col_descs)
lines.append(header)

# Compute column widths using the SAME rendering rules the data
# rows will use, including the trailing `[=]` formula marker.
# Otherwise `[=]` inflates a cell past col_width post-hoc and
# spuriously triggers the long-value fallback below.
# Otherwise `[=]` inflates a cell past col_width post-hoc.
col_widths: dict[int, int] = {}
for col in cols:
col_letter = col_number_to_letter(col)
max_width = len(col_letter)
max_width = len(col_number_to_letter(col))
for row in rows:
cell = self._sheet.get_cell(row, col)
if cell is None:
Expand All @@ -143,63 +184,55 @@ def render_block(self, block: BlockDTO) -> str:
max_width = max(max_width, len(val))
col_widths[col] = min(max_width, 30) # Cap at 30 for alignment; text may overflow

# Column header row
col_headers = []
for col in cols:
if col in self._sheet.hidden_cols:
continue
letter = col_number_to_letter(col)
col_headers.append(letter.ljust(col_widths[col]))
lines.append("| " + " | ".join(col_headers) + " |")
lines.append(
"|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|"
)

# Data rows
is_first_data = True
# Row-number gutter: gives every value a row coordinate so an agent
# can form a full A1 reference. Hidden rows are flagged here.
gutter_header = "row"
gutter: dict[int, str] = {}
for row in rows:
label = str(row)
if row in self._sheet.hidden_rows:
continue
label += " [hidden]"
gutter[row] = label
gutter_width = max([len(gutter_header), *(len(g) for g in gutter.values())])

def _row(gutter_cell: str, values: list[str]) -> str:
return "| " + " | ".join([gutter_cell.ljust(gutter_width), *values]) + " |"

def _sep() -> str:
return (
"|-"
+ "-|-".join(["-" * gutter_width, *("-" * col_widths[c] for c in cols)])
+ "-|"
)

def _cells(row: int) -> list[str]:
values = []
for col in cols:
if col in self._sheet.hidden_cols:
continue
cell = self._sheet.get_cell(row, col)
val = _cell_render_value(cell) if cell else ""

if cell and cell.formula and not val.startswith("="):
val = f"{val} [=]"

# Markdown table rows are single-line; collapse embedded newlines
# Markdown rows are single-line; collapse embedded newlines
# (common in headers like "租金\n天数") so they don't break the grid.
val = _flatten_cell_text(val)

# Long-value fallback: only triggers if the rendered string
# genuinely exceeds the (now consistently-computed) column
# width — i.e. the column was capped at 30. We still emit
# the full retrieval value (no truncation) and let the
# alignment overflow; truncating destroys retrievability.
# Full retrieval value (no truncation); alignment may overflow.
values.append(val.ljust(col_widths[col]))

line = "| " + " | ".join(values) + " |"
lines.append(line)

# Add separator after first row if it looks like a header
if is_first_data and block.block_type in (
BlockType.TABLE,
BlockType.ASSUMPTIONS_TABLE,
):
lines.append(
"|-"
+ "-|-".join(
"-" * col_widths[c]
for c in cols
if c not in self._sheet.hidden_cols
)
+ "-|"
)
is_first_data = False
return values

# Header row: real first row for tables, else Excel column letters.
if first_row_is_header:
lines.append(_row(gutter_header, _cells(rng.top_left.row)))
lines.append(_sep())
data_rows = rows[1:]
else:
letters = [col_number_to_letter(c).ljust(col_widths[c]) for c in cols]
lines.append(_row(gutter_header, letters))
lines.append(_sep())
data_rows = rows

# Data rows (hidden rows/cols included; hidden rows flagged in gutter).
for row in data_rows:
lines.append(_row(gutter[row], _cells(row)))

return "\n".join(lines)

Expand Down
82 changes: 82 additions & 0 deletions tests/test_rendering.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,27 @@ def test_data_ref_attributes(self, simple_workbook):

assert 'data-ref="A1"' in html

def test_hidden_cells_included_and_flagged(self, hidden_rows_cols_workbook):
"""Hidden rows/columns are emitted in the HTML (flagged
`data-hidden`) rather than dropped — matching the text renderer."""
from ks_xlsx_parser.pipeline import parse_workbook

html = "\n".join(
c.render_html for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
)
assert 'data-hidden="true"' in html
assert "R3C1" in html # content of hidden row 3
assert "R1C2" in html # content of hidden column B

def test_hidden_sheet_flagged_on_table(self, multi_sheet_workbook):
"""Tables from a hidden worksheet carry `data-sheet-hidden`."""
from ks_xlsx_parser.pipeline import parse_workbook

chunks = parse_workbook(str(multi_sheet_workbook)).chunks
hidden_html = [c.render_html for c in chunks if c.sheet_name == "Hidden"]
assert hidden_html
assert all('data-sheet-hidden="true"' in h for h in hidden_html)


class TestTextRendering:
"""Test plain text / markdown rendering."""
Expand Down Expand Up @@ -150,3 +171,64 @@ def test_numeric_cells_render_raw_not_display_formatted(self):
assert "0.002668" in text
assert "e-03" not in text
assert "e+03" not in text

def test_table_header_uses_real_names_not_column_letters(self, table_workbook):
"""Option A: the grid header holds the table's real column names while
Excel column letters move to a `cols:` map on the bracket line, with a
leading `row` gutter — so downstream 'find the real header' logic sees
'Product', not 'A'."""
from ks_xlsx_parser.pipeline import parse_workbook

chunks = parse_workbook(str(table_workbook)).chunks
text = next(c.render_text for c in chunks if "Product" in c.render_text)

grid_lines = [ln for ln in text.splitlines() if ln.startswith("|")]
header_cells = [c.strip() for c in grid_lines[0].split("|")[1:-1]]
# Gutter first, then the *real* header names — not bare column letters.
assert header_cells[0] == "row"
assert header_cells[1] == "Product"
# Column letters are published as a name→letter map instead.
assert "cols: A=Product" in text

def test_hidden_rows_and_cols_are_extracted_and_flagged(
self, hidden_rows_cols_workbook
):
"""Hidden rows/columns are rendered (not dropped) and flagged
`[hidden]` — in the `cols:` map for columns, the gutter for rows."""
from ks_xlsx_parser.pipeline import parse_workbook

text = "\n".join(
c.render_text for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
)
assert "R1C2" in text # a cell in hidden column B
assert "R3C1" in text # a cell in hidden row 3
assert "[hidden]" in text

def test_hidden_rows_and_cols_recorded_in_chunk_metadata(
self, hidden_rows_cols_workbook
):
"""Hidden rows/columns are stored as structured chunk metadata
(scoped to the chunk's range), not just inline text markers."""
from ks_xlsx_parser.pipeline import parse_workbook

chunks = parse_workbook(str(hidden_rows_cols_workbook)).chunks
hidden_rows = {r for c in chunks for r in c.metadata.get("hidden_rows", [])}
hidden_cols = {col for c in chunks for col in c.metadata.get("hidden_cols", [])}
assert 3 in hidden_rows # row 3 is hidden
assert "B" in hidden_cols # column B is hidden

def test_hidden_sheet_marked_in_metadata_and_render(self, multi_sheet_workbook):
"""A hidden worksheet is still parsed and chunked, and every chunk
from it carries `sheet_hidden` metadata and a `[hidden sheet]` render
marker; visible-sheet chunks carry neither."""
from ks_xlsx_parser.pipeline import parse_workbook

chunks = parse_workbook(str(multi_sheet_workbook)).chunks
hidden = [c for c in chunks if c.sheet_name == "Hidden"]
visible = [c for c in chunks if c.sheet_name != "Hidden"]

assert hidden, "hidden sheet should still be parsed and chunked"
assert all(c.metadata.get("sheet_hidden") is True for c in hidden)
assert all("[hidden sheet]" in c.render_text for c in hidden)
assert all("sheet_hidden" not in c.metadata for c in visible)
assert all("[hidden sheet]" not in c.render_text for c in visible)
Loading