diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py
index 7057e78..db17100 100644
--- a/src/ks_xlsx_parser/chunking/chunker.py
+++ b/src/ks_xlsx_parser/chunking/chunker.py
@@ -14,7 +14,7 @@
import logging
from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
-from ks_xlsx_parser.models.common import CellCoord, EdgeType
+from ks_xlsx_parser.models.common import CellCoord, EdgeType, col_number_to_letter
from ks_xlsx_parser.models.sheet import SheetDTO
from ks_xlsx_parser.models.workbook import WorkbookDTO
from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
@@ -154,6 +154,28 @@ def _block_to_chunk(
for coord in block.key_cells
]
+ # Hidden-content provenance: record which parts of this chunk Excel
+ # hides — the whole sheet, specific rows, or specific columns — so
+ # downstream consumers can flag or filter hidden data instead of
+ # silently treating it as visible. Rows/cols are scoped to this
+ # chunk's range; the renderers also mark them inline ([hidden]).
+ rng = block.cell_range
+ metadata: dict[str, object] = {}
+ if sheet.properties.is_hidden:
+ metadata["sheet_hidden"] = True
+ hidden_rows = sorted(
+ r for r in sheet.hidden_rows
+ if rng.top_left.row <= r <= rng.bottom_right.row
+ )
+ if hidden_rows:
+ metadata["hidden_rows"] = hidden_rows
+ hidden_cols = sorted(
+ c for c in sheet.hidden_cols
+ if rng.top_left.col <= c <= rng.bottom_right.col
+ )
+ if hidden_cols:
+ metadata["hidden_cols"] = [col_number_to_letter(c) for c in hidden_cols]
+
return ChunkDTO(
sheet_name=block.sheet_name,
block_type=block.block_type,
@@ -166,6 +188,7 @@ def _block_to_chunk(
render_html=render_html,
render_text=render_text,
token_count=token_count,
+ metadata=metadata,
)
def _chart_to_chunk(self, chart) -> ChunkDTO:
diff --git a/src/ks_xlsx_parser/pipeline.py b/src/ks_xlsx_parser/pipeline.py
index 1451074..b74b199 100644
--- a/src/ks_xlsx_parser/pipeline.py
+++ b/src/ks_xlsx_parser/pipeline.py
@@ -195,6 +195,7 @@ def to_json(self) -> dict[str, Any]:
"cells": _chunk_cells(c, self.workbook),
"key_cells": c.key_cells,
"named_ranges": c.named_ranges,
+ "metadata": c.metadata,
"dependency_summary": {
"upstream_refs": c.dependency_summary.upstream_refs,
"downstream_refs": c.dependency_summary.downstream_refs,
diff --git a/src/ks_xlsx_parser/rendering/html_renderer.py b/src/ks_xlsx_parser/rendering/html_renderer.py
index 50a8b21..8584b2f 100644
--- a/src/ks_xlsx_parser/rendering/html_renderer.py
+++ b/src/ks_xlsx_parser/rendering/html_renderer.py
@@ -67,30 +67,30 @@ def render_block(self, block: BlockDTO) -> str:
BlockType.ASSUMPTIONS_TABLE,
)
+ sheet_hidden_attr = (
+ ' data-sheet-hidden="true"' if self._sheet.properties.is_hidden else ""
+ )
parts: list[str] = []
parts.append(
f'
'
+ f'data-block-type="{block.block_type.value}"{sheet_hidden_attr}>'
)
for row_idx, row in enumerate(rows):
- if row in self._sheet.hidden_rows:
- continue
+ # Hidden rows are emitted (flagged data-hidden) rather than dropped.
+ row_hidden = row in self._sheet.hidden_rows
is_header_row = row_idx == 0 and is_first_row_header
tag = "th" if is_header_row else "td"
- wrapper = "thead" if is_header_row else "tbody"
if row_idx == 0 and is_header_row:
parts.append("")
elif row_idx == 1 and is_first_row_header:
parts.append("")
- parts.append("")
+ parts.append('
' if row_hidden else "
")
for col in cols:
- if col in self._sheet.hidden_cols:
- continue
if (row, col) in skip_cells:
continue
@@ -106,6 +106,8 @@ def render_block(self, block: BlockDTO) -> str:
# Build cell attributes
attrs = [f'data-ref="{cell_ref}"']
+ if row_hidden or col in self._sheet.hidden_cols:
+ attrs.append('data-hidden="true"')
if rowspan > 1:
attrs.append(f'rowspan="{rowspan}"')
if colspan > 1:
diff --git a/src/ks_xlsx_parser/rendering/text_renderer.py b/src/ks_xlsx_parser/rendering/text_renderer.py
index e2c15f4..f308443 100644
--- a/src/ks_xlsx_parser/rendering/text_renderer.py
+++ b/src/ks_xlsx_parser/rendering/text_renderer.py
@@ -101,37 +101,78 @@ def __init__(self, sheet: SheetDTO):
def render_block(self, block: BlockDTO) -> str:
"""
- Render a block as plain text with coordinate context.
-
- Format:
- [Sheet1!A1:D10] (table: "SalesData")
- | A | B | C | D |
- |----------|---------|--------|---------|
- | Product | Q1 | Q2 | Q3 |
- | Widget A | 100 | 150 | 200 |
- ...
+ Render a block as a plain-text / Markdown table with coordinate
+ context.
+
+ The grid is a *standard* Markdown table whose header row holds the
+ block's real column names (for ``TABLE`` / ``ASSUMPTIONS_TABLE``
+ blocks, mirroring :class:`HtmlRenderer`'s ```` behaviour).
+ Excel column letters are published on the bracket line as a
+ ``cols:`` map rather than occupying the header row, and a leading
+ ``row`` gutter carries the Excel row number of every line. Together
+ the ``cols:`` map and the row gutter let an agent reconstruct a full
+ A1 reference (column ``Amount`` + row ``3`` → ``B3``) without the
+ column letters masquerading as the table's headers and defeating
+ downstream header detection.
+
+ Hidden rows and columns are *included* (not dropped) and flagged
+ ``[hidden]`` — in the gutter for rows, in the ``cols:`` map for
+ columns.
+
+ Format::
+
+ [Sheet1!A1:D3] (table) cols: A=Product, B=Q1, C=Q2, D=Q3
+ | row | Product | Q1 | Q2 | Q3 |
+ |-----|----------|-----|-----|-----|
+ | 2 | Widget A | 100 | 150 | 200 |
"""
rng = block.cell_range
- rows = range(rng.top_left.row, rng.bottom_right.row + 1)
- cols = range(rng.top_left.col, rng.bottom_right.col + 1)
+ rows = list(range(rng.top_left.row, rng.bottom_right.row + 1))
+ cols = list(range(rng.top_left.col, rng.bottom_right.col + 1))
+
+ # Mirror HtmlRenderer: for these block types the first row carries the
+ # real column names, so it becomes the Markdown header row.
+ first_row_is_header = block.block_type in (
+ BlockType.TABLE,
+ BlockType.ASSUMPTIONS_TABLE,
+ )
lines: list[str] = []
- # Header with location and type
+ # --- Bracket line + column-letter map ------------------------------
type_label = block.block_type.value.replace("_", " ")
header = f"[{block.sheet_name}!{rng.to_a1()}] ({type_label})"
+ if self._sheet.properties.is_hidden:
+ header += " [hidden sheet]"
if block.table_name:
header += f' table: "{block.table_name}"'
+
+ # Column letters live here (not as a grid row) so the grid header is
+ # free to hold real names while an agent can still map name → letter.
+ col_descs: list[str] = []
+ for col in cols:
+ desc = col_number_to_letter(col)
+ if first_row_is_header:
+ name_cell = self._sheet.get_cell(rng.top_left.row, col)
+ name = (
+ _flatten_cell_text(_cell_render_value(name_cell))
+ if name_cell
+ else ""
+ )
+ if name:
+ desc = f"{desc}={name}"
+ if col in self._sheet.hidden_cols:
+ desc += " [hidden]"
+ col_descs.append(desc)
+ header += " cols: " + ", ".join(col_descs)
lines.append(header)
# Compute column widths using the SAME rendering rules the data
# rows will use, including the trailing `[=]` formula marker.
- # Otherwise `[=]` inflates a cell past col_width post-hoc and
- # spuriously triggers the long-value fallback below.
+ # Otherwise `[=]` inflates a cell past col_width post-hoc.
col_widths: dict[int, int] = {}
for col in cols:
- col_letter = col_number_to_letter(col)
- max_width = len(col_letter)
+ max_width = len(col_number_to_letter(col))
for row in rows:
cell = self._sheet.get_cell(row, col)
if cell is None:
@@ -143,63 +184,55 @@ def render_block(self, block: BlockDTO) -> str:
max_width = max(max_width, len(val))
col_widths[col] = min(max_width, 30) # Cap at 30 for alignment; text may overflow
- # Column header row
- col_headers = []
- for col in cols:
- if col in self._sheet.hidden_cols:
- continue
- letter = col_number_to_letter(col)
- col_headers.append(letter.ljust(col_widths[col]))
- lines.append("| " + " | ".join(col_headers) + " |")
- lines.append(
- "|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|"
- )
-
- # Data rows
- is_first_data = True
+ # Row-number gutter: gives every value a row coordinate so an agent
+ # can form a full A1 reference. Hidden rows are flagged here.
+ gutter_header = "row"
+ gutter: dict[int, str] = {}
for row in rows:
+ label = str(row)
if row in self._sheet.hidden_rows:
- continue
+ label += " [hidden]"
+ gutter[row] = label
+ gutter_width = max([len(gutter_header), *(len(g) for g in gutter.values())])
+ def _row(gutter_cell: str, values: list[str]) -> str:
+ return "| " + " | ".join([gutter_cell.ljust(gutter_width), *values]) + " |"
+
+ def _sep() -> str:
+ return (
+ "|-"
+ + "-|-".join(["-" * gutter_width, *("-" * col_widths[c] for c in cols)])
+ + "-|"
+ )
+
+ def _cells(row: int) -> list[str]:
values = []
for col in cols:
- if col in self._sheet.hidden_cols:
- continue
cell = self._sheet.get_cell(row, col)
val = _cell_render_value(cell) if cell else ""
-
if cell and cell.formula and not val.startswith("="):
val = f"{val} [=]"
-
- # Markdown table rows are single-line; collapse embedded newlines
+ # Markdown rows are single-line; collapse embedded newlines
# (common in headers like "租金\n天数") so they don't break the grid.
val = _flatten_cell_text(val)
-
- # Long-value fallback: only triggers if the rendered string
- # genuinely exceeds the (now consistently-computed) column
- # width — i.e. the column was capped at 30. We still emit
- # the full retrieval value (no truncation) and let the
- # alignment overflow; truncating destroys retrievability.
+ # Full retrieval value (no truncation); alignment may overflow.
values.append(val.ljust(col_widths[col]))
-
- line = "| " + " | ".join(values) + " |"
- lines.append(line)
-
- # Add separator after first row if it looks like a header
- if is_first_data and block.block_type in (
- BlockType.TABLE,
- BlockType.ASSUMPTIONS_TABLE,
- ):
- lines.append(
- "|-"
- + "-|-".join(
- "-" * col_widths[c]
- for c in cols
- if c not in self._sheet.hidden_cols
- )
- + "-|"
- )
- is_first_data = False
+ return values
+
+ # Header row: real first row for tables, else Excel column letters.
+ if first_row_is_header:
+ lines.append(_row(gutter_header, _cells(rng.top_left.row)))
+ lines.append(_sep())
+ data_rows = rows[1:]
+ else:
+ letters = [col_number_to_letter(c).ljust(col_widths[c]) for c in cols]
+ lines.append(_row(gutter_header, letters))
+ lines.append(_sep())
+ data_rows = rows
+
+ # Data rows (hidden rows/cols included; hidden rows flagged in gutter).
+ for row in data_rows:
+ lines.append(_row(gutter[row], _cells(row)))
return "\n".join(lines)
diff --git a/tests/test_rendering.py b/tests/test_rendering.py
index 3a757ec..e8ee73a 100644
--- a/tests/test_rendering.py
+++ b/tests/test_rendering.py
@@ -61,6 +61,27 @@ def test_data_ref_attributes(self, simple_workbook):
assert 'data-ref="A1"' in html
+ def test_hidden_cells_included_and_flagged(self, hidden_rows_cols_workbook):
+ """Hidden rows/columns are emitted in the HTML (flagged
+ `data-hidden`) rather than dropped — matching the text renderer."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ html = "\n".join(
+ c.render_html for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
+ )
+ assert 'data-hidden="true"' in html
+ assert "R3C1" in html # content of hidden row 3
+ assert "R1C2" in html # content of hidden column B
+
+ def test_hidden_sheet_flagged_on_table(self, multi_sheet_workbook):
+ """Tables from a hidden worksheet carry `data-sheet-hidden`."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ chunks = parse_workbook(str(multi_sheet_workbook)).chunks
+ hidden_html = [c.render_html for c in chunks if c.sheet_name == "Hidden"]
+ assert hidden_html
+ assert all('data-sheet-hidden="true"' in h for h in hidden_html)
+
class TestTextRendering:
"""Test plain text / markdown rendering."""
@@ -150,3 +171,64 @@ def test_numeric_cells_render_raw_not_display_formatted(self):
assert "0.002668" in text
assert "e-03" not in text
assert "e+03" not in text
+
+ def test_table_header_uses_real_names_not_column_letters(self, table_workbook):
+ """Option A: the grid header holds the table's real column names while
+ Excel column letters move to a `cols:` map on the bracket line, with a
+ leading `row` gutter — so downstream 'find the real header' logic sees
+ 'Product', not 'A'."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ chunks = parse_workbook(str(table_workbook)).chunks
+ text = next(c.render_text for c in chunks if "Product" in c.render_text)
+
+ grid_lines = [ln for ln in text.splitlines() if ln.startswith("|")]
+ header_cells = [c.strip() for c in grid_lines[0].split("|")[1:-1]]
+ # Gutter first, then the *real* header names — not bare column letters.
+ assert header_cells[0] == "row"
+ assert header_cells[1] == "Product"
+ # Column letters are published as a name→letter map instead.
+ assert "cols: A=Product" in text
+
+ def test_hidden_rows_and_cols_are_extracted_and_flagged(
+ self, hidden_rows_cols_workbook
+ ):
+ """Hidden rows/columns are rendered (not dropped) and flagged
+ `[hidden]` — in the `cols:` map for columns, the gutter for rows."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ text = "\n".join(
+ c.render_text for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
+ )
+ assert "R1C2" in text # a cell in hidden column B
+ assert "R3C1" in text # a cell in hidden row 3
+ assert "[hidden]" in text
+
+ def test_hidden_rows_and_cols_recorded_in_chunk_metadata(
+ self, hidden_rows_cols_workbook
+ ):
+ """Hidden rows/columns are stored as structured chunk metadata
+ (scoped to the chunk's range), not just inline text markers."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ chunks = parse_workbook(str(hidden_rows_cols_workbook)).chunks
+ hidden_rows = {r for c in chunks for r in c.metadata.get("hidden_rows", [])}
+ hidden_cols = {col for c in chunks for col in c.metadata.get("hidden_cols", [])}
+ assert 3 in hidden_rows # row 3 is hidden
+ assert "B" in hidden_cols # column B is hidden
+
+ def test_hidden_sheet_marked_in_metadata_and_render(self, multi_sheet_workbook):
+ """A hidden worksheet is still parsed and chunked, and every chunk
+ from it carries `sheet_hidden` metadata and a `[hidden sheet]` render
+ marker; visible-sheet chunks carry neither."""
+ from ks_xlsx_parser.pipeline import parse_workbook
+
+ chunks = parse_workbook(str(multi_sheet_workbook)).chunks
+ hidden = [c for c in chunks if c.sheet_name == "Hidden"]
+ visible = [c for c in chunks if c.sheet_name != "Hidden"]
+
+ assert hidden, "hidden sheet should still be parsed and chunked"
+ assert all(c.metadata.get("sheet_hidden") is True for c in hidden)
+ assert all("[hidden sheet]" in c.render_text for c in hidden)
+ assert all("sheet_hidden" not in c.metadata for c in visible)
+ assert all("[hidden sheet]" not in c.render_text for c in visible)