knowledgestack · arnav2 · Jun 5, 2026 · Jun 5, 2026
diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py
@@ -14,7 +14,7 @@
 import logging
 
 from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
-from ks_xlsx_parser.models.common import CellCoord, EdgeType
+from ks_xlsx_parser.models.common import CellCoord, EdgeType, col_number_to_letter
 from ks_xlsx_parser.models.sheet import SheetDTO
 from ks_xlsx_parser.models.workbook import WorkbookDTO
 from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
@@ -154,6 +154,28 @@ def _block_to_chunk(
             for coord in block.key_cells
         ]
 
+        # Hidden-content provenance: record which parts of this chunk Excel
+        # hides — the whole sheet, specific rows, or specific columns — so
+        # downstream consumers can flag or filter hidden data instead of
+        # silently treating it as visible. Rows/cols are scoped to this
+        # chunk's range; the renderers also mark them inline ([hidden]).
+        rng = block.cell_range
+        metadata: dict[str, object] = {}
+        if sheet.properties.is_hidden:
+            metadata["sheet_hidden"] = True
+        hidden_rows = sorted(
+            r for r in sheet.hidden_rows
+            if rng.top_left.row <= r <= rng.bottom_right.row
+        )
+        if hidden_rows:
+            metadata["hidden_rows"] = hidden_rows
+        hidden_cols = sorted(
+            c for c in sheet.hidden_cols
+            if rng.top_left.col <= c <= rng.bottom_right.col
+        )
+        if hidden_cols:
+            metadata["hidden_cols"] = [col_number_to_letter(c) for c in hidden_cols]
+
         return ChunkDTO(
             sheet_name=block.sheet_name,
             block_type=block.block_type,
@@ -166,6 +188,7 @@ def _block_to_chunk(
             render_html=render_html,
             render_text=render_text,
             token_count=token_count,
+            metadata=metadata,
         )
 
     def _chart_to_chunk(self, chart) -> ChunkDTO:

diff --git a/src/ks_xlsx_parser/pipeline.py b/src/ks_xlsx_parser/pipeline.py
@@ -195,6 +195,7 @@ def to_json(self) -> dict[str, Any]:
                     "cells": _chunk_cells(c, self.workbook),
                     "key_cells": c.key_cells,
                     "named_ranges": c.named_ranges,
+                    "metadata": c.metadata,
                     "dependency_summary": {
                         "upstream_refs": c.dependency_summary.upstream_refs,
                         "downstream_refs": c.dependency_summary.downstream_refs,

diff --git a/src/ks_xlsx_parser/rendering/html_renderer.py b/src/ks_xlsx_parser/rendering/html_renderer.py
@@ -67,30 +67,30 @@ def render_block(self, block: BlockDTO) -> str:
             BlockType.ASSUMPTIONS_TABLE,
         )
 
+        sheet_hidden_attr = (
+            ' data-sheet-hidden="true"' if self._sheet.properties.is_hidden else ""
+        )
         parts: list[str] = []
         parts.append(
             f'<table data-sheet="{html.escape(block.sheet_name)}" '
             f'data-range="{rng.to_a1()}" '
-            f'data-block-type="{block.block_type.value}">'
+            f'data-block-type="{block.block_type.value}"{sheet_hidden_attr}>'
         )
 
         for row_idx, row in enumerate(rows):
-            if row in self._sheet.hidden_rows:
-                continue
+            # Hidden rows are emitted (flagged data-hidden) rather than dropped.
+            row_hidden = row in self._sheet.hidden_rows
 
             is_header_row = row_idx == 0 and is_first_row_header
             tag = "th" if is_header_row else "td"
-            wrapper = "thead" if is_header_row else "tbody"
 
             if row_idx == 0 and is_header_row:
                 parts.append("<thead>")
             elif row_idx == 1 and is_first_row_header:
                 parts.append("<tbody>")
 
-            parts.append("<tr>")
+            parts.append('<tr data-hidden="true">' if row_hidden else "<tr>")
             for col in cols:
-                if col in self._sheet.hidden_cols:
-                    continue
                 if (row, col) in skip_cells:
                     continue
 
@@ -106,6 +106,8 @@ def render_block(self, block: BlockDTO) -> str:
 
                 # Build cell attributes
                 attrs = [f'data-ref="{cell_ref}"']
+                if row_hidden or col in self._sheet.hidden_cols:
+                    attrs.append('data-hidden="true"')
                 if rowspan > 1:
                     attrs.append(f'rowspan="{rowspan}"')
                 if colspan > 1:

diff --git a/src/ks_xlsx_parser/rendering/text_renderer.py b/src/ks_xlsx_parser/rendering/text_renderer.py
@@ -101,37 +101,78 @@ def __init__(self, sheet: SheetDTO):
 
     def render_block(self, block: BlockDTO) -> str:
         """
-        Render a block as plain text with coordinate context.
-
-        Format:
-            [Sheet1!A1:D10] (table: "SalesData")
-            | A        | B       | C      | D       |
-            |----------|---------|--------|---------|
-            | Product  | Q1      | Q2     | Q3      |
-            | Widget A | 100     | 150    | 200     |
-            ...
+        Render a block as a plain-text / Markdown table with coordinate
+        context.
+
+        The grid is a *standard* Markdown table whose header row holds the
+        block's real column names (for ``TABLE`` / ``ASSUMPTIONS_TABLE``
+        blocks, mirroring :class:`HtmlRenderer`'s ``<thead>`` behaviour).
+        Excel column letters are published on the bracket line as a
+        ``cols:`` map rather than occupying the header row, and a leading
+        ``row`` gutter carries the Excel row number of every line. Together
+        the ``cols:`` map and the row gutter let an agent reconstruct a full
+        A1 reference (column ``Amount`` + row ``3`` → ``B3``) without the
+        column letters masquerading as the table's headers and defeating
+        downstream header detection.
+
+        Hidden rows and columns are *included* (not dropped) and flagged
+        ``[hidden]`` — in the gutter for rows, in the ``cols:`` map for
+        columns.
+
+        Format::
+
+            [Sheet1!A1:D3] (table) cols: A=Product, B=Q1, C=Q2, D=Q3
+            | row | Product  | Q1  | Q2  | Q3  |
+            |-----|----------|-----|-----|-----|
+            | 2   | Widget A | 100 | 150 | 200 |
         """
         rng = block.cell_range
-        rows = range(rng.top_left.row, rng.bottom_right.row + 1)
-        cols = range(rng.top_left.col, rng.bottom_right.col + 1)
+        rows = list(range(rng.top_left.row, rng.bottom_right.row + 1))
+        cols = list(range(rng.top_left.col, rng.bottom_right.col + 1))
+
+        # Mirror HtmlRenderer: for these block types the first row carries the
+        # real column names, so it becomes the Markdown header row.
+        first_row_is_header = block.block_type in (
+            BlockType.TABLE,
+            BlockType.ASSUMPTIONS_TABLE,
+        )
 
         lines: list[str] = []
 
-        # Header with location and type
+        # --- Bracket line + column-letter map ------------------------------
         type_label = block.block_type.value.replace("_", " ")
         header = f"[{block.sheet_name}!{rng.to_a1()}] ({type_label})"
+        if self._sheet.properties.is_hidden:
+            header += " [hidden sheet]"
         if block.table_name:
             header += f' table: "{block.table_name}"'
+
+        # Column letters live here (not as a grid row) so the grid header is
+        # free to hold real names while an agent can still map name → letter.
+        col_descs: list[str] = []
+        for col in cols:
+            desc = col_number_to_letter(col)
+            if first_row_is_header:
+                name_cell = self._sheet.get_cell(rng.top_left.row, col)
+                name = (
+                    _flatten_cell_text(_cell_render_value(name_cell))
+                    if name_cell
+                    else ""
+                )
+                if name:
+                    desc = f"{desc}={name}"
+            if col in self._sheet.hidden_cols:
+                desc += " [hidden]"
+            col_descs.append(desc)
+        header += " cols: " + ", ".join(col_descs)
         lines.append(header)
 
         # Compute column widths using the SAME rendering rules the data
         # rows will use, including the trailing `[=]` formula marker.
-        # Otherwise `[=]` inflates a cell past col_width post-hoc and
-        # spuriously triggers the long-value fallback below.
+        # Otherwise `[=]` inflates a cell past col_width post-hoc.
         col_widths: dict[int, int] = {}
         for col in cols:
-            col_letter = col_number_to_letter(col)
-            max_width = len(col_letter)
+            max_width = len(col_number_to_letter(col))
             for row in rows:
                 cell = self._sheet.get_cell(row, col)
                 if cell is None:
@@ -143,63 +184,55 @@ def render_block(self, block: BlockDTO) -> str:
                 max_width = max(max_width, len(val))
             col_widths[col] = min(max_width, 30)  # Cap at 30 for alignment; text may overflow
 
-        # Column header row
-        col_headers = []
-        for col in cols:
-            if col in self._sheet.hidden_cols:
-                continue
-            letter = col_number_to_letter(col)
-            col_headers.append(letter.ljust(col_widths[col]))
-        lines.append("| " + " | ".join(col_headers) + " |")
-        lines.append(
-            "|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|"
-        )
-
-        # Data rows
-        is_first_data = True
+        # Row-number gutter: gives every value a row coordinate so an agent
+        # can form a full A1 reference. Hidden rows are flagged here.
+        gutter_header = "row"
+        gutter: dict[int, str] = {}
         for row in rows:
+            label = str(row)
             if row in self._sheet.hidden_rows:
-                continue
+                label += " [hidden]"
+            gutter[row] = label
+        gutter_width = max([len(gutter_header), *(len(g) for g in gutter.values())])
 
+        def _row(gutter_cell: str, values: list[str]) -> str:
+            return "| " + " | ".join([gutter_cell.ljust(gutter_width), *values]) + " |"
+
+        def _sep() -> str:
+            return (
+                "|-"
+                + "-|-".join(["-" * gutter_width, *("-" * col_widths[c] for c in cols)])
+                + "-|"
+            )
+
+        def _cells(row: int) -> list[str]:
             values = []
             for col in cols:
-                if col in self._sheet.hidden_cols:
-                    continue
                 cell = self._sheet.get_cell(row, col)
                 val = _cell_render_value(cell) if cell else ""
-
                 if cell and cell.formula and not val.startswith("="):
                     val = f"{val} [=]"
-
-                # Markdown table rows are single-line; collapse embedded newlines
+                # Markdown rows are single-line; collapse embedded newlines
                 # (common in headers like "租金\n天数") so they don't break the grid.
                 val = _flatten_cell_text(val)
-
-                # Long-value fallback: only triggers if the rendered string
-                # genuinely exceeds the (now consistently-computed) column
-                # width — i.e. the column was capped at 30. We still emit
-                # the full retrieval value (no truncation) and let the
-                # alignment overflow; truncating destroys retrievability.
+                # Full retrieval value (no truncation); alignment may overflow.
                 values.append(val.ljust(col_widths[col]))
-
-            line = "| " + " | ".join(values) + " |"
-            lines.append(line)
-
-            # Add separator after first row if it looks like a header
-            if is_first_data and block.block_type in (
-                BlockType.TABLE,
-                BlockType.ASSUMPTIONS_TABLE,
-            ):
-                lines.append(
-                    "|-"
-                    + "-|-".join(
-                        "-" * col_widths[c]
-                        for c in cols
-                        if c not in self._sheet.hidden_cols
-                    )
-                    + "-|"
-                )
-            is_first_data = False
+            return values
+
+        # Header row: real first row for tables, else Excel column letters.
+        if first_row_is_header:
+            lines.append(_row(gutter_header, _cells(rng.top_left.row)))
+            lines.append(_sep())
+            data_rows = rows[1:]
+        else:
+            letters = [col_number_to_letter(c).ljust(col_widths[c]) for c in cols]
+            lines.append(_row(gutter_header, letters))
+            lines.append(_sep())
+            data_rows = rows
+
+        # Data rows (hidden rows/cols included; hidden rows flagged in gutter).
+        for row in data_rows:
+            lines.append(_row(gutter[row], _cells(row)))
 
         return "\n".join(lines)
 

diff --git a/tests/test_rendering.py b/tests/test_rendering.py
@@ -61,6 +61,27 @@ def test_data_ref_attributes(self, simple_workbook):
 
         assert 'data-ref="A1"' in html
 
+    def test_hidden_cells_included_and_flagged(self, hidden_rows_cols_workbook):
+        """Hidden rows/columns are emitted in the HTML (flagged
+        `data-hidden`) rather than dropped — matching the text renderer."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        html = "\n".join(
+            c.render_html for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
+        )
+        assert 'data-hidden="true"' in html
+        assert "R3C1" in html  # content of hidden row 3
+        assert "R1C2" in html  # content of hidden column B
+
+    def test_hidden_sheet_flagged_on_table(self, multi_sheet_workbook):
+        """Tables from a hidden worksheet carry `data-sheet-hidden`."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        chunks = parse_workbook(str(multi_sheet_workbook)).chunks
+        hidden_html = [c.render_html for c in chunks if c.sheet_name == "Hidden"]
+        assert hidden_html
+        assert all('data-sheet-hidden="true"' in h for h in hidden_html)
+
 
 class TestTextRendering:
     """Test plain text / markdown rendering."""
@@ -150,3 +171,64 @@ def test_numeric_cells_render_raw_not_display_formatted(self):
         assert "0.002668" in text
         assert "e-03" not in text
         assert "e+03" not in text
+
+    def test_table_header_uses_real_names_not_column_letters(self, table_workbook):
+        """Option A: the grid header holds the table's real column names while
+        Excel column letters move to a `cols:` map on the bracket line, with a
+        leading `row` gutter — so downstream 'find the real header' logic sees
+        'Product', not 'A'."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        chunks = parse_workbook(str(table_workbook)).chunks
+        text = next(c.render_text for c in chunks if "Product" in c.render_text)
+
+        grid_lines = [ln for ln in text.splitlines() if ln.startswith("|")]
+        header_cells = [c.strip() for c in grid_lines[0].split("|")[1:-1]]
+        # Gutter first, then the *real* header names — not bare column letters.
+        assert header_cells[0] == "row"
+        assert header_cells[1] == "Product"
+        # Column letters are published as a name→letter map instead.
+        assert "cols: A=Product" in text
+
+    def test_hidden_rows_and_cols_are_extracted_and_flagged(
+        self, hidden_rows_cols_workbook
+    ):
+        """Hidden rows/columns are rendered (not dropped) and flagged
+        `[hidden]` — in the `cols:` map for columns, the gutter for rows."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        text = "\n".join(
+            c.render_text for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks
+        )
+        assert "R1C2" in text  # a cell in hidden column B
+        assert "R3C1" in text  # a cell in hidden row 3
+        assert "[hidden]" in text
+
+    def test_hidden_rows_and_cols_recorded_in_chunk_metadata(
+        self, hidden_rows_cols_workbook
+    ):
+        """Hidden rows/columns are stored as structured chunk metadata
+        (scoped to the chunk's range), not just inline text markers."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        chunks = parse_workbook(str(hidden_rows_cols_workbook)).chunks
+        hidden_rows = {r for c in chunks for r in c.metadata.get("hidden_rows", [])}
+        hidden_cols = {col for c in chunks for col in c.metadata.get("hidden_cols", [])}
+        assert 3 in hidden_rows  # row 3 is hidden
+        assert "B" in hidden_cols  # column B is hidden
+
+    def test_hidden_sheet_marked_in_metadata_and_render(self, multi_sheet_workbook):
+        """A hidden worksheet is still parsed and chunked, and every chunk
+        from it carries `sheet_hidden` metadata and a `[hidden sheet]` render
+        marker; visible-sheet chunks carry neither."""
+        from ks_xlsx_parser.pipeline import parse_workbook
+
+        chunks = parse_workbook(str(multi_sheet_workbook)).chunks
+        hidden = [c for c in chunks if c.sheet_name == "Hidden"]
+        visible = [c for c in chunks if c.sheet_name != "Hidden"]
+
+        assert hidden, "hidden sheet should still be parsed and chunked"
+        assert all(c.metadata.get("sheet_hidden") is True for c in hidden)
+        assert all("[hidden sheet]" in c.render_text for c in hidden)
+        assert all("sheet_hidden" not in c.metadata for c in visible)
+        assert all("[hidden sheet]" not in c.render_text for c in visible)