diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py index 7057e78..db17100 100644 --- a/src/ks_xlsx_parser/chunking/chunker.py +++ b/src/ks_xlsx_parser/chunking/chunker.py @@ -14,7 +14,7 @@ import logging from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary -from ks_xlsx_parser.models.common import CellCoord, EdgeType +from ks_xlsx_parser.models.common import CellCoord, EdgeType, col_number_to_letter from ks_xlsx_parser.models.sheet import SheetDTO from ks_xlsx_parser.models.workbook import WorkbookDTO from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer @@ -154,6 +154,28 @@ def _block_to_chunk( for coord in block.key_cells ] + # Hidden-content provenance: record which parts of this chunk Excel + # hides — the whole sheet, specific rows, or specific columns — so + # downstream consumers can flag or filter hidden data instead of + # silently treating it as visible. Rows/cols are scoped to this + # chunk's range; the renderers also mark them inline ([hidden]). + rng = block.cell_range + metadata: dict[str, object] = {} + if sheet.properties.is_hidden: + metadata["sheet_hidden"] = True + hidden_rows = sorted( + r for r in sheet.hidden_rows + if rng.top_left.row <= r <= rng.bottom_right.row + ) + if hidden_rows: + metadata["hidden_rows"] = hidden_rows + hidden_cols = sorted( + c for c in sheet.hidden_cols + if rng.top_left.col <= c <= rng.bottom_right.col + ) + if hidden_cols: + metadata["hidden_cols"] = [col_number_to_letter(c) for c in hidden_cols] + return ChunkDTO( sheet_name=block.sheet_name, block_type=block.block_type, @@ -166,6 +188,7 @@ def _block_to_chunk( render_html=render_html, render_text=render_text, token_count=token_count, + metadata=metadata, ) def _chart_to_chunk(self, chart) -> ChunkDTO: diff --git a/src/ks_xlsx_parser/pipeline.py b/src/ks_xlsx_parser/pipeline.py index 1451074..b74b199 100644 --- a/src/ks_xlsx_parser/pipeline.py +++ b/src/ks_xlsx_parser/pipeline.py @@ -195,6 +195,7 @@ def to_json(self) -> dict[str, Any]: "cells": _chunk_cells(c, self.workbook), "key_cells": c.key_cells, "named_ranges": c.named_ranges, + "metadata": c.metadata, "dependency_summary": { "upstream_refs": c.dependency_summary.upstream_refs, "downstream_refs": c.dependency_summary.downstream_refs, diff --git a/src/ks_xlsx_parser/rendering/html_renderer.py b/src/ks_xlsx_parser/rendering/html_renderer.py index 50a8b21..8584b2f 100644 --- a/src/ks_xlsx_parser/rendering/html_renderer.py +++ b/src/ks_xlsx_parser/rendering/html_renderer.py @@ -67,30 +67,30 @@ def render_block(self, block: BlockDTO) -> str: BlockType.ASSUMPTIONS_TABLE, ) + sheet_hidden_attr = ( + ' data-sheet-hidden="true"' if self._sheet.properties.is_hidden else "" + ) parts: list[str] = [] parts.append( f'' + f'data-block-type="{block.block_type.value}"{sheet_hidden_attr}>' ) for row_idx, row in enumerate(rows): - if row in self._sheet.hidden_rows: - continue + # Hidden rows are emitted (flagged data-hidden) rather than dropped. + row_hidden = row in self._sheet.hidden_rows is_header_row = row_idx == 0 and is_first_row_header tag = "th" if is_header_row else "td" - wrapper = "thead" if is_header_row else "tbody" if row_idx == 0 and is_header_row: parts.append("") elif row_idx == 1 and is_first_row_header: parts.append("") - parts.append("") + parts.append('' if row_hidden else "") for col in cols: - if col in self._sheet.hidden_cols: - continue if (row, col) in skip_cells: continue @@ -106,6 +106,8 @@ def render_block(self, block: BlockDTO) -> str: # Build cell attributes attrs = [f'data-ref="{cell_ref}"'] + if row_hidden or col in self._sheet.hidden_cols: + attrs.append('data-hidden="true"') if rowspan > 1: attrs.append(f'rowspan="{rowspan}"') if colspan > 1: diff --git a/src/ks_xlsx_parser/rendering/text_renderer.py b/src/ks_xlsx_parser/rendering/text_renderer.py index e2c15f4..f308443 100644 --- a/src/ks_xlsx_parser/rendering/text_renderer.py +++ b/src/ks_xlsx_parser/rendering/text_renderer.py @@ -101,37 +101,78 @@ def __init__(self, sheet: SheetDTO): def render_block(self, block: BlockDTO) -> str: """ - Render a block as plain text with coordinate context. - - Format: - [Sheet1!A1:D10] (table: "SalesData") - | A | B | C | D | - |----------|---------|--------|---------| - | Product | Q1 | Q2 | Q3 | - | Widget A | 100 | 150 | 200 | - ... + Render a block as a plain-text / Markdown table with coordinate + context. + + The grid is a *standard* Markdown table whose header row holds the + block's real column names (for ``TABLE`` / ``ASSUMPTIONS_TABLE`` + blocks, mirroring :class:`HtmlRenderer`'s ```` behaviour). + Excel column letters are published on the bracket line as a + ``cols:`` map rather than occupying the header row, and a leading + ``row`` gutter carries the Excel row number of every line. Together + the ``cols:`` map and the row gutter let an agent reconstruct a full + A1 reference (column ``Amount`` + row ``3`` → ``B3``) without the + column letters masquerading as the table's headers and defeating + downstream header detection. + + Hidden rows and columns are *included* (not dropped) and flagged + ``[hidden]`` — in the gutter for rows, in the ``cols:`` map for + columns. + + Format:: + + [Sheet1!A1:D3] (table) cols: A=Product, B=Q1, C=Q2, D=Q3 + | row | Product | Q1 | Q2 | Q3 | + |-----|----------|-----|-----|-----| + | 2 | Widget A | 100 | 150 | 200 | """ rng = block.cell_range - rows = range(rng.top_left.row, rng.bottom_right.row + 1) - cols = range(rng.top_left.col, rng.bottom_right.col + 1) + rows = list(range(rng.top_left.row, rng.bottom_right.row + 1)) + cols = list(range(rng.top_left.col, rng.bottom_right.col + 1)) + + # Mirror HtmlRenderer: for these block types the first row carries the + # real column names, so it becomes the Markdown header row. + first_row_is_header = block.block_type in ( + BlockType.TABLE, + BlockType.ASSUMPTIONS_TABLE, + ) lines: list[str] = [] - # Header with location and type + # --- Bracket line + column-letter map ------------------------------ type_label = block.block_type.value.replace("_", " ") header = f"[{block.sheet_name}!{rng.to_a1()}] ({type_label})" + if self._sheet.properties.is_hidden: + header += " [hidden sheet]" if block.table_name: header += f' table: "{block.table_name}"' + + # Column letters live here (not as a grid row) so the grid header is + # free to hold real names while an agent can still map name → letter. + col_descs: list[str] = [] + for col in cols: + desc = col_number_to_letter(col) + if first_row_is_header: + name_cell = self._sheet.get_cell(rng.top_left.row, col) + name = ( + _flatten_cell_text(_cell_render_value(name_cell)) + if name_cell + else "" + ) + if name: + desc = f"{desc}={name}" + if col in self._sheet.hidden_cols: + desc += " [hidden]" + col_descs.append(desc) + header += " cols: " + ", ".join(col_descs) lines.append(header) # Compute column widths using the SAME rendering rules the data # rows will use, including the trailing `[=]` formula marker. - # Otherwise `[=]` inflates a cell past col_width post-hoc and - # spuriously triggers the long-value fallback below. + # Otherwise `[=]` inflates a cell past col_width post-hoc. col_widths: dict[int, int] = {} for col in cols: - col_letter = col_number_to_letter(col) - max_width = len(col_letter) + max_width = len(col_number_to_letter(col)) for row in rows: cell = self._sheet.get_cell(row, col) if cell is None: @@ -143,63 +184,55 @@ def render_block(self, block: BlockDTO) -> str: max_width = max(max_width, len(val)) col_widths[col] = min(max_width, 30) # Cap at 30 for alignment; text may overflow - # Column header row - col_headers = [] - for col in cols: - if col in self._sheet.hidden_cols: - continue - letter = col_number_to_letter(col) - col_headers.append(letter.ljust(col_widths[col])) - lines.append("| " + " | ".join(col_headers) + " |") - lines.append( - "|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|" - ) - - # Data rows - is_first_data = True + # Row-number gutter: gives every value a row coordinate so an agent + # can form a full A1 reference. Hidden rows are flagged here. + gutter_header = "row" + gutter: dict[int, str] = {} for row in rows: + label = str(row) if row in self._sheet.hidden_rows: - continue + label += " [hidden]" + gutter[row] = label + gutter_width = max([len(gutter_header), *(len(g) for g in gutter.values())]) + def _row(gutter_cell: str, values: list[str]) -> str: + return "| " + " | ".join([gutter_cell.ljust(gutter_width), *values]) + " |" + + def _sep() -> str: + return ( + "|-" + + "-|-".join(["-" * gutter_width, *("-" * col_widths[c] for c in cols)]) + + "-|" + ) + + def _cells(row: int) -> list[str]: values = [] for col in cols: - if col in self._sheet.hidden_cols: - continue cell = self._sheet.get_cell(row, col) val = _cell_render_value(cell) if cell else "" - if cell and cell.formula and not val.startswith("="): val = f"{val} [=]" - - # Markdown table rows are single-line; collapse embedded newlines + # Markdown rows are single-line; collapse embedded newlines # (common in headers like "租金\n天数") so they don't break the grid. val = _flatten_cell_text(val) - - # Long-value fallback: only triggers if the rendered string - # genuinely exceeds the (now consistently-computed) column - # width — i.e. the column was capped at 30. We still emit - # the full retrieval value (no truncation) and let the - # alignment overflow; truncating destroys retrievability. + # Full retrieval value (no truncation); alignment may overflow. values.append(val.ljust(col_widths[col])) - - line = "| " + " | ".join(values) + " |" - lines.append(line) - - # Add separator after first row if it looks like a header - if is_first_data and block.block_type in ( - BlockType.TABLE, - BlockType.ASSUMPTIONS_TABLE, - ): - lines.append( - "|-" - + "-|-".join( - "-" * col_widths[c] - for c in cols - if c not in self._sheet.hidden_cols - ) - + "-|" - ) - is_first_data = False + return values + + # Header row: real first row for tables, else Excel column letters. + if first_row_is_header: + lines.append(_row(gutter_header, _cells(rng.top_left.row))) + lines.append(_sep()) + data_rows = rows[1:] + else: + letters = [col_number_to_letter(c).ljust(col_widths[c]) for c in cols] + lines.append(_row(gutter_header, letters)) + lines.append(_sep()) + data_rows = rows + + # Data rows (hidden rows/cols included; hidden rows flagged in gutter). + for row in data_rows: + lines.append(_row(gutter[row], _cells(row))) return "\n".join(lines) diff --git a/tests/test_rendering.py b/tests/test_rendering.py index 3a757ec..e8ee73a 100644 --- a/tests/test_rendering.py +++ b/tests/test_rendering.py @@ -61,6 +61,27 @@ def test_data_ref_attributes(self, simple_workbook): assert 'data-ref="A1"' in html + def test_hidden_cells_included_and_flagged(self, hidden_rows_cols_workbook): + """Hidden rows/columns are emitted in the HTML (flagged + `data-hidden`) rather than dropped — matching the text renderer.""" + from ks_xlsx_parser.pipeline import parse_workbook + + html = "\n".join( + c.render_html for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks + ) + assert 'data-hidden="true"' in html + assert "R3C1" in html # content of hidden row 3 + assert "R1C2" in html # content of hidden column B + + def test_hidden_sheet_flagged_on_table(self, multi_sheet_workbook): + """Tables from a hidden worksheet carry `data-sheet-hidden`.""" + from ks_xlsx_parser.pipeline import parse_workbook + + chunks = parse_workbook(str(multi_sheet_workbook)).chunks + hidden_html = [c.render_html for c in chunks if c.sheet_name == "Hidden"] + assert hidden_html + assert all('data-sheet-hidden="true"' in h for h in hidden_html) + class TestTextRendering: """Test plain text / markdown rendering.""" @@ -150,3 +171,64 @@ def test_numeric_cells_render_raw_not_display_formatted(self): assert "0.002668" in text assert "e-03" not in text assert "e+03" not in text + + def test_table_header_uses_real_names_not_column_letters(self, table_workbook): + """Option A: the grid header holds the table's real column names while + Excel column letters move to a `cols:` map on the bracket line, with a + leading `row` gutter — so downstream 'find the real header' logic sees + 'Product', not 'A'.""" + from ks_xlsx_parser.pipeline import parse_workbook + + chunks = parse_workbook(str(table_workbook)).chunks + text = next(c.render_text for c in chunks if "Product" in c.render_text) + + grid_lines = [ln for ln in text.splitlines() if ln.startswith("|")] + header_cells = [c.strip() for c in grid_lines[0].split("|")[1:-1]] + # Gutter first, then the *real* header names — not bare column letters. + assert header_cells[0] == "row" + assert header_cells[1] == "Product" + # Column letters are published as a name→letter map instead. + assert "cols: A=Product" in text + + def test_hidden_rows_and_cols_are_extracted_and_flagged( + self, hidden_rows_cols_workbook + ): + """Hidden rows/columns are rendered (not dropped) and flagged + `[hidden]` — in the `cols:` map for columns, the gutter for rows.""" + from ks_xlsx_parser.pipeline import parse_workbook + + text = "\n".join( + c.render_text for c in parse_workbook(str(hidden_rows_cols_workbook)).chunks + ) + assert "R1C2" in text # a cell in hidden column B + assert "R3C1" in text # a cell in hidden row 3 + assert "[hidden]" in text + + def test_hidden_rows_and_cols_recorded_in_chunk_metadata( + self, hidden_rows_cols_workbook + ): + """Hidden rows/columns are stored as structured chunk metadata + (scoped to the chunk's range), not just inline text markers.""" + from ks_xlsx_parser.pipeline import parse_workbook + + chunks = parse_workbook(str(hidden_rows_cols_workbook)).chunks + hidden_rows = {r for c in chunks for r in c.metadata.get("hidden_rows", [])} + hidden_cols = {col for c in chunks for col in c.metadata.get("hidden_cols", [])} + assert 3 in hidden_rows # row 3 is hidden + assert "B" in hidden_cols # column B is hidden + + def test_hidden_sheet_marked_in_metadata_and_render(self, multi_sheet_workbook): + """A hidden worksheet is still parsed and chunked, and every chunk + from it carries `sheet_hidden` metadata and a `[hidden sheet]` render + marker; visible-sheet chunks carry neither.""" + from ks_xlsx_parser.pipeline import parse_workbook + + chunks = parse_workbook(str(multi_sheet_workbook)).chunks + hidden = [c for c in chunks if c.sheet_name == "Hidden"] + visible = [c for c in chunks if c.sheet_name != "Hidden"] + + assert hidden, "hidden sheet should still be parsed and chunked" + assert all(c.metadata.get("sheet_hidden") is True for c in hidden) + assert all("[hidden sheet]" in c.render_text for c in hidden) + assert all("sheet_hidden" not in c.metadata for c in visible) + assert all("[hidden sheet]" not in c.render_text for c in visible)