Does MiniExcel provide an option to ignore dirty data rows? Commercial libraries such as Aspose.Cells seem to support retrieving MaxDataRow and MaxDataColumn. At the moment, I’m using the following approach, which has proven to efficiently obtain the maximum data row and column:
/// <summary>
/// 获取工作表的真实数据边界(基于底层流式物理扫描)
/// </summary>
/// <param name="filePath">Excel 文件物理路径</param>
/// <param name="sheetName">UI 上显示的 Sheet 名称</param>
/// <returns>真实的最大行和最大列 (1-based)。若无有效数据则返回 (0, 0),下标为1。</returns>
public static (int MaxDataRow, int MaxDataColumn) GetMaxDataCell(string filePath, string sheetName)
{
int maxRow = 0; int maxCol = 0;
// 必须使用 FileShare.ReadWrite,防止被其他进程或 Excel 客户端锁死抛出 IOException
using (var fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
using (var archive = new ZipArchive(fs, ZipArchiveMode.Read))
{
// 不能写死 sheet1.xml,必须通过 workbook.xml + rels 找出真实物理路径
string physicalSheetPath = ResolveWorksheetEntryPath(archive, sheetName);
if (string.IsNullOrEmpty(physicalSheetPath))
return (0, 0);
var sheetEntry = archive.GetEntry(physicalSheetPath);
if (sheetEntry == null)
return (0, 0);
// 直接打开底层 Deflate 压缩流,彻底踢掉 StreamReader 的 UTF-8 转码开销
using (var stream = sheetEntry.Open())
{
// 使用 L1 Cache 友好的 64KB 纯字节块进行读取
byte[] buffer = new byte[65536];
int bytesRead;
int state = 0;
bool hasValidRef = false;
int pendingRow = 0, pendingCol = 0;
// 不再有任何字符解码、字符串截取分配,仅做底层 byte 的数学位运算
while ((bytesRead = stream.Read(buffer, 0, buffer.Length)) > 0)
{
for (int i = 0; i < bytesRead; i++)
{
byte b = buffer[i];
// 【物理降维】中文字符在 UTF-8 下首字节 >= 128,而我们的目标均在 ASCII 范围内。
// 遇到中文字节会由于不匹配任何 ASCII case 而极速掠过,完全无需额外处理逻辑。
switch (state)
{
case 0: // 寻找 '<'
if (b == (byte)'<') state = 1;
break;
case 1: // 确认是单元格标签 '<c'
if (b == (byte)'c') state = 2;
else state = 0;
break;
case 2: // <c 之后
if (b == (byte)' ') { state = 3; hasValidRef = false; }
else if (b == (byte)'>') { state = 8; hasValidRef = false; }
else state = 0;
break;
// ================== 属性解析隔离区 (严格 DFA) ==================
case 3: // 【准备迎接新属性】(此时必定处于空格后)
if (b == (byte)'r') state = 4; // 精准命中首字母为 r 的属性
else if (b == (byte)' ') state = 3; // 忽略连续空格
else if (b == (byte)'/') state = 7; // 自闭合准备 (幽灵标签)
else if (b == (byte)'>') state = 8; // 属性区结束
else state = 22; // 其他无关属性名 (如 t, s, rPh)
break;
case 4: // 刚刚读取了首字母 'r'
if (b == (byte)'=') state = 5; // 确认为 r=
else state = 22; // 误报,降级为无关属性去跳过
break;
case 5: // 刚刚读取了 "r="
if (b == (byte)'"') { state = 6; pendingRow = 0; pendingCol = 0; }
else state = 22; // 格式错误,降级跳过
break;
case 6: // 【高价值区域】:处于 r="..." 内部提取坐标
if (b >= (byte)'A' && b <= (byte)'Z')
{
pendingCol = pendingCol * 26 + (b - (byte)'A' + 1); // 零分配,直接转 int
}
else if (b >= (byte)'a' && b <= (byte)'z')
{
pendingCol = pendingCol * 26 + (b - (byte)'a' + 1); // 向下兼容小写引用
}
else if (b >= (byte)'0' && b <= (byte)'9')
{
pendingRow = pendingRow * 10 + (b - (byte)'0'); // 零分配,直接转 int
}
else if (b == (byte)'"')
{
hasValidRef = true; // 完美提取!
state = 24; // 进入属性结束状态
}
else state = 20; // 坐标内混入非法字符,按废弃字符串处理
break;
// --- 无关属性跳过区 ---
case 22: // 正在跳过无关属性的名称区
if (b == (byte)'=') state = 23;
else if (b == (byte)'/') state = 7;
else if (b == (byte)'>') state = 8;
break;
case 23: // 刚刚读取了无关属性的 '='
if (b == (byte)'"') state = 20; // 正式进入无关字符串
break;
case 20: // 处于无关属性的 attr="..." 内部
if (b == (byte)'"') state = 24; // 无关字符串结束
break;
case 24: // 【一个属性闭合完毕】(刚刚读取完闭合的 '"')
if (b == (byte)' ') state = 3; // 遇到空格,回归“准备迎接新属性”
else if (b == (byte)'/') state = 7;
else if (b == (byte)'>') state = 8;
else state = 22; // 容错:假如连着写没空格 attr="1"b="2"
break;
// ===============================================================
case 7: // 遇到了 '/'
if (b == (byte)'>') state = 0; // 捕获 <c r="A1" /> 幽灵标签,直接重置放弃!
else state = 22; // 属性名里的假动作
break;
case 8: // 遇到了 <c ... > 的闭合 '>',等待子节点
if (b == (byte)'<') state = 9; // 紧接着开启子节点
else if (b != (byte)' ' && b != (byte)'\r' && b != (byte)'\n' && b != (byte)'\t') state = 0;
break;
case 9: // 遇到了 <c...><
// 碰到 <v> (值) 或 <f> (公式),确认是真数据
if (b == (byte)'v' || b == (byte)'f')
{
if (hasValidRef)
{
if (pendingRow > maxRow) maxRow = pendingRow;
if (pendingCol > maxCol) maxCol = pendingCol;
hasValidRef = false;
}
state = 0;
}
else if (b == (byte)'i') state = 10;
else if (b == (byte)'/') state = 0; // </c> 闭合标签
else state = 8;
break;
case 10: // 遇到了 <c...><i
// 碰到 <is> (内联字符串),确认是真数据
if (b == (byte)'s')
{
if (hasValidRef)
{
if (pendingRow > maxRow) maxRow = pendingRow;
if (pendingCol > maxCol) maxCol = pendingCol;
hasValidRef = false;
}
state = 0;
}
else state = 8;
break;
}
}
}
}
}
return (maxRow, maxCol);
}
/// <summary>
/// 通过 workbook.xml 和 rels 正确解析物理 sheet 路径 (由于这部分极小,保留使用标准 XmlReader 以保稳定)
/// </summary>
private static string ResolveWorksheetEntryPath(ZipArchive archive, string sheetName)
{
var workbookEntry = archive.GetEntry("xl/workbook.xml");
if (workbookEntry == null) return null;
var relsEntry = archive.GetEntry("xl/_rels/workbook.xml.rels");
if (relsEntry == null) return null;
string sheetRelationshipId = null;
// 1. 在 workbook.xml 中通过 UI 上的 Name 找到底层的 R:Id
using (var stream = workbookEntry.Open())
using (var reader = XmlReader.Create(stream, new XmlReaderSettings { IgnoreWhitespace = true }))
{
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "sheet")
{
if (string.Equals(reader.GetAttribute("name"), sheetName, StringComparison.OrdinalIgnoreCase))
{
sheetRelationshipId =
reader.GetAttribute("r:id") ??
reader.GetAttribute("id", "http://schemas.openxmlformats.org/officeDocument/2006/relationships");
break;
}
}
}
}
if (string.IsNullOrEmpty(sheetRelationshipId)) return null;
// 2. 在 rels 表中通过 R:Id 映射出真正的物理 XML 路径 (比如 xl/worksheets/sheet3.xml)
using (var stream = relsEntry.Open())
using (var reader = XmlReader.Create(stream, new XmlReaderSettings { IgnoreWhitespace = true }))
{
while (reader.Read())
{
if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "Relationship")
{
if (string.Equals(reader.GetAttribute("Id"), sheetRelationshipId, StringComparison.Ordinal))
{
string target = reader.GetAttribute("Target");
if (string.IsNullOrWhiteSpace(target)) return null;
if (target.StartsWith("/")) return target.TrimStart('/');
if (target.StartsWith("xl/", StringComparison.OrdinalIgnoreCase)) return target;
return "xl/" + target.TrimStart('/');
}
}
}
}
return null;
}
Does MiniExcel provide an option to ignore dirty data rows? Commercial libraries such as Aspose.Cells seem to support retrieving MaxDataRow and MaxDataColumn. At the moment, I’m using the following approach, which has proven to efficiently obtain the maximum data row and column: