diff --git a/frontend/package.json b/frontend/package.json
index 08b298260e3..4e117cd05cc 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -47,6 +47,7 @@
"d3-shape": "2.1.0",
"dagre": "0.8.5",
"file-saver": "2.0.5",
+ "file-type": "^22.0.1",
"fuse.js": "6.5.3",
"html2canvas": "1.4.1",
"jointjs": "3.5.4",
diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.html b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.html
index fd0ba3af152..7092d3294e4 100644
--- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.html
+++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.html
@@ -44,6 +44,36 @@
nzType="warning"
nzMessage="Preview of the file type is currently not supported">
+
+
+
+
+
+
+
+
+ 0 || canOpenInWorkflow" class="file-metadata-strip">
+
+ {{ item.label }}
+ {{ item.value }}
+
+
+
- | {{ column }} |
+
+ {{ column }}
+
+ {{ fileMetadata?.columnTypes?.[i] }}
+ 0">
+ {{ fileMetadata?.nullCounts?.[i] }} null
+
+
+ |
@@ -79,6 +119,14 @@
alt="{{filePath}}"
class="full-size-image" />
+
+
+
+
+
diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.scss b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.scss
index e6424f529d8..c692589f767 100644
--- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.scss
+++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.scss
@@ -40,3 +40,65 @@
max-width: 90%;
max-height: 90%;
}
+
+.file-metadata-strip {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 6px;
+ margin-bottom: 10px;
+ padding: 6px 0;
+ border-bottom: 1px solid #f0f0f0;
+}
+
+.metadata-pill {
+ display: inline-flex;
+ align-items: center;
+ gap: 4px;
+ padding: 2px 8px 2px 6px;
+ background: #fafafa;
+ border: 1px solid #e8e8e8;
+ border-radius: 4px;
+ font-size: 12px;
+ white-space: nowrap;
+}
+
+.metadata-label {
+ color: #8c8c8c;
+ font-weight: 500;
+}
+
+.metadata-value {
+ color: #262626;
+}
+
+.column-name {
+ font-weight: 600;
+}
+
+.column-meta {
+ display: flex;
+ align-items: center;
+ gap: 6px;
+ margin-top: 2px;
+ font-weight: 400;
+}
+
+.column-type-tag {
+ display: inline-block;
+ padding: 0 6px;
+ font-size: 11px;
+ color: #1890ff;
+ background: #e6f4ff;
+ border: 1px solid #91caff;
+ border-radius: 3px;
+ font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+.column-null-hint {
+ font-size: 11px;
+ color: #d4380d;
+}
+
+.open-in-workflow-btn {
+ margin-left: auto;
+}
diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.spec.ts b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.spec.ts
index 9e70a444df8..568dac0ecd5 100644
--- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.spec.ts
+++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.spec.ts
@@ -19,11 +19,13 @@
import { TestBed } from "@angular/core/testing";
import { HttpClientTestingModule } from "@angular/common/http/testing";
-import { UserDatasetFileRendererComponent } from "./user-dataset-file-renderer.component";
+import { UserDatasetFileRendererComponent, MIME_TYPES, getMimeType, inferColumnSchema } from "./user-dataset-file-renderer.component";
import { DatasetService } from "../../../../../service/user/dataset/dataset.service";
import { NotificationService } from "../../../../../../common/service/notification/notification.service";
import { DomSanitizer } from "@angular/platform-browser";
import { commonTestProviders } from "../../../../../../common/testing/test-utils";
+import { Router } from "@angular/router";
+import { WorkflowPersistService } from "../../../../../../common/service/workflow-persist/workflow-persist.service";
describe("UserDatasetFileRendererComponent", () => {
let component: UserDatasetFileRendererComponent;
@@ -34,7 +36,15 @@ describe("UserDatasetFileRendererComponent", () => {
providers: [
DatasetService,
NotificationService,
- { provide: DomSanitizer, useValue: { bypassSecurityTrustUrl: vi.fn() } },
+ WorkflowPersistService,
+ { provide: Router, useValue: { navigate: vi.fn() } },
+ {
+ provide: DomSanitizer,
+ useValue: {
+ bypassSecurityTrustUrl: vi.fn((url: string) => url),
+ bypassSecurityTrustResourceUrl: vi.fn((url: string) => url),
+ },
+ },
...commonTestProviders,
],
});
@@ -42,15 +52,250 @@ describe("UserDatasetFileRendererComponent", () => {
component = fixture.componentInstance;
});
- it("should return true for supported MIME type", () => {
- const supportedMimeType = "image/jpeg"; // Example of a supported MIME type
- const result = component.isPreviewSupported(supportedMimeType);
- expect(result).toBe(true);
+ describe("isPreviewSupported", () => {
+ it("should return true for known MIME types", () => {
+ expect(component.isPreviewSupported("image/jpeg")).toBe(true);
+ expect(component.isPreviewSupported("application/pdf")).toBe(true);
+ expect(component.isPreviewSupported("application/x-parquet")).toBe(true);
+ });
+
+ it("should return false only for unidentified binary (octet-stream)", () => {
+ expect(component.isPreviewSupported(MIME_TYPES.OCTET_STREAM)).toBe(false);
+ });
});
- it("should return false for unsupported MIME type", () => {
- const unsupportedMimeType = "application/unknown"; // Example of an unsupported MIME type
- const result = component.isPreviewSupported(unsupportedMimeType);
- expect(result).toBe(false);
+ describe("getMimeType (extension-based fallback)", () => {
+ it("should resolve common image extensions", () => {
+ expect(getMimeType("photo.jpg")).toBe(MIME_TYPES.JPEG);
+ expect(getMimeType("photo.PNG")).toBe(MIME_TYPES.PNG);
+ expect(getMimeType("anim.gif")).toBe(MIME_TYPES.GIF);
+ });
+
+ it("should resolve xlsx separately from xls", () => {
+ expect(getMimeType("data.xlsx")).toBe(MIME_TYPES.XLSX);
+ expect(getMimeType("data.xls")).toBe(MIME_TYPES.MSEXCEL);
+ });
+
+ it("should resolve data format extensions", () => {
+ expect(getMimeType("data.parquet")).toBe(MIME_TYPES.PARQUET);
+ expect(getMimeType("data.arrow")).toBe(MIME_TYPES.ARROW);
+ expect(getMimeType("data.feather")).toBe(MIME_TYPES.ARROW);
+ });
+
+ it("should return octet-stream for unknown extensions", () => {
+ expect(getMimeType("file.xyz")).toBe(MIME_TYPES.OCTET_STREAM);
+ expect(getMimeType("noextension")).toBe(MIME_TYPES.OCTET_STREAM);
+ });
+ });
+
+ describe("detectMimeType (magic byte detection)", () => {
+ it("should detect Parquet files from PAR1 magic bytes", async () => {
+ const magic = new Uint8Array([0x50, 0x41, 0x52, 0x31, 0x00, 0x00, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.PARQUET);
+ });
+
+ it("should detect Arrow IPC files from ARROW1 magic bytes", async () => {
+ const magic = new Uint8Array([0x41, 0x52, 0x52, 0x4f, 0x57, 0x31, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.ARROW);
+ });
+
+ it("should detect JSON via text sniffing (object)", async () => {
+ const blob = new Blob(['{"key": "value"}'], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.JSON);
+ });
+
+ it("should detect JSON via text sniffing (array)", async () => {
+ const blob = new Blob(['[1, 2, 3]'], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.JSON);
+ });
+
+ it("should detect CSV via text sniffing", async () => {
+ const blob = new Blob(["name,age,city\nAlice,30,LA\nBob,25,NY"], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.CSV);
+ });
+
+ it("should detect Markdown via text sniffing", async () => {
+ const blob = new Blob(["# My Title\n\nSome content here"], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.MD);
+ });
+
+ it("should detect plain text when content is printable ASCII", async () => {
+ const blob = new Blob(["Hello, world! This is plain text."], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.TXT);
+ });
+
+ it("should return octet-stream for unidentifiable binary", async () => {
+ const binary = new Uint8Array([0x00, 0x01, 0x02, 0x80, 0xff, 0xfe, 0x7f, 0x03]);
+ const blob = new Blob([binary]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.OCTET_STREAM);
+ });
+
+ it("should detect HDF5 from magic bytes (generic .h5)", async () => {
+ const magic = new Uint8Array([0x89, 0x48, 0x44, 0x46, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob, "model.h5");
+ expect(result).toBe(MIME_TYPES.HDF5);
+ });
+
+ it("should refine HDF5 to H5AD by extension", async () => {
+ const magic = new Uint8Array([0x89, 0x48, 0x44, 0x46, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob, "scrna.h5ad");
+ expect(result).toBe(MIME_TYPES.H5AD);
+ });
+
+ it("should refine HDF5 to H5SEURAT by extension", async () => {
+ const magic = new Uint8Array([0x89, 0x48, 0x44, 0x46, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob, "pbmc.h5seurat");
+ expect(result).toBe(MIME_TYPES.H5SEURAT);
+ });
+
+ it("should detect Python pickle from \\x80 + protocol byte", async () => {
+ const magic = new Uint8Array([0x80, 0x04, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.PICKLE);
+ });
+
+ it("should detect NumPy .npy from magic bytes", async () => {
+ const magic = new Uint8Array([0x93, 0x4e, 0x55, 0x4d, 0x50, 0x59, 0x01, 0x00, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.NPY);
+ });
+
+ it("should detect GGUF from magic bytes", async () => {
+ const magic = new Uint8Array([0x47, 0x47, 0x55, 0x46, 0x03, 0x00, 0x00, 0x00]);
+ const blob = new Blob([magic]);
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.GGUF);
+ });
+
+ it("should detect Safetensors via extension fallback", async () => {
+ const opaque = new Uint8Array([0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
+ const blob = new Blob([opaque]);
+ const result = await component.detectMimeType(blob, "model.safetensors");
+ expect(result).toBe(MIME_TYPES.SAFETENSORS);
+ });
+
+ it("should detect ONNX via extension fallback", async () => {
+ const opaque = new Uint8Array([0x08, 0x07, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00]);
+ const blob = new Blob([opaque]);
+ const result = await component.detectMimeType(blob, "resnet.onnx");
+ expect(result).toBe(MIME_TYPES.ONNX);
+ });
+
+ it("should detect VCF from header line", async () => {
+ const blob = new Blob(["##fileformat=VCFv4.2\n##source=test\n"], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.VCF);
+ });
+
+ it("should detect FASTA from > prefix", async () => {
+ const blob = new Blob([">seq1\nACGTACGT\n>seq2\nTGCATGCA\n"], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.FASTA);
+ });
+
+ it("should detect FASTQ from 4-line @/+ pattern", async () => {
+ const blob = new Blob(["@read1\nACGT\n+\n!!!!\n@read2\nTGCA\n+\n!!!!\n"], { type: "text/plain" });
+ const result = await component.detectMimeType(blob);
+ expect(result).toBe(MIME_TYPES.FASTQ);
+ });
+ });
+
+ describe("parser helpers", () => {
+ it("should parse a NumPy v1.0 header", async () => {
+ // Construct a minimal valid .npy v1 file: magic + version + uint16 header_len + ASCII header
+ const headerText = "{'descr': '> 8) & 0xff;
+ buf.set(headerBytes, 10);
+ const blob = new Blob([buf]);
+ const result = await (component as any).parseNpyHeader(blob);
+ expect(result?.dtype).toBe(" {
+ const header = JSON.stringify({
+ "layer.weight": { dtype: "F32", shape: [128, 64], data_offsets: [0, 32768] },
+ "layer.bias": { dtype: "F32", shape: [128], data_offsets: [32768, 33280] },
+ __metadata__: { format: "pt" },
+ });
+ const headerBytes = new TextEncoder().encode(header);
+ const lenBytes = new Uint8Array(8);
+ let len = headerBytes.length;
+ for (let i = 0; i < 8; i++) {
+ lenBytes[i] = len & 0xff;
+ len = Math.floor(len / 256);
+ }
+ const blob = new Blob([lenBytes, headerBytes]);
+ const result = await (component as any).parseSafetensorsHeader(blob);
+ expect(result?.tensorCount).toBe(2);
+ expect(result?.parameterCount).toBe(128 * 64 + 128);
+ expect(result?.sampleNames).toEqual(["layer.weight", "layer.bias"]);
+ });
+
+ it("should infer column types from tabular sample data", () => {
+ const rows = [
+ ["Alice", "30", "75000.50", "true", "2024-01-15"],
+ ["Bob", "25", "60000.00", "false", "2024-03-22"],
+ ["Carol", "", "82000.75", "true", "2024-05-10"],
+ ];
+ const schema = inferColumnSchema(rows, 5);
+ expect(schema.types).toEqual(["string", "integer", "double", "boolean", "date"]);
+ expect(schema.nullCounts).toEqual([0, 1, 0, 0, 0]);
+ expect(schema.samples).toEqual(["Alice", "30", "75000.50", "true", "2024-01-15"]);
+ });
+
+ it("should fall back to string for all-null columns", () => {
+ const rows = [["a", ""], ["b", ""]];
+ const schema = inferColumnSchema(rows, 2);
+ expect(schema.types).toEqual(["string", "string"]);
+ expect(schema.nullCounts).toEqual([0, 2]);
+ });
+
+ it("should expose canOpenInWorkflow whenever a filePath is set", () => {
+ component.filePath = "/x/y/v1/data.csv";
+ expect(component.canOpenInWorkflow).toBe(true);
+ component.filePath = "/x/y/v1/model.safetensors";
+ expect(component.canOpenInWorkflow).toBe(true);
+ });
+
+ it("should not expose canOpenInWorkflow when no file is selected", () => {
+ component.filePath = "";
+ expect(component.canOpenInWorkflow).toBe(false);
+ });
+
+
+ it("should parse a GGUF header", async () => {
+ const buf = new Uint8Array(24);
+ buf.set([0x47, 0x47, 0x55, 0x46], 0); // "GGUF"
+ buf.set([0x03, 0x00, 0x00, 0x00], 4); // version 3
+ buf.set([0xd2, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], 8); // 722 tensors
+ buf.set([0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], 16); // 16 metadata kv
+ const blob = new Blob([buf]);
+ const result = await (component as any).parseGgufHeader(blob);
+ expect(result?.version).toBe(3);
+ expect(result?.tensorCount).toBe(722);
+ expect(result?.metadataKvCount).toBe(16);
+ });
});
});
diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts
index 861479ca5a5..564e7867ad4 100644
--- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts
+++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-file-renderer/user-dataset-file-renderer.component.ts
@@ -17,14 +17,23 @@
* under the License.
*/
-import { Component, EventEmitter, Input, OnChanges, OnDestroy, OnInit, Output, SimpleChanges } from "@angular/core";
+import { ChangeDetectorRef, Component, EventEmitter, Input, OnChanges, OnDestroy, OnInit, Output, SimpleChanges } from "@angular/core";
import { DatasetService } from "../../../../../service/user/dataset/dataset.service";
import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy";
import * as Papa from "papaparse";
import { ParseResult } from "papaparse";
-import { DomSanitizer, SafeUrl } from "@angular/platform-browser";
-import readXlsxFile from "read-excel-file";
+import { DomSanitizer, SafeResourceUrl, SafeUrl } from "@angular/platform-browser";
+import readXlsxFile, { readSheetNames } from "read-excel-file";
import { NotificationService } from "../../../../../../common/service/notification/notification.service";
+import { formatSize } from "../../../../../../common/util/size-formatter.util";
+import { Router } from "@angular/router";
+import {
+ DEFAULT_WORKFLOW_NAME,
+ WorkflowPersistService,
+} from "../../../../../../common/service/workflow-persist/workflow-persist.service";
+import { GuiConfigService } from "../../../../../../common/service/gui-config.service";
+import { ExecutionMode, WorkflowContent } from "../../../../../../common/type/workflow";
+import { DASHBOARD_USER_WORKSPACE } from "../../../../../../app-routing.constant";
import { NgStyle, NgIf, NgFor } from "@angular/common";
import { NzSpinComponent } from "ng-zorro-antd/spin";
import { NzAlertComponent } from "ng-zorro-antd/alert";
@@ -38,6 +47,9 @@ import {
} from "ng-zorro-antd/table";
import { MarkdownComponent } from "ngx-markdown";
import { NgxJsonViewerModule } from "ngx-json-viewer";
+import { fileTypeFromBlob } from "file-type";
+import { NzButtonComponent } from "ng-zorro-antd/button";
+import { NzIconDirective } from "ng-zorro-antd/icon";
export const MIME_TYPES = {
JPEG: "image/jpeg",
@@ -45,6 +57,9 @@ export const MIME_TYPES = {
PNG: "image/png",
WEBP: "image/webp",
GIF: "image/gif",
+ AVIF: "image/avif",
+ BMP: "image/bmp",
+ TIFF: "image/tiff",
CSV: "text/csv",
TXT: "text/plain",
MD: "text/markdown",
@@ -53,35 +68,474 @@ export const MIME_TYPES = {
PDF: "application/pdf",
MSWORD: "application/msword",
MSEXCEL: "application/vnd.ms-excel",
+ XLSX: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ PPTX: "application/vnd.openxmlformats-officedocument.presentationml.presentation",
MSPOWERPOINT: "application/vnd.ms-powerpoint",
MP4: "video/mp4",
MP3: "audio/mpeg",
- OCTET_STREAM: "application/octet-stream", // Default binary format
+ WAV: "audio/wav",
+ FLAC: "audio/flac",
+ WEBM: "video/webm",
+ MOV: "video/quicktime",
+ ARROW: "application/x-arrow",
+ PARQUET: "application/x-parquet",
+ // ML / scientific data formats
+ HDF5: "application/x-hdf5",
+ H5AD: "application/x-h5ad",
+ H5SEURAT: "application/x-h5seurat",
+ LOOM: "application/x-loom",
+ PICKLE: "application/x-python-pickle",
+ NPY: "application/x-numpy-array",
+ NPZ: "application/x-numpy-archive",
+ SAFETENSORS: "application/x-safetensors",
+ GGUF: "application/x-gguf",
+ PYTORCH: "application/x-pytorch",
+ KERAS: "application/x-keras",
+ ONNX: "application/x-onnx",
+ RDS: "application/x-rds",
+ // Bioinformatics text
+ FASTA: "application/x-fasta",
+ FASTQ: "application/x-fastq",
+ VCF: "application/x-vcf",
+ OCTET_STREAM: "application/octet-stream",
};
export function getMimeType(filename: string): string {
- const extension = filename.split(".").pop()?.toUpperCase();
- return extension && MIME_TYPES[extension as keyof typeof MIME_TYPES]
- ? MIME_TYPES[extension as keyof typeof MIME_TYPES]
- : MIME_TYPES.OCTET_STREAM;
+ const extensionMap: Record = {
+ JPG: MIME_TYPES.JPEG,
+ JPEG: MIME_TYPES.JPEG,
+ PNG: MIME_TYPES.PNG,
+ WEBP: MIME_TYPES.WEBP,
+ GIF: MIME_TYPES.GIF,
+ AVIF: MIME_TYPES.AVIF,
+ BMP: MIME_TYPES.BMP,
+ TIFF: MIME_TYPES.TIFF,
+ TIF: MIME_TYPES.TIFF,
+ CSV: MIME_TYPES.CSV,
+ TSV: MIME_TYPES.CSV,
+ TXT: MIME_TYPES.TXT,
+ MD: MIME_TYPES.MD,
+ HTML: MIME_TYPES.HTML,
+ HTM: MIME_TYPES.HTML,
+ JSON: MIME_TYPES.JSON,
+ JSONL: MIME_TYPES.TXT,
+ PDF: MIME_TYPES.PDF,
+ DOC: MIME_TYPES.MSWORD,
+ XLS: MIME_TYPES.MSEXCEL,
+ XLSX: MIME_TYPES.XLSX,
+ DOCX: MIME_TYPES.DOCX,
+ PPTX: MIME_TYPES.PPTX,
+ PPT: MIME_TYPES.MSPOWERPOINT,
+ MP4: MIME_TYPES.MP4,
+ MP3: MIME_TYPES.MP3,
+ WAV: MIME_TYPES.WAV,
+ FLAC: MIME_TYPES.FLAC,
+ WEBM: MIME_TYPES.WEBM,
+ MOV: MIME_TYPES.MOV,
+ ARROW: MIME_TYPES.ARROW,
+ FEATHER: MIME_TYPES.ARROW,
+ PARQUET: MIME_TYPES.PARQUET,
+ // ML / scientific
+ H5: MIME_TYPES.HDF5,
+ HDF5: MIME_TYPES.HDF5,
+ H5AD: MIME_TYPES.H5AD,
+ H5SEURAT: MIME_TYPES.H5SEURAT,
+ LOOM: MIME_TYPES.LOOM,
+ PKL: MIME_TYPES.PICKLE,
+ PICKLE: MIME_TYPES.PICKLE,
+ JOBLIB: MIME_TYPES.PICKLE,
+ NPY: MIME_TYPES.NPY,
+ NPZ: MIME_TYPES.NPZ,
+ SAFETENSORS: MIME_TYPES.SAFETENSORS,
+ GGUF: MIME_TYPES.GGUF,
+ PT: MIME_TYPES.PYTORCH,
+ PTH: MIME_TYPES.PYTORCH,
+ KERAS: MIME_TYPES.KERAS,
+ ONNX: MIME_TYPES.ONNX,
+ RDS: MIME_TYPES.RDS,
+ // Bioinformatics text
+ FASTA: MIME_TYPES.FASTA,
+ FA: MIME_TYPES.FASTA,
+ FNA: MIME_TYPES.FASTA,
+ FFN: MIME_TYPES.FASTA,
+ FAA: MIME_TYPES.FASTA,
+ FASTQ: MIME_TYPES.FASTQ,
+ FQ: MIME_TYPES.FASTQ,
+ VCF: MIME_TYPES.VCF,
+ };
+ const ext = filename.split(".").pop()?.toUpperCase() ?? "";
+ return extensionMap[ext] ?? MIME_TYPES.OCTET_STREAM;
+}
+
+export function formatDuration(seconds: number): string {
+ if (!isFinite(seconds) || seconds < 0) return "—";
+ const totalSec = Math.floor(seconds);
+ const h = Math.floor(totalSec / 3600);
+ const m = Math.floor((totalSec % 3600) / 60);
+ const s = totalSec % 60;
+ if (h > 0) return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
+ return `${m}:${String(s).padStart(2, "0")}`;
}
-// the size limits for all preview-supported types
+/**
+ * Maximum size at which we'll attempt to preview a file.
+ *
+ * Note on memory: for "identify-only" types (HDF5, Parquet, Arrow, pickle, model containers, etc.)
+ * we only read the first ~16 bytes for magic-byte detection, so 1 GB is safe. For header-parse types
+ * (Safetensors, GGUF, NumPy .npy) we only read the first few KB. The cost of bumping all limits to
+ * 1 GB is the full-blob download time, since the dataset service streams the entire file.
+ *
+ * For full-content render types (CSV via Papa.parse, XLSX, JSON, large text) memory cost scales
+ * with file size — browsers may slow down or OOM well before 1 GB. The user can choose: the guard
+ * no longer blocks; if their browser tab struggles, they can close it.
+ */
+const MAX_PREVIEW_SIZE = 1024 * 1024 * 1024;
+
+// size limits per MIME type — also used as pre-fetch guard
export const MIME_TYPE_SIZE_LIMITS_MB = {
- [MIME_TYPES.JPEG]: 5 * 1024 * 1024, // 5 MB
- [MIME_TYPES.PNG]: 5 * 1024 * 1024, // 5 MB
- [MIME_TYPES.WEBP]: 5 * 1024 * 1024, // 5 MB
- [MIME_TYPES.GIF]: 10 * 1024 * 1024, // 10 MB
- [MIME_TYPES.CSV]: 2 * 1024 * 1024, // 2 MB for text-based data files
- [MIME_TYPES.TXT]: 1 * 1024 * 1024, // 1 MB for plain text files
- [MIME_TYPES.MD]: 1 * 1024 * 1024, // 1 MB for MD files
- [MIME_TYPES.JSON]: 1 * 1024 * 1024, // 1 MB for JSON files
- [MIME_TYPES.MSEXCEL]: 10 * 1024 * 1024, // 10 MB for Excel spreadsheets
- [MIME_TYPES.MP4]: 50 * 1024 * 1024, // 50 MB for MP4 videos
- [MIME_TYPES.MP3]: 10 * 1024 * 1024, // 10 MB for MP3 audio files
- [MIME_TYPES.OCTET_STREAM]: 5 * 1024 * 1024, // Default size for other binary formats
+ [MIME_TYPES.JPEG]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PNG]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.WEBP]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.GIF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.AVIF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.BMP]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.TIFF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.CSV]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.TXT]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.MD]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.JSON]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PDF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.MSEXCEL]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.XLSX]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.DOCX]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PPTX]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.MP4]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.WEBM]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.MOV]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.MP3]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.WAV]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.FLAC]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.ARROW]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PARQUET]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.HDF5]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.H5AD]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.H5SEURAT]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.LOOM]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PICKLE]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.NPY]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.NPZ]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.SAFETENSORS]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.GGUF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.PYTORCH]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.KERAS]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.ONNX]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.RDS]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.FASTA]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.FASTQ]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.VCF]: MAX_PREVIEW_SIZE,
+ [MIME_TYPES.OCTET_STREAM]: MAX_PREVIEW_SIZE,
};
+export interface FileMetadata {
+ fileSize?: number;
+ // image
+ imageWidth?: number;
+ imageHeight?: number;
+ // video
+ videoDuration?: number;
+ videoWidth?: number;
+ videoHeight?: number;
+ // audio
+ audioDuration?: number;
+ // tabular
+ rowCount?: number;
+ columnCount?: number;
+ columnNames?: string[];
+ sheetCount?: number;
+ // json
+ jsonTopLevelType?: "object" | "array";
+ jsonItemCount?: number;
+ jsonPreviewKeys?: string[];
+ // text / markdown
+ lineCount?: number;
+ wordCount?: number;
+ charCount?: number;
+ headingCount?: number;
+ // pdf
+ pageCount?: number;
+ // ML model / tensor data
+ modelFormat?: string; // "PyTorch", "Keras", "ONNX", "Safetensors", "GGUF", "TensorFlow"
+ containerFormat?: string; // "HDF5", "ZIP archive", "gzip"
+ tensorCount?: number;
+ parameterCount?: number;
+ sampleTensorNames?: string[];
+ // NumPy
+ dtype?: string;
+ shape?: number[];
+ // GGUF
+ ggufVersion?: number;
+ metadataKvCount?: number;
+ // Bioinformatics
+ sequenceCount?: number;
+ sequenceCountIsExact?: boolean;
+ variantCount?: number;
+ variantCountIsExact?: boolean;
+
+ // Rich tabular schema (CSV / XLSX)
+ columnTypes?: string[]; // inferred type per column: "integer", "double", "boolean", "date", "string"
+ nullCounts?: number[]; // count of empty cells per column (in sample)
+ sampleValues?: string[]; // first non-null value per column
+
+ // JSON schema
+ jsonMaxDepth?: number;
+ jsonKeyTypes?: { key: string; type: string }[]; // for object roots
+ jsonArrayElementType?: string; // for array roots: uniform type or "mixed"
+
+ // PDF /Info dictionary
+ pdfTitle?: string;
+ pdfAuthor?: string;
+ pdfCreator?: string;
+ pdfProducer?: string;
+ pdfVersion?: string;
+ pdfEncrypted?: boolean;
+
+ // Markdown structure
+ codeBlockCount?: number;
+ linkCount?: number;
+ imageCount?: number;
+ listItemCount?: number;
+
+ // Plain text / encoding
+ encoding?: string; // "UTF-8 BOM", "UTF-8", "ASCII"
+ emptyLineCount?: number;
+ avgLineLength?: number;
+ maxLineLength?: number;
+
+ // NumPy enhanced
+ totalElements?: number;
+ byteOrder?: string; // "little-endian", "big-endian"
+ fortranOrder?: boolean;
+
+ // Safetensors enhanced
+ dtypeBreakdown?: { dtype: string; params: number }[];
+ largestTensor?: { name: string; shape: number[]; params: number };
+ safetensorsMetadata?: { key: string; value: string }[];
+
+ // GGUF enhanced
+ ggufArchitecture?: string;
+ ggufQuantization?: string;
+
+ // FASTA enhanced
+ totalBases?: number;
+ gcContent?: number; // 0..1
+ minSequenceLength?: number;
+ maxSequenceLength?: number;
+ avgSequenceLength?: number;
+ isProtein?: boolean;
+
+ // VCF enhanced
+ vcfSampleCount?: number;
+ vcfChromosomes?: string[];
+}
+
+/**
+ * Above this size, skip the download entirely and show only extension-based
+ * identification + a "how to load" hint. The dominant source of preview lag
+ * is the full-blob download from the dataset service.
+ */
+export const FULL_PREVIEW_MAX_BYTES = 50 * 1024 * 1024; // 50 MB
+
+/**
+ * One-line "how to load" or "what is this" message per format.
+ * Used both when content was downloaded (in renderByMimeType) and when the
+ * download was skipped (in showOversizedFileInfo).
+ */
+export const TYPE_LOADING_HINTS: Record = {
+ [MIME_TYPES.PARQUET]: "Parquet file. Use the Parquet File Scan operator in Texera to analyze this data.",
+ [MIME_TYPES.ARROW]: "Arrow / Feather file. Use the Arrow File Scan operator in Texera.",
+ [MIME_TYPES.HDF5]: "HDF5 binary container (Keras .h5 or scientific dataset). Load with h5py / rhdf5.",
+ [MIME_TYPES.H5AD]: "AnnData (.h5ad) — single-cell expression matrix. Load with scanpy.read_h5ad().",
+ [MIME_TYPES.H5SEURAT]: "Seurat HDF5 object (.h5seurat). Load with SeuratDisk::LoadH5Seurat() in R.",
+ [MIME_TYPES.LOOM]: "Loom (.loom) single-cell expression. Load with loompy / scanpy in Python.",
+ [MIME_TYPES.RDS]: "R serialized object (.rds) — Seurat / SCE / fitted model. Load with readRDS() in R.",
+ [MIME_TYPES.PICKLE]: "Python pickle — serialized model or dataset. Load with pickle.load() in Python.",
+ [MIME_TYPES.PYTORCH]: "PyTorch checkpoint (.pt/.pth). Load with torch.load() in Python.",
+ [MIME_TYPES.KERAS]: "Keras v3 model (.keras). Load with tf.keras.models.load_model() in Python.",
+ [MIME_TYPES.ONNX]: "ONNX model (.onnx). Load with onnxruntime; inspect at netron.app.",
+ [MIME_TYPES.SAFETENSORS]: "Safetensors file. Load with safetensors.torch.load_file() in Python.",
+ [MIME_TYPES.GGUF]: "GGUF model (llama.cpp / quantized LLM).",
+ [MIME_TYPES.NPY]: "NumPy array (.npy). Load with numpy.load() in Python.",
+ [MIME_TYPES.NPZ]: "NumPy archive (.npz) — ZIP of .npy arrays. Load with numpy.load().",
+ [MIME_TYPES.CSV]: "CSV file. Use the CSV File Scan operator in Texera.",
+ [MIME_TYPES.JSON]: "JSON file. Use the JSONL File Scan operator (or Python UDF for nested objects).",
+ [MIME_TYPES.XLSX]: "Excel spreadsheet (.xlsx). Convert to CSV or use a Python UDF with openpyxl.",
+ [MIME_TYPES.MSEXCEL]: "Excel spreadsheet (.xls). Convert to CSV or use a Python UDF.",
+ [MIME_TYPES.FASTA]: "FASTA sequence file. Parse with Biopython SeqIO.",
+ [MIME_TYPES.FASTQ]: "FASTQ reads file. Parse with Biopython SeqIO.",
+ [MIME_TYPES.VCF]: "VCF variant file. Parse with pyvcf / cyvcf2.",
+};
+
+/** Classify a single cell value into a coarse type label. */
+function inferCellType(value: string): string {
+ if (value === "" || value == null) return "null";
+ if (/^-?\d+$/.test(value)) return "integer";
+ if (/^-?\d+\.\d+$/.test(value) || /^-?\d+\.?\d*[eE][-+]?\d+$/.test(value)) return "double";
+ if (/^(true|false|True|False|TRUE|FALSE)$/.test(value)) return "boolean";
+ if (/^\d{4}-\d{2}-\d{2}(?:[T ]\d{2}:\d{2}(?::\d{2})?)?$/.test(value)) return "date";
+ return "string";
+}
+
+/** Infer per-column type, null count, and a sample value from tabular data rows. */
+export function inferColumnSchema(
+ dataRows: string[][],
+ columnCount: number,
+ sampleLimit: number = 50
+): { types: string[]; nullCounts: number[]; samples: string[] } {
+ const types: string[] = [];
+ const nullCounts: number[] = [];
+ const samples: string[] = [];
+ const rowsToScan = Math.min(dataRows.length, sampleLimit);
+
+ for (let c = 0; c < columnCount; c++) {
+ const typeCounts: Record = {};
+ let nullCount = 0;
+ let firstNonNull = "";
+
+ for (let r = 0; r < rowsToScan; r++) {
+ const raw = dataRows[r][c];
+ const val = raw == null ? "" : String(raw).trim();
+ const t = inferCellType(val);
+ if (t === "null") {
+ nullCount++;
+ } else {
+ if (firstNonNull === "") firstNonNull = val;
+ typeCounts[t] = (typeCounts[t] ?? 0) + 1;
+ }
+ }
+
+ const ranked = Object.entries(typeCounts).sort((a, b) => b[1] - a[1]);
+ types.push(ranked[0]?.[0] ?? "string");
+ nullCounts.push(nullCount);
+ samples.push(firstNonNull);
+ }
+ return { types, nullCounts, samples };
+}
+
+/** Walk an arbitrary JSON value and compute max nesting depth. */
+function jsonMaxDepth(value: unknown, depth = 1): number {
+ if (Array.isArray(value)) {
+ let max = depth;
+ for (const item of value) max = Math.max(max, jsonMaxDepth(item, depth + 1));
+ return max;
+ }
+ if (value !== null && typeof value === "object") {
+ let max = depth;
+ for (const v of Object.values(value as Record)) {
+ max = Math.max(max, jsonMaxDepth(v, depth + 1));
+ }
+ return max;
+ }
+ return depth;
+}
+
+/** Describe a JS value's type for human display. */
+function jsTypeLabel(value: unknown): string {
+ if (value === null) return "null";
+ if (Array.isArray(value)) return `array(${value.length})`;
+ return typeof value;
+}
+
+/** Extract /Info dictionary fields from a PDF's raw text. Heuristic but robust for unencrypted PDFs. */
+function extractPdfInfo(rawText: string): {
+ title?: string;
+ author?: string;
+ creator?: string;
+ producer?: string;
+ version?: string;
+ encrypted?: boolean;
+} {
+ const result: ReturnType = {};
+ const versionMatch = rawText.match(/^%PDF-(\d+\.\d+)/);
+ if (versionMatch) result.version = versionMatch[1];
+ result.encrypted = /\/Encrypt\b/.test(rawText);
+
+ // Match `/Title (value)` or `/Title ` — only the parenthesized form is reliably plain text
+ const fieldRe = (name: string) => new RegExp(`/${name}\\s*\\(([^)\\\\]*(?:\\\\.[^)\\\\]*)*)\\)`);
+ const grab = (name: string): string | undefined => {
+ const m = rawText.match(fieldRe(name));
+ if (!m) return undefined;
+ // PDF strings can contain \( \) \\ escapes — unescape minimally
+ return m[1].replace(/\\([()\\])/g, "$1").trim() || undefined;
+ };
+ result.title = grab("Title");
+ result.author = grab("Author");
+ result.creator = grab("Creator");
+ result.producer = grab("Producer");
+ return result;
+}
+
+/** Compute GC content and sequence-length stats from a FASTA blob's text. */
+function summarizeFasta(text: string): {
+ sequenceCount: number;
+ totalBases: number;
+ gcContent: number;
+ minLen: number;
+ maxLen: number;
+ avgLen: number;
+ isProtein: boolean;
+} {
+ // Walk character by character — avoids splitting a multi-MB string into a huge array.
+ let inHeader = false;
+ let sequenceCount = 0;
+ let currentLen = 0;
+ let totalBases = 0;
+ let gcCount = 0;
+ let nonNucleotideCount = 0;
+ let minLen = Infinity;
+ let maxLen = 0;
+ const nucleotideSet = new Set(["A", "C", "G", "T", "U", "N", "a", "c", "g", "t", "u", "n"]);
+
+ const finishSequence = () => {
+ if (sequenceCount > 0 && currentLen > 0) {
+ if (currentLen < minLen) minLen = currentLen;
+ if (currentLen > maxLen) maxLen = currentLen;
+ }
+ currentLen = 0;
+ };
+
+ for (let i = 0; i < text.length; i++) {
+ const ch = text[i];
+ if (ch === "\n") {
+ if (inHeader) inHeader = false;
+ continue;
+ }
+ if (inHeader) continue;
+ if (ch === ">") {
+ finishSequence();
+ sequenceCount++;
+ inHeader = true;
+ continue;
+ }
+ if (ch === "\r" || ch === " " || ch === "\t") continue;
+ currentLen++;
+ totalBases++;
+ if (ch === "G" || ch === "C" || ch === "g" || ch === "c") gcCount++;
+ if (!nucleotideSet.has(ch)) nonNucleotideCount++;
+ }
+ finishSequence();
+
+ return {
+ sequenceCount,
+ totalBases,
+ gcContent: totalBases > 0 ? gcCount / totalBases : 0,
+ minLen: minLen === Infinity ? 0 : minLen,
+ maxLen,
+ avgLen: sequenceCount > 0 ? totalBases / sequenceCount : 0,
+ isProtein: totalBases > 0 && nonNucleotideCount / totalBases > 0.1,
+ };
+}
+
@UntilDestroy()
@Component({
selector: "texera-user-dataset-file-renderer",
@@ -101,14 +555,30 @@ export const MIME_TYPE_SIZE_LIMITS_MB = {
NzTbodyComponent,
MarkdownComponent,
NgxJsonViewerModule,
+ NzButtonComponent,
+ NzIconDirective,
],
})
export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDestroy {
- private DEFAULT_MAX_SIZE = 5 * 1024 * 1024; // 5 MB
+ private DEFAULT_MAX_SIZE = 1024 * 1024 * 1024; // 1 GB
+
+ // For text-based formats we slice to this size before parsing/rendering.
+ // Reading 1 GB as a UTF-16 string in JS would balloon to ~2 GB and likely crash the tab.
+ private static readonly PREVIEW_TEXT_BYTES = 10 * 1024 * 1024; // 10 MB
+
+ /** Slice the blob if it exceeds the preview limit, returning the slice + whether truncation occurred. */
+ private getPreviewSlice(blob: Blob): { slice: Blob; truncated: boolean } {
+ const limit = UserDatasetFileRendererComponent.PREVIEW_TEXT_BYTES;
+ if (blob.size <= limit) return { slice: blob, truncated: false };
+ return { slice: blob.slice(0, limit), truncated: true };
+ }
+
+ /** True when text content shown is from a slice rather than the whole file. */
+ public previewTruncated: boolean = false;
public fileURL: string | undefined;
- // safe url is used to display some formats including image
public safeFileURL: SafeUrl | undefined;
+ public safeResourceFileURL: SafeResourceUrl | undefined;
// table related control
public displayCSV: boolean = false;
@@ -131,10 +601,18 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe
// audio
public displayMP3: boolean = false;
- // plain text & octet stream related control
+ // PDF
+ public displayPDF: boolean = false;
+
+ // plain text
public displayPlainText: boolean = false;
public textContent: string = "";
+ // shown for detectable-but-unpreviewable types (Parquet, Arrow, DOCX, PPTX)
+ public detectedTypeMessage: string = "";
+
+ public fileMetadata: FileMetadata | undefined = undefined;
+
// control flags
public isLoading: boolean = false;
public isFileSizeUnloadable = false;
@@ -142,33 +620,107 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe
public isFileTypePreviewUnsupported: boolean = false;
public currentFile: File | undefined = undefined;
- @Input()
- isMaximized: boolean = false;
-
- @Input()
- did: number | undefined;
- @Input()
- dvid: number | undefined;
+ @Input() isMaximized: boolean = false;
+ @Input() did: number | undefined;
+ @Input() dvid: number | undefined;
+ @Input() filePath: string = "";
+ @Input() fileSize?: number;
+ @Input() isLogin: boolean = false;
- @Input()
- filePath: string = "";
-
- @Input()
- fileSize?: number;
-
- @Input()
- isLogin: boolean = false;
-
- @Output()
- loadFile = new EventEmitter<{ file: string; prefix: string }>();
+ @Output() loadFile = new EventEmitter<{ file: string; prefix: string }>();
constructor(
private datasetService: DatasetService,
private sanitizer: DomSanitizer,
- private notificationService: NotificationService
+ private notificationService: NotificationService,
+ private cdr: ChangeDetectorRef,
+ private router: Router,
+ private workflowPersistService: WorkflowPersistService,
+ private config: GuiConfigService
) {}
+ /** Always available — every file gives the user something useful when opened in a workflow. */
+ get canOpenInWorkflow(): boolean {
+ return !!this.filePath;
+ }
+
+ /**
+ * Map a file's MIME to the scan operator type that handles it. The workspace component
+ * uses this string to build a properly schema-validated operator via WorkflowUtilService.
+ */
+ private static getOperatorTypeForFile(filePath: string): string | null {
+ const mime = getMimeType(filePath);
+ switch (mime) {
+ case MIME_TYPES.CSV: return "CSVFileScan";
+ case MIME_TYPES.JSON: return "JSONLFileScan";
+ case MIME_TYPES.ARROW: return "ArrowFileScan";
+ case MIME_TYPES.PARQUET: return "ParquetFileScan";
+ case MIME_TYPES.PNG:
+ case MIME_TYPES.JPEG:
+ case MIME_TYPES.WEBP:
+ case MIME_TYPES.GIF:
+ case MIME_TYPES.AVIF:
+ case MIME_TYPES.BMP:
+ case MIME_TYPES.TIFF:
+ case MIME_TYPES.PDF:
+ case MIME_TYPES.MP3:
+ case MIME_TYPES.MP4:
+ case MIME_TYPES.WAV:
+ case MIME_TYPES.FLAC:
+ case MIME_TYPES.WEBM:
+ case MIME_TYPES.MOV:
+ case MIME_TYPES.TXT:
+ case MIME_TYPES.MD:
+ return "FileScan";
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Creates a new empty workflow and navigates to the editor. If the file type maps to a
+ * known scan operator, the workspace component picks up the `addOp` + `fileName` query
+ * params after init and adds the operator via the schema-validated path — see
+ * `workspace.component.ts:handlePendingOperatorAddition()`.
+ */
+ onOpenInWorkflow(): void {
+ const fileName = this.filePath.split("/").pop() ?? "file";
+ const addOp = UserDatasetFileRendererComponent.getOperatorTypeForFile(this.filePath);
+ const workflowContent: WorkflowContent = {
+ operators: [],
+ commentBoxes: [],
+ links: [],
+ operatorPositions: {},
+ settings: {
+ dataTransferBatchSize: this.config.env.defaultDataTransferBatchSize,
+ executionMode: this.config.env.defaultExecutionMode ?? ExecutionMode.PIPELINED,
+ },
+ };
+ const workflowName = `Analysis of ${fileName}`;
+ this.workflowPersistService
+ .createWorkflow(workflowContent, workflowName || DEFAULT_WORKFLOW_NAME)
+ .pipe(untilDestroyed(this))
+ .subscribe({
+ next: created => {
+ const wid = created?.workflow?.wid;
+ if (wid == null) {
+ this.notificationService.error("Workflow created but no ID was returned.");
+ return;
+ }
+ // Query params tell the workspace component which operator to auto-add and which
+ // file path to wire into its fileName property. The workspace strips them on use.
+ const queryParams = addOp ? { addOp, fileName: this.filePath } : undefined;
+ this.router.navigate([DASHBOARD_USER_WORKSPACE, wid], { queryParams }).then(navigated => {
+ if (!navigated) {
+ this.notificationService.error("Navigation to the workflow editor was blocked.");
+ }
+ });
+ },
+ error: () => this.notificationService.error("Failed to create workflow"),
+ });
+ }
+
ngOnInit(): void {
this.reloadFileContent();
}
@@ -194,105 +746,894 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe
reloadFileContent() {
this.turnOffAllDisplay();
- // Pre-check - file size
- const mimeType = getMimeType(this.filePath);
- if (!this.isPreviewSupported(mimeType)) {
- this.onFileTypePreviewUnsupported();
+ const extensionMime = getMimeType(this.filePath);
+
+ // Skip the full download for large files. The dataset service streams the entire blob;
+ // for a 500 MB file we'd wait 30+ seconds just to read its first 16 magic bytes. Above
+ // the threshold, fall back to extension-based identification + a "how to load" hint.
+ if (this.fileSize != null && this.fileSize > FULL_PREVIEW_MAX_BYTES) {
+ this.showOversizedFileInfo(extensionMime);
return;
}
- const limit = MIME_TYPE_SIZE_LIMITS_MB[mimeType] ?? this.DEFAULT_MAX_SIZE;
- if (this.fileSize != null && this.fileSize > limit) {
+
+ // Hard upper bound (defensive): even small types shouldn't load anything past this.
+ const preCheckLimit = MIME_TYPE_SIZE_LIMITS_MB[extensionMime] ?? this.DEFAULT_MAX_SIZE;
+ if (this.fileSize != null && this.fileSize > preCheckLimit) {
this.onFileSizeNotLoadable();
return;
}
- // Load file
+ if (!this.did || !this.dvid || !this.filePath) return;
+
this.isLoading = true;
- if (this.did && this.dvid && this.filePath != "") {
- this.datasetService
- .retrieveDatasetVersionSingleFile(this.filePath, this.isLogin)
- .pipe(untilDestroyed(this))
- .subscribe({
- next: blob => {
- this.isLoading = false;
- const blobMimeType = getMimeType(this.filePath);
- if (!this.isPreviewSupported(blobMimeType)) {
- this.onFileTypePreviewUnsupported();
- return;
- }
- const MaxSize = MIME_TYPE_SIZE_LIMITS_MB[blobMimeType] || this.DEFAULT_MAX_SIZE;
- const fileSize = blob.size;
- if (fileSize > MaxSize) {
- this.onFileSizeNotLoadable();
- this.notificationService.warning(`File ${this.filePath} is too large to be previewed`);
- return;
- }
- this.currentFile = new File([blob], this.filePath, { type: blob.type });
- // Handle different file types
- switch (blobMimeType) {
- case MIME_TYPES.PNG:
- case MIME_TYPES.JPEG:
- case MIME_TYPES.WEBP:
- case MIME_TYPES.GIF:
- this.displayImage = true;
- this.loadSafeURL(blob);
- break;
- case MIME_TYPES.MP4:
- this.displayMP4 = true;
- this.loadSafeURL(blob);
- break;
-
- case MIME_TYPES.MP3:
- this.displayMP3 = true;
- this.loadSafeURL(blob);
- break;
-
- case MIME_TYPES.MSEXCEL:
- readXlsxFile(blob).then(rows => {
- let parsedData: string[][] = [];
- rows.forEach(row => {
- // Convert each cell in the row to a string
- let stringRow = row.map(cell => (cell ? cell.toString() : ""));
- // Add the string array to the main array
- parsedData.push(stringRow);
- });
- if (parsedData.length > 0) {
- this.loadTabularFile(parsedData);
- this.displayXlsx = true;
- }
- });
- break;
- case MIME_TYPES.CSV:
- this.displayCSV = true;
- // Handle CSV display
- Papa.parse(this.currentFile, {
- complete: (results: ParseResult) => {
- if (results.data.length > 0) {
- this.loadTabularFile(results.data);
- }
- },
- error: error => {
- console.error("Error parsing file:", error);
- this.onFileLoadingError();
- },
- });
- break;
- case MIME_TYPES.MD:
- this.displayMarkdown = true;
- this.readFileAsText(blob);
- break;
- case MIME_TYPES.JSON:
- this.displayJson = true;
- this.readFileAsText(blob);
- break;
- case MIME_TYPES.TXT:
- default:
- this.displayPlainText = true;
- this.readFileAsText(blob);
- break;
+ this.datasetService
+ .retrieveDatasetVersionSingleFile(this.filePath, this.isLogin)
+ .pipe(untilDestroyed(this))
+ .subscribe({
+ next: async (blob: Blob) => {
+ this.isLoading = false;
+
+ const detectedMime = await this.detectMimeType(blob, this.filePath);
+
+ // Post-detection size check against the now-known type limit
+ const sizeLimit = MIME_TYPE_SIZE_LIMITS_MB[detectedMime] ?? this.DEFAULT_MAX_SIZE;
+ if (blob.size > sizeLimit) {
+ this.onFileSizeNotLoadable();
+ this.notificationService.warning(`File ${this.filePath} is too large to preview`);
+ return;
+ }
+
+ // currentFile is built lazily inside the CSV case (the only consumer); avoids an
+ // extra in-memory copy of the blob for every other type.
+ this.renderByMimeType(blob, detectedMime);
+ },
+ error: () => this.onFileLoadingError(),
+ });
+ }
+
+ /**
+ * Detects the actual MIME type of a blob using four strategies in order:
+ * 1. file-type library (magic bytes, ~100 formats) — refined with extension hints for
+ * ZIP/gzip container formats (PyTorch, Keras, NPZ, RDS).
+ * 2. Manual magic bytes for data formats not covered by file-type
+ * (Parquet, Arrow, HDF5, NumPy .npy, GGUF, Python pickle).
+ * 3. Extension-based fallback for opaque binary formats with no reliable magic bytes
+ * (Safetensors, ONNX).
+ * 4. Text sniffing for JSON, CSV, FASTA, FASTQ, VCF, Markdown, and plain text.
+ *
+ * Uses FileReader throughout for broad environment compatibility (tests, browsers).
+ */
+ async detectMimeType(blob: Blob, fileName?: string): Promise {
+ const ext = (fileName ?? "").split(".").pop()?.toLowerCase() ?? "";
+
+ // 1. file-type library covers images, video, audio, PDF, Office (ZIP-based), and more.
+ if (typeof fileTypeFromBlob === "function") {
+ try {
+ const result = await fileTypeFromBlob(blob);
+ if (result) {
+ // Refine generic container types (ZIP, gzip) using extension hints
+ if (result.mime === "application/zip") {
+ if (ext === "pt" || ext === "pth") return MIME_TYPES.PYTORCH;
+ if (ext === "keras") return MIME_TYPES.KERAS;
+ if (ext === "npz") return MIME_TYPES.NPZ;
+ }
+ if (result.mime === "application/gzip" && ext === "rds") return MIME_TYPES.RDS;
+ return result.mime;
+ }
+ } catch (_) {}
+ }
+
+ // 2. Manual magic bytes for formats not in file-type's signature list.
+ try {
+ const header = await this.readBlobBytes(blob.slice(0, 16));
+
+ // Parquet: PAR1 at bytes 0–3
+ if (header[0] === 0x50 && header[1] === 0x41 && header[2] === 0x52 && header[3] === 0x31) {
+ return MIME_TYPES.PARQUET;
+ }
+ // Arrow IPC: ARROW1 at bytes 0–5
+ if (
+ header[0] === 0x41 && header[1] === 0x52 && header[2] === 0x52 &&
+ header[3] === 0x4f && header[4] === 0x57 && header[5] === 0x31
+ ) {
+ return MIME_TYPES.ARROW;
+ }
+ // HDF5: \x89HDF\r\n\x1a\n at bytes 0–7
+ if (
+ header[0] === 0x89 && header[1] === 0x48 && header[2] === 0x44 && header[3] === 0x46 &&
+ header[4] === 0x0d && header[5] === 0x0a && header[6] === 0x1a && header[7] === 0x0a
+ ) {
+ // Refine HDF5 sub-types by extension (all use identical magic bytes)
+ if (ext === "h5ad") return MIME_TYPES.H5AD;
+ if (ext === "h5seurat") return MIME_TYPES.H5SEURAT;
+ if (ext === "loom") return MIME_TYPES.LOOM;
+ return MIME_TYPES.HDF5;
+ }
+ // NumPy .npy: \x93NUMPY at bytes 0–5
+ if (
+ header[0] === 0x93 && header[1] === 0x4e && header[2] === 0x55 &&
+ header[3] === 0x4d && header[4] === 0x50 && header[5] === 0x59
+ ) {
+ return MIME_TYPES.NPY;
+ }
+ // GGUF: ASCII "GGUF" at bytes 0–3
+ if (header[0] === 0x47 && header[1] === 0x47 && header[2] === 0x55 && header[3] === 0x46) {
+ return MIME_TYPES.GGUF;
+ }
+ // Python pickle: \x80 + protocol byte (2..5) + \x95 (FRAME opcode in proto 4+)
+ if (header[0] === 0x80 && header[1] >= 0x02 && header[1] <= 0x05) {
+ return MIME_TYPES.PICKLE;
+ }
+ } catch (_) {}
+
+ // 3. Extension-based fallback for opaque binaries lacking reliable magic bytes
+ if (ext === "safetensors") return MIME_TYPES.SAFETENSORS;
+ if (ext === "onnx") return MIME_TYPES.ONNX;
+
+ // 4. Text sniffing for formats with no fixed magic bytes
+ try {
+ const sample = await this.readBlobText(blob.slice(0, 4096));
+ const trimmed = sample.trimStart();
+ const firstLine = trimmed.split("\n")[0] ?? "";
+
+ if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
+ return MIME_TYPES.JSON;
+ }
+ if (trimmed.startsWith("# ") || trimmed.startsWith("## ")) {
+ return MIME_TYPES.MD;
+ }
+ // VCF: header line starts with ##fileformat=VCF
+ if (firstLine.startsWith("##fileformat=VCF")) {
+ return MIME_TYPES.VCF;
+ }
+ // FASTA: first non-empty/comment line starts with '>'
+ if (firstLine.startsWith(">")) {
+ return MIME_TYPES.FASTA;
+ }
+ // FASTQ: 4-line record pattern — line 1 starts '@', line 3 starts '+'
+ const lines = trimmed.split("\n");
+ if (lines.length >= 4 && lines[0].startsWith("@") && lines[2].startsWith("+")) {
+ return MIME_TYPES.FASTQ;
+ }
+ // CSV heuristic: first line has at least 3 comma-separated fields
+ if (firstLine.split(",").length >= 3) {
+ return MIME_TYPES.CSV;
+ }
+ // Printable ASCII/UTF-8 → plain text
+ const bytes = await this.readBlobBytes(blob.slice(0, 512));
+ const isPrintable = bytes.every(b => b === 9 || b === 10 || b === 13 || (b >= 32 && b <= 126));
+ if (isPrintable) return MIME_TYPES.TXT;
+ } catch (_) {}
+
+ return MIME_TYPES.OCTET_STREAM;
+ }
+
+ /** Parse a NumPy .npy header. Returns dtype, shape, byte order, and Fortran flag or null on failure. */
+ private async parseNpyHeader(
+ blob: Blob
+ ): Promise<{ dtype?: string; shape?: number[]; byteOrder?: string; fortranOrder?: boolean } | null> {
+ try {
+ const head = await this.readBlobBytes(blob.slice(0, 4096));
+ // bytes 0-5: magic, byte 6: major, byte 7: minor
+ const major = head[6];
+ // v1.0: uint16 LE header length at bytes 8-9; v2.0+: uint32 LE at bytes 8-11
+ const headerLen = major >= 2 ? head[8] | (head[9] << 8) | (head[10] << 16) | (head[11] << 24)
+ : head[8] | (head[9] << 8);
+ const headerStart = major >= 2 ? 12 : 10;
+ const headerText = new TextDecoder().decode(head.slice(headerStart, headerStart + headerLen));
+ const dtypeMatch = headerText.match(/['"]descr['"]\s*:\s*['"]([^'"]+)['"]/);
+ const shapeMatch = headerText.match(/['"]shape['"]\s*:\s*\(([^)]*)\)/);
+ const fortranMatch = headerText.match(/['"]fortran_order['"]\s*:\s*(True|False)/);
+ const shape = shapeMatch
+ ? shapeMatch[1].split(",").map(s => s.trim()).filter(s => s.length > 0).map(Number)
+ : undefined;
+ const dtype = dtypeMatch?.[1];
+ // dtype prefix: '<' = little-endian, '>' = big-endian, '|' = byte order N/A, '=' = native
+ let byteOrder: string | undefined;
+ if (dtype) {
+ if (dtype.startsWith("<")) byteOrder = "little-endian";
+ else if (dtype.startsWith(">")) byteOrder = "big-endian";
+ else if (dtype.startsWith("|")) byteOrder = "n/a";
+ }
+ const fortranOrder = fortranMatch ? fortranMatch[1] === "True" : undefined;
+ return { dtype, shape, byteOrder, fortranOrder };
+ } catch {
+ return null;
+ }
+ }
+
+ /** Parse a Safetensors file header. Returns rich tensor metadata or null. */
+ private async parseSafetensorsHeader(blob: Blob): Promise<{
+ tensorCount: number;
+ parameterCount: number;
+ sampleNames: string[];
+ dtypeBreakdown: { dtype: string; params: number }[];
+ largestTensor?: { name: string; shape: number[]; params: number };
+ metadata?: { key: string; value: string }[];
+ } | null> {
+ try {
+ const lenBytes = await this.readBlobBytes(blob.slice(0, 8));
+ // uint64 LE — JS can read up to 53 bits safely; header is always small (KB-MB)
+ let headerLen = 0;
+ for (let i = 0; i < 8; i++) headerLen += lenBytes[i] * Math.pow(256, i);
+ if (headerLen <= 0 || headerLen > 100 * 1024 * 1024) return null;
+ const headerText = await this.readBlobText(blob.slice(8, 8 + headerLen));
+ const json = JSON.parse(headerText);
+ const names = Object.keys(json).filter(k => k !== "__metadata__");
+ let paramCount = 0;
+ const dtypeMap: Record = {};
+ let largest: { name: string; shape: number[]; params: number } | undefined;
+ for (const name of names) {
+ const shape: number[] = json[name]?.shape ?? [];
+ const dtype: string = json[name]?.dtype ?? "?";
+ const params = shape.length > 0 ? shape.reduce((a, b) => a * b, 1) : 0;
+ paramCount += params;
+ dtypeMap[dtype] = (dtypeMap[dtype] ?? 0) + params;
+ if (!largest || params > largest.params) largest = { name, shape, params };
+ }
+ const dtypeBreakdown = Object.entries(dtypeMap)
+ .sort((a, b) => b[1] - a[1])
+ .map(([dtype, params]) => ({ dtype, params }));
+ const meta = (json.__metadata__ ?? {}) as Record;
+ const metadata = Object.entries(meta)
+ .slice(0, 6)
+ .map(([key, value]) => ({ key, value: String(value) }));
+ return {
+ tensorCount: names.length,
+ parameterCount: paramCount,
+ sampleNames: names.slice(0, 5),
+ dtypeBreakdown,
+ largestTensor: largest,
+ metadata: metadata.length > 0 ? metadata : undefined,
+ };
+ } catch {
+ return null;
+ }
+ }
+
+ /** Parse a GGUF (llama.cpp model) header. Returns version/tensor count or null. */
+ private async parseGgufHeader(
+ blob: Blob
+ ): Promise<{ version: number; tensorCount: number; metadataKvCount: number } | null> {
+ try {
+ const head = await this.readBlobBytes(blob.slice(0, 24));
+ // bytes 0-3: "GGUF" magic
+ // bytes 4-7: version (uint32 LE)
+ const version = head[4] | (head[5] << 8) | (head[6] << 16) | (head[7] << 24);
+ // bytes 8-15: tensor count (uint64 LE)
+ let tensorCount = 0;
+ for (let i = 0; i < 8; i++) tensorCount += head[8 + i] * Math.pow(256, i);
+ // bytes 16-23: metadata kv count (uint64 LE)
+ let metadataKvCount = 0;
+ for (let i = 0; i < 8; i++) metadataKvCount += head[16 + i] * Math.pow(256, i);
+ return { version, tensorCount, metadataKvCount };
+ } catch {
+ return null;
+ }
+ }
+
+ private readBlobBytes(blob: Blob): Promise {
+ return new Promise((resolve, reject) => {
+ const reader = new FileReader();
+ reader.onload = () => resolve(new Uint8Array(reader.result as ArrayBuffer));
+ reader.onerror = () => reject(reader.error);
+ reader.readAsArrayBuffer(blob);
+ });
+ }
+
+ private readBlobText(blob: Blob): Promise {
+ return new Promise((resolve, reject) => {
+ const reader = new FileReader();
+ reader.onload = () => resolve(reader.result as string);
+ reader.onerror = () => reject(reader.error);
+ reader.readAsText(blob);
+ });
+ }
+
+ /**
+ * Returns true for any MIME type we know how to render or describe.
+ * Only truly unidentified binary (OCTET_STREAM) is considered unsupported.
+ */
+ isPreviewSupported(mimeType: string): boolean {
+ return mimeType !== MIME_TYPES.OCTET_STREAM;
+ }
+
+ get metadataItems(): { label: string; value: string }[] {
+ const m = this.fileMetadata;
+ if (!m) return [];
+ const items: { label: string; value: string }[] = [];
+
+ if (m.imageWidth != null && m.imageHeight != null) {
+ items.push({ label: "Dimensions", value: `${m.imageWidth} × ${m.imageHeight} px` });
+ const gcd = (a: number, b: number): number => (b === 0 ? a : gcd(b, a % b));
+ const g = gcd(m.imageWidth, m.imageHeight);
+ items.push({ label: "Aspect ratio", value: `${m.imageWidth / g}:${m.imageHeight / g}` });
+ }
+
+ if (m.videoDuration != null) items.push({ label: "Duration", value: formatDuration(m.videoDuration) });
+ if (m.videoWidth != null && m.videoHeight != null)
+ items.push({ label: "Resolution", value: `${m.videoWidth} × ${m.videoHeight}` });
+
+ if (m.audioDuration != null) items.push({ label: "Duration", value: formatDuration(m.audioDuration) });
+
+ if (m.rowCount != null) items.push({ label: "Rows", value: m.rowCount.toLocaleString() });
+ if (m.columnCount != null) items.push({ label: "Columns", value: m.columnCount.toLocaleString() });
+ if (m.sheetCount != null) items.push({ label: "Sheets", value: m.sheetCount.toLocaleString() });
+ if (m.columnNames?.length) {
+ const preview = m.columnNames.slice(0, 8).join(", ");
+ const more = m.columnNames.length > 8 ? ` +${m.columnNames.length - 8} more` : "";
+ items.push({ label: "Fields", value: preview + more });
+ }
+
+ if (m.jsonTopLevelType != null) {
+ const label = m.jsonTopLevelType === "array" ? "Items" : "Keys";
+ items.push({ label: "JSON", value: m.jsonTopLevelType });
+ if (m.jsonItemCount != null) items.push({ label, value: m.jsonItemCount.toLocaleString() });
+ if (m.jsonPreviewKeys?.length) items.push({ label: "Preview", value: m.jsonPreviewKeys.join(", ") });
+ }
+
+ if (m.lineCount != null) items.push({ label: "Lines", value: m.lineCount.toLocaleString() });
+ if (m.wordCount != null) items.push({ label: "Words", value: m.wordCount.toLocaleString() });
+ if (m.charCount != null) items.push({ label: "Characters", value: m.charCount.toLocaleString() });
+ if (m.headingCount != null) items.push({ label: "Headings", value: m.headingCount.toLocaleString() });
+
+ if (m.pageCount != null) items.push({ label: "Pages", value: `~${m.pageCount}` });
+
+ // ML / scientific
+ if (m.modelFormat) items.push({ label: "Format", value: m.modelFormat });
+ if (m.containerFormat) items.push({ label: "Container", value: m.containerFormat });
+ if (m.dtype) items.push({ label: "dtype", value: m.dtype });
+ if (m.shape?.length) items.push({ label: "Shape", value: `(${m.shape.join(", ")})` });
+ if (m.tensorCount != null) items.push({ label: "Tensors", value: m.tensorCount.toLocaleString() });
+ if (m.parameterCount != null) items.push({ label: "Parameters", value: `~${m.parameterCount.toLocaleString()}` });
+ if (m.sampleTensorNames?.length)
+ items.push({ label: "Tensors (first)", value: m.sampleTensorNames.join(", ") });
+ if (m.ggufVersion != null) items.push({ label: "GGUF version", value: `v${m.ggufVersion}` });
+ if (m.metadataKvCount != null) items.push({ label: "Metadata KV", value: m.metadataKvCount.toLocaleString() });
+
+ // JSON schema details
+ if (m.jsonMaxDepth != null) items.push({ label: "Max depth", value: m.jsonMaxDepth.toLocaleString() });
+ if (m.jsonArrayElementType) items.push({ label: "Element type", value: m.jsonArrayElementType });
+ if (m.jsonKeyTypes?.length) {
+ items.push({
+ label: "Schema",
+ value: m.jsonKeyTypes.map(kt => `${kt.key}: ${kt.type}`).join(", "),
+ });
+ }
+
+ // PDF /Info
+ if (m.pdfVersion) items.push({ label: "PDF version", value: m.pdfVersion });
+ if (m.pdfTitle) items.push({ label: "Title", value: m.pdfTitle });
+ if (m.pdfAuthor) items.push({ label: "Author", value: m.pdfAuthor });
+ if (m.pdfCreator) items.push({ label: "Creator", value: m.pdfCreator });
+ if (m.pdfProducer) items.push({ label: "Producer", value: m.pdfProducer });
+ if (m.pdfEncrypted) items.push({ label: "Encrypted", value: "Yes" });
+
+ // Markdown structure
+ if (m.codeBlockCount) items.push({ label: "Code blocks", value: m.codeBlockCount.toLocaleString() });
+ if (m.linkCount) items.push({ label: "Links", value: m.linkCount.toLocaleString() });
+ if (m.imageCount) items.push({ label: "Images", value: m.imageCount.toLocaleString() });
+ if (m.listItemCount) items.push({ label: "List items", value: m.listItemCount.toLocaleString() });
+
+ // Plain text encoding/structure
+ if (m.encoding) items.push({ label: "Encoding", value: m.encoding });
+ if (m.emptyLineCount != null && m.emptyLineCount > 0)
+ items.push({ label: "Blank lines", value: m.emptyLineCount.toLocaleString() });
+ if (m.avgLineLength != null && m.avgLineLength > 0)
+ items.push({ label: "Avg line", value: `${Math.round(m.avgLineLength)} chars` });
+ if (m.maxLineLength != null && m.maxLineLength > 0)
+ items.push({ label: "Max line", value: `${m.maxLineLength.toLocaleString()} chars` });
+
+ // NumPy details
+ if (m.totalElements != null) items.push({ label: "Elements", value: m.totalElements.toLocaleString() });
+ if (m.byteOrder) items.push({ label: "Byte order", value: m.byteOrder });
+ if (m.fortranOrder != null) items.push({ label: "Order", value: m.fortranOrder ? "Fortran (column)" : "C (row)" });
+
+ // Safetensors details
+ if (m.dtypeBreakdown?.length) {
+ items.push({
+ label: "Dtypes",
+ value: m.dtypeBreakdown.map(d => `${d.dtype}: ${d.params.toLocaleString()}`).join(", "),
+ });
+ }
+ if (m.largestTensor) {
+ items.push({
+ label: "Largest tensor",
+ value: `${m.largestTensor.name} (${m.largestTensor.shape.join("×")}, ${m.largestTensor.params.toLocaleString()} params)`,
+ });
+ }
+ if (m.safetensorsMetadata?.length) {
+ for (const kv of m.safetensorsMetadata) {
+ items.push({ label: kv.key, value: kv.value });
+ }
+ }
+
+ // GGUF details
+ if (m.ggufArchitecture) items.push({ label: "Architecture", value: m.ggufArchitecture });
+ if (m.ggufQuantization) items.push({ label: "Quantization", value: m.ggufQuantization });
+
+ // Bioinformatics
+ if (m.sequenceCount != null) {
+ const label = m.sequenceCountIsExact ? "Sequences" : "Sequences (sampled)";
+ items.push({ label, value: m.sequenceCount.toLocaleString() });
+ }
+ if (m.variantCount != null) {
+ const label = m.variantCountIsExact ? "Variants" : "Variants (sampled)";
+ items.push({ label, value: m.variantCount.toLocaleString() });
+ }
+ if (m.totalBases != null) items.push({ label: "Total bases", value: m.totalBases.toLocaleString() });
+ if (m.gcContent != null) items.push({ label: "GC content", value: `${(m.gcContent * 100).toFixed(1)}%` });
+ if (m.isProtein) items.push({ label: "Sequence type", value: "Protein" });
+ if (m.minSequenceLength != null && m.maxSequenceLength != null) {
+ items.push({
+ label: "Length range",
+ value: `${m.minSequenceLength.toLocaleString()}–${m.maxSequenceLength.toLocaleString()} (avg ${Math.round(
+ m.avgSequenceLength ?? 0
+ ).toLocaleString()})`,
+ });
+ }
+ if (m.vcfSampleCount != null && m.vcfSampleCount > 0)
+ items.push({ label: "Samples", value: m.vcfSampleCount.toLocaleString() });
+ if (m.vcfChromosomes?.length)
+ items.push({ label: "Chromosomes", value: m.vcfChromosomes.slice(0, 8).join(", ") });
+
+ return items;
+ }
+
+ private renderByMimeType(blob: Blob, mimeType: string): void {
+ if (mimeType.startsWith("image/")) {
+ this.displayImage = true;
+ this.loadSafeURL(blob);
+ this.fileMetadata = { fileSize: blob.size };
+ const img = new Image();
+ img.onload = () => {
+ this.fileMetadata = { ...this.fileMetadata, imageWidth: img.naturalWidth, imageHeight: img.naturalHeight };
+ this.cdr.markForCheck();
+ };
+ img.src = this.fileURL!;
+ return;
+ }
+
+ if (mimeType.startsWith("video/")) {
+ this.displayMP4 = true;
+ this.loadSafeURL(blob);
+ this.fileMetadata = { fileSize: blob.size };
+ const video = document.createElement("video");
+ video.preload = "metadata";
+ video.onloadedmetadata = () => {
+ this.fileMetadata = {
+ ...this.fileMetadata,
+ videoDuration: video.duration,
+ videoWidth: video.videoWidth,
+ videoHeight: video.videoHeight,
+ };
+ this.cdr.markForCheck();
+ URL.revokeObjectURL(video.src);
+ };
+ video.src = URL.createObjectURL(blob);
+ return;
+ }
+
+ if (mimeType.startsWith("audio/")) {
+ this.displayMP3 = true;
+ this.loadSafeURL(blob);
+ this.fileMetadata = { fileSize: blob.size };
+ const audio = document.createElement("audio");
+ audio.preload = "metadata";
+ audio.onloadedmetadata = () => {
+ this.fileMetadata = { ...this.fileMetadata, audioDuration: audio.duration };
+ this.cdr.markForCheck();
+ URL.revokeObjectURL(audio.src);
+ };
+ audio.src = URL.createObjectURL(blob);
+ return;
+ }
+
+ switch (mimeType) {
+ case MIME_TYPES.PDF:
+ this.displayPDF = true;
+ this.loadSafeURL(blob);
+ this.fileMetadata = { fileSize: blob.size };
+ // Read first 200KB for /Info + version + page count; tail 50KB for trailer (where /Info often lives)
+ Promise.all([
+ this.readBlobText(blob.slice(0, 200 * 1024)),
+ this.readBlobText(blob.slice(Math.max(0, blob.size - 50 * 1024))),
+ ]).then(([head, tail]) => {
+ const combined = head + "\n" + tail;
+ const exact = (combined.match(/\/Type\s*\/Page\b/g) ?? []).length;
+ const fallback = Math.ceil((combined.match(/\/Page\b/g) ?? []).length / 2);
+ const pageCount = exact > 0 ? exact : fallback || undefined;
+ const info = extractPdfInfo(combined);
+ this.fileMetadata = {
+ ...this.fileMetadata,
+ pageCount,
+ pdfTitle: info.title,
+ pdfAuthor: info.author,
+ pdfCreator: info.creator,
+ pdfProducer: info.producer,
+ pdfVersion: info.version,
+ pdfEncrypted: info.encrypted,
+ };
+ this.cdr.markForCheck();
+ });
+ break;
+
+ case MIME_TYPES.MSEXCEL:
+ case MIME_TYPES.XLSX:
+ Promise.all([readXlsxFile(blob), readSheetNames(blob)]).then(([rows, sheetNames]) => {
+ const parsedData = rows.map(row => row.map(cell => (cell != null ? cell.toString() : "")));
+ if (parsedData.length > 0) {
+ this.loadTabularFile(parsedData);
+ this.displayXlsx = true;
+ const header = parsedData[0];
+ const dataRows = parsedData.slice(1).filter(r => r.some(c => c !== ""));
+ const schema = inferColumnSchema(dataRows, header.length);
+ this.fileMetadata = {
+ fileSize: blob.size,
+ rowCount: dataRows.length,
+ columnCount: header.length,
+ columnNames: header,
+ sheetCount: sheetNames.length,
+ columnTypes: schema.types,
+ nullCounts: schema.nullCounts,
+ sampleValues: schema.samples,
+ };
+ this.cdr.markForCheck();
+ }
+ });
+ break;
+
+ case MIME_TYPES.CSV: {
+ this.displayCSV = true;
+ const { slice: csvSlice, truncated: csvTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = csvTruncated;
+ // Papa.parse needs a File-like; build it from the slice only — no need to keep the full blob.
+ const fileToParse = new File([csvSlice], this.filePath, { type: MIME_TYPES.CSV });
+ Papa.parse(fileToParse, {
+ complete: (results: ParseResult) => {
+ if (results.data.length > 0) {
+ this.loadTabularFile(results.data);
+ const header: string[] = results.data[0].map(String);
+ const dataRows = (results.data.slice(1) as any[][])
+ .filter(r => r.some((c: any) => c !== ""))
+ .map(r => r.map((c: any) => (c == null ? "" : String(c))));
+ const schema = inferColumnSchema(dataRows, header.length);
+ this.fileMetadata = {
+ fileSize: blob.size,
+ rowCount: dataRows.length,
+ columnCount: header.length,
+ columnNames: header,
+ columnTypes: schema.types,
+ nullCounts: schema.nullCounts,
+ sampleValues: schema.samples,
+ };
+ this.cdr.markForCheck();
}
},
+ error: () => this.onFileLoadingError(),
+ });
+ break;
+ }
+
+ case MIME_TYPES.MD: {
+ this.displayMarkdown = true;
+ const { slice: mdSlice, truncated: mdTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = mdTruncated;
+ this.readBlobText(mdSlice).then(text => {
+ this.textContent = text;
+ const lines = text.split("\n");
+ // Strip fenced code blocks to count them; also count inline elements
+ const codeBlockCount = (text.match(/^```/gm) ?? []).length / 2;
+ const linkCount = (text.match(/\[[^\]]+\]\([^)]+\)/g) ?? []).length;
+ const imageCount = (text.match(/!\[[^\]]*\]\([^)]+\)/g) ?? []).length;
+ const listItemCount = lines.filter(l => /^\s*[-*+]\s/.test(l) || /^\s*\d+\.\s/.test(l)).length;
+ this.fileMetadata = {
+ fileSize: blob.size,
+ lineCount: lines.length,
+ wordCount: text.trim() ? text.trim().split(/\s+/).length : 0,
+ headingCount: lines.filter(l => /^#{1,6}\s/.test(l)).length,
+ codeBlockCount: Math.floor(codeBlockCount),
+ linkCount: linkCount - imageCount, // image syntax is link syntax + leading '!'
+ imageCount,
+ listItemCount,
+ };
+ this.cdr.markForCheck();
+ });
+ break;
+ }
+
+ case MIME_TYPES.JSON: {
+ this.displayJson = true;
+ const { slice: jsonSlice, truncated: jsonTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = jsonTruncated;
+ this.readBlobText(jsonSlice).then(text => {
+ this.textContent = text;
+ try {
+ const parsed = JSON.parse(text);
+ const isArray = Array.isArray(parsed);
+ const keys = isArray ? null : Object.keys(parsed);
+ const maxDepth = jsonMaxDepth(parsed);
+ let jsonKeyTypes: { key: string; type: string }[] | undefined;
+ let jsonArrayElementType: string | undefined;
+ if (isArray && parsed.length > 0) {
+ const elementTypes = new Set(parsed.slice(0, 20).map(jsTypeLabel));
+ jsonArrayElementType = elementTypes.size === 1 ? [...elementTypes][0] : "mixed";
+ } else if (!isArray && keys) {
+ jsonKeyTypes = keys.slice(0, 8).map(k => ({
+ key: k,
+ type: jsTypeLabel((parsed as Record)[k]),
+ }));
+ }
+ this.fileMetadata = {
+ fileSize: blob.size,
+ jsonTopLevelType: isArray ? "array" : "object",
+ jsonItemCount: isArray ? parsed.length : keys!.length,
+ jsonPreviewKeys: isArray
+ ? parsed.slice(0, 5).map((_: unknown, i: number) => `[${i}]`)
+ : keys!.slice(0, 8),
+ jsonMaxDepth: maxDepth,
+ jsonKeyTypes,
+ jsonArrayElementType,
+ };
+ } catch {
+ // Truncated JSON or invalid — fall back to raw text view
+ this.fileMetadata = { fileSize: blob.size };
+ }
+ this.cdr.markForCheck();
+ });
+ break;
+ }
+
+ case MIME_TYPES.PARQUET:
+ this.detectedTypeMessage =
+ "Parquet file detected. Use the Parquet File Scan operator in Texera to analyze this data.";
+ this.fileMetadata = { fileSize: blob.size };
+ break;
+
+ case MIME_TYPES.ARROW:
+ this.detectedTypeMessage =
+ "Arrow/Feather file detected. Use the Arrow File Scan operator in Texera to analyze this data.";
+ this.fileMetadata = { fileSize: blob.size };
+ break;
+
+ case MIME_TYPES.DOCX:
+ this.detectedTypeMessage = "Word document (.docx) detected. Rich document preview is not yet supported.";
+ this.fileMetadata = { fileSize: blob.size };
+ break;
+
+ case MIME_TYPES.PPTX:
+ this.detectedTypeMessage = "PowerPoint (.pptx) detected. Presentation preview is not yet supported.";
+ this.fileMetadata = { fileSize: blob.size };
+ break;
+
+ // --- ML / scientific data formats ---
+
+ case MIME_TYPES.HDF5:
+ this.detectedTypeMessage =
+ "HDF5 binary container detected. Likely a model (Keras .h5) or scientific dataset. Load with h5py / rhdf5.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "HDF5" };
+ break;
+
+ case MIME_TYPES.H5AD:
+ this.detectedTypeMessage =
+ "AnnData (.h5ad) detected — single-cell expression matrix in HDF5. Load with scanpy.read_h5ad() in Python.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "HDF5" };
+ break;
+
+ case MIME_TYPES.H5SEURAT:
+ this.detectedTypeMessage =
+ "Seurat HDF5 object (.h5seurat) detected. Load with SeuratDisk::LoadH5Seurat() in R.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "HDF5" };
+ break;
+
+ case MIME_TYPES.LOOM:
+ this.detectedTypeMessage =
+ "Loom (.loom) detected — single-cell expression in HDF5. Load with loompy / scanpy in Python.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "HDF5" };
+ break;
+
+ case MIME_TYPES.RDS:
+ this.detectedTypeMessage =
+ "R serialized object (.rds) detected — commonly a Seurat / SingleCellExperiment / fitted model. Load with readRDS() in R.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "gzip" };
+ break;
+
+ case MIME_TYPES.PICKLE:
+ this.detectedTypeMessage =
+ "Python pickle detected — typically a serialized model (sklearn / joblib) or dataset. Load with pickle.load() in Python.";
+ this.fileMetadata = { fileSize: blob.size };
+ break;
+
+ case MIME_TYPES.PYTORCH:
+ this.detectedTypeMessage =
+ "PyTorch checkpoint (.pt/.pth) detected. Load with torch.load() in Python.";
+ this.fileMetadata = { fileSize: blob.size, modelFormat: "PyTorch", containerFormat: "ZIP archive" };
+ break;
+
+ case MIME_TYPES.KERAS:
+ this.detectedTypeMessage =
+ "Keras v3 model (.keras) detected. Load with tf.keras.models.load_model() in Python.";
+ this.fileMetadata = { fileSize: blob.size, modelFormat: "Keras", containerFormat: "ZIP archive" };
+ break;
+
+ case MIME_TYPES.ONNX:
+ this.detectedTypeMessage =
+ "ONNX model (.onnx) detected — portable neural network. Load with onnxruntime or netron.app for inspection.";
+ this.fileMetadata = { fileSize: blob.size, modelFormat: "ONNX" };
+ break;
+
+ case MIME_TYPES.NPY:
+ this.parseNpyHeader(blob).then(info => {
+ const shapeStr = info?.shape ? info.shape.join(" × ") : "?";
+ const totalElements = info?.shape?.reduce((a, b) => a * b, 1);
+ this.detectedTypeMessage = `NumPy array (.npy) detected — ${info?.dtype ?? "?"} array of shape (${shapeStr}).`;
+ this.fileMetadata = {
+ fileSize: blob.size,
+ dtype: info?.dtype,
+ shape: info?.shape,
+ totalElements,
+ byteOrder: info?.byteOrder,
+ fortranOrder: info?.fortranOrder,
+ };
+ this.cdr.markForCheck();
+ });
+ break;
+
+ case MIME_TYPES.NPZ:
+ this.detectedTypeMessage =
+ "NumPy archive (.npz) detected — ZIP of .npy arrays. Load with numpy.load() and access via dict-like API.";
+ this.fileMetadata = { fileSize: blob.size, containerFormat: "ZIP archive" };
+ break;
+
+ case MIME_TYPES.SAFETENSORS:
+ this.parseSafetensorsHeader(blob).then(info => {
+ if (info) {
+ const paramStr = info.parameterCount.toLocaleString();
+ this.detectedTypeMessage = `Safetensors model detected — ${info.tensorCount} tensors, ~${paramStr} parameters.`;
+ this.fileMetadata = {
+ fileSize: blob.size,
+ modelFormat: "Safetensors",
+ tensorCount: info.tensorCount,
+ parameterCount: info.parameterCount,
+ sampleTensorNames: info.sampleNames,
+ dtypeBreakdown: info.dtypeBreakdown,
+ largestTensor: info.largestTensor,
+ safetensorsMetadata: info.metadata,
+ };
+ } else {
+ this.detectedTypeMessage = "Safetensors file detected. Load with safetensors.torch.load_file() in Python.";
+ this.fileMetadata = { fileSize: blob.size, modelFormat: "Safetensors" };
+ }
+ this.cdr.markForCheck();
+ });
+ break;
+
+ case MIME_TYPES.GGUF:
+ this.parseGgufHeader(blob).then(info => {
+ if (info) {
+ this.detectedTypeMessage = `GGUF model detected — v${info.version}, ${info.tensorCount} tensors, ${info.metadataKvCount} metadata entries.`;
+ this.fileMetadata = {
+ fileSize: blob.size,
+ modelFormat: "GGUF",
+ ggufVersion: info.version,
+ tensorCount: info.tensorCount,
+ metadataKvCount: info.metadataKvCount,
+ };
+ } else {
+ this.detectedTypeMessage = "GGUF model detected (llama.cpp / quantized LLM format).";
+ this.fileMetadata = { fileSize: blob.size, modelFormat: "GGUF" };
+ }
+ this.cdr.markForCheck();
+ });
+ break;
+
+ // --- Bioinformatics text formats — render as plain text plus record-count metadata ---
+
+ case MIME_TYPES.FASTA: {
+ this.displayPlainText = true;
+ const { slice: faSlice, truncated: faTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = faTruncated;
+ this.readBlobText(faSlice).then(text => {
+ this.textContent = text;
+ const stats = summarizeFasta(text);
+ this.fileMetadata = {
+ fileSize: blob.size,
+ lineCount: text.split("\n").length,
+ sequenceCount: stats.sequenceCount,
+ sequenceCountIsExact: !faTruncated,
+ totalBases: stats.totalBases,
+ gcContent: stats.isProtein ? undefined : stats.gcContent,
+ minSequenceLength: stats.minLen,
+ maxSequenceLength: stats.maxLen,
+ avgSequenceLength: stats.avgLen,
+ isProtein: stats.isProtein,
+ };
+ this.cdr.markForCheck();
+ });
+ break;
+ }
+
+ case MIME_TYPES.FASTQ: {
+ this.displayPlainText = true;
+ const { slice: fqSlice, truncated: fqTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = fqTruncated;
+ this.readBlobText(fqSlice).then(text => {
+ this.textContent = text;
+ const lineCount = text.split("\n").filter(l => l.length > 0).length;
+ this.fileMetadata = {
+ fileSize: blob.size,
+ lineCount: text.split("\n").length,
+ sequenceCount: Math.floor(lineCount / 4),
+ sequenceCountIsExact: !fqTruncated,
+ };
+ this.cdr.markForCheck();
+ });
+ break;
+ }
+
+ case MIME_TYPES.VCF: {
+ this.displayPlainText = true;
+ const { slice: vcfSlice, truncated: vcfTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = vcfTruncated;
+ this.readBlobText(vcfSlice).then(text => {
+ this.textContent = text;
+ const lines = text.split("\n");
+ const variantLines = lines.filter(l => l.length > 0 && !l.startsWith("#"));
+ // Sample names are tab-separated columns after the 9 fixed VCF fields on the #CHROM header line
+ const chromHeader = lines.find(l => l.startsWith("#CHROM"));
+ const headerFields = chromHeader ? chromHeader.split("\t") : [];
+ const vcfSampleCount = headerFields.length > 9 ? headerFields.length - 9 : 0;
+ const chromSet = new Set();
+ for (const line of variantLines.slice(0, 5000)) {
+ const chr = line.split("\t", 1)[0];
+ if (chr) chromSet.add(chr);
+ if (chromSet.size >= 30) break;
+ }
+ this.fileMetadata = {
+ fileSize: blob.size,
+ lineCount: lines.length,
+ variantCount: variantLines.length,
+ variantCountIsExact: !vcfTruncated,
+ vcfSampleCount,
+ vcfChromosomes: [...chromSet].slice(0, 12),
+ };
+ this.cdr.markForCheck();
});
+ break;
+ }
+
+ case MIME_TYPES.OCTET_STREAM:
+ this.onFileTypePreviewUnsupported();
+ break;
+
+ default: {
+ this.displayPlainText = true;
+ const { slice: txtSlice, truncated: txtTruncated } = this.getPreviewSlice(blob);
+ this.previewTruncated = txtTruncated;
+ Promise.all([this.readBlobBytes(blob.slice(0, 3)), this.readBlobText(txtSlice)]).then(([head, text]) => {
+ this.textContent = text;
+ const lines = text.split("\n");
+ const lineLens = lines.map(l => l.length);
+ const totalLen = lineLens.reduce((a, b) => a + b, 0);
+ const emptyLineCount = lineLens.filter(n => n === 0).length;
+ const maxLineLength = lineLens.length > 0 ? Math.max(...lineLens) : 0;
+ // BOM detection: UTF-8 BOM is EF BB BF; otherwise assume ASCII/UTF-8
+ let encoding = "UTF-8";
+ if (head[0] === 0xef && head[1] === 0xbb && head[2] === 0xbf) encoding = "UTF-8 BOM";
+ else if (lines.every(l => /^[\x00-\x7F]*$/.test(l))) encoding = "ASCII";
+ this.fileMetadata = {
+ fileSize: blob.size,
+ lineCount: lines.length,
+ wordCount: text.trim() ? text.trim().split(/\s+/).length : 0,
+ charCount: text.length,
+ emptyLineCount,
+ avgLineLength: lines.length > 0 ? totalLen / lines.length : 0,
+ maxLineLength,
+ encoding,
+ };
+ this.cdr.markForCheck();
+ });
+ }
}
}
@@ -305,17 +1646,26 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe
this.displayJson = false;
this.displayMP4 = false;
this.displayMP3 = false;
+ this.displayPDF = false;
+ this.detectedTypeMessage = "";
+ this.fileMetadata = undefined;
this.isLoading = false;
this.isFileLoadingError = false;
this.isFileSizeUnloadable = false;
this.isFileTypePreviewUnsupported = false;
- // garbage collection
if (this.fileURL) {
URL.revokeObjectURL(this.fileURL);
}
- if (this.safeFileURL) {
- URL.revokeObjectURL(this.safeFileURL.toString());
- }
+ this.fileURL = undefined;
+ this.safeFileURL = undefined;
+ this.safeResourceFileURL = undefined;
+ // Clear cached content so memory is reclaimed when switching files; without these,
+ // a previously-loaded 10 MB text or 100K-row table would persist on the component.
+ this.textContent = "";
+ this.tableContent = [];
+ this.tableDataHeader = [];
+ this.currentFile = undefined;
+ this.previewTruncated = false;
}
onFileLoadingError() {
@@ -333,49 +1683,37 @@ export class UserDatasetFileRendererComponent implements OnInit, OnChanges, OnDe
this.isFileTypePreviewUnsupported = true;
}
- isPreviewSupported(mimeType: string) {
- return mimeType !== MIME_TYPES.OCTET_STREAM && Object.hasOwnProperty.call(MIME_TYPE_SIZE_LIMITS_MB, mimeType);
- }
-
- private readFileAsText(blob: Blob) {
- const txtReader = new FileReader();
- txtReader.onload = (event: any) => {
- this.textContent = event.target.result;
- };
- txtReader.readAsText(blob);
+ /**
+ * Skip the download for very large files and show only the extension-based type hint.
+ * Avoids the multi-second download + memory cost of fetching a multi-hundred-MB blob
+ * just to render its first frame / table / iframe.
+ */
+ private showOversizedFileInfo(extensionMime: string): void {
+ const hint = TYPE_LOADING_HINTS[extensionMime];
+ const sizeStr = this.fileSize != null ? formatSize(this.fileSize) : "very large";
+ this.detectedTypeMessage = hint
+ ? `${hint} (Preview skipped — file is ${sizeStr}.)`
+ : `File is ${sizeStr} — full preview skipped to avoid browser lag. Open in a workflow operator to analyze.`;
+ this.cdr.markForCheck();
}
- private loadSafeURL(blob: Blob) {
+ private loadSafeURL(blob: Blob): void {
this.fileURL = URL.createObjectURL(blob);
this.safeFileURL = this.sanitizer.bypassSecurityTrustUrl(this.fileURL);
+ this.safeResourceFileURL = this.sanitizer.bypassSecurityTrustResourceUrl(this.fileURL);
}
- private loadTabularFile(data: any[][]) {
+
+ private loadTabularFile(data: any[][]): void {
if (data.length > 0) {
- // Extract the header (first row)
this.tableDataHeader = data[0];
-
- // Process the rest of the rows
this.tableContent = data
.slice(1)
.map(row => {
- // Normalize the row length to match the header length
- while (row.length < this.tableDataHeader.length) {
- row.push("");
- }
+ while (row.length < this.tableDataHeader.length) row.push("");
return row;
})
- .filter(row => {
- // filter out all empty row
- let areCellAllEmpty = true;
- for (const cell in row) {
- if (cell != "") {
- areCellAllEmpty = false;
- break;
- }
- }
- return !areCellAllEmpty;
- });
+ .filter(row => row.some(cell => cell !== ""));
}
}
}
diff --git a/frontend/src/app/workspace/component/workspace.component.ts b/frontend/src/app/workspace/component/workspace.component.ts
index 9968c26f647..e96a53959ea 100644
--- a/frontend/src/app/workspace/component/workspace.component.ts
+++ b/frontend/src/app/workspace/component/workspace.component.ts
@@ -36,6 +36,7 @@ import { Workflow } from "../../common/type/workflow";
import { OperatorMetadataService } from "../service/operator-metadata/operator-metadata.service";
import { UndoRedoService } from "../service/undo-redo/undo-redo.service";
import { WorkflowActionService } from "../service/workflow-graph/model/workflow-action.service";
+import { WorkflowUtilService } from "../service/workflow-graph/util/workflow-util.service";
import { NzMessageService } from "ng-zorro-antd/message";
import { debounceTime, distinctUntilChanged, filter, switchMap, throttleTime } from "rxjs/operators";
import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy";
@@ -117,6 +118,7 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy {
private undoRedoService: UndoRedoService,
private workflowPersistService: WorkflowPersistService,
private workflowActionService: WorkflowActionService,
+ private workflowUtilService: WorkflowUtilService,
private location: Location,
private route: ActivatedRoute,
private operatorMetadataService: OperatorMetadataService,
@@ -186,6 +188,38 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy {
this.workflowActionService.clearWorkflow();
}
+ /**
+ * Reads `addOp` + `fileName` query params (set by the dataset file renderer's "Open in
+ * workflow" button) and adds the corresponding scan operator with its `fileName` property
+ * prefilled. Strips the params from the URL on success so a refresh doesn't double-add.
+ *
+ * Runs after `loadWorkflowWithId` completes, so the operator metadata is loaded and the
+ * workflow graph is ready for modification.
+ */
+ handlePendingOperatorAddition(): void {
+ const params = this.route.snapshot.queryParams;
+ const addOp: string | undefined = params.addOp;
+ const fileName: string | undefined = params.fileName;
+ if (!addOp || !fileName) return;
+ try {
+ const operator = this.workflowUtilService.getNewOperatorPredicate(addOp);
+ // Place near the upper-left of the visible viewport.
+ const origin = this.workflowActionService.getJointGraphWrapper().getMainJointPaper()?.translate();
+ const point = { x: 400 - (origin?.tx ?? 0), y: 200 - (origin?.ty ?? 0) };
+ this.workflowActionService.addOperator(operator, point);
+ // Set the file path through the schema-validated mutation API (operatorProperties is readonly).
+ this.workflowActionService.setOperatorProperty(operator.operatorID, {
+ ...operator.operatorProperties,
+ fileName,
+ });
+ } catch (err) {
+ this.notificationService.error(`Could not pre-add operator "${addOp}" — drag it in manually.`);
+ } finally {
+ // Strip query params so a manual refresh doesn't re-add the operator.
+ this.router.navigate([], { relativeTo: this.route, queryParams: {}, preserveFragment: true });
+ }
+ }
+
registerAutoPersistWorkflow(): void {
// make sure it is only registered once
if (this.autoPersistRegistered) {
@@ -260,6 +294,9 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy {
this.setLoadingState(false);
this.registerAutoPersistWorkflow();
this.triggerCenter();
+ // If the user arrived via "Open in workflow" from the dataset file renderer,
+ // honor the addOp + fileName query params now that the workflow is fully loaded.
+ this.handlePendingOperatorAddition();
},
() => {
this.workflowActionService.resetAsNewWorkflow();
diff --git a/frontend/yarn.lock b/frontend/yarn.lock
index 6a4ae4330c4..2d2851d7885 100644
--- a/frontend/yarn.lock
+++ b/frontend/yarn.lock
@@ -2059,6 +2059,13 @@ __metadata:
languageName: node
linkType: hard
+"@borewit/text-codec@npm:^0.2.1":
+ version: 0.2.2
+ resolution: "@borewit/text-codec@npm:0.2.2"
+ checksum: 10c0/2d3fb132bc6a132914a8fbf8e9ff2fa1ead210ecc395b28bb7355bd7719548a5e351ffe39f21c3bee8048f6cabd99eabd404bb5cc809cad9cba25abed19d271f
+ languageName: node
+ linkType: hard
+
"@bufbuild/protobuf@npm:^2.0.0, @bufbuild/protobuf@npm:^2.5.0":
version: 2.12.0
resolution: "@bufbuild/protobuf@npm:2.12.0"
@@ -5869,6 +5876,23 @@ __metadata:
languageName: node
linkType: hard
+"@tokenizer/inflate@npm:^0.4.1":
+ version: 0.4.1
+ resolution: "@tokenizer/inflate@npm:0.4.1"
+ dependencies:
+ debug: "npm:^4.4.3"
+ token-types: "npm:^6.1.1"
+ checksum: 10c0/9817516efe21d1ce3bdfb80a1f94efc8981064ce3873448ba79f4d81d96c0694c484c289bd042d346ae5536cf77f5aa9a367d39c3df700eb610761b7c306b4de
+ languageName: node
+ linkType: hard
+
+"@tokenizer/token@npm:^0.3.0":
+ version: 0.3.0
+ resolution: "@tokenizer/token@npm:0.3.0"
+ checksum: 10c0/7ab9a822d4b5ff3f5bca7f7d14d46bdd8432528e028db4a52be7fbf90c7f495cc1af1324691dda2813c6af8dc4b8eb29de3107d4508165f9aa5b53e7d501f155
+ languageName: node
+ linkType: hard
+
"@tsconfig/node10@npm:^1.0.7":
version: 1.0.12
resolution: "@tsconfig/node10@npm:1.0.12"
@@ -10419,6 +10443,18 @@ __metadata:
languageName: node
linkType: hard
+"file-type@npm:^22.0.1":
+ version: 22.0.1
+ resolution: "file-type@npm:22.0.1"
+ dependencies:
+ "@tokenizer/inflate": "npm:^0.4.1"
+ strtok3: "npm:^10.3.5"
+ token-types: "npm:^6.1.2"
+ uint8array-extras: "npm:^1.5.0"
+ checksum: 10c0/45b70a10196d46965eadd7835ec408c1c07b4fd2ed395e9bbcc0ad63d93f7bf6d076d0e970673b754577002019c8858825bc71ccc07ca7c0e49ac0c2b7e1839f
+ languageName: node
+ linkType: hard
+
"fill-range@npm:^7.1.1":
version: 7.1.1
resolution: "fill-range@npm:7.1.1"
@@ -11065,6 +11101,7 @@ __metadata:
eslint-plugin-rxjs: "npm:5.0.3"
eslint-plugin-rxjs-angular: "npm:2.0.1"
file-saver: "npm:2.0.5"
+ file-type: "npm:^22.0.1"
fs-extra: "npm:10.0.1"
fuse.js: "npm:6.5.3"
git-describe: "npm:4.1.0"
@@ -11460,7 +11497,7 @@ __metadata:
languageName: node
linkType: hard
-"ieee754@npm:1.2.1, ieee754@npm:^1.1.13":
+"ieee754@npm:1.2.1, ieee754@npm:^1.1.13, ieee754@npm:^1.2.1":
version: 1.2.1
resolution: "ieee754@npm:1.2.1"
checksum: 10c0/b0782ef5e0935b9f12883a2e2aa37baa75da6e66ce6515c168697b42160807d9330de9a32ec1ed73149aea02e0d822e572bca6f1e22bdcbd2149e13b050b17bb
@@ -17140,6 +17177,15 @@ __metadata:
languageName: node
linkType: hard
+"strtok3@npm:^10.3.5":
+ version: 10.3.5
+ resolution: "strtok3@npm:10.3.5"
+ dependencies:
+ "@tokenizer/token": "npm:^0.3.0"
+ checksum: 10c0/8d2477b239054c9f1f5b14a65d531147ca158ab9887fdc2d0938e77b7ec8891fb683b58254c7643afd5d98a421a59207534d491762b111f58c795071ecbe9fd1
+ languageName: node
+ linkType: hard
+
"style-loader@npm:^3.3.0":
version: 3.3.4
resolution: "style-loader@npm:3.3.4"
@@ -17450,6 +17496,17 @@ __metadata:
languageName: node
linkType: hard
+"token-types@npm:^6.1.1, token-types@npm:^6.1.2":
+ version: 6.1.2
+ resolution: "token-types@npm:6.1.2"
+ dependencies:
+ "@borewit/text-codec": "npm:^0.2.1"
+ "@tokenizer/token": "npm:^0.3.0"
+ ieee754: "npm:^1.2.1"
+ checksum: 10c0/8786e28e3cb65b9e890bc3c38def98e6dfe4565538237f8c0e47dbe549ed8f5f00de8dc464717868308abb4729f1958f78f69e1c4c3deebbb685729113a6fee8
+ languageName: node
+ linkType: hard
+
"totalist@npm:^1.0.0":
version: 1.1.0
resolution: "totalist@npm:1.1.0"
@@ -17798,6 +17855,13 @@ __metadata:
languageName: node
linkType: hard
+"uint8array-extras@npm:^1.5.0":
+ version: 1.5.0
+ resolution: "uint8array-extras@npm:1.5.0"
+ checksum: 10c0/0e74641ac7dadb02eadefc1ccdadba6010e007757bda824960de3c72bbe2b04e6d3af75648441f412148c4103261d54fcb60be45a2863beb76643a55fddba3bd
+ languageName: node
+ linkType: hard
+
"underscore@npm:>=1.8.3":
version: 1.13.8
resolution: "underscore@npm:1.13.8"