From 761add475ca98d7eb8d4d622b3afcc7ae1f0a83a Mon Sep 17 00:00:00 2001 From: Tanishq Gandhi Date: Fri, 15 May 2026 21:57:55 -0700 Subject: [PATCH] feat: add smart source and visual trace --- agent-service/src/agent/prompts.test.ts | 51 ++ agent-service/src/agent/prompts.ts | 2 + agent-service/src/types/agent.test.ts | 27 + agent-service/src/types/agent.ts | 1 + .../texera/web/TexeraWebApplication.scala | 1 + .../resource/SmartFileInferenceResource.scala | 114 +++++ .../web/service/ExecutionResultService.scala | 47 +- .../service/ExecutionResultServiceSpec.scala | 29 ++ build.sbt | 13 + .../texera/amber/util/ImageFormatUtils.scala | 74 +++ common/workflow-operator/build.sbt | 46 ++ .../texera/amber/operator/LogicalOp.scala | 4 + .../operator/fileSplit/FileSplitOpDesc.scala | 106 ++++ .../operator/fileSplit/FileSplitOpExec.scala | 58 +++ .../source/scan/FolderInputResolver.scala | 127 +++++ .../scan/file/FileScanSourceOpDesc.scala | 18 +- .../scan/file/FileScanSourceOpExec.scala | 27 +- .../source/scan/file/FileScanUtils.scala | 14 +- .../source/scan/smart/CSVDialectSniffer.scala | 144 ++++++ .../source/scan/smart/FormatDetector.scala | 125 +++++ .../source/scan/smart/ParquetUtils.scala | 208 ++++++++ .../source/scan/smart/SmartFileFormat.java | 66 +++ .../scan/smart/SmartFileInferencer.scala | 476 ++++++++++++++++++ .../scan/smart/SmartFileSourceOpDesc.scala | 137 +++++ .../scan/smart/SmartFileSourceOpExec.scala | 345 +++++++++++++ .../fileSplit/FileSplitOpDescSpec.scala | 53 ++ .../fileSplit/FileSplitOpExecSpec.scala | 69 +++ .../scan/file/FileScanSourceOpDescSpec.scala | 59 +++ .../scan/smart/CSVDialectSnifferSpec.scala | 60 +++ .../scan/smart/FormatDetectorSpec.scala | 86 ++++ .../smart/SmartFileSourceOpDescSpec.scala | 307 +++++++++++ .../smart/SmartFileSourceOpExecSpec.scala | 115 +++++ ...user-dataset-version-filetree.component.ts | 9 +- .../dataset-file-selector.component.ts | 1 + .../dataset-selection-modal.component.html | 1 + .../dataset-selection-modal.component.ts | 3 +- ...perator-property-edit-frame.component.html | 24 + ...perator-property-edit-frame.component.scss | 14 + .../operator-property-edit-frame.component.ts | 136 ++++- .../result-table-cell.utils.spec.ts | 34 ++ .../result-table-cell.utils.ts | 22 + .../result-table-frame.component.html | 9 +- .../result-table-frame.component.scss | 8 + .../result-table-frame.component.ts | 9 + .../visual-trace-panel.component.html | 136 +++++ .../visual-trace-panel.component.scss | 346 +++++++++++++ .../visual-trace-panel.component.spec.ts | 98 ++++ .../visual-trace-panel.component.ts | 85 ++++ ...visualization-frame-content.component.html | 2 + .../visualization-frame-content.component.ts | 87 +++- .../component/workspace.component.html | 1 + .../component/workspace.component.ts | 2 + .../smart-file-inference.service.ts | 73 +++ .../visual-trace/visual-trace.service.ts | 39 ++ .../visual-trace/visual-trace.utils.spec.ts | 224 +++++++++ .../visual-trace/visual-trace.utils.ts | 293 +++++++++++ .../workspace/types/visual-trace.interface.ts | 52 ++ .../src/assets/operator_images/FileSplit.png | Bin 0 -> 1657 bytes .../assets/operator_images/SmartFileScan.png | Bin 0 -> 6977 bytes 59 files changed, 4664 insertions(+), 53 deletions(-) create mode 100644 agent-service/src/agent/prompts.test.ts create mode 100644 agent-service/src/types/agent.test.ts create mode 100644 amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala create mode 100644 common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala create mode 100644 common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala create mode 100644 common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala create mode 100644 frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts create mode 100644 frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts create mode 100644 frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.html create mode 100644 frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss create mode 100644 frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts create mode 100644 frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts create mode 100644 frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts create mode 100644 frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts create mode 100644 frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts create mode 100644 frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts create mode 100644 frontend/src/app/workspace/types/visual-trace.interface.ts create mode 100644 frontend/src/assets/operator_images/FileSplit.png create mode 100644 frontend/src/assets/operator_images/SmartFileScan.png diff --git a/agent-service/src/agent/prompts.test.ts b/agent-service/src/agent/prompts.test.ts new file mode 100644 index 00000000000..b036c76eaf0 --- /dev/null +++ b/agent-service/src/agent/prompts.test.ts @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { buildSystemPrompt } from "./prompts"; +import { WorkflowSystemMetadata } from "./util/workflow-system-metadata"; + +describe("buildSystemPrompt", () => { + test("includes both operator type and display name", () => { + const metadata = new WorkflowSystemMetadata(); + metadata.loadFromMetadata({ + operators: [ + { + operatorType: "SmartFileScan", + operatorVersion: "1", + jsonSchema: { properties: { fileName: { type: "string" } }, required: ["fileName"] }, + additionalMetadata: { + userFriendlyName: "Smart Source", + operatorGroupName: "Data Input", + operatorDescription: "Auto-detects files and folders.", + inputPorts: [], + outputPorts: [{}], + }, + }, + ], + groups: [], + }); + + const prompt = buildSystemPrompt(metadata, ["SmartFileScan"]); + + expect(prompt).toContain("## SmartFileScan"); + expect(prompt).toContain("Display name: Smart Source"); + expect(prompt).toContain("Description: Auto-detects files and folders."); + }); +}); diff --git a/agent-service/src/agent/prompts.ts b/agent-service/src/agent/prompts.ts index 064eed2e3e5..ca3b542c463 100644 --- a/agent-service/src/agent/prompts.ts +++ b/agent-service/src/agent/prompts.ts @@ -268,10 +268,12 @@ function buildAllowedOperatorSchemas( for (const operatorType of operatorTypes) { const compactSchema = metadataStore.getCompactSchema(operatorType); const description = metadataStore.getDescription(operatorType); + const displayName = metadataStore.getAdditionalMetadata(operatorType)?.userFriendlyName; if (compactSchema) { schemas.push( `## ${operatorType}\n` + + (displayName ? `Display name: ${displayName}\n` : "") + (description ? `Description: ${description}\n` : "") + `Schema:\n\`\`\`json\n${JSON.stringify(compactSchema, null, 2)}\n\`\`\`` ); diff --git a/agent-service/src/types/agent.test.ts b/agent-service/src/types/agent.test.ts new file mode 100644 index 00000000000..abc4e73acf2 --- /dev/null +++ b/agent-service/src/types/agent.test.ts @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { DEFAULT_AGENT_SETTINGS } from "./agent"; + +describe("DEFAULT_AGENT_SETTINGS", () => { + test("allows the smart source operator by default", () => { + expect(DEFAULT_AGENT_SETTINGS.allowedOperatorTypes).toContain("SmartFileScan"); + }); +}); diff --git a/agent-service/src/types/agent.ts b/agent-service/src/types/agent.ts index 765f5a7cb46..74cb6230c16 100644 --- a/agent-service/src/types/agent.ts +++ b/agent-service/src/types/agent.ts @@ -87,6 +87,7 @@ export const DEFAULT_AGENT_SETTINGS: Omit = { executionTimeoutMs: 240000, maxSteps: 100, allowedOperatorTypes: [ + "SmartFileScan", "CSVFileScan", "Filter", "Projection", diff --git a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala index 98b7c68c974..2390e38ea22 100644 --- a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala +++ b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala @@ -130,6 +130,7 @@ class TexeraWebApplication environment.servlets.setSessionHandler(new SessionHandler) environment.jersey.register(classOf[SystemMetadataResource]) + environment.jersey.register(classOf[SmartFileInferenceResource]) // environment.jersey().register(classOf[MockKillWorkerResource]) environment.jersey.register(classOf[HealthCheckResource]) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala new file mode 100644 index 00000000000..27d9706462c --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.web.resource + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty} +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.operator.source.scan.FileDecodingMethod +import org.apache.texera.amber.operator.source.scan.smart.{ + InferenceOverrides, + SmartFileFormat, + SmartFileInferencer +} + +import javax.annotation.security.RolesAllowed +import javax.ws.rs.core.MediaType +import javax.ws.rs.{Consumes, POST, Path, Produces} +import scala.jdk.CollectionConverters._ + +@JsonIgnoreProperties(ignoreUnknown = true) +case class SmartFileInferenceRequest( + @JsonProperty("fileName") fileName: String, + @JsonProperty("fileEncoding") fileEncoding: Option[String] = None, + @JsonProperty("formatOverride") formatOverride: Option[String] = None, + @JsonProperty("customDelimiter") customDelimiter: Option[String] = None, + @JsonProperty("hasHeader") hasHeader: Option[Boolean] = None, + @JsonProperty("sheetName") sheetName: Option[String] = None, + @JsonProperty("flatten") flatten: Option[Boolean] = None +) + +case class SmartFileInferenceColumn(name: String, `type`: String) + +case class SmartFileInferenceResponse( + detectedFormat: String, + schema: java.util.List[SmartFileInferenceColumn], + customDelimiter: String, + hasHeader: java.lang.Boolean, + sheetName: String, + availableSheetNames: java.util.List[String], + flatten: java.lang.Boolean, + isFolder: Boolean, + fileCount: Int +) + +@Path("/file-inference") +@RolesAllowed(Array("REGULAR", "ADMIN")) +@Consumes(Array(MediaType.APPLICATION_JSON)) +@Produces(Array(MediaType.APPLICATION_JSON)) +class SmartFileInferenceResource { + + @POST + @Path("/preview") + def preview(request: SmartFileInferenceRequest): SmartFileInferenceResponse = { + val uri = FileResolver.resolve(request.fileName) + val charset = request.fileEncoding + .flatMap(name => tryParseEncoding(name)) + .getOrElse(FileDecodingMethod.UTF_8.getCharset) + + val overrides = InferenceOverrides( + format = request.formatOverride.flatMap(s => tryParseFormat(s)), + delimiter = request.customDelimiter.flatMap(_.headOption), + hasHeader = request.hasHeader, + sheetName = request.sheetName, + flatten = request.flatten + ) + + val result = SmartFileInferencer.infer(uri, charset, overrides) + val columns = result.schema.getAttributes + .map(a => SmartFileInferenceColumn(a.getName, a.getType.toString)) + .asJava + + SmartFileInferenceResponse( + detectedFormat = result.format.getLabel, + schema = columns, + customDelimiter = result.csvDelimiter.orNull, + hasHeader = result.csvHasHeader.map(java.lang.Boolean.valueOf).orNull, + sheetName = result.sheetName.orNull, + availableSheetNames = result.availableSheetNames.asJava, + flatten = result.flatten.map(java.lang.Boolean.valueOf).orNull, + isFolder = result.isFolder, + fileCount = result.fileCount + ) + } + + private def tryParseFormat(value: String): Option[SmartFileFormat] = { + val upper = value.toUpperCase + // Accept both the enum name (CSV, TSV, ...) and the user-facing label ("Plain text", ...). + try Some(SmartFileFormat.valueOf(upper)) + catch { + case _: IllegalArgumentException => + SmartFileFormat.values().find(_.getLabel.equalsIgnoreCase(value)) + } + } + + private def tryParseEncoding(value: String): Option[java.nio.charset.Charset] = + try Some(FileDecodingMethod.valueOf(value.toUpperCase).getCharset) + catch { case _: IllegalArgumentException => None } +} diff --git a/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala b/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala index b335ed0c3c7..5ff51f7f7e2 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala @@ -46,6 +46,7 @@ import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAg import org.apache.texera.amber.engine.common.AmberRuntime import org.apache.texera.amber.engine.common.client.AmberClient import org.apache.texera.amber.engine.common.executionruntimestate.ExecutionMetadataStore +import org.apache.texera.amber.util.ImageFormatUtils import org.apache.texera.web.SubscriptionManager import org.apache.texera.web.model.websocket.event.{ PaginatedResultEvent, @@ -59,6 +60,7 @@ import org.apache.texera.web.service.WorkflowExecutionService.getLatestExecution import org.apache.texera.web.storage.{ExecutionStateStore, WorkflowStateStore} import java.lang.Byte.{SIZE => BitsPerByte} +import java.util.Base64 import java.util.UUID import scala.collection.mutable import scala.concurrent.duration.DurationInt @@ -76,6 +78,11 @@ object ExecutionResultService { ) .mkString("") + private def bytesToImageDataUrl(bytes: Array[Byte]): Option[String] = + ImageFormatUtils + .detectMimeType(bytes) + .map(mimeType => s"data:$mimeType;base64,${Base64.getEncoder.encodeToString(bytes)}") + /** * Converts a collection of Tuples to a list of JSON ObjectNodes. * @@ -107,25 +114,27 @@ object ExecutionResultService { case AttributeType.BINARY => value match { case byteArray: Array[Byte] => - val totalSize = byteArray.length - val sizeFormatted = f"$totalSize%,d" - val totalBits = totalSize * BitsPerByte - val preview = - if (totalBits <= binaryPreviewLeadingBits + binaryPreviewTrailingBits) - bytesToBinaryString(byteArray) - else { - val leadingBytesNeeded = - math.ceil(binaryPreviewLeadingBits.toDouble / BitsPerByte).toInt - val trailingBytesNeeded = - math.ceil(binaryPreviewTrailingBits.toDouble / BitsPerByte).toInt - val leading = bytesToBinaryString(byteArray.take(leadingBytesNeeded)) - .take(binaryPreviewLeadingBits) - val trailing = bytesToBinaryString( - byteArray.takeRight(trailingBytesNeeded) - ).takeRight(binaryPreviewTrailingBits) - s"$leading...$trailing" - } - s"" + bytesToImageDataUrl(byteArray).getOrElse { + val totalSize = byteArray.length + val sizeFormatted = f"$totalSize%,d" + val totalBits = totalSize * BitsPerByte + val preview = + if (totalBits <= binaryPreviewLeadingBits + binaryPreviewTrailingBits) + bytesToBinaryString(byteArray) + else { + val leadingBytesNeeded = + math.ceil(binaryPreviewLeadingBits.toDouble / BitsPerByte).toInt + val trailingBytesNeeded = + math.ceil(binaryPreviewTrailingBits.toDouble / BitsPerByte).toInt + val leading = bytesToBinaryString(byteArray.take(leadingBytesNeeded)) + .take(binaryPreviewLeadingBits) + val trailing = bytesToBinaryString( + byteArray.takeRight(trailingBytesNeeded) + ).takeRight(binaryPreviewTrailingBits) + s"$leading...$trailing" + } + s"" + } case _ => throw new RuntimeException( diff --git a/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala b/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala index 0afe31fc099..2d86c47a158 100644 --- a/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala +++ b/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala @@ -23,6 +23,10 @@ import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tup import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import java.awt.image.BufferedImage +import java.io.ByteArrayOutputStream +import javax.imageio.ImageIO + class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { "convertTuplesToJson" should "convert tuples with various field types correctly" in { @@ -181,6 +185,24 @@ class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { emptyBinaryString should include("size = 0 bytes") } + it should "serialize recognized image binaries as data URLs" in { + val attributes = List( + new Attribute("image", AttributeType.BINARY) + ) + val schema = new Schema(attributes) + val imageBytes = pngBytes(width = 2, height = 2) + + val tuple = Tuple + .builder(schema) + .add("image", AttributeType.BINARY, imageBytes) + .build() + + val result = ExecutionResultService.convertTuplesToJson(List(tuple)) + + result should have size 1 + result.head.get("image").asText() should startWith("data:image/png;base64,") + } + it should "handle binary data with single ByteBuffer" in { val attributes = List( new Attribute("singleBufferBinary", AttributeType.BINARY) @@ -475,4 +497,11 @@ class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { resultsDefault(2).get("value").asText() shouldBe "medium length" resultsDefault(3).get("value").asText() should endWith("...") } + + private def pngBytes(width: Int, height: Int): Array[Byte] = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + val out = new ByteArrayOutputStream() + ImageIO.write(image, "png", out) + out.toByteArray + } } diff --git a/build.sbt b/build.sbt index b7b6b3cfb20..22dcd24e085 100644 --- a/build.sbt +++ b/build.sbt @@ -50,6 +50,19 @@ lazy val asfLicensingSettingsWithVendored = AddMetaInfLicenseFiles.workflowOpera val jacksonVersion = "2.18.6" +// Globally exclude transitive Hadoop landmines that conflict with Texera's +// Dropwizard + Jersey stack. These ride in via Parquet's `parquet-hadoop`, +// added in common/workflow-operator/build.sbt for SmartFileScan. Defining the +// excludes at ThisBuild level ensures they apply to every project that +// transitively pulls Hadoop — most importantly amber. +ThisBuild / excludeDependencies ++= Seq( + ExclusionRule("javax.servlet.jsp", "jsp-api"), + ExclusionRule("javax.servlet", "servlet-api"), + ExclusionRule(organization = "com.sun.jersey"), + ExclusionRule(organization = "com.sun.jersey.contribs"), + ExclusionRule("com.github.pjfanning", "jersey-json") +) + lazy val DAO = (project in file("common/dao")).settings(asfLicensingSettings) lazy val Config = (project in file("common/config")).settings(asfLicensingSettings) lazy val Auth = (project in file("common/auth")) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala new file mode 100644 index 00000000000..27c1d66ef9f --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.util + +object ImageFormatUtils { + + private val PngMagic = Array[Byte](0x89.toByte, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a) + private val JpegMagic = Array[Byte](0xff.toByte, 0xd8.toByte, 0xff.toByte) + private val Gif87Magic = "GIF87a".getBytes("US-ASCII") + private val Gif89Magic = "GIF89a".getBytes("US-ASCII") + private val RiffMagic = "RIFF".getBytes("US-ASCII") + private val WebpMagic = "WEBP".getBytes("US-ASCII") + + def detectFormat(bytes: Array[Byte]): Option[String] = { + if (startsWith(bytes, PngMagic)) Some("png") + else if (startsWith(bytes, JpegMagic)) Some("jpeg") + else if (startsWith(bytes, Gif87Magic) || startsWith(bytes, Gif89Magic)) Some("gif") + else if (isWebp(bytes)) Some("webp") + else None + } + + def detectMimeType(bytes: Array[Byte]): Option[String] = + detectFormat(bytes).map { + case "png" => "image/png" + case "jpeg" => "image/jpeg" + case "gif" => "image/gif" + case "webp" => "image/webp" + } + + def extensionFormat(path: String): Option[String] = { + val lower = path.toLowerCase + val dot = lower.lastIndexOf('.') + if (dot < 0) return None + lower.substring(dot + 1) match { + case "png" => Some("png") + case "jpg" | "jpeg" => Some("jpeg") + case "gif" => Some("gif") + case "webp" => Some("webp") + case _ => None + } + } + + private def isWebp(bytes: Array[Byte]): Boolean = + bytes.length >= 12 && + startsWith(bytes, RiffMagic) && + startsWith(bytes.drop(8), WebpMagic) + + private def startsWith(bytes: Array[Byte], prefix: Array[Byte]): Boolean = { + if (bytes.length < prefix.length) return false + var index = 0 + while (index < prefix.length) { + if (bytes(index) != prefix(index)) return false + index += 1 + } + true + } +} diff --git a/common/workflow-operator/build.sbt b/common/workflow-operator/build.sbt index 1c082cae96e..a79165d0b64 100644 --- a/common/workflow-operator/build.sbt +++ b/common/workflow-operator/build.sbt @@ -113,4 +113,50 @@ libraryDependencies ++= Seq( "org.apache.lucene" % "lucene-analyzers-common" % "8.11.4" ) +// SmartFileSource: Parquet + Excel support. +// +// Hadoop drags in a LOT of stuff Texera doesn't use, and several of those +// transitive deps conflict head-on with Texera's existing Dropwizard + Jersey-3 +// stack. We exclude all of the known troublemakers here. If you're tempted to +// remove one of these, run TexeraWebApplication and watch it die at startup. +// +// Conflicts being avoided: +// - slf4j-reload4j / reload4j: conflicts with the project's logback setup +// - jsp-api 2.1: ships an ancient `javax.el.ExpressionFactory` (no +// `newInstance()`) that shadows the real `javax.el-3.0.x` Dropwizard's +// Hibernate Validator needs (NoSuchMethodError otherwise) +// - com.sun.jersey.* (Jersey 1.x): collides with the project's Jersey 3 via +// HK2 — JSONRootElementProvider gets instantiated and explodes on init +// - tomcat / jasper: only used by Hadoop's embedded web UIs +// - servlet-api 2.5: ancient javax servlet that conflicts with Jakarta +libraryDependencies ++= Seq( + "org.apache.parquet" % "parquet-hadoop" % "1.13.1", + "org.apache.hadoop" % "hadoop-common" % "3.3.6" + exclude("org.slf4j", "slf4j-reload4j") + exclude("ch.qos.reload4j", "reload4j") + exclude("javax.servlet.jsp", "jsp-api") + exclude("javax.servlet", "servlet-api") + exclude("org.mortbay.jetty", "jetty") + exclude("org.mortbay.jetty", "jetty-util") + exclude("org.mortbay.jetty", "jsp-api-2.1") + exclude("tomcat", "jasper-compiler") + exclude("tomcat", "jasper-runtime") + exclude("com.sun.jersey", "jersey-core") + exclude("com.sun.jersey", "jersey-server") + exclude("com.sun.jersey", "jersey-json") + exclude("com.sun.jersey", "jersey-servlet") + exclude("com.sun.jersey", "jersey-client") + excludeAll(ExclusionRule(organization = "com.sun.jersey")), + "org.apache.hadoop" % "hadoop-mapreduce-client-core" % "3.3.6" + exclude("org.slf4j", "slf4j-reload4j") + exclude("ch.qos.reload4j", "reload4j") + exclude("javax.servlet.jsp", "jsp-api") + exclude("javax.servlet", "servlet-api") + excludeAll(ExclusionRule(organization = "com.sun.jersey")), + "org.apache.poi" % "poi-ooxml" % "5.2.5" +) +// Global Hadoop transitive-dep blackhole is declared at the top-level +// build.sbt as `ThisBuild / excludeDependencies` so it applies to every +// downstream project (especially amber) that pulls Hadoop through us. + libraryDependencies += "io.github.classgraph" % "classgraph" % "4.8.184" % Test diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala index 4e9d6c6e2cd..26643a4804f 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala @@ -41,6 +41,7 @@ import org.apache.texera.amber.operator.difference.DifferenceOpDesc import org.apache.texera.amber.operator.distinct.DistinctOpDesc import org.apache.texera.amber.operator.dummy.DummyOpDesc import org.apache.texera.amber.operator.filter.SpecializedFilterOpDesc +import org.apache.texera.amber.operator.fileSplit.FileSplitOpDesc import org.apache.texera.amber.operator.hashJoin.HashJoinOpDesc import org.apache.texera.amber.operator.huggingFace.{ HuggingFaceIrisLogisticRegressionOpDesc, @@ -81,6 +82,7 @@ import org.apache.texera.amber.operator.source.scan.arrow.ArrowSourceOpDesc import org.apache.texera.amber.operator.source.scan.csv.CSVScanSourceOpDesc import org.apache.texera.amber.operator.source.scan.csvOld.CSVOldScanSourceOpDesc import org.apache.texera.amber.operator.source.scan.json.JSONLScanSourceOpDesc +import org.apache.texera.amber.operator.source.scan.smart.SmartFileSourceOpDesc import org.apache.texera.amber.operator.source.scan.text.TextInputSourceOpDesc import org.apache.texera.amber.operator.source.sql.asterixdb.AsterixDBSourceOpDesc import org.apache.texera.amber.operator.source.sql.mysql.MySQLSourceOpDesc @@ -164,9 +166,11 @@ trait StateTransferFunc @JsonSubTypes( Array( new Type(value = classOf[IfOpDesc], name = "If"), + new Type(value = classOf[FileSplitOpDesc], name = "FileSplit"), new Type(value = classOf[SankeyDiagramOpDesc], name = "SankeyDiagram"), new Type(value = classOf[IcicleChartOpDesc], name = "IcicleChart"), new Type(value = classOf[FileListerSourceOpDesc], name = "FileLister"), + new Type(value = classOf[SmartFileSourceOpDesc], name = "SmartFileScan"), new Type(value = classOf[CSVScanSourceOpDesc], name = "CSVFileScan"), // disabled the ParallelCSVScanSourceOpDesc so that it does not confuse user. it can be re-enabled when doing experiments. // new Type(value = classOf[ParallelCSVScanSourceOpDesc], name = "ParallelCSVFileScan"), diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala new file mode 100644 index 00000000000..4b71e441202 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithClassName +import org.apache.texera.amber.core.tuple.{AttributeType, Schema} +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow._ +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.{LogicalOp, PortDescription} +import org.apache.texera.amber.util.JSONUtils.objectMapper + +class FileSplitOpDesc extends LogicalOp { + + @JsonProperty + @JsonSchemaTitle("File Column") + @JsonPropertyDescription("leave empty to auto-detect source_file or filename") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var fileAttribute: Option[String] = None + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = + PhysicalOp + .oneToOnePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithClassName( + "org.apache.texera.amber.operator.fileSplit.FileSplitOpExec", + objectMapper.writeValueAsString(this) + ) + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withParallelizable(false) + .withPropagateSchema( + SchemaPropagationFunc(inputSchemas => { + require(inputSchemas.size == 1, "File Split requires exactly one input") + val inputSchema = inputSchemas.values.head + resolveFileAttribute(inputSchema) + operatorInfo.outputPorts.map(port => port.id -> inputSchema).toMap + }) + ) + + override def operatorInfo: OperatorInfo = { + val outputPortInfo = + if (outputPorts != null && outputPorts.nonEmpty) { + outputPorts.zipWithIndex.map { + case (portDesc: PortDescription, idx) => + OutputPort(PortIdentity(idx), displayName = portDesc.displayName) + } + } else { + List(OutputPort(PortIdentity()), OutputPort(PortIdentity(1))) + } + + OperatorInfo( + userFriendlyName = "File Split", + operatorDescription = "Route rows from the same file to the same output port", + operatorGroupName = OperatorGroupConstants.UTILITY_GROUP, + inputPorts = List(InputPort()), + outputPorts = outputPortInfo, + dynamicOutputPorts = true, + allowPortCustomization = true + ) + } + + def resolveFileAttribute(schema: Schema): String = { + val attributeName = fileAttribute.getOrElse { + List("source_file", "filename") + .find(schema.containsAttribute) + .getOrElse( + throw new IllegalArgumentException( + "File Split requires a source_file or filename column, or an explicit File Column" + ) + ) + } + if (!schema.containsAttribute(attributeName)) { + throw new IllegalArgumentException(s"File Split column '$attributeName' does not exist") + } + if (schema.getAttribute(attributeName).getType != AttributeType.STRING) { + throw new IllegalArgumentException(s"File Split column '$attributeName' must be a STRING") + } + attributeName + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala new file mode 100644 index 00000000000..9816cf34c17 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.executor.OperatorExecutor +import org.apache.texera.amber.core.tuple.{Tuple, TupleLike} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.util.JSONUtils.objectMapper + +import scala.collection.mutable + +class FileSplitOpExec(descString: String) extends OperatorExecutor { + private val desc: FileSplitOpDesc = objectMapper.readValue(descString, classOf[FileSplitOpDesc]) + private val fileToPort = mutable.LinkedHashMap.empty[String, PortIdentity] + private var fileAttribute: String = _ + private var outputPortCount: Int = _ + + override def open(): Unit = { + outputPortCount = desc.operatorInfo.outputPorts.length + require(outputPortCount > 0, "File Split requires at least one output port") + } + + override def processTupleMultiPort( + tuple: Tuple, + port: Int + ): Iterator[(TupleLike, Option[PortIdentity])] = { + if (fileAttribute == null) { + fileAttribute = desc.resolveFileAttribute(tuple.getSchema) + } + val sourceFile = Option(tuple.getField[String](fileAttribute)).getOrElse( + throw new IllegalArgumentException(s"File Split column '$fileAttribute' cannot be null") + ) + val outputPort = fileToPort.getOrElseUpdate( + sourceFile, + PortIdentity(fileToPort.size % outputPortCount) + ) + Iterator.single((tuple, Some(outputPort))) + } + + override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = ??? +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala new file mode 100644 index 00000000000..e4f022d201d --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan + +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.core.storage.util.LakeFSStorageClient + +import java.net.{URI, URLDecoder, URLEncoder} +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path, Paths} +import scala.jdk.CollectionConverters._ +import scala.util.Using + +case class ResolvedInputFile(uri: URI, displayName: String) +case class ResolvedFolderInput(files: List[ResolvedInputFile], isFolder: Boolean) + +object FolderInputResolver { + + def resolve(uri: URI): ResolvedFolderInput = + Option(uri.getScheme).map(_.toLowerCase) match { + case Some("file") => resolveLocalInput(uri) + case Some(FileResolver.DATASET_FILE_URI_SCHEME) => resolveDatasetInput(uri) + case _ => + ResolvedFolderInput(List(ResolvedInputFile(uri, uri.toASCIIString)), isFolder = false) + } + + private def resolveLocalInput(uri: URI): ResolvedFolderInput = { + val path = Paths.get(uri) + if (Files.isDirectory(path)) { + val files = Using.resource(Files.walk(path)) { stream => + stream + .iterator() + .asScala + .filter(Files.isRegularFile(_)) + .filterNot(isHiddenPath) + .map(file => ResolvedInputFile(file.toUri, path.relativize(file).toString)) + .toList + .sortBy(_.displayName) + } + ResolvedFolderInput(files, isFolder = true) + } else { + ResolvedFolderInput(List(ResolvedInputFile(uri, uri.toASCIIString)), isFolder = false) + } + } + + private def resolveDatasetInput(uri: URI): ResolvedFolderInput = { + val segments = Paths + .get(uri.getPath) + .iterator() + .asScala + .map(_.toString) + .toList + + if (segments.length < 3) { + throw new IllegalArgumentException(s"Dataset URI is missing a relative path: $uri") + } + + val repositoryName = segments.head + val versionHash = URLDecoder.decode(segments(1), StandardCharsets.UTF_8) + val relativePath = segments + .drop(2) + .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) + .mkString("/") + + val objects = LakeFSStorageClient.retrieveObjectsOfVersion(repositoryName, versionHash) + val exactFile = objects.find(_.getPath == relativePath) + exactFile match { + case Some(file) => + ResolvedFolderInput( + List( + ResolvedInputFile( + buildDatasetFileUri(repositoryName, versionHash, file.getPath), + uri.toASCIIString + ) + ), + isFolder = false + ) + case None => + val prefix = if (relativePath.endsWith("/")) relativePath else s"$relativePath/" + val files = objects + .map(_.getPath) + .filter(_.startsWith(prefix)) + .filterNot(isHiddenDatasetPath) + .sorted + .map { path => + ResolvedInputFile( + buildDatasetFileUri(repositoryName, versionHash, path), + path.stripPrefix(prefix) + ) + } + ResolvedFolderInput(files, isFolder = true) + } + } + + private def buildDatasetFileUri(repositoryName: String, versionHash: String, relativePath: String): URI = { + val encodedSegments = + List(repositoryName, versionHash) ++ relativePath + .split("/") + .toList + .filter(_.nonEmpty) + .map(segment => URLEncoder.encode(segment, StandardCharsets.UTF_8)) + new URI(FileResolver.DATASET_FILE_URI_SCHEME, "", s"/${encodedSegments.mkString("/")}", null) + } + + private def isHiddenPath(path: Path): Boolean = + Option(path.getFileName).exists(_.toString.startsWith(".")) + + private def isHiddenDatasetPath(path: String): Boolean = + path.split("/").lastOption.exists(_.startsWith(".")) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala index 82997632d14..b3c3d260723 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala @@ -20,11 +20,7 @@ package org.apache.texera.amber.operator.source.scan.file import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty} -import com.kjetland.jackson.jsonSchema.annotations.{ - JsonSchemaInject, - JsonSchemaString, - JsonSchemaTitle -} +import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaString, JsonSchemaTitle} import org.apache.texera.amber.core.executor.OpExecWithClassName import org.apache.texera.amber.core.tuple.{AttributeType, Schema} import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} @@ -53,14 +49,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { @JsonProperty(defaultValue = "false") @JsonSchemaTitle("Include Filename") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "extract"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ) - ) - val outputFileName: Boolean = false + var outputFileName: Boolean = false fileTypeName = Option("") @@ -92,4 +81,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { } schema.add(attributeName, attributeType.getType) } + + override def operatorInfo = + super.operatorInfo.copy(operatorDescription = "Scan data from a file or a folder of files") } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala index d47cf3681c2..3b71a126437 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala @@ -21,9 +21,11 @@ package org.apache.texera.amber.operator.source.scan.file import org.apache.texera.amber.core.executor.SourceOperatorExecutor import org.apache.texera.amber.core.tuple.TupleLike +import org.apache.texera.amber.operator.source.scan.FolderInputResolver import org.apache.texera.amber.util.JSONUtils.objectMapper import java.io.IOException +import java.net.URI class FileScanSourceOpExec private[scan] ( descString: String @@ -33,14 +35,21 @@ class FileScanSourceOpExec private[scan] ( @throws[IOException] override def produceTuple(): Iterator[TupleLike] = { - FileScanUtils.createTuplesFromFile( - fileName = desc.fileName.get, - attributeType = desc.attributeType, - fileEncoding = desc.fileEncoding, - extract = desc.extract, - outputFileName = desc.outputFileName, - fileScanOffset = desc.fileScanOffset, - fileScanLimit = desc.fileScanLimit - ) + FolderInputResolver + .resolve(new URI(desc.fileName.get)) + .files + .iterator + .flatMap(file => + FileScanUtils.createTuplesFromFile( + fileName = file.uri.toASCIIString, + displayFileName = file.displayName, + attributeType = desc.attributeType, + fileEncoding = desc.fileEncoding, + extract = desc.extract, + outputFileName = desc.outputFileName, + fileScanOffset = desc.fileScanOffset, + fileScanLimit = desc.fileScanLimit + ) + ) } } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala index a7f81b4869c..e022d96e435 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala @@ -110,7 +110,8 @@ private[file] object FileScanUtils { TupleLike(fields.toSeq: _*) } } else { - fileEntries.flatMap(entry => + fileEntries.zipAll(filenameIt, null, null).flatMap { + case (entry, entryFileName) => new BufferedReader(new InputStreamReader(entry, fileEncoding.getCharset)) .lines() .iterator() @@ -119,13 +120,14 @@ private[file] object FileScanUtils { fileScanOffset.getOrElse(0), fileScanOffset.getOrElse(0) + fileScanLimit.getOrElse(Int.MaxValue) ) - .map(line => - TupleLike(attributeType match { + .map { line => + val parsed = attributeType match { case FileAttributeType.SINGLE_STRING => line case _ => parseField(line, attributeType.getType) - }) - ) - ) + } + if (outputFileName) TupleLike(entryFileName, parsed) else TupleLike(parsed) + } + } } new AutoClosingIterator(rawIterator, () => closeables.foreach(_.close())) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala new file mode 100644 index 00000000000..fe73bf36f17 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.texera.amber.core.tuple.{AttributeType, AttributeTypeUtils} + +import java.io.StringReader + +/** A guess at how a CSV-family file should be read. */ +case class CSVDialect(delimiter: Char, hasHeader: Boolean) + +/** + * Heuristic CSV dialect detector. Given a text sample (first ~64 KB of the file), + * it picks the delimiter that produces the most consistent column count across rows, + * then decides whether the first row is a header. + * + * Not perfect — quoted multi-line values can confuse it on very short samples — but + * good enough for the common cases the Smart File Source wants to cover. + */ +object CSVDialectSniffer { + + private val Candidates: Seq[Char] = Seq(',', '\t', ';', '|') + + /** + * @param sampleText decoded text sample + * @param preferred an extension-based hint (`,` if `.csv`, `\t` if `.tsv`). When the + * data is consistent with the preferred delimiter, we keep it even + * if another delimiter would score marginally higher. + */ + def sniff(sampleText: String, preferred: Option[Char] = None): CSVDialect = { + val scored = Candidates.map(d => d -> scoreDelimiter(sampleText, d)).toMap + + val delimiter = preferred match { + case Some(p) if scored.getOrElse(p, 0.0) >= 0.5 => p + case _ => + scored + .filter { case (_, score) => score > 0.0 } + .toSeq + .sortBy { case (_, score) => -score } + .headOption + .map(_._1) + .getOrElse(',') // fall back to comma; downstream parsing will surface a real error + } + + val hasHeader = detectHeader(sampleText, delimiter) + CSVDialect(delimiter, hasHeader) + } + + /** + * A delimiter is "consistent" when the per-row column count is stable across rows. + * Score is `(rows_with_modal_count - 1) / total_rows`, in [0, 1]. + */ + private def scoreDelimiter(sample: String, delimiter: Char): Double = { + val rows = parseRows(sample, delimiter, headerExtraction = false, maxRows = 30) + if (rows.size < 2) return 0.0 + val counts = rows.map(_.length).filter(_ > 0) + if (counts.length < 2) return 0.0 + val modalCount = counts.groupBy(identity).view.mapValues(_.size).maxBy(_._2)._1 + if (modalCount < 2) return 0.0 // single-column "matches" don't tell us anything + val agreeing = counts.count(_ == modalCount) + (agreeing - 1).toDouble / rows.size + } + + /** + * Header detection: parse the first row, then parse subsequent rows; if at least one + * column has a row-1 type of STRING but later rows are numeric/boolean/timestamp, the + * first row is probably a header. + */ + private def detectHeader(sample: String, delimiter: Char): Boolean = { + val rows = parseRows(sample, delimiter, headerExtraction = false, maxRows = 30) + if (rows.size < 2) return true // safer default — most CSVs have headers + val firstRow = rows.head + val laterRows = rows.tail + val width = firstRow.length + if (width == 0) return true + + val laterTypes: Array[AttributeType] = AttributeTypeUtils.inferSchemaFromRows( + laterRows.iterator.map(r => r.padTo(width, "").take(width).asInstanceOf[Array[Any]]) + ) + + val firstTypes = firstRow.map { v => + if (v == null || v.trim.isEmpty) AttributeType.STRING + else AttributeTypeUtils.inferField(v) + } + + val typedColumns = laterTypes.zipWithIndex.collect { + case (t, i) + if t != AttributeType.STRING && i < firstTypes.length + && firstTypes(i) == AttributeType.STRING => + i + } + typedColumns.nonEmpty + } + + private def parseRows( + sample: String, + delimiter: Char, + headerExtraction: Boolean, + maxRows: Int + ): Array[Array[String]] = { + val format = new CsvFormat() + format.setDelimiter(delimiter) + format.setLineSeparator("\n") + format.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setFormat(format) + settings.setMaxCharsPerColumn(-1) + settings.setHeaderExtractionEnabled(headerExtraction) + settings.setNullValue("") + val parser = new CsvParser(settings) + val reader = new StringReader(sample) + try { + parser.beginParsing(reader) + val buf = scala.collection.mutable.ArrayBuffer.empty[Array[String]] + var count = 0 + var row = parser.parseNext() + while (row != null && count < maxRows) { + buf += row + count += 1 + row = parser.parseNext() + } + parser.stopParsing() + buf.toArray + } finally reader.close() + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala new file mode 100644 index 00000000000..143b1a9290f --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.texera.amber.util.ImageFormatUtils + +import java.nio.charset.Charset + +object FormatDetector { + + // Magic bytes used by the formats we support. + private val ParquetMagic: Array[Byte] = "PAR1".getBytes("US-ASCII") + private val XlsxMagic: Array[Byte] = Array(0x50, 0x4b, 0x03, 0x04).map(_.toByte) // PK\x03\x04 ZIP container + private val OleMagic: Array[Byte] = // legacy .xls (OLE2 compound document) + Array(0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1).map(_.toByte) + // Arrow IPC stream begins with "ARROW1\0\0", file format also starts with this signature. + private val ArrowMagic: Array[Byte] = "ARROW1".getBytes("US-ASCII") + + /** + * Cheap detection from a byte sample plus optional filename hint. + * Order: magic bytes (most reliable) → extension → content sniff. + */ + def detect( + fileNameHint: Option[String], + sample: Array[Byte], + charset: Charset + ): SmartFileFormat = { + if (startsWith(sample, ParquetMagic)) return SmartFileFormat.PARQUET + if (startsWith(sample, OleMagic)) return SmartFileFormat.EXCEL + if (startsWith(sample, ArrowMagic)) return SmartFileFormat.ARROW + if (ImageFormatUtils.detectFormat(sample).nonEmpty) return SmartFileFormat.IMAGE + + val extensionDetected = fileNameHint.flatMap(extensionFormat) + if (startsWith(sample, XlsxMagic) && extensionDetected.contains(SmartFileFormat.EXCEL)) { + return SmartFileFormat.EXCEL + } + + extensionDetected.foreach(return _) + + sniffText(sample, charset) + } + + /** Extension-based detection. Returns None if extension is unknown or absent. */ + def extensionFormat(fileName: String): Option[SmartFileFormat] = { + val lower = fileName.toLowerCase + val dot = lower.lastIndexOf('.') + if (dot < 0) return None + lower.substring(dot + 1) match { + case "csv" => Some(SmartFileFormat.CSV) + case "tsv" | "tab" => Some(SmartFileFormat.TSV) + case "json" => Some(SmartFileFormat.JSON) + case "jsonl" | "ndjson" => Some(SmartFileFormat.JSONL) + case "arrow" => Some(SmartFileFormat.ARROW) + case "parquet" | "pq" => Some(SmartFileFormat.PARQUET) + case "xlsx" | "xls" | "xlsm" => Some(SmartFileFormat.EXCEL) + case "png" | "jpg" | "jpeg" | + "gif" | "webp" => Some(SmartFileFormat.IMAGE) + case "txt" | "log" => Some(SmartFileFormat.TEXT) + case _ => None + } + } + + /** + * Content-based sniffing for text formats when neither magic bytes nor extension + * give a definitive answer. Heuristics: + * - first non-blank char `{` → JSON object → ambiguous JSON vs JSONL → look at how many + * `{` start at the beginning of a line + * - first non-blank char `[` → JSON array + * - lines with consistent tabs but few commas → TSV + * - otherwise → CSV (the most common case) + */ + private def sniffText(sample: Array[Byte], charset: Charset): SmartFileFormat = { + val text = new String(sample, charset) + val trimmed = text.dropWhile(_.isWhitespace) + if (trimmed.isEmpty) return SmartFileFormat.TEXT + + trimmed.head match { + case '[' => return SmartFileFormat.JSON + case '{' => + // Either a single JSON object, JSON array of objects pretty-printed, or JSONL. + // JSONL: multiple lines each starting with `{`. + val objectLineStarts = text.linesIterator + .filter(_.nonEmpty) + .count(line => line.headOption.contains('{')) + return if (objectLineStarts >= 2) SmartFileFormat.JSONL else SmartFileFormat.JSON + case _ => + } + + // Delimiter heuristic — only the first ~30 lines. + val lines = text.linesIterator.take(30).filter(_.nonEmpty).toList + if (lines.isEmpty) return SmartFileFormat.TEXT + val tabHits = lines.count(_.contains('\t')) + val commaHits = lines.count(_.contains(',')) + if (tabHits > 0 && tabHits >= commaHits) SmartFileFormat.TSV + else if (commaHits > 0) SmartFileFormat.CSV + else SmartFileFormat.TEXT + } + + private def startsWith(sample: Array[Byte], prefix: Array[Byte]): Boolean = { + if (sample.length < prefix.length) return false + var i = 0 + while (i < prefix.length) { + if (sample(i) != prefix(i)) return false + i += 1 + } + true + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala new file mode 100644 index 00000000000..3954c8cf55b --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.example.data.Group +import org.apache.parquet.example.data.simple.convert.GroupRecordConverter +import org.apache.parquet.hadoop.ParquetFileReader +import org.apache.parquet.hadoop.util.HadoopInputFile +import org.apache.parquet.io.ColumnIOFactory +import org.apache.parquet.schema.LogicalTypeAnnotation +import org.apache.parquet.schema.LogicalTypeAnnotation.{ + DateLogicalTypeAnnotation, + StringLogicalTypeAnnotation, + TimestampLogicalTypeAnnotation +} +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName +import org.apache.parquet.schema.{MessageType, PrimitiveType, Type} +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} + +import java.io.File + +object ParquetUtils { + + /** Map a Parquet `MessageType` to a Texera Schema. Skips non-primitive (nested) fields. */ + def toTexeraSchema(messageType: MessageType): Schema = { + val attrs = scala.collection.mutable.ListBuffer.empty[Attribute] + val fieldCount = messageType.getFieldCount + var i = 0 + while (i < fieldCount) { + val field: Type = messageType.getType(i) + if (field.isPrimitive) { + attrs += new Attribute(field.getName, toAttributeType(field.asPrimitiveType())) + } + i += 1 + } + Schema(attrs.toList) + } + + def toAttributeType(primitive: PrimitiveType): AttributeType = { + val logical = primitive.getLogicalTypeAnnotation + primitive.getPrimitiveTypeName match { + case PrimitiveTypeName.BOOLEAN => AttributeType.BOOLEAN + case PrimitiveTypeName.INT32 => + logical match { + case _: DateLogicalTypeAnnotation => AttributeType.TIMESTAMP + case _ => AttributeType.INTEGER + } + case PrimitiveTypeName.INT64 => + logical match { + case _: TimestampLogicalTypeAnnotation => AttributeType.TIMESTAMP + case _ => AttributeType.LONG + } + case PrimitiveTypeName.FLOAT | PrimitiveTypeName.DOUBLE => AttributeType.DOUBLE + case PrimitiveTypeName.INT96 => AttributeType.TIMESTAMP + case PrimitiveTypeName.BINARY => + logical match { + case _: StringLogicalTypeAnnotation => AttributeType.STRING + case _ if isStringLike(logical) => AttributeType.STRING + case _ => AttributeType.BINARY + } + case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => AttributeType.BINARY + } + } + + private def isStringLike(logical: LogicalTypeAnnotation): Boolean = { + if (logical == null) return false + // EnumLogicalTypeAnnotation / JsonLogicalTypeAnnotation also serialize as text. + val name = logical.toString.toLowerCase + name.contains("string") || name.contains("enum") || name.contains("json") + } + + /** Opens a `ParquetFileReader` on a local file. */ + def openReader(file: File): ParquetFileReader = { + val conf = newConfiguration() + val inputFile = HadoopInputFile.fromPath(new Path(file.toURI), conf) + ParquetFileReader.open(inputFile) + } + + /** + * Read the file into a lazy iterator of `Group` records. + * Caller is responsible for closing the returned reader via [[ParquetReadHandle.close]]. + */ + def openRecords(file: File): ParquetReadHandle = { + val conf = newConfiguration() + val inputFile = HadoopInputFile.fromPath(new Path(file.toURI), conf) + val reader = ParquetFileReader.open(inputFile) + val schema = reader.getFooter.getFileMetaData.getSchema + val converter = new GroupRecordConverter(schema) + val columnIO = new ColumnIOFactory().getColumnIO(schema) + val iterator = new Iterator[Group] { + private var currentPages = reader.readNextRowGroup() + private var recordReader = + if (currentPages != null) columnIO.getRecordReader(currentPages, converter) else null + private var remaining: Long = if (currentPages != null) currentPages.getRowCount else 0L + + override def hasNext: Boolean = { + if (remaining > 0) return true + // Advance to next row group. + var nextPages = reader.readNextRowGroup() + while (nextPages != null && nextPages.getRowCount == 0) nextPages = reader.readNextRowGroup() + if (nextPages == null) false + else { + currentPages = nextPages + recordReader = columnIO.getRecordReader(nextPages, converter) + remaining = nextPages.getRowCount + true + } + } + + override def next(): Group = { + if (!hasNext) throw new NoSuchElementException + remaining -= 1 + recordReader.read().asInstanceOf[Group] + } + } + ParquetReadHandle(schema, iterator, () => reader.close()) + } + + /** Read a primitive field at position `index` of a Parquet `Group`, honoring schema. */ + def readField(group: Group, index: Int, schema: MessageType): Any = { + if (group.getFieldRepetitionCount(index) == 0) return null + val field = schema.getType(index) + if (!field.isPrimitive) return null + val primitive = field.asPrimitiveType() + primitive.getPrimitiveTypeName match { + case PrimitiveTypeName.BOOLEAN => group.getBoolean(index, 0) + case PrimitiveTypeName.INT32 => + primitive.getLogicalTypeAnnotation match { + case _: DateLogicalTypeAnnotation => + // Date stored as days since epoch. + val days = group.getInteger(index, 0).toLong + new java.sql.Timestamp(days * 86400000L) + case _ => Int.box(group.getInteger(index, 0)) + } + case PrimitiveTypeName.INT64 => + primitive.getLogicalTypeAnnotation match { + case ts: TimestampLogicalTypeAnnotation => + val raw = group.getLong(index, 0) + val millis = ts.getUnit match { + case LogicalTypeAnnotation.TimeUnit.MILLIS => raw + case LogicalTypeAnnotation.TimeUnit.MICROS => raw / 1000L + case LogicalTypeAnnotation.TimeUnit.NANOS => raw / 1000000L + } + new java.sql.Timestamp(millis) + case _ => Long.box(group.getLong(index, 0)) + } + case PrimitiveTypeName.FLOAT => Double.box(group.getFloat(index, 0).toDouble) + case PrimitiveTypeName.DOUBLE => Double.box(group.getDouble(index, 0)) + case PrimitiveTypeName.INT96 => + // INT96 → 96-bit timestamp; convert via Parquet's NanoTime helper. + val binary = group.getInt96(index, 0) + int96ToTimestamp(binary.getBytes) + case PrimitiveTypeName.BINARY => + val binary = group.getBinary(index, 0) + primitive.getLogicalTypeAnnotation match { + case _: StringLogicalTypeAnnotation => binary.toStringUsingUTF8 + case logical if isStringLike(logical) => binary.toStringUsingUTF8 + case _ => binary.getBytes + } + case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => group.getBinary(index, 0).getBytes + } + } + + private def int96ToTimestamp(bytes: Array[Byte]): java.sql.Timestamp = { + // INT96: 8 bytes little-endian nanoseconds of day, then 4 bytes little-endian Julian day. + var nanos: Long = 0L + for (i <- 0 until 8) nanos |= (bytes(i).toLong & 0xff) << (8 * i) + var julian: Int = 0 + for (i <- 0 until 4) julian |= (bytes(8 + i).toInt & 0xff) << (8 * i) + val daysFromEpoch = julian - 2440588 // Julian day 2440588 = 1970-01-01 + val millis = daysFromEpoch.toLong * 86400000L + nanos / 1000000L + new java.sql.Timestamp(millis) + } + + private def newConfiguration(): Configuration = { + val conf = new Configuration(false) + // Reduce noisy default classpath probing — we only ever look at local files. + conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") + conf + } + + case class ParquetReadHandle( + schema: MessageType, + records: Iterator[Group], + closer: () => Unit + ) { + def close(): Unit = closer() + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java new file mode 100644 index 00000000000..190b367daec --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + +public enum SmartFileFormat { + AUTO("Auto-detect"), + CSV("CSV"), + TSV("TSV"), + JSON("JSON"), + JSONL("JSONL"), + ARROW("Arrow"), + PARQUET("Parquet"), + EXCEL("Excel"), + IMAGE("Image"), + TEXT("Plain text"); + + private final String label; + + SmartFileFormat(String label) { + this.label = label; + } + + @JsonValue + public String getLabel() { + return label; + } + + /** Accept either the enum name (e.g. "CSV") or the label (e.g. "Plain text"). */ + @JsonCreator + public static SmartFileFormat fromString(String value) { + if (value == null) { + return null; + } + for (SmartFileFormat format : values()) { + if (format.name().equalsIgnoreCase(value) || format.label.equalsIgnoreCase(value)) { + return format; + } + } + throw new IllegalArgumentException("Unknown SmartFileFormat: " + value); + } + + @Override + public String toString() { + return label; + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala new file mode 100644 index 00000000000..0a657639d65 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala @@ -0,0 +1,476 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.databind.JsonNode +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.ipc.ArrowFileReader +import org.apache.poi.ss.usermodel.{Cell, CellType, DateUtil, Sheet, WorkbookFactory} +import org.apache.texera.amber.core.storage.DocumentFactory +import org.apache.texera.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.amber.util.ArrowUtils +import org.apache.texera.amber.util.JSONUtils.{JSONToMap, objectMapper} + +import java.io.{BufferedReader, InputStream, InputStreamReader} +import java.net.URI +import java.nio.charset.Charset +import java.nio.file.{Files, StandardOpenOption} +import scala.collection.mutable.ArrayBuffer +import scala.jdk.CollectionConverters._ +import scala.util.Using + +/** + * Overrides supplied by the user. Each `Some(...)` value short-circuits the corresponding + * detection step; `None` means "let the inferencer decide". + */ +case class InferenceOverrides( + format: Option[SmartFileFormat] = None, + delimiter: Option[Char] = None, + hasHeader: Option[Boolean] = None, + sheetName: Option[String] = None, + flatten: Option[Boolean] = None +) + +/** + * The full inference result. Carries the inferred schema along with the configuration + * the runtime executor needs to read the file the same way the inferencer did. + */ +case class InferenceResult( + format: SmartFileFormat, + schema: Schema, + csvDelimiter: Option[String] = None, + csvHasHeader: Option[Boolean] = None, + sheetName: Option[String] = None, + availableSheetNames: List[String] = Nil, + flatten: Option[Boolean] = None, + isFolder: Boolean = false, + fileCount: Int = 1 +) + +/** + * The single source of truth for "look at this file and decide how to read it." + * Both the operator descriptor (compile-time schema declaration) and the live + * preview REST endpoint route through this object so their behavior is identical. + */ +object SmartFileInferencer { + + /** Bytes to read when sniffing format / delimiter / header. */ + private val SampleByteCount = 64 * 1024 + + /** Rows to read when inferring types. Matches `ScanSourceOpDesc.INFER_READ_LIMIT`. */ + private val InferRowLimit = 100 + + /** Cheap detection that only reads the header bytes. */ + def detect(uri: URI, encoding: Charset): SmartFileFormat = { + val sample = readSampleBytes(uri) + FormatDetector.detect(Some(uri.getPath), sample, encoding) + } + + /** Full inference: format detection + schema. */ + def infer(uri: URI, encoding: Charset, overrides: InferenceOverrides): InferenceResult = { + val input = FolderInputResolver.resolve(uri) + if (input.isFolder) { + inferFolder(uri, input.files.map(_.uri), encoding, overrides) + } else { + inferSingle(uri, encoding, overrides) + } + } + + private def inferFolder( + folderUri: URI, + files: List[URI], + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + if (files.isEmpty) { + throw new IllegalArgumentException(s"Folder $folderUri does not contain any readable files") + } + + val inferred = files.map(file => inferSingle(file, encoding, overrides)) + val first = inferred.head + val mismatchedFormat = inferred.find(_.format != first.format) + if (mismatchedFormat.nonEmpty) { + throw new IllegalArgumentException( + s"Folder $folderUri must contain files with the same detected format" + ) + } + + val expectedSchema = schemaSignature(first.schema) + val mismatchedSchema = inferred.find(result => schemaSignature(result.schema) != expectedSchema) + if (mismatchedSchema.nonEmpty) { + throw new IllegalArgumentException( + s"Folder $folderUri must contain files with the same inferred schema" + ) + } + + first.copy(isFolder = true, fileCount = files.size) + } + + private def inferSingle(uri: URI, encoding: Charset, overrides: InferenceOverrides): InferenceResult = { + val format = overrides.format + .filter(_ != SmartFileFormat.AUTO) + .getOrElse { + val sample = readSampleBytes(uri) + FormatDetector.detect(Some(uri.getPath), sample, encoding) + } + + format match { + case SmartFileFormat.CSV | SmartFileFormat.TSV => inferCsv(uri, encoding, format, overrides) + case SmartFileFormat.JSONL => inferJsonl(uri, encoding, overrides) + case SmartFileFormat.JSON => inferJson(uri, encoding, overrides) + case SmartFileFormat.ARROW => inferArrow(uri) + case SmartFileFormat.PARQUET => inferParquet(uri) + case SmartFileFormat.EXCEL => inferExcel(uri, overrides) + case SmartFileFormat.IMAGE => inferImage() + case SmartFileFormat.TEXT => inferText() + case SmartFileFormat.AUTO => + throw new IllegalStateException("AUTO should have been resolved before dispatch") + } + } + + private def schemaSignature(schema: Schema): List[(String, AttributeType)] = + schema.getAttributes.map(attribute => attribute.getName -> attribute.getType) + + // --------------------------------------------------------------------------- + // CSV / TSV + // --------------------------------------------------------------------------- + + private def inferCsv( + uri: URI, + encoding: Charset, + format: SmartFileFormat, + overrides: InferenceOverrides + ): InferenceResult = { + val sampleText = readSampleText(uri, encoding) + val preferred = format match { + case SmartFileFormat.TSV => Some('\t') + case _ => Some(',') + } + val sniffed = CSVDialectSniffer.sniff(sampleText, preferred) + val delimiter = overrides.delimiter.getOrElse(sniffed.delimiter) + val hasHeader = overrides.hasHeader.getOrElse(sniffed.hasHeader) + val schema = inferCsvSchema(uri, encoding, delimiter, hasHeader) + InferenceResult( + format = format, + schema = schema, + csvDelimiter = Some(delimiter.toString), + csvHasHeader = Some(hasHeader) + ) + } + + private def inferCsvSchema( + uri: URI, + encoding: Charset, + delimiter: Char, + hasHeader: Boolean + ): Schema = { + val csvFormat = new CsvFormat() + csvFormat.setDelimiter(delimiter) + csvFormat.setLineSeparator("\n") + csvFormat.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setMaxCharsPerColumn(-1) + settings.setFormat(csvFormat) + settings.setHeaderExtractionEnabled(hasHeader) + settings.setNullValue("") + + val parser = new CsvParser(settings) + val stream = openStream(uri) + val reader = new InputStreamReader(stream, encoding) + try { + parser.beginParsing(reader) + val rows = ArrayBuffer.empty[Array[String]] + var row = parser.parseNext() + var read = 0 + while (row != null && read < InferRowLimit) { + rows += row + read += 1 + row = parser.parseNext() + } + parser.stopParsing() + val attributeTypes = inferSchemaFromRows(rows.iterator.map(_.asInstanceOf[Array[Any]])) + val header = + if (hasHeader) + Option(parser.getContext.headers()) + .getOrElse((1 to attributeTypes.length).map(i => s"column-$i").toArray) + else + (1 to attributeTypes.length).map(i => s"column-$i").toArray + val pairs = header.indices.map { i => + val attributeType = + if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + (header(i), attributeType) + } + pairs.foldLeft(Schema()) { case (s, (name, t)) => s.add(name, t) } + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // JSONL + // --------------------------------------------------------------------------- + + private def inferJsonl( + uri: URI, + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + val flatten = overrides.flatten.getOrElse(false) + val stream = openStream(uri) + val reader = new BufferedReader(new InputStreamReader(stream, encoding)) + try { + val fieldNames = scala.collection.mutable.LinkedHashSet[String]() + val rows = ArrayBuffer.empty[Map[String, String]] + val lines = reader.lines().iterator().asScala.take(InferRowLimit) + lines.foreach { line => + if (line != null && line.trim.nonEmpty) { + val root: JsonNode = objectMapper.readTree(line) + if (root.isObject) { + val fields = JSONToMap(root, flatten = flatten) + fields.keys.foreach(fieldNames += _) + rows += fields + } + } + } + val orderedNames = fieldNames.toList + val schema = buildJsonSchema(orderedNames, rows.toSeq) + InferenceResult( + format = SmartFileFormat.JSONL, + schema = schema, + flatten = Some(flatten) + ) + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // JSON (single object or array of objects) + // --------------------------------------------------------------------------- + + private def inferJson( + uri: URI, + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + val flatten = overrides.flatten.getOrElse(false) + val stream = openStream(uri) + val reader = new InputStreamReader(stream, encoding) + try { + val root = objectMapper.readTree(reader) + val rows = ArrayBuffer.empty[Map[String, String]] + val fieldNames = scala.collection.mutable.LinkedHashSet[String]() + + val objectNodes: Iterator[JsonNode] = + if (root.isArray) root.elements().asScala + else if (root.isObject) Iterator.single(root) + else Iterator.empty + + var count = 0 + while (objectNodes.hasNext && count < InferRowLimit) { + val node = objectNodes.next() + if (node.isObject) { + val fields = JSONToMap(node, flatten = flatten) + fields.keys.foreach(fieldNames += _) + rows += fields + count += 1 + } + } + + val schema = buildJsonSchema(fieldNames.toList, rows.toSeq) + InferenceResult( + format = SmartFileFormat.JSON, + schema = schema, + flatten = Some(flatten) + ) + } finally reader.close() + } + + private def buildJsonSchema(orderedNames: List[String], rows: Seq[Map[String, String]]): Schema = { + if (orderedNames.isEmpty) return Schema() + val attributeTypes = inferSchemaFromRows(rows.iterator.map { row => + orderedNames.map(name => row.getOrElse(name, null)).toArray[Any] + }) + val attrs = orderedNames.indices.map { i => + val t = + if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + new Attribute(orderedNames(i), t) + } + Schema(attrs.toList) + } + + // --------------------------------------------------------------------------- + // Arrow + // --------------------------------------------------------------------------- + + private def inferArrow(uri: URI): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val allocator = new RootAllocator() + val schema = Using + .Manager { use => + val channel = use(Files.newByteChannel(file.toPath, StandardOpenOption.READ)) + val reader = use(new ArrowFileReader(channel, allocator)) + ArrowUtils.toTexeraSchema(reader.getVectorSchemaRoot.getSchema) + } + .getOrElse(throw new RuntimeException(s"Failed to read Arrow schema from $uri")) + InferenceResult(format = SmartFileFormat.ARROW, schema = schema) + } + + // --------------------------------------------------------------------------- + // Parquet + // --------------------------------------------------------------------------- + + private def inferParquet(uri: URI): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val reader = ParquetUtils.openReader(file) + try { + val parquetSchema = reader.getFooter.getFileMetaData.getSchema + InferenceResult(format = SmartFileFormat.PARQUET, schema = ParquetUtils.toTexeraSchema(parquetSchema)) + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // Excel + // --------------------------------------------------------------------------- + + private def inferExcel(uri: URI, overrides: InferenceOverrides): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val workbook = WorkbookFactory.create(file, null, true) // read-only + try { + val sheetNames = (0 until workbook.getNumberOfSheets).map(workbook.getSheetName).toList + val targetSheet: Sheet = overrides.sheetName + .flatMap(name => Option(workbook.getSheet(name))) + .getOrElse(workbook.getSheetAt(0)) + val hasHeader = overrides.hasHeader.getOrElse(true) + + val rowIter = targetSheet.iterator().asScala + val sampled = rowIter.take(InferRowLimit + 1).toList + if (sampled.isEmpty) { + return InferenceResult( + format = SmartFileFormat.EXCEL, + schema = Schema(), + sheetName = Some(targetSheet.getSheetName), + availableSheetNames = sheetNames, + csvHasHeader = Some(hasHeader) + ) + } + + val columnCount = sampled.map(_.getLastCellNum.toInt).max + val rowsAsStrings: List[Array[String]] = sampled.map { row => + (0 until columnCount).map(c => cellToString(row.getCell(c))).toArray + } + + val header: Array[String] = + if (hasHeader && rowsAsStrings.nonEmpty) + rowsAsStrings.head.zipWithIndex.map { + case (s, i) => if (s == null || s.isEmpty) s"column-${i + 1}" else s + } + else (1 to columnCount).map(i => s"column-$i").toArray + + val dataRows = if (hasHeader) rowsAsStrings.drop(1) else rowsAsStrings + val attributeTypes = inferSchemaFromRows(dataRows.iterator.map(_.asInstanceOf[Array[Any]])) + + val schema = header.indices.foldLeft(Schema()) { (s, i) => + val t = if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + s.add(header(i), t) + } + + InferenceResult( + format = SmartFileFormat.EXCEL, + schema = schema, + sheetName = Some(targetSheet.getSheetName), + availableSheetNames = sheetNames, + csvHasHeader = Some(hasHeader) + ) + } finally workbook.close() + } + + private def cellToString(cell: Cell): String = { + if (cell == null) return null + cell.getCellType match { + case CellType.STRING => cell.getStringCellValue + case CellType.BOOLEAN => String.valueOf(cell.getBooleanCellValue) + case CellType.NUMERIC => + if (DateUtil.isCellDateFormatted(cell)) + new java.sql.Timestamp(cell.getDateCellValue.getTime).toString + else { + val d = cell.getNumericCellValue + if (d == d.toLong.toDouble) d.toLong.toString else d.toString + } + case CellType.FORMULA => + cellToString(safelyEvaluate(cell)) + case CellType.BLANK | CellType._NONE | CellType.ERROR => null + case _ => null + } + } + + private def safelyEvaluate(cell: Cell): Cell = { + try { + val evaluator = cell.getSheet.getWorkbook.getCreationHelper.createFormulaEvaluator() + evaluator.evaluateInCell(cell) + } catch { + case _: Throwable => cell + } + } + + // --------------------------------------------------------------------------- + // Plain text + // --------------------------------------------------------------------------- + + private def inferText(): InferenceResult = + InferenceResult( + format = SmartFileFormat.TEXT, + schema = Schema(List(new Attribute("line", AttributeType.STRING))) + ) + + private def inferImage(): InferenceResult = + InferenceResult( + format = SmartFileFormat.IMAGE, + schema = Schema() + .add("image", AttributeType.BINARY) + .add("format", AttributeType.STRING) + .add("width", AttributeType.INTEGER) + .add("height", AttributeType.INTEGER) + ) + + // --------------------------------------------------------------------------- + // I/O helpers + // --------------------------------------------------------------------------- + + private def openStream(uri: URI): InputStream = + DocumentFactory.openReadonlyDocument(uri).asInputStream() + + private def readSampleBytes(uri: URI): Array[Byte] = { + val stream = openStream(uri) + try { + val buffer = new Array[Byte](SampleByteCount) + var totalRead = 0 + var lastRead = 0 + while (totalRead < buffer.length && { + lastRead = stream.read(buffer, totalRead, buffer.length - totalRead); lastRead + } > 0) { + totalRead += lastRead + } + if (totalRead == buffer.length) buffer else buffer.take(totalRead) + } finally stream.close() + } + + private def readSampleText(uri: URI, charset: Charset): String = + new String(readSampleBytes(uri), charset) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala new file mode 100644 index 00000000000..e2101bd2808 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} +import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithClassName +import org.apache.texera.amber.core.tuple.Schema +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{OutputPort, PhysicalOp, SchemaPropagationFunc} +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.source.scan.ScanSourceOpDesc +import org.apache.texera.amber.util.JSONUtils.objectMapper + +import java.io.IOException +import java.net.URI + +class SmartFileSourceOpDesc extends ScanSourceOpDesc { + + @JsonProperty(defaultValue = "AUTO") + @JsonSchemaTitle("Format") + @JsonPropertyDescription("override automatic format detection") + var formatOverride: SmartFileFormat = SmartFileFormat.AUTO + + @JsonProperty + @JsonSchemaTitle("Delimiter") + @JsonPropertyDescription("CSV/TSV delimiter (auto-detected if empty)") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var customDelimiter: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Has Header") + @JsonPropertyDescription("first row contains column names (CSV/TSV/Excel)") + @JsonDeserialize(contentAs = classOf[java.lang.Boolean]) + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var hasHeader: Option[Boolean] = None + + @JsonProperty + @JsonSchemaTitle("Excel Sheet Name") + @JsonPropertyDescription("for Excel files; leave empty to use the first sheet") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var sheetName: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Flatten Nested JSON") + @JsonPropertyDescription("flatten nested JSON objects and arrays into dot-notation columns") + @JsonDeserialize(contentAs = classOf[java.lang.Boolean]) + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var flatten: Option[Boolean] = None + + @JsonProperty(defaultValue = "false") + @JsonSchemaTitle("Include Source File") + @JsonPropertyDescription("append a source file column when reading folders") + var includeSourceFile: Boolean = false + + @JsonProperty(defaultValue = "source_file") + @JsonSchemaTitle("Source File Column") + @JsonPropertyDescription("column name used when source file output is enabled") + var sourceFileAttribute: String = "source_file" + + fileTypeName = Option("Smart") + + override def operatorInfo: OperatorInfo = + OperatorInfo( + userFriendlyName = "Smart Source", + operatorDescription = + "Auto-detects file format and schema for a file or a folder of similar files. Supports CSV, TSV, JSON, JSONL, Arrow, Parquet, Excel, images, and plain text.", + operatorGroupName = OperatorGroupConstants.INPUT_GROUP, + inputPorts = List.empty, + outputPorts = List(OutputPort()) + ) + + @throws[IOException] + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = { + PhysicalOp + .sourcePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithClassName( + "org.apache.texera.amber.operator.source.scan.smart.SmartFileSourceOpExec", + objectMapper.writeValueAsString(this) + ) + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withPropagateSchema( + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) + ) + } + + override def sourceSchema(): Schema = { + if (!fileResolved()) return null + withOptionalSourceFile(runInference().schema) + } + + /** Run inference using the descriptor's own fields as overrides. */ + def runInference(): InferenceResult = { + val overrides = InferenceOverrides( + format = Option(formatOverride), + delimiter = customDelimiter.flatMap(_.headOption), + hasHeader = hasHeader, + sheetName = sheetName, + flatten = flatten + ) + SmartFileInferencer.infer( + new URI(fileName.get), + fileEncoding.getCharset, + overrides + ) + } + + def withOptionalSourceFile(schema: Schema): Schema = + if (includeSourceFile) schema.add(sourceFileAttribute, org.apache.texera.amber.core.tuple.AttributeType.STRING) + else schema +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala new file mode 100644 index 00000000000..b6849bc0cd5 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.databind.JsonNode +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowFileReader +import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory} +import org.apache.texera.amber.core.executor.SourceOperatorExecutor +import org.apache.texera.amber.core.storage.DocumentFactory +import org.apache.texera.amber.core.tuple.{AttributeTypeUtils, Schema, TupleLike} +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.amber.util.{ArrowUtils, ImageFormatUtils, JSONUtils} +import org.apache.texera.amber.util.JSONUtils.{JSONToMap, objectMapper} + +import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader} +import java.net.URI +import java.nio.file.{Files, StandardOpenOption} +import javax.imageio.ImageIO +import scala.collection.immutable.ArraySeq +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpExec(descString: String) extends SourceOperatorExecutor { + + private val desc: SmartFileSourceOpDesc = + objectMapper.readValue(descString, classOf[SmartFileSourceOpDesc]) + + private var inference: InferenceResult = _ + private var schema: Schema = _ + private val resources = scala.collection.mutable.ListBuffer.empty[AutoCloseable] + private var tupleSource: Iterator[TupleLike] = Iterator.empty + + private def closeableOf(fn: () => Unit): AutoCloseable = + new AutoCloseable { override def close(): Unit = fn() } + + override def open(): Unit = { + inference = desc.runInference() + schema = desc.withOptionalSourceFile(inference.schema) + tupleSource = openReader() + } + + override def produceTuple(): Iterator[TupleLike] = { + var it = tupleSource.drop(desc.offset.getOrElse(0)) + if (desc.limit.isDefined) it = it.take(desc.limit.get) + it + } + + override def close(): Unit = { + resources.foreach { c => + try c.close() + catch { case _: Throwable => /* swallow on shutdown */ } + } + resources.clear() + } + + // --------------------------------------------------------------------------- + // Per-format readers + // --------------------------------------------------------------------------- + + private def openReader(): Iterator[TupleLike] = { + val input = FolderInputResolver.resolve(new URI(desc.fileName.get)) + input.files.iterator.flatMap { file => + val rows = inference.format match { + case SmartFileFormat.CSV | SmartFileFormat.TSV => csvReader(file.uri) + case SmartFileFormat.JSONL => jsonlReader(file.uri) + case SmartFileFormat.JSON => jsonReader(file.uri) + case SmartFileFormat.ARROW => arrowReader(file.uri) + case SmartFileFormat.PARQUET => parquetReader(file.uri) + case SmartFileFormat.EXCEL => excelReader(file.uri) + case SmartFileFormat.IMAGE => imageReader(file.uri) + case SmartFileFormat.TEXT => textReader(file.uri) + case SmartFileFormat.AUTO => + throw new IllegalStateException("AUTO should have been resolved by inferencer") + } + if (desc.includeSourceFile) rows.map(appendSourceFile(_, file.displayName)) else rows + } + } + + private def appendSourceFile(tuple: TupleLike, displayName: String): TupleLike = + TupleLike(tuple.getFields :+ displayName) + + // CSV / TSV ---------------------------------------------------------------- + + private def csvReader(uri: URI): Iterator[TupleLike] = { + val delimiter = inference.csvDelimiter + .flatMap(_.headOption) + .getOrElse(if (inference.format == SmartFileFormat.TSV) '\t' else ',') + val hasHeader = inference.csvHasHeader.getOrElse(true) + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val reader = new InputStreamReader(stream, desc.fileEncoding.getCharset) + resources += reader + + val format = new CsvFormat() + format.setDelimiter(delimiter) + format.setLineSeparator("\n") + format.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setMaxCharsPerColumn(-1) + settings.setFormat(format) + settings.setHeaderExtractionEnabled(hasHeader) + settings.setNullValue("") + val parser = new CsvParser(settings) + parser.beginParsing(reader) + resources += closeableOf(() => parser.stopParsing()) + + new Iterator[TupleLike] { + private var nextRow: Array[String] = parser.parseNext() + override def hasNext: Boolean = nextRow != null + override def next(): TupleLike = { + val row = nextRow + nextRow = parser.parseNext() + try { + TupleLike( + ArraySeq.unsafeWrapArray( + AttributeTypeUtils.parseFields(row.asInstanceOf[Array[Any]], schema) + ): _* + ) + } catch { + case _: Throwable => null + } + } + }.filter(_ != null) + } + + // JSONL -------------------------------------------------------------------- + + private def jsonlReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val br = new BufferedReader(new InputStreamReader(stream, desc.fileEncoding.getCharset)) + resources += br + val flatten = inference.flatten.getOrElse(false) + val names = schema.getAttributeNames + + br.lines().iterator().asScala + .flatMap { line => + if (line == null || line.trim.isEmpty) None + else { + try { + val node = objectMapper.readTree(line) + if (!node.isObject) None + else Some(buildTupleFromJsonObject(node, names, flatten)) + } catch { + case _: Throwable => None + } + } + } + } + + // JSON --------------------------------------------------------------------- + + private def jsonReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val reader = new InputStreamReader(stream, desc.fileEncoding.getCharset) + resources += reader + val flatten = inference.flatten.getOrElse(false) + val names = schema.getAttributeNames + + val root = objectMapper.readTree(reader) + val nodes: Iterator[JsonNode] = + if (root.isArray) root.elements().asScala + else if (root.isObject) Iterator.single(root) + else Iterator.empty + + nodes.flatMap { node => + if (!node.isObject) None + else + try Some(buildTupleFromJsonObject(node, names, flatten)) + catch { case _: Throwable => None } + } + } + + private def buildTupleFromJsonObject( + node: JsonNode, + names: List[String], + flatten: Boolean + ): TupleLike = { + val fields = JSONToMap(node, flatten).withDefaultValue(null) + val parsed = names.map { name => + AttributeTypeUtils.parseField(fields(name), schema.getAttribute(name).getType) + } + TupleLike(parsed: _*) + } + + // Arrow -------------------------------------------------------------------- + + private def arrowReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val allocator = new RootAllocator() + val channel = Files.newByteChannel(file.toPath, StandardOpenOption.READ) + val arrowReader = new ArrowFileReader(channel, allocator) + val vectorRoot: VectorSchemaRoot = arrowReader.getVectorSchemaRoot + resources += vectorRoot + resources += arrowReader + resources += allocator + resources += closeableOf(() => channel.close()) + + new Iterator[TupleLike] { + private var idx = 0 + override def hasNext: Boolean = { + if (vectorRoot.getRowCount > idx) true + else if (arrowReader.loadNextBatch()) { idx = 0; vectorRoot.getRowCount > 0 } + else false + } + override def next(): TupleLike = { + val tuple = ArrowUtils.getTexeraTuple(idx, vectorRoot) + idx += 1 + tuple + } + } + } + + // Parquet ------------------------------------------------------------------ + + private def parquetReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val handle = ParquetUtils.openRecords(file) + resources += closeableOf(() => handle.close()) + + val parquetSchema = handle.schema + val attributeNames = schema.getAttributeNames + val parquetIndex: Map[String, Int] = + (0 until parquetSchema.getFieldCount).map(i => parquetSchema.getType(i).getName -> i).toMap + + handle.records.map { group => + val values = attributeNames.map { name => + parquetIndex.get(name) match { + case Some(i) => + val raw = ParquetUtils.readField(group, i, parquetSchema) + try AttributeTypeUtils.parseField(raw, schema.getAttribute(name).getType) + catch { case _: Throwable => raw } + case None => null + } + } + TupleLike(values: _*) + } + } + + // Excel -------------------------------------------------------------------- + + private def excelReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val workbook: Workbook = WorkbookFactory.create(file, null, true) + resources += workbook + val sheet = inference.sheetName + .flatMap(name => Option(workbook.getSheet(name))) + .getOrElse(workbook.getSheetAt(0)) + val hasHeader = inference.csvHasHeader.getOrElse(true) + val attributeNames = schema.getAttributeNames + + val rowIter = sheet.iterator().asScala + val dataRows = if (hasHeader && rowIter.hasNext) { rowIter.next(); rowIter } else rowIter + + dataRows.map { row => + val values = attributeNames.indices.map { i => + val cell = row.getCell(i) + val raw = readExcelCell(cell) + try AttributeTypeUtils.parseField(raw, schema.getAttributes(i).getType) + catch { case _: Throwable => raw } + } + TupleLike(values: _*) + } + } + + private def readExcelCell(cell: org.apache.poi.ss.usermodel.Cell): Any = { + import org.apache.poi.ss.usermodel.{CellType, DateUtil} + if (cell == null) return null + cell.getCellType match { + case CellType.STRING => cell.getStringCellValue + case CellType.BOOLEAN => java.lang.Boolean.valueOf(cell.getBooleanCellValue) + case CellType.NUMERIC => + if (DateUtil.isCellDateFormatted(cell)) + new java.sql.Timestamp(cell.getDateCellValue.getTime) + else { + val d = cell.getNumericCellValue + if (d == d.toLong.toDouble) java.lang.Long.valueOf(d.toLong) + else java.lang.Double.valueOf(d) + } + case CellType.FORMULA => + try { + val evaluator = cell.getSheet.getWorkbook.getCreationHelper.createFormulaEvaluator() + val evaluated = evaluator.evaluate(cell) + evaluated.getCellType match { + case CellType.STRING => evaluated.getStringValue + case CellType.BOOLEAN => java.lang.Boolean.valueOf(evaluated.getBooleanValue) + case CellType.NUMERIC => + val d = evaluated.getNumberValue + if (d == d.toLong.toDouble) java.lang.Long.valueOf(d.toLong) + else java.lang.Double.valueOf(d) + case _ => null + } + } catch { + case _: Throwable => null + } + case _ => null + } + } + + // Images ------------------------------------------------------------------- + + private def imageReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val bytes = + try stream.readAllBytes() + finally stream.close() + val image = ImageIO.read(new ByteArrayInputStream(bytes)) + val format = ImageFormatUtils + .detectFormat(bytes) + .orElse(ImageFormatUtils.extensionFormat(uri.getPath)) + .getOrElse("unknown") + val width = Option(image).map(image => Int.box(image.getWidth)).orNull + val height = Option(image).map(image => Int.box(image.getHeight)).orNull + Iterator.single(TupleLike(bytes, format, width, height)) + } + + // Plain text --------------------------------------------------------------- + + private def textReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val br = new BufferedReader(new InputStreamReader(stream, desc.fileEncoding.getCharset)) + resources += br + br.lines().iterator().asScala.map(line => TupleLike(line)) + } + + // Keep the JSONUtils import live (used transitively by JSONToMap/objectMapper above). + locally(JSONUtils) +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala new file mode 100644 index 00000000000..398f4de7729 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.scalatest.flatspec.AnyFlatSpec + +class FileSplitOpDescSpec extends AnyFlatSpec { + + "FileSplitOpDesc" should "propagate the input schema to every output port" in { + val desc = new FileSplitOpDesc() + val inputSchema = Schema( + List( + new Attribute("source_file", AttributeType.STRING), + new Attribute("value", AttributeType.INTEGER) + ) + ) + + val outputSchemas = desc.getExternalOutputSchemas(Map(PortIdentity() -> inputSchema)) + + assert(outputSchemas.keySet == Set(PortIdentity(), PortIdentity(1))) + assert(outputSchemas.values.forall(_ == inputSchema)) + } + + it should "reject inputs without a file identity column" in { + val desc = new FileSplitOpDesc() + val inputSchema = Schema(List(new Attribute("value", AttributeType.INTEGER))) + + val err = intercept[IllegalArgumentException] { + desc.getExternalOutputSchemas(Map(PortIdentity() -> inputSchema)) + } + assert(err.getMessage.contains("source_file")) + assert(err.getMessage.contains("filename")) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala new file mode 100644 index 00000000000..7b04ec961ba --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +class FileSplitOpExecSpec extends AnyFlatSpec { + + "FileSplitOpExec" should "keep rows from the same file on the same output port" in { + val desc = new FileSplitOpDesc() + val exec = new FileSplitOpExec(objectMapper.writeValueAsString(desc)) + val schema = Schema( + List( + new Attribute("source_file", AttributeType.STRING), + new Attribute("value", AttributeType.INTEGER) + ) + ) + + exec.open() + val outputs = List( + Tuple(schema, Array[Any]("a.csv", 1)), + Tuple(schema, Array[Any]("b.csv", 2)), + Tuple(schema, Array[Any]("a.csv", 3)), + Tuple(schema, Array[Any]("c.csv", 4)) + ).flatMap(tuple => exec.processTupleMultiPort(tuple, 0).toList) + exec.close() + + assert(outputs.map(_._2.get) == List(PortIdentity(), PortIdentity(1), PortIdentity(), PortIdentity())) + } + + it should "auto-detect the filename column used by file scans" in { + val desc = new FileSplitOpDesc() + val exec = new FileSplitOpExec(objectMapper.writeValueAsString(desc)) + val schema = Schema( + List( + new Attribute("filename", AttributeType.STRING), + new Attribute("content", AttributeType.BINARY) + ) + ) + + exec.open() + val output = exec + .processTupleMultiPort(Tuple(schema, Array[Any]("cat.png", Array[Byte](1, 2, 3))), 0) + .next() + exec.close() + + assert(output._2.contains(PortIdentity())) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala index 4437c018bd5..b5906e4edfd 100644 --- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala @@ -27,6 +27,9 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec +import java.nio.file.Files +import scala.jdk.CollectionConverters._ + class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { var fileScanSourceOpDesc: FileScanSourceOpDesc = _ @@ -185,4 +188,60 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { FileScanSourceOpExec.close() } + it should "read a folder of binary files and preserve relative file names" in { + val dir = Files.createTempDirectory("file-scan-image-folder-") + try { + Files.write(dir.resolve("cat.png"), Array[Byte](1, 2, 3)) + Files.write(dir.resolve("dog.png"), Array[Byte](4, 5, 6)) + + fileScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(dir.toString)) + fileScanSourceOpDesc.attributeType = FileAttributeType.BINARY + fileScanSourceOpDesc.outputFileName = true + + val exec = new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) + exec.open() + val tuples = exec + .produceTuple() + .map(_.asInstanceOf[SchemaEnforceable].enforceSchema(fileScanSourceOpDesc.sourceSchema())) + .toList + exec.close() + + assert(tuples.map(_.getField[String]("filename")) == List("cat.png", "dog.png")) + assert(tuples.map(_.getField[Array[Byte]]("line").toList) == List(List[Byte](1, 2, 3), List[Byte](4, 5, 6))) + } finally deleteRecursively(dir) + } + + it should "preserve relative file names for line-based folder scans" in { + val dir = Files.createTempDirectory("file-scan-text-folder-") + try { + Files.writeString(dir.resolve("a.txt"), "line-a\n") + Files.writeString(dir.resolve("b.txt"), "line-b\n") + + fileScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(dir.toString)) + fileScanSourceOpDesc.attributeType = FileAttributeType.STRING + fileScanSourceOpDesc.outputFileName = true + + val exec = new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) + exec.open() + val tuples = exec + .produceTuple() + .map(_.asInstanceOf[SchemaEnforceable].enforceSchema(fileScanSourceOpDesc.sourceSchema())) + .toList + exec.close() + + assert(tuples.map(_.getField[String]("filename")) == List("a.txt", "b.txt")) + assert(tuples.map(_.getField[String]("line")) == List("line-a", "line-b")) + } finally deleteRecursively(dir) + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } + } diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala new file mode 100644 index 00000000000..82f349d2c2f --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.scalatest.flatspec.AnyFlatSpec + +class CSVDialectSnifferSpec extends AnyFlatSpec { + + "CSVDialectSniffer" should "detect comma as delimiter for plain CSV" in { + val text = "id,name,age\n1,Ada,36\n2,Lin,29\n3,Bob,42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ',') + assert(dialect.hasHeader) + } + + it should "detect tab as delimiter for TSV-like content" in { + val text = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n3\tBob\t42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == '\t') + assert(dialect.hasHeader) + } + + it should "detect semicolon as delimiter when commas are absent" in { + val text = "id;name;age\n1;Ada;36\n2;Lin;29\n3;Bob;42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ';') + } + + it should "detect missing header when all rows look like data" in { + val text = "1,Ada,36\n2,Lin,29\n3,Bob,42\n4,Eve,55\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ',') + // First row is purely numeric/string mixed; later rows are the same shape. + // The sniffer defaults to "has header" only when row 1 looks distinct. + assert(!dialect.hasHeader) + } + + it should "honor a preferred delimiter when the content is consistent with it" in { + val text = "a,b,c\n1,2,3\n4,5,6\n" + val dialect = CSVDialectSniffer.sniff(text, preferred = Some(',')) + assert(dialect.delimiter == ',') + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala new file mode 100644 index 00000000000..cecc74034f5 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.scalatest.flatspec.AnyFlatSpec + +import java.nio.charset.StandardCharsets + +class FormatDetectorSpec extends AnyFlatSpec { + + private val utf8 = StandardCharsets.UTF_8 + + "FormatDetector" should "detect Parquet by magic bytes" in { + val bytes = "PAR1".getBytes(utf8) ++ Array.fill(20)(0.toByte) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.PARQUET) + } + + it should "detect XLSX by ZIP magic bytes" in { + val bytes = Array[Byte](0x50, 0x4b, 0x03, 0x04, 0, 0, 0, 0) + assert(FormatDetector.detect(Some("foo.xlsx"), bytes, utf8) == SmartFileFormat.EXCEL) + } + + it should "not classify a generic ZIP container as Excel" in { + val bytes = Array[Byte](0x50, 0x4b, 0x03, 0x04, 0, 0, 0, 0) + assert(FormatDetector.detect(Some("archive.zip"), bytes, utf8) == SmartFileFormat.TEXT) + } + + it should "detect Arrow by ARROW1 magic" in { + val bytes = "ARROW1\u0000\u0000".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.ARROW) + } + + it should "detect TSV when content contains tabs and extension matches" in { + val bytes = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n".getBytes(utf8) + assert(FormatDetector.detect(Some("users.tsv"), bytes, utf8) == SmartFileFormat.TSV) + } + + it should "detect TSV by content even if extension is .csv" in { + val bytes = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n".getBytes(utf8) + val detected = FormatDetector.detect(Some("misnamed.csv"), bytes, utf8) + // The .csv extension wins over content sniffing — that's the expected ranking. + assert(detected == SmartFileFormat.CSV) + } + + it should "fall back to content sniffing when extension is unknown" in { + val bytes = "id\tname\n1\tAda\n2\tLin\n".getBytes(utf8) + assert(FormatDetector.detect(Some("blob.bin"), bytes, utf8) == SmartFileFormat.TSV) + } + + it should "detect JSONL when multiple lines start with {" in { + val bytes = "{\"a\":1}\n{\"a\":2}\n{\"a\":3}\n".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.JSONL) + } + + it should "detect JSON array when content starts with [" in { + val bytes = "[ {\"a\":1}, {\"a\":2} ]".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.JSON) + } + + it should "detect plain text when there are no delimiters" in { + val bytes = "hello world\nthis is text\n".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.TEXT) + } + + it should "prefer extension over content sniffing for CSV" in { + val bytes = "a,b,c\n1,2,3\n".getBytes(utf8) + assert(FormatDetector.detect(Some("data.csv"), bytes, utf8) == SmartFileFormat.CSV) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala new file mode 100644 index 00000000000..ec092c69d22 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.example.data.simple.SimpleGroupFactory +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.example.GroupWriteSupport +import org.apache.parquet.schema.{MessageTypeParser, Type} +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.TestOperators +import org.scalatest.flatspec.AnyFlatSpec + +import java.awt.image.BufferedImage +import java.io.{File, FileOutputStream} +import javax.imageio.ImageIO +import java.nio.file.Files +import java.nio.charset.StandardCharsets +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpDescSpec extends AnyFlatSpec { + + "SmartFileSourceOpDesc.operatorInfo" should "advertise the broader Smart Source name" in { + val desc = new SmartFileSourceOpDesc() + + assert(desc.operatorInfo.userFriendlyName == "Smart Source") + } + + "SmartFileSourceOpDesc" should "infer CSV format and schema from a CSV file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.CountrySalesSmallCsvPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.csvDelimiter.contains(",")) + assert(result.csvHasHeader.contains(true)) + assert(result.schema.getAttributes.length == 14) + assert(result.schema.getAttribute("Order ID").getType == AttributeType.INTEGER) + } + + it should "infer JSONL format and schema from a JSONL file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.smallJsonLPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.JSONL) + assert(result.schema.getAttributes.nonEmpty) + } + + it should "respect a formatOverride from the user" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.CountrySalesSmallCsvPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.formatOverride = SmartFileFormat.CSV + desc.customDelimiter = Some(",") + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + } + + it should "infer plain text format for a .txt file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.TestTextFilePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.TEXT) + assert(result.schema.getAttributeNames == List("line")) + assert(result.schema.getAttribute("line").getType == AttributeType.STRING) + } + + it should "infer string columns for a header-only CSV file" in { + val tmp = Files.createTempFile("smartfile-header-only-", ".csv") + try { + Files.writeString(tmp, "id,name,score\n", StandardCharsets.UTF_8) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.toFile.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.schema.getAttributeNames == List("id", "name", "score")) + assert(result.schema.getAttributes.forall(_.getType == AttributeType.STRING)) + } finally Files.deleteIfExists(tmp) + } + + it should "infer one schema for a folder of similar CSV files" in { + val dir = Files.createTempDirectory("smartfile-folder-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.isFolder) + assert(result.fileCount == 2) + assert(result.schema.getAttributeNames == List("id", "name")) + } finally deleteRecursively(dir) + } + + it should "infer image folders as image records" in { + val dir = Files.createTempDirectory("smartfile-image-folder-") + try { + writePng(dir.resolve("cat.png").toFile, width = 3, height = 2) + writePng(dir.resolve("dog.png").toFile, width = 4, height = 5) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.IMAGE) + assert(result.isFolder) + assert(result.fileCount == 2) + assert(result.schema.getAttributeNames == List("image", "format", "width", "height")) + assert(result.schema.getAttribute("image").getType == AttributeType.BINARY) + assert(result.schema.getAttribute("format").getType == AttributeType.STRING) + assert(result.schema.getAttribute("width").getType == AttributeType.INTEGER) + assert(result.schema.getAttribute("height").getType == AttributeType.INTEGER) + } finally deleteRecursively(dir) + } + + it should "append a source file column when folder provenance is enabled" in { + val dir = Files.createTempDirectory("smartfile-folder-source-column-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val schema = desc.sourceSchema() + assert(schema.getAttributeNames == List("id", "name", "source_file")) + assert(schema.getAttribute("source_file").getType == AttributeType.STRING) + } finally deleteRecursively(dir) + } + + it should "reject folders that mix file formats" in { + val dir = Files.createTempDirectory("smartfile-mixed-folder-") + try { + Files.writeString(dir.resolve("part.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("part.jsonl"), """{"id":2,"name":"Lin"}""" + "\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val err = intercept[IllegalArgumentException](desc.runInference()) + assert(err.getMessage.contains("same detected format")) + } finally deleteRecursively(dir) + } + + it should "reject empty folders" in { + val dir = Files.createTempDirectory("smartfile-empty-folder-") + try { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val err = intercept[IllegalArgumentException](desc.runInference()) + assert(err.getMessage.contains("does not contain any readable files")) + } finally deleteRecursively(dir) + } + + it should "infer schema from a generated Excel file" in { + val tmp = Files.createTempFile("smartfile-test-", ".xlsx").toFile + try { + writeExcel(tmp) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.EXCEL) + val attrs = result.schema.getAttributes + assert(attrs.length == 3) + assert(attrs.head.getName == "id") + assert(attrs(1).getName == "name") + assert(attrs(2).getName == "score") + assert(attrs.head.getType == AttributeType.INTEGER) + assert(attrs(2).getType == AttributeType.DOUBLE) + } finally tmp.delete() + } + + it should "infer schema from a generated Parquet file" in { + val tmp = Files.createTempFile("smartfile-test-", ".parquet").toFile + tmp.delete() // ParquetWriter wants to create the file itself + try { + writeParquet(tmp) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.PARQUET) + val attrs = result.schema.getAttributes + assert(attrs.length == 3) + assert(attrs.exists(_.getName == "id")) + assert(result.schema.getAttribute("id").getType == AttributeType.INTEGER) + assert(result.schema.getAttribute("name").getType == AttributeType.STRING) + assert(result.schema.getAttribute("score").getType == AttributeType.DOUBLE) + } finally tmp.delete() + } + + private def writeExcel(out: File): Unit = { + val workbook = new XSSFWorkbook() + try { + val sheet = workbook.createSheet("Sheet1") + val header = sheet.createRow(0) + header.createCell(0).setCellValue("id") + header.createCell(1).setCellValue("name") + header.createCell(2).setCellValue("score") + + val rows = Seq((1, "Ada", 36.5), (2, "Lin", 29.1), (3, "Bob", 42.0)) + rows.zipWithIndex.foreach { + case ((id, name, score), i) => + val row = sheet.createRow(i + 1) + row.createCell(0).setCellValue(id.toDouble) + row.createCell(1).setCellValue(name) + row.createCell(2).setCellValue(score) + } + val fos = new FileOutputStream(out) + try workbook.write(fos) + finally fos.close() + } finally workbook.close() + } + + private def writePng(out: File, width: Int, height: Int): Unit = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + ImageIO.write(image, "png", out) + } + + private def writeParquet(out: File): Unit = { + val schemaStr = + """ + |message simple { + | required int32 id; + | required binary name (UTF8); + | required double score; + |} + """.stripMargin + val schema = MessageTypeParser.parseMessageType(schemaStr) + val conf = new Configuration(false) + conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") + GroupWriteSupport.setSchema(schema, conf) + + val factory = new SimpleGroupFactory(schema) + val writer = new ParquetWriter[org.apache.parquet.example.data.Group]( + new Path(out.toURI), + new GroupWriteSupport(), + org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED, + ParquetWriter.DEFAULT_BLOCK_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + true, + false, + ParquetWriter.DEFAULT_WRITER_VERSION, + conf + ) + try { + writer.write(factory.newGroup().append("id", 1).append("name", "Ada").append("score", 36.5d)) + writer.write(factory.newGroup().append("id", 2).append("name", "Lin").append("score", 29.1d)) + } finally writer.close() + + // Avoid compiler unused-import warning for Type — keep an explicit reference here so that + // if MessageTypeParser ever changes its return type the compile fails loudly. + val _: Type = schema + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala new file mode 100644 index 00000000000..13cc2d80cd1 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +import java.awt.image.BufferedImage +import java.io.File +import javax.imageio.ImageIO +import java.nio.charset.StandardCharsets +import java.nio.file.Files +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpExecSpec extends AnyFlatSpec { + + "SmartFileSourceOpExec" should "read a folder of similar CSV files as one source" in { + val dir = Files.createTempDirectory("smartfile-folder-exec-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.size == 2) + assert(tuples.map(_.getFields(0)) == List(1, 2)) + assert(tuples.map(_.getFields(1)) == List("Ada", "Lin")) + } finally deleteRecursively(dir) + } + + it should "preserve the originating file for folder rows when enabled" in { + val dir = Files.createTempDirectory("smartfile-folder-source-column-exec-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.map(_.getFields.last) == List("2025-01.csv", "2025-02.csv")) + } finally deleteRecursively(dir) + } + + it should "read image folders as image records with metadata" in { + val dir = Files.createTempDirectory("smartfile-image-folder-exec-") + try { + writePng(dir.resolve("cat.png").toFile, width = 3, height = 2) + writePng(dir.resolve("dog.png").toFile, width = 4, height = 5) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.size == 2) + assert(tuples.map(_.getFields(0).asInstanceOf[Array[Byte]].nonEmpty) == List(true, true)) + assert(tuples.map(_.getFields(1)) == List("png", "png")) + assert(tuples.map(_.getFields(2)) == List(3, 4)) + assert(tuples.map(_.getFields(3)) == List(2, 5)) + assert(tuples.map(_.getFields(4)) == List("cat.png", "dog.png")) + } finally deleteRecursively(dir) + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } + + private def writePng(out: File, width: Int, height: Int): Unit = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + ImageIO.write(image, "png", out) + } +} diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts index 2eda4b53bf6..89ae3fe95ff 100644 --- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts +++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts @@ -58,6 +58,9 @@ export class UserDatasetVersionFiletreeComponent implements AfterViewInit { @Input() public isExpandAllAfterViewInit = false; + @Input() + public isDirectorySelectable = false; + @ViewChild("tree") tree: any; @Output() @@ -69,9 +72,13 @@ export class UserDatasetVersionFiletreeComponent implements AfterViewInit { actionMapping: { mouse: { click: (tree: any, node: any, $event: any) => { + const isDirectory = node.data.type === "directory"; + if (isDirectory && this.isDirectorySelectable) { + this.selectedTreeNode.emit(node.data); + } if (node.hasChildren) { TREE_ACTIONS.TOGGLE_EXPANDED(tree, node, $event); - } else { + } else if (!isDirectory) { this.selectedTreeNode.emit(node.data); } }, diff --git a/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts b/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts index 5de61b33860..55bb8450ae5 100644 --- a/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts +++ b/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts @@ -62,6 +62,7 @@ export class DatasetFileSelectorComponent extends FieldType { nzData: { fileMode: true, selectedPath: this.formControl.getRawValue(), + allowDirectorySelection: this.props["allowFolderSelection"] === true, }, nzBodyStyle: { resize: "both", diff --git a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html index f8189ddb3ff..d6b43f767b7 100644 --- a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html +++ b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html @@ -55,6 +55,7 @@
diff --git a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts index 7f70792f937..9ec8a0809c4 100644 --- a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts +++ b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts @@ -53,9 +53,10 @@ import { ɵNzTransitionPatchDirective } from "ng-zorro-antd/core/transition-patc ], }) export class DatasetSelectionModalComponent implements OnInit { - private readonly data = inject(NZ_MODAL_DATA) as { + public readonly data = inject(NZ_MODAL_DATA) as { fileMode: boolean; selectedPath?: string | null; + allowDirectorySelection?: boolean; }; datasets: ReadonlyArray = []; diff --git a/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html b/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html index 1f2c2963f29..ab8f203420c 100644 --- a/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html +++ b/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html @@ -88,6 +88,30 @@

{{ operatorDescription }}

+
+ Detecting file format... +
+ +
+ Detected + {{ smartFileInferenceSummary.detectedFormat }} + + Folder: {{ smartFileInferenceSummary.fileCount }} files + + + Delimiter: {{ delimiter }} + + + Header: {{ smartFileInferenceSummary.hasHeader ? "yes" : "no" }} + + Sheet: {{ smartFileInferenceSummary.sheetName }} + {{ smartFileInferenceSummary.schema.length }} columns +
+
= new Subject(); + /** Prevent duplicate inference calls for the same operator/file pair. */ + private smartFileLastInferenceKey: string | undefined; + private smartFileInferenceByOperator = new Map(); + public smartFileInferenceSummary?: SmartFileInferenceResponse; + public smartFileInferenceLoading = false; + constructor( private formlyJsonschema: FormlyJsonschema, private workflowActionService: WorkflowActionService, @@ -173,7 +185,8 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On private changeDetectorRef: ChangeDetectorRef, private workflowVersionService: WorkflowVersionService, private workflowStatusSerivce: WorkflowStatusService, - private config: GuiConfigService + private config: GuiConfigService, + private smartFileInferenceService: SmartFileInferenceService ) {} ngOnChanges(changes: SimpleChanges): void { @@ -243,6 +256,11 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On this.setFormlyFormBinding(this.currentOperatorSchema.jsonSchema); this.formTitle = operator.customDisplayName ?? this.currentOperatorSchema.additionalMetadata.userFriendlyName; this.operatorDescription = this.currentOperatorSchema.additionalMetadata.operatorDescription; + this.smartFileInferenceSummary = + this.currentOperatorSchema.operatorType === SMART_FILE_SCAN_TYPE + ? this.smartFileInferenceByOperator.get(operator.operatorID) + : undefined; + this.smartFileInferenceLoading = false; /** * Important: make a deep copy of the initial property data object. * Prevent the form directly changes the value in the texera graph without going through workflow action service. @@ -349,10 +367,120 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On this.typeInferenceOnLambdaFunction(formData); this.workflowActionService.setOperatorProperty(this.currentOperatorId, cloneDeep(formData)); this.listeningToChange = true; + this.runSmartFileInferenceIfNeeded(formData); } }); } + /** + * For `SmartFileScan` operators, when the user picks a new file the backend can sniff the + * format, dialect, and schema and tell us what to prefill. This method only fires once per + * fileName change (so editing other fields doesn't re-trigger it) and silently no-ops for any + * other operator type. + */ + private runSmartFileInferenceIfNeeded(formData: Record): void { + if (!this.currentOperatorId) return; + if (this.currentOperatorSchema?.operatorType !== SMART_FILE_SCAN_TYPE) return; + const fileName = formData?.["fileName"]; + if (typeof fileName !== "string" || fileName.length === 0) return; + const operatorIdAtRequestTime = this.currentOperatorId; + const inferenceKey = `${operatorIdAtRequestTime}:${fileName}`; + if (inferenceKey === this.smartFileLastInferenceKey) return; + this.smartFileLastInferenceKey = inferenceKey; + this.smartFileInferenceByOperator.delete(operatorIdAtRequestTime); + this.smartFileInferenceSummary = undefined; + this.smartFileInferenceLoading = true; + + const formatOverride = formData["formatOverride"]; + const requestFormat = + typeof formatOverride === "string" && formatOverride !== "Auto-detect" && formatOverride !== "AUTO" + ? formatOverride + : undefined; + const customDelimiter = formData["customDelimiter"]; + const hasHeader = formData["hasHeader"]; + const sheetName = formData["sheetName"]; + const flatten = formData["flatten"]; + const fileEncoding = formData["fileEncoding"]; + + this.smartFileInferenceService + .preview({ + fileName, + fileEncoding: typeof fileEncoding === "string" ? fileEncoding : undefined, + formatOverride: requestFormat, + customDelimiter: + typeof customDelimiter === "string" && customDelimiter.length > 0 ? customDelimiter : undefined, + hasHeader: typeof hasHeader === "boolean" ? hasHeader : undefined, + sheetName: typeof sheetName === "string" && sheetName.length > 0 ? sheetName : undefined, + flatten: typeof flatten === "boolean" ? flatten : undefined, + }) + .pipe(untilDestroyed(this)) + .subscribe({ + next: response => this.applySmartFileInference(operatorIdAtRequestTime, fileName, response), + error: (err: unknown) => { + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.smartFileInferenceLoading = false; + } + if (this.smartFileLastInferenceKey === inferenceKey) { + this.smartFileLastInferenceKey = undefined; + } + // Surface as a non-blocking warning. Sniffing failure shouldn't break the workflow — + // the operator's own sourceSchema() call will re-attempt at compile time. + this.notificationService.warning(`Could not auto-detect file: ${this.smartFileInferenceErrorMessage(err)}`); + }, + }); + } + + private applySmartFileInference( + operatorIdAtRequestTime: string, + fileNameAtRequestTime: string, + response: SmartFileInferenceResponse + ): void { + const operator = this.workflowActionService.getTexeraGraph().getOperator(operatorIdAtRequestTime); + if (!operator) return; + // Drop stale responses — user may have already changed the file again. + if (operator.operatorProperties["fileName"] !== fileNameAtRequestTime) return; + + const merged: Record = { ...operator.operatorProperties }; + merged["formatOverride"] = response.detectedFormat; + if (response.customDelimiter !== null && response.customDelimiter !== undefined) { + merged["customDelimiter"] = response.customDelimiter; + } + if (response.hasHeader !== null && response.hasHeader !== undefined) { + merged["hasHeader"] = response.hasHeader; + } + if (response.sheetName !== null && response.sheetName !== undefined) { + merged["sheetName"] = response.sheetName; + } + if (response.flatten !== null && response.flatten !== undefined) { + merged["flatten"] = response.flatten; + } + const sourceFileColumnExists = response.schema.some(column => column.name.toLowerCase() === "source_file"); + if (response.isFolder && !sourceFileColumnExists && merged["includeSourceFile"] === undefined) { + merged["includeSourceFile"] = true; + } + this.smartFileInferenceByOperator.set(operatorIdAtRequestTime, response); + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.smartFileInferenceSummary = response; + this.smartFileInferenceLoading = false; + } + this.workflowActionService.setOperatorProperty(operatorIdAtRequestTime, merged); + } + + public formatSmartFileDelimiter(delimiter: string | null): string | undefined { + if (delimiter === null) return undefined; + if (delimiter === "\t") return "tab"; + if (delimiter === " ") return "space"; + return delimiter; + } + + private smartFileInferenceErrorMessage(err: unknown): string { + if (typeof err !== "object" || err === null) return "unknown error"; + const maybeError = err as { error?: { message?: unknown }; message?: unknown }; + if (typeof maybeError.error?.message === "string") return maybeError.error.message; + if (typeof maybeError.message === "string") return maybeError.message; + return "unknown error"; + } + typeInferenceOnLambdaFunction(formData: any): void { if (!this.currentOperatorId?.includes("PythonLambdaFunction")) { return; @@ -468,6 +596,12 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On // if the title is fileName, then change it to custom autocomplete input template if (mappedField.key === "fileName") { mappedField.type = "inputautocomplete"; + mappedField.props = { + ...mappedField.props, + allowFolderSelection: + this.currentOperatorSchema?.operatorType === this.smartFileScanType || + this.currentOperatorSchema?.operatorType === "FileScan", + }; } if (mappedField.key === "datasetVersionPath") { diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts new file mode 100644 index 00000000000..1b5428892c7 --- /dev/null +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { isImageDataUrl } from "./result-table-cell.utils"; + +describe("isImageDataUrl", () => { + it("should recognize supported image data URLs", () => { + expect(isImageDataUrl("data:image/png;base64,AAAA")).toBe(true); + expect(isImageDataUrl("data:image/jpeg;base64,BBBB")).toBe(true); + expect(isImageDataUrl("data:image/webp;base64,CCCC")).toBe(true); + }); + + it("should reject binary previews and non-image strings", () => { + expect(isImageDataUrl("")).toBe(false); + expect(isImageDataUrl("data:text/plain;base64,AAAA")).toBe(false); + expect(isImageDataUrl(42)).toBe(false); + }); +}); diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts new file mode 100644 index 00000000000..830551304b9 --- /dev/null +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts @@ -0,0 +1,22 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +export function isImageDataUrl(value: unknown): value is string { + return typeof value === "string" && /^data:image\/(?:png|jpeg|gif|webp);base64,/i.test(value); +} diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html index 5400d978ee3..6fc0b49dd89 100644 --- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html @@ -161,7 +161,14 @@
- {{ column.getCell(row) }} + + + {{ column.getCell(row) }} + + +
+
+ +
+ +
+
Visual Journey
+
{{ currentTrace.title }}
+
+ {{ currentTrace.subtitle }} +
+
+ +
+ {{ currentTrace.heroMetric.label }} + {{ currentTrace.heroMetric.value }} +
+
+ +

+ {{ currentTrace.summary }} +

+ + +
+
+
{{ i + 1 }}
+
+ + {{ getKindLabel(step.kind) }} +
+
+
+ +
+
+
+
+
+ +
+ + {{ i + 1 }} +
+ +
+
+ {{ getKindLabel(step.kind) }} + + + {{ getStepLabel(step) }} + +
+ +

{{ step.title }}

+

{{ step.detail }}

+ +
+
+ {{ metric.label }} + {{ metric.value }} +
+
+
+
+
+ + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss new file mode 100644 index 00000000000..4adb8bade0c --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss @@ -0,0 +1,346 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +:host { + position: fixed; + inset: 0; + z-index: 6; + pointer-events: none; +} + +.trace-panel { + position: absolute; + top: 74px; + right: 14px; + bottom: 14px; + width: min(420px, calc(100vw - 28px)); + background: #fff; + border: 1px solid #dfe5ec; + box-shadow: 0 18px 42px rgba(19, 29, 40, 0.18); + display: flex; + flex-direction: column; + overflow: hidden; + pointer-events: auto; +} + +.trace-header { + padding: 18px 18px 14px; + border-bottom: 1px solid #e8edf3; + background: + linear-gradient(135deg, rgba(255, 244, 214, 0.9), rgba(232, 247, 255, 0.94)), + #fff; +} + +.close-button { + position: absolute; + top: 12px; + right: 12px; + border: 0; + background: #fff; + width: 30px; + height: 30px; + display: grid; + place-items: center; + color: #253040; + cursor: pointer; +} + +.hero { + display: grid; + grid-template-columns: 74px 1fr; + gap: 12px; + align-items: center; + padding-right: 34px; +} + +.hero-media { + width: 74px; + height: 74px; + background: #fff; + border: 1px solid #d9e0e8; + display: grid; + place-items: center; +} + +.hero-media img { + max-width: 64px; + max-height: 64px; + object-fit: contain; + image-rendering: pixelated; +} + +.hero-copy { + min-width: 0; +} + +.hero-kicker { + color: #46617f; + font-size: 11px; + text-transform: uppercase; + letter-spacing: 0; +} + +.hero-title { + color: #17202d; + font-size: 20px; + font-weight: 700; + line-height: 1.2; +} + +.hero-subtitle { + color: #556577; + font-size: 13px; + margin-top: 4px; +} + +.hero-metric { + grid-column: 1 / -1; + justify-self: start; + display: inline-flex; + align-items: baseline; + gap: 8px; + margin-top: 10px; + padding: 7px 10px; + background: #17202d; + color: #fff; +} + +.hero-metric span, +.metric-pill span { + font-size: 11px; + color: inherit; + opacity: 0.74; +} + +.hero-metric strong { + font-size: 18px; +} + +.trace-summary { + margin: 12px 0 0; + color: #334255; + font-size: 13px; + line-height: 1.45; +} + +.filmstrip { + display: flex; + gap: 10px; + padding: 14px 18px; + overflow-x: auto; + border-bottom: 1px solid #edf1f5; +} + +.film-frame { + position: relative; + flex: 0 0 auto; + width: 72px; +} + +.film-index { + position: absolute; + top: -5px; + left: -5px; + z-index: 1; + width: 20px; + height: 20px; + display: grid; + place-items: center; + background: #17202d; + color: #fff; + font-size: 11px; +} + +.film-image { + width: 72px; + height: 72px; + border: 1px solid #dfe5ec; + background: #fff; + display: grid; + place-items: center; +} + +.film-image img { + width: 100%; + height: 100%; + object-fit: contain; + image-rendering: pixelated; +} + +.film-image--empty { + background: #f3f6f8; + color: #526375; + font-size: 11px; + text-align: center; +} + +.trace-steps { + padding: 18px; + overflow-y: auto; +} + +.trace-step { + position: relative; + display: grid; + grid-template-columns: 16px 68px 1fr; + gap: 12px; + min-height: 92px; + padding-bottom: 18px; +} + +.trace-step:last-child { + padding-bottom: 0; +} + +.trace-line { + position: relative; + display: flex; + justify-content: center; +} + +.trace-line::after { + content: ""; + position: absolute; + top: 18px; + bottom: -18px; + width: 2px; + background: #d9e0e8; +} + +.trace-step:last-child .trace-line::after { + display: none; +} + +.trace-dot { + position: relative; + z-index: 1; + width: 12px; + height: 12px; + margin-top: 6px; + background: #62768d; +} + +.trace-step--source .trace-dot { + background: #1f8a70; +} + +.trace-step--match .trace-dot { + background: #2b6cb0; +} + +.trace-step--compute .trace-dot { + background: #d97706; +} + +.trace-step--render .trace-dot { + background: #b83280; +} + +.step-media { + width: 68px; + height: 68px; + border: 1px solid #dfe5ec; + background: #f7f9fb; + display: grid; + place-items: center; + color: #526375; + font-size: 16px; + font-weight: 700; +} + +.step-media img { + width: 100%; + height: 100%; + object-fit: contain; + image-rendering: pixelated; +} + +.step-body { + min-width: 0; +} + +.step-meta { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-bottom: 6px; +} + +.kind-chip, +.operator-chip { + border: 1px solid #d9e0e8; + background: #fff; + color: #314255; + padding: 3px 7px; + font-size: 11px; + line-height: 1.2; +} + +.operator-chip { + cursor: pointer; +} + +.operator-chip--static { + cursor: default; +} + +.step-body h3 { + margin: 0; + color: #17202d; + font-size: 14px; + font-weight: 700; + line-height: 1.35; +} + +.step-body p { + margin: 5px 0 0; + color: #556577; + font-size: 12px; + line-height: 1.45; +} + +.metric-row { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-top: 8px; +} + +.metric-pill { + display: inline-flex; + align-items: baseline; + gap: 5px; + padding: 4px 7px; + background: #f3f6f8; + color: #253040; +} + +.metric-pill strong { + font-size: 12px; +} + +@media (max-width: 720px) { + .trace-panel { + top: 60px; + right: 8px; + bottom: 8px; + width: calc(100vw - 16px); + } +} + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts new file mode 100644 index 00000000000..d5ff3f4f6ad --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { ComponentFixture, TestBed } from "@angular/core/testing"; +import { BehaviorSubject } from "rxjs"; +import { VisualTracePanelComponent } from "./visual-trace-panel.component"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; +import { VisualTrace } from "../../types/visual-trace.interface"; + +describe("VisualTracePanelComponent", () => { + let fixture: ComponentFixture; + let component: VisualTracePanelComponent; + let traceSubject: BehaviorSubject; + + beforeEach(async () => { + traceSubject = new BehaviorSubject(undefined); + + await TestBed.configureTestingModule({ + imports: [VisualTracePanelComponent], + providers: [ + { + provide: VisualTraceService, + useValue: { + trace$: traceSubject.asObservable(), + closeTrace: vi.fn(), + }, + }, + { + provide: WorkflowActionService, + useValue: { + getTexeraGraph: () => ({ + hasOperator: vi.fn().mockReturnValue(true), + getOperator: vi.fn().mockReturnValue({ + operatorID: "op1", + operatorType: "PythonUDFV2", + customDisplayName: "Battle Logic", + }), + }), + highlightOperators: vi.fn(), + }, + }, + ], + }).compileComponents(); + + fixture = TestBed.createComponent(VisualTracePanelComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it("renders a visual journey with hero media, metrics, and ordered steps", () => { + traceSubject.next({ + title: "Charizard wins", + subtitle: "Fire matchup", + heroImage: "data:image/png;base64,abc", + heroMetric: { label: "Advantage", value: "2x" }, + steps: [ + { + title: "Loaded sprite", + operatorId: "op1", + image: "data:image/png;base64,abc", + metrics: [{ label: "Rows", value: "440" }], + }, + { + title: "Rendered result", + kind: "render", + }, + ], + }); + fixture.detectChanges(); + + const native = fixture.nativeElement as HTMLElement; + expect(native.querySelector(".trace-panel")).toBeTruthy(); + expect(native.querySelector(".hero-title")?.textContent).toContain("Charizard wins"); + expect(native.querySelector(".hero-media img")).toBeTruthy(); + expect(native.querySelector(".hero-metric")?.textContent).toContain("2x"); + expect(native.querySelectorAll(".trace-step")).toHaveLength(2); + expect(native.textContent).toContain("Loaded sprite"); + expect(native.textContent).toContain("Rendered result"); + }); +}); + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts new file mode 100644 index 00000000000..2962229517a --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NgClass, NgFor, NgIf } from "@angular/common"; +import { Component, OnInit } from "@angular/core"; +import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; +import { NzIconDirective } from "ng-zorro-antd/icon"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; +import { VisualTrace, VisualTraceStep, VisualTraceStepKind } from "../../types/visual-trace.interface"; + +@UntilDestroy() +@Component({ + selector: "texera-visual-trace-panel", + templateUrl: "./visual-trace-panel.component.html", + styleUrls: ["./visual-trace-panel.component.scss"], + imports: [NgIf, NgFor, NgClass, NzIconDirective], +}) +export class VisualTracePanelComponent implements OnInit { + public trace?: VisualTrace; + + constructor( + private readonly visualTraceService: VisualTraceService, + private readonly workflowActionService: WorkflowActionService + ) {} + + ngOnInit(): void { + this.visualTraceService.trace$.pipe(untilDestroyed(this)).subscribe(trace => { + this.trace = trace; + }); + } + + public close(): void { + this.visualTraceService.closeTrace(); + } + + public focusOperator(step: VisualTraceStep): void { + if (!step.operatorId || !this.workflowActionService.getTexeraGraph().hasOperator(step.operatorId)) { + return; + } + this.workflowActionService.highlightOperators(false, step.operatorId); + } + + public getStepLabel(step: VisualTraceStep): string { + if (step.operatorLabel) { + return step.operatorLabel; + } + if (!step.operatorId || !this.workflowActionService.getTexeraGraph().hasOperator(step.operatorId)) { + return this.getKindLabel(step.kind); + } + const operator = this.workflowActionService.getTexeraGraph().getOperator(step.operatorId); + return operator.customDisplayName ?? operator.operatorType; + } + + public getKindLabel(kind?: VisualTraceStepKind): string { + switch (kind) { + case "source": + return "Source"; + case "match": + return "Match"; + case "compute": + return "Compute"; + case "render": + return "Render"; + default: + return "Step"; + } + } +} diff --git a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html index c092a4bf74a..0c759a0af5d 100644 --- a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html +++ b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html @@ -18,5 +18,7 @@ --> diff --git a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts index eb329c1c7f1..4602476e798 100644 --- a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts +++ b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts @@ -17,11 +17,22 @@ * under the License. */ -import { AfterContentInit, Component, Input } from "@angular/core"; +import { AfterContentInit, Component, ElementRef, HostListener, Input, ViewChild } from "@angular/core"; import { DomSanitizer } from "@angular/platform-browser"; import { WorkflowResultService } from "../../service/workflow-result/workflow-result.service"; import { auditTime, filter } from "rxjs/operators"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { + buildStructuralVisualTrace, + buildVisualTraceBridgeScript, + extractVisualTraceSelectionFromElement, + findVisualTraceElement, + parseVisualTraceMessage, + parseVisualTracePayloadAttribute, + parseVisualTraceSelectionMessage, +} from "../../service/visual-trace/visual-trace.utils"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; @UntilDestroy() @Component({ @@ -32,13 +43,17 @@ import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; export class VisualizationFrameContentComponent implements AfterContentInit { // operatorId: string = inject(NZ_MODAL_DATA).operatorId; @Input() operatorId?: string; + @ViewChild("visualizationFrame") visualizationFrame?: ElementRef; // progressive visualization update and redraw interval in milliseconds public static readonly UPDATE_INTERVAL_MS = 2000; htmlData: any = ""; + private removeFrameClickListener?: () => void; constructor( private workflowResultService: WorkflowResultService, - private sanitizer: DomSanitizer + private sanitizer: DomSanitizer, + private visualTraceService: VisualTraceService, + private workflowActionService: WorkflowActionService ) {} ngAfterContentInit() { @@ -79,9 +94,77 @@ export class VisualizationFrameContentComponent implements AfterContentInit { const firstDiv = doc.body.querySelector("div"); if (firstDiv) firstDiv.style.height = "100%"; + const bridgeScript = doc.createElement("script"); + bridgeScript.textContent = buildVisualTraceBridgeScript(); + doc.body.appendChild(bridgeScript); + const serializer = new XMLSerializer(); const newHtmlString = serializer.serializeToString(doc); this.htmlData = this.sanitizer.bypassSecurityTrustHtml(newHtmlString); // this line bypasses angular security } + + @HostListener("window:message", ["$event"]) + handleWindowMessage(event: MessageEvent): void { + if (this.visualizationFrame?.nativeElement.contentWindow && event.source !== this.visualizationFrame.nativeElement.contentWindow) { + return; + } + const trace = parseVisualTraceMessage(event.data); + if (trace) { + this.visualTraceService.openTrace(trace); + return; + } + + const selection = parseVisualTraceSelectionMessage(event.data); + if (!selection || !this.operatorId) { + return; + } + this.openStructuralTrace(selection); + } + + onVisualizationFrameLoad(): void { + this.removeFrameClickListener?.(); + + const frameDocument = this.visualizationFrame?.nativeElement.contentDocument; + if (!frameDocument) { + return; + } + + const handleClick = (event: MouseEvent): void => { + const traceElement = findVisualTraceElement(event.target); + if (!traceElement) { + return; + } + + const trace = parseVisualTracePayloadAttribute(traceElement.getAttribute("data-texera-trace")); + if (trace) { + this.visualTraceService.openTrace(trace); + return; + } + + const selection = extractVisualTraceSelectionFromElement(traceElement); + if (selection && this.operatorId) { + this.openStructuralTrace(selection); + } + }; + + frameDocument.addEventListener("click", handleClick); + this.removeFrameClickListener = () => frameDocument.removeEventListener("click", handleClick); + } + + private openStructuralTrace(selection: { title?: string; image?: string; imageAlt?: string }): void { + if (!this.operatorId) { + return; + } + + const graph = this.workflowActionService.getTexeraGraph(); + const structuralTrace = buildStructuralVisualTrace(selection, this.operatorId, { + hasOperator: operatorId => graph.hasOperator(operatorId), + getOperator: operatorId => graph.getOperator(operatorId), + getInputOperatorIds: operatorId => graph.getInputLinksByOperatorId(operatorId).map(link => link.source.operatorID), + }); + if (structuralTrace) { + this.visualTraceService.openTrace(structuralTrace); + } + } } diff --git a/frontend/src/app/workspace/component/workspace.component.html b/frontend/src/app/workspace/component/workspace.component.html index c54446fb318..3d7dd8c9cb3 100644 --- a/frontend/src/app/workspace/component/workspace.component.html +++ b/frontend/src/app/workspace/component/workspace.component.html @@ -37,4 +37,5 @@ *ngIf="copilotEnabled" [agentIdToActivate]="agentIdToActivate"> + diff --git a/frontend/src/app/workspace/component/workspace.component.ts b/frontend/src/app/workspace/component/workspace.component.ts index 9968c26f647..f3af0802ba9 100644 --- a/frontend/src/app/workspace/component/workspace.component.ts +++ b/frontend/src/app/workspace/component/workspace.component.ts @@ -61,6 +61,7 @@ import { LeftPanelComponent } from "./left-panel/left-panel.component"; import { AgentPanelComponent } from "./agent/agent-panel/agent-panel.component"; import { PropertyEditorComponent } from "./property-editor/property-editor.component"; import { FormlyRepeatDndComponent } from "../../common/formly/repeat-dnd/repeat-dnd.component"; +import { VisualTracePanelComponent } from "./visual-trace-panel/visual-trace-panel.component"; export const SAVE_DEBOUNCE_TIME_IN_MS = 5000; @@ -83,6 +84,7 @@ export const SAVE_DEBOUNCE_TIME_IN_MS = 5000; NgIf, AgentPanelComponent, PropertyEditorComponent, + VisualTracePanelComponent, FormlyRepeatDndComponent, ], }) diff --git a/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts b/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts new file mode 100644 index 00000000000..2c48806aec4 --- /dev/null +++ b/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { HttpClient } from "@angular/common/http"; +import { Injectable } from "@angular/core"; +import { Observable } from "rxjs"; +import { AppSettings } from "../../../common/app-setting"; + +export interface SmartFileInferenceColumn { + name: string; + type: string; +} + +export interface SmartFileInferenceResponse { + detectedFormat: string; + schema: SmartFileInferenceColumn[]; + customDelimiter: string | null; + hasHeader: boolean | null; + sheetName: string | null; + availableSheetNames: string[]; + flatten: boolean | null; + isFolder: boolean; + fileCount: number; +} + +export interface SmartFileInferenceRequest { + fileName: string; + fileEncoding?: string; + formatOverride?: string; + customDelimiter?: string; + hasHeader?: boolean; + sheetName?: string; + flatten?: boolean; +} + +/** Operator type string registered in LogicalOp.scala. */ +export const SMART_FILE_SCAN_TYPE = "SmartFileScan"; + +/** + * Talks to the backend `POST /api/file-inference/preview` endpoint that backs the + * SmartFileScan operator. The endpoint runs the same inference path the operator + * uses at workflow compile time, so what the user sees in the property panel is + * exactly what the workflow will produce for either one file or one folder. + */ +@Injectable({ + providedIn: "root", +}) +export class SmartFileInferenceService { + constructor(private http: HttpClient) {} + + preview(request: SmartFileInferenceRequest): Observable { + return this.http.post( + `${AppSettings.getApiEndpoint()}/file-inference/preview`, + request + ); + } +} diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts new file mode 100644 index 00000000000..6e8a72e2203 --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { BehaviorSubject } from "rxjs"; +import { VisualTrace } from "../../types/visual-trace.interface"; + +@Injectable({ + providedIn: "root", +}) +export class VisualTraceService { + private readonly traceSubject = new BehaviorSubject(undefined); + public readonly trace$ = this.traceSubject.asObservable(); + + public openTrace(trace: VisualTrace): void { + this.traceSubject.next(trace); + } + + public closeTrace(): void { + this.traceSubject.next(undefined); + } +} + diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts new file mode 100644 index 00000000000..7005c898018 --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts @@ -0,0 +1,224 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { + buildStructuralVisualTrace, + extractVisualTraceSelectionFromElement, + findVisualTraceElement, + parseVisualTraceMessage, + parseVisualTracePayloadAttribute, + parseVisualTraceSelectionMessage, +} from "./visual-trace.utils"; + +describe("parseVisualTraceMessage", () => { + it("accepts a valid visual trace message", () => { + expect( + parseVisualTraceMessage({ + type: "texera-visual-trace", + payload: { + title: "Charizard wins", + heroImage: "data:image/png;base64,abc", + steps: [ + { + title: "Loaded sprite", + kind: "source", + metrics: [{ label: "Rows", value: "440" }], + }, + ], + }, + }) + ).toEqual({ + title: "Charizard wins", + heroImage: "data:image/png;base64,abc", + steps: [ + { + title: "Loaded sprite", + kind: "source", + metrics: [{ label: "Rows", value: "440" }], + }, + ], + }); + }); + + it("rejects malformed or incomplete trace messages", () => { + expect(parseVisualTraceMessage(undefined)).toBeUndefined(); + expect(parseVisualTraceMessage({ type: "other", payload: {} })).toBeUndefined(); + expect(parseVisualTraceMessage({ type: "texera-visual-trace", payload: { title: "Missing steps" } })).toBeUndefined(); + expect( + parseVisualTraceMessage({ + type: "texera-visual-trace", + payload: { + title: "Bad step", + steps: [{ detail: "No title" }], + }, + }) + ).toBeUndefined(); + }); +}); + +describe("parseVisualTraceSelectionMessage", () => { + it("accepts a valid fallback selection message", () => { + expect( + parseVisualTraceSelectionMessage({ + type: "texera-visual-trace-selection", + payload: { + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }, + }) + ).toEqual({ + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }); + }); + + it("rejects malformed selection messages", () => { + expect(parseVisualTraceSelectionMessage(undefined)).toBeUndefined(); + expect(parseVisualTraceSelectionMessage({ type: "other", payload: {} })).toBeUndefined(); + expect(parseVisualTraceSelectionMessage({ type: "texera-visual-trace-selection", payload: {} })).toBeUndefined(); + }); +}); + +describe("buildStructuralVisualTrace", () => { + it("builds an upstream workflow journey when a visualization only reports the clicked image", () => { + const operators = { + source: { operatorID: "source", operatorType: "Smart Source", customDisplayName: "Pokemon Images" }, + udf: { operatorID: "udf", operatorType: "Python UDF", customDisplayName: "Map sprites" }, + visualizer: { operatorID: "visualizer", operatorType: "HTML Visualizer" }, + }; + const inputs = { + source: [], + udf: ["source"], + visualizer: ["udf"], + }; + + expect( + buildStructuralVisualTrace( + { title: "Charizard", image: "data:image/png;base64,abc", imageAlt: "Charizard sprite" }, + "visualizer", + { + hasOperator: (operatorId: string) => operatorId in operators, + getOperator: (operatorId: string) => operators[operatorId as keyof typeof operators], + getInputOperatorIds: (operatorId: string) => inputs[operatorId as keyof typeof inputs], + } + ) + ).toEqual({ + title: "Charizard", + subtitle: "Workflow path to HTML Visualizer", + summary: + "Auto-built from the upstream workflow graph. Add a trace payload in the visualization for row-level details.", + heroImage: "data:image/png;base64,abc", + heroImageAlt: "Charizard sprite", + heroMetric: { label: "Steps", value: "3" }, + steps: [ + { + title: "Pokemon Images", + operatorId: "source", + operatorLabel: "Pokemon Images", + kind: "source", + }, + { + title: "Map sprites", + operatorId: "udf", + operatorLabel: "Map sprites", + kind: "compute", + }, + { + title: "HTML Visualizer", + operatorId: "visualizer", + operatorLabel: "HTML Visualizer", + kind: "render", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }, + ], + }); + }); + + it("returns undefined when the visualizer operator is missing", () => { + expect( + buildStructuralVisualTrace( + { title: "Charizard", image: "data:image/png;base64,abc" }, + "missing", + { + hasOperator: () => false, + getOperator: () => { + throw new Error("should not be called"); + }, + getInputOperatorIds: () => [], + } + ) + ).toBeUndefined(); + }); +}); + +describe("visual trace DOM helpers", () => { + it("reads a rich trace payload from an element attribute", () => { + expect( + parseVisualTracePayloadAttribute( + JSON.stringify({ + title: "Charizard wins", + steps: [{ title: "Rendered card" }], + }) + ) + ).toEqual({ + title: "Charizard wins", + steps: [{ title: "Rendered card" }], + }); + }); + + it("finds an image-bearing ancestor and extracts a fallback selection", () => { + const card = document.createElement("div"); + card.className = "pokemon-side"; + card.innerHTML = ` +
WINNER
+ Charizard +
Charizard
+ `; + const badge = card.querySelector(".winner-badge"); + expect(badge).not.toBeNull(); + const traceElement = findVisualTraceElement(badge); + + expect(traceElement).toBe(card); + expect(extractVisualTraceSelectionFromElement(traceElement as Element)).toEqual({ + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard", + }); + }); + + it("accepts element-like click targets from iframe documents", () => { + const frame = document.createElement("iframe"); + document.body.appendChild(frame); + const frameDocument = frame.contentDocument as Document; + const card = frameDocument.createElement("div"); + card.innerHTML = ` +
WINNER
+ Charizard + `; + frameDocument.body.appendChild(card); + + const badge = card.querySelector(".winner-badge"); + expect(findVisualTraceElement(badge)).toBe(card); + + frame.remove(); + }); +}); diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts new file mode 100644 index 00000000000..d94bde9723e --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts @@ -0,0 +1,293 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { + VisualTrace, + VisualTraceMetric, + VisualTraceSelection, + VisualTraceStep, + VisualTraceStepKind, +} from "../../types/visual-trace.interface"; + +const TRACE_MESSAGE_TYPE = "texera-visual-trace"; +const TRACE_SELECTION_MESSAGE_TYPE = "texera-visual-trace-selection"; +const VALID_STEP_KINDS = new Set(["source", "match", "compute", "render"]); + +export interface VisualTraceGraphOperator { + operatorID: string; + operatorType: string; + customDisplayName?: string; +} + +export interface VisualTraceGraphReader { + hasOperator(operatorId: string): boolean; + getOperator(operatorId: string): VisualTraceGraphOperator; + getInputOperatorIds(operatorId: string): string[]; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +function isElementLike(value: EventTarget | null): value is Element { + return ( + typeof value === "object" && + value !== null && + "nodeType" in value && + value.nodeType === 1 && + "matches" in value && + typeof value.matches === "function" && + "querySelector" in value && + typeof value.querySelector === "function" + ); +} + +function parseMetric(value: unknown): VisualTraceMetric | undefined { + if (!isRecord(value) || typeof value.label !== "string" || typeof value.value !== "string") { + return undefined; + } + return { + label: value.label, + value: value.value, + }; +} + +function parseStep(value: unknown): VisualTraceStep | undefined { + if (!isRecord(value) || typeof value.title !== "string") { + return undefined; + } + + const kind: VisualTraceStepKind | undefined = + typeof value.kind === "string" && VALID_STEP_KINDS.has(value.kind as VisualTraceStepKind) + ? (value.kind as VisualTraceStepKind) + : undefined; + const metrics = Array.isArray(value.metrics) ? value.metrics.map(parseMetric).filter(Boolean) : undefined; + + return { + title: value.title, + detail: typeof value.detail === "string" ? value.detail : undefined, + operatorId: typeof value.operatorId === "string" ? value.operatorId : undefined, + operatorLabel: typeof value.operatorLabel === "string" ? value.operatorLabel : undefined, + image: typeof value.image === "string" ? value.image : undefined, + imageAlt: typeof value.imageAlt === "string" ? value.imageAlt : undefined, + kind, + metrics: metrics as VisualTraceMetric[] | undefined, + }; +} + +export function parseVisualTraceMessage(message: unknown): VisualTrace | undefined { + if (!isRecord(message) || message.type !== TRACE_MESSAGE_TYPE || !isRecord(message.payload)) { + return undefined; + } + + const payload = message.payload; + if (typeof payload.title !== "string" || !Array.isArray(payload.steps)) { + return undefined; + } + + const steps = payload.steps.map(parseStep); + if (steps.length === 0 || steps.some(step => step === undefined)) { + return undefined; + } + + return { + title: payload.title, + subtitle: typeof payload.subtitle === "string" ? payload.subtitle : undefined, + summary: typeof payload.summary === "string" ? payload.summary : undefined, + heroImage: typeof payload.heroImage === "string" ? payload.heroImage : undefined, + heroImageAlt: typeof payload.heroImageAlt === "string" ? payload.heroImageAlt : undefined, + heroMetric: parseMetric(payload.heroMetric), + steps: steps as VisualTraceStep[], + }; +} + +export function parseVisualTraceSelectionMessage(message: unknown): VisualTraceSelection | undefined { + if (!isRecord(message) || message.type !== TRACE_SELECTION_MESSAGE_TYPE || !isRecord(message.payload)) { + return undefined; + } + + const payload = message.payload; + const selection = { + title: typeof payload.title === "string" ? payload.title : undefined, + image: typeof payload.image === "string" ? payload.image : undefined, + imageAlt: typeof payload.imageAlt === "string" ? payload.imageAlt : undefined, + }; + + return selection.title || selection.image ? selection : undefined; +} + +export function buildStructuralVisualTrace( + selection: VisualTraceSelection, + targetOperatorId: string, + graph: VisualTraceGraphReader +): VisualTrace | undefined { + if (!graph.hasOperator(targetOperatorId)) { + return undefined; + } + + const visited = new Set(); + const operatorIds: string[] = []; + const visit = (operatorId: string): void => { + if (visited.has(operatorId) || !graph.hasOperator(operatorId)) { + return; + } + visited.add(operatorId); + graph.getInputOperatorIds(operatorId).forEach(visit); + operatorIds.push(operatorId); + }; + visit(targetOperatorId); + + const targetOperator = graph.getOperator(targetOperatorId); + const targetLabel = targetOperator.customDisplayName ?? targetOperator.operatorType; + const steps = operatorIds.map(operatorId => { + const operator = graph.getOperator(operatorId); + const operatorLabel = operator.customDisplayName ?? operator.operatorType; + const inputIds = graph.getInputOperatorIds(operatorId); + const kind: VisualTraceStepKind = + operatorId === targetOperatorId ? "render" : inputIds.length === 0 ? "source" : "compute"; + + return { + title: operatorLabel, + operatorId, + operatorLabel, + kind, + image: operatorId === targetOperatorId ? selection.image : undefined, + imageAlt: operatorId === targetOperatorId ? selection.imageAlt : undefined, + }; + }); + + return { + title: selection.title ?? "Selected result", + subtitle: `Workflow path to ${targetLabel}`, + summary: "Auto-built from the upstream workflow graph. Add a trace payload in the visualization for row-level details.", + heroImage: selection.image, + heroImageAlt: selection.imageAlt, + heroMetric: { + label: "Steps", + value: String(steps.length), + }, + steps, + }; +} + +export function parseVisualTracePayloadAttribute(value: string | null): VisualTrace | undefined { + if (!value) { + return undefined; + } + try { + return parseVisualTraceMessage({ + type: TRACE_MESSAGE_TYPE, + payload: JSON.parse(value), + }); + } catch { + return undefined; + } +} + +export function findVisualTraceElement(target: EventTarget | null): Element | undefined { + let element = isElementLike(target) ? target : undefined; + while (element && element !== document.body) { + if (element.hasAttribute("data-texera-trace") || element.matches("img") || element.querySelector("img")) { + return element; + } + element = element.parentElement ?? undefined; + } + return undefined; +} + +export function extractVisualTraceSelectionFromElement(element: Element): VisualTraceSelection | undefined { + const image = element.matches("img") ? element : element.querySelector("img"); + if (!image || image.tagName !== "IMG") { + return undefined; + } + const titleElement = element.querySelector("[data-texera-trace-title], .pokemon-name"); + const imageAlt = image.getAttribute("alt") ?? undefined; + const title = titleElement?.textContent?.trim() || imageAlt || undefined; + const selection = { + title, + image: image.getAttribute("src") ?? undefined, + imageAlt: imageAlt || title, + }; + return selection.title || selection.image ? selection : undefined; +} + +export function buildVisualTraceBridgeScript(): string { + return ` +(() => { + const TRACE_MESSAGE_TYPE = "texera-visual-trace"; + const TRACE_SELECTION_MESSAGE_TYPE = "texera-visual-trace-selection"; + const emitTrace = payload => window.parent.postMessage({ type: TRACE_MESSAGE_TYPE, payload }, "*"); + const emitSelection = payload => window.parent.postMessage({ type: TRACE_SELECTION_MESSAGE_TYPE, payload }, "*"); + const parseTrace = value => { + try { + return JSON.parse(value); + } catch { + return undefined; + } + }; + + const findFallbackElement = target => { + let element = target instanceof Element ? target : null; + while (element && element !== document.body) { + if (element.hasAttribute("data-texera-trace")) { + return element; + } + if (element.matches("img") || element.querySelector("img")) { + return element; + } + element = element.parentElement; + } + return null; + }; + + const buildFallbackSelection = element => { + const image = element.matches("img") ? element : element.querySelector("img"); + if (!image) { + return undefined; + } + const titleElement = element.querySelector("[data-texera-trace-title], .pokemon-name"); + const title = titleElement?.textContent?.trim() || image.getAttribute("alt") || undefined; + return { + title, + image: image.getAttribute("src") || undefined, + imageAlt: image.getAttribute("alt") || title, + }; + }; + + document.addEventListener("click", event => { + const element = findFallbackElement(event.target); + if (!element) { + return; + } + const payload = parseTrace(element.getAttribute("data-texera-trace")); + if (payload) { + emitTrace(payload); + return; + } + const selection = buildFallbackSelection(element); + if (selection) { + emitSelection(selection); + } + }); + + window.texera = window.texera || {}; + window.texera.showTrace = emitTrace; +})(); +`; +} diff --git a/frontend/src/app/workspace/types/visual-trace.interface.ts b/frontend/src/app/workspace/types/visual-trace.interface.ts new file mode 100644 index 00000000000..0af072ec34f --- /dev/null +++ b/frontend/src/app/workspace/types/visual-trace.interface.ts @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +export type VisualTraceStepKind = "source" | "match" | "compute" | "render"; + +export interface VisualTraceMetric { + label: string; + value: string; +} + +export interface VisualTraceStep { + title: string; + detail?: string; + operatorId?: string; + operatorLabel?: string; + image?: string; + imageAlt?: string; + kind?: VisualTraceStepKind; + metrics?: VisualTraceMetric[]; +} + +export interface VisualTrace { + title: string; + subtitle?: string; + summary?: string; + heroImage?: string; + heroImageAlt?: string; + heroMetric?: VisualTraceMetric; + steps: VisualTraceStep[]; +} + +export interface VisualTraceSelection { + title?: string; + image?: string; + imageAlt?: string; +} diff --git a/frontend/src/assets/operator_images/FileSplit.png b/frontend/src/assets/operator_images/FileSplit.png new file mode 100644 index 0000000000000000000000000000000000000000..f6e57404cbe41e3c92c35d4b7dc17458aeefeeeb GIT binary patch literal 1657 zcmeAS@N?(olHy`uVBq!ia0vp^2_VeD1|%QND7OGojKx9jP7LeL$-D$|0#YM9(|mmy zw18|51|~)!24;{FAY^FIWMBca85q=nGy?DdeyTSmVA(5FO z5hW46K32*3xq68y`AMmI6}bf<1q?P7RzPNMYDuC(MQ%=Bu~mhw5?F;5kPQ;nS5g2g zDap1~itr6kaLzAERWQ>t&@)i7<5EyiuqjGOvkG!?gK7uzY?U%fN(!v>^~=l4^~#O) z@{7{-4J|D#^$m>ljf`}GDs+o0^GXscbn}XpA%?)raY-#sF3Kz@$;{7F0GXSZlwVq6 ztE2?72o50bEXhnm*pycc^%l^B`XCv7Lp=k1xY)8}3Pc4?9ei;DviT9Cx#9%!~o#kkSG^S^r}nRN`WAfq@@F4o#Xp%<(-=5brix4QNH!E^ajtx+{5Q0pagFKP>(KT!NX*O@-4$$U%0UV?jW(v%x0uiSZd!f8{X+Q#{R{U4 z=C?oFb?r;n%-DlQC4rnd(*xxf)to)4dnH5q)v6NTk0le*TBK9>7t}8_b^JCpl|41r ziKAZl)#{wy>tVAN^n2Zztx)3^ZW-nqm$!(met{i-dTPG2zQa%dFt3Mv85vTiJ{9b; zs$9sP>m2*x{^MI)d$y)jTI?;E+WztE0rhY{)7YuCt=Z2jg4GoN+^n#hSu}BL1E1sr z=0~jZ0zs!lRqR;i7w+6WH?ne{fOytZCmT69Fv`-Ii+^R9ri~tu@@R= zmrUww;F-|(cEyy`hQJw@?y4{^Nw(HXl}r#>R$_jV;qlE$p(Sw_g=OOBpS+V9f9BEk zO+S|Zkute8|5NaS2T@O+YW-$2dNsYVf9VI--bj!ApUUPf{uJ}(?7!~F{R{^elpmJq zSG#m#p}X+xe9oov=VD7O8Z4RPL$hviOC&8AG(wYW2`|9<;J`O$gj;*&YO8+Z<}?Umnm%v4fwBlo4x+icwuyieSf z&8Dof0-lapf3me6W@%t z7k)p@e$^d$XZ}up-ct8k4%PGG8u`VC3?B+_QMs|P^1X?B{d%W2(Uz`D+k0CBAKCTA z`f?v{I~`CL-c;>==Ron>MbFF_majg`b7+OP=^@z*7fhynu#BDRqUO;5?8~L=vsmnn zO1%3%JpJjWT&(e9W@Gtg;Xiu4XO1_`m#i?iKKTBUMuE+5{WQ^pee+en&voiwnlW#G zzxKbz_Xl;>7Cine=Tj)V@h_|Rb;eJB9AiEzPp&i6t}DFlJ5jps%_J2i1ttVCC}sN3 YxNSlC#}C?h*FXiir>mdKI;Vst0HBhi7XSbN literal 0 HcmV?d00001 diff --git a/frontend/src/assets/operator_images/SmartFileScan.png b/frontend/src/assets/operator_images/SmartFileScan.png new file mode 100644 index 0000000000000000000000000000000000000000..b1266bd66561bd2c4c6029daa985f4a60ae1b396 GIT binary patch literal 6977 zcmds6cT|(xmOn`lP!R|ypj5+EP>Lc72!zCIC>OYZpcFwtiYQV9SUJ5N^Xz3HPR zf8K1}8S6CLJUv|%u(%#c0d66Q;Vx~nD|2D(hyD1VkN*!o9U=K{WhmY^cLGq#o<5Ru zQ8{NkCO-4T{Barzben>ogkZ8JO~AP=P+CMcqga_x@QNO@{lvwLK?IQ0x76$`-GolUx=x z)R-{sHDp_>$I`#x|6{YKfsKO{aVLgbfo5_p{)j%|jt=vVqmoJ*8J4rU#92#gOFrpc zKuei6{l1@^=7}Q9*1gX1l>2~LOGg)DS*li64ryb~RSTH`Ts4+L5gO@oxkR-fkL?*I z^fL%lu;VmIrK%*=^~yI*V=JD9z;GO2gjnf$D1nwgHJ?dZC~Pi2zL0k|^vS#c+W#oy zuuF?5V(UH7OEJ1WD9tQKMdoYhEqJ+a_R75H(BZ?4rJuFu2N}^8lztCiX zqFvDR9C32m`H>u|`zY|n%lWq&lEWfrn)ywzP<~=8OkWwPEQaXF{*FuO-G*)%TwI#^ z?3^5GTswjIB#zh`117D5G-`2)TyCMY?7O+CCH*=V=*d+$UQ>UPZ1A-Lg-kKyv*JT$ zkW}(ne$j#omdO`VHxln&``|WL(%>gx%lajG-La z3A*<~>I6VfH#wwDufs9fNhnraJiVPZ5)KeR`( zjYAt+KVPJaC^Ay<=2q?)K)sPzY{wn$t`wv$%X%UwC3zAb=wi<+S<&)5(vb6%51+ko zh9zHJzmK~b3{TP37!dpREWSsB?I~XQGo!ZQ&Q1&Z;Pu}@onr& z_R`t5B`;frYG@{oO2u)573af8$Z1=y^m-$|maI<||3ZstgS(u@_L;W9Fw6FvqXL-W zZ~C<8=>D)AfA8s+s;*DPH4bSFEk`c8I86`IH#|~6D&PLZkLR2_e!rq?u5H2lK#N<+ zVd*7zbeUXudx7CDjKMwQ@cnVit^vV$T{;r&qYmaC4hAR4xhqbK=K@zXH+7g{C=ZoS zfaf>5gr8>}*naCzEk$8trC`>%cfMI(XpQ7CjqPC)ts(KLell@Niu1XdljdDF$}Jwr ziQ4O-@o4Yw-~~~(wmLIty{3N~Iz8sC!RH&9=VYXD=mL4l?`VY3E46JPrS?X8?36$T zl>jd1bs%nr%Hp>5h~T5Bfd9ldqzk2oY2(iZLOL+%{m7hV1b=$;290x@LioSV`#q-_ zHn`dCLl7FU{@=BS51co&M^Z2scTvJc6&0--*YeSJon7t_b zH_)xzG7?izt2M>B5OoQS_!$T6vl_h?2?RU4@eqlL(i|dIq$nr`tR4Y*AIB~r)24>a zxV~$OniD)hSq#uh0rt^(aZ{kg`)H3a*tHwkYmRi0)jgFr7NT`=22ccDN(CuFQgKa7 zm~K{GT6b^g$Xd!F5i4N~rBwlR?_3Lij&a2b{G1D`m+#n&p_~>6QwfD536ZmlxQu85 zsD@7Ni3@L)WO?-UJ8v$MfGV)Uj+3)jLeRyD1FLO4&R)R*0}6GqP#IeMKCe~%%%E3^ z7FwTJQ+F1P@J;}erZtTI$ntAF!261ziy~y4YY$5q;|51R7IQ?rba22VHkkkg~-dve^I^c^S`Fv;&fEiw{reVnqeE7@*~nJQL-^-7K^f zUB}|-p8chZGUh9iK@=S^s}Lw>@7w=`0PMtp^z*=f7&7N8N&vkXyME0n6^fx~Z=`G? z2K*8l?nF{W0rocT4D{860FW61*B&nPKv!dp_!3gUWK{ph#2Iqb?_!9tT>zL`@Fl<& z1<#jo695Ga0pBMGK!GS?Y$pJr(r7DTB%oZNQt}J$69EN7TDEM zW2x$(0ABvE_=UQ~f=W%7_#MLe+mX{YDdx zv~+QOMLa)rp8KIWu28+H-j$J?vBU@JP6YpXe3*tiMZ<9du`K1Q{yj%0tZ8++4siKm zVgOkE4*NV)6BwwC8v#(>i2z{NZvbRmfs5ot?%!Ccs}qHUSfjd%g_|&6Z6yDZxz<(L>VO7M(UUqt_0|3@eM&m!kCLMG@KcJ^40;)pGREbq6}i z`yR;HA9;9?>W5`JeQ4eHNN&UJ`zg-U=JZJQp?l{a%Uwl^gRM=Uap6O+<<&2=?rXS1 zUc1xjX61sU{2n^iwZM$)IcDzKkjtvc<-YxMDih-?v zZBBCu%Q?MJeI$erT~^l)IHVZVdldA;jGt}!L*psoC$SmcIX zK>11pDHvjk|5FWC&xVHoW%?_pnx`~My6nrFsXHa4QH{iYj53?4JfAbS`PFWqe0@!r z?bT^yY)wO;GPMrKM2l2wv!r0dm9f8^<`<_}%Zz#wFO&lp`!rst`4){XtKEyfxkVIg zeGexZ)~3nCh7659R~q}-1|#Vwdw8_oUMmlln0=wN5`+2OGF;gr%|O1{*+4d@&57D) zq#_|PkZXK7O}aq1aoJvNai$L~utfK6O;1CEt=(!Qo{J&ba;S`(s<5p7r5Lq>Em%kX zcO#Y4vc!rx+GpQxe0%ZGZbnwI8wp!Z>$0ZtNn?QWgYI?Ls&Gd5 zhl8-Devb*%4SPN=@$gWEsOP>9O@xSpi*p0p%FsBYJ-f|dfXoBbBNs(-WY_6BTcQ3`+9J?tm1;+h!Br8aO%-`l(DZVdNH=QVC zuMHD!u6hV_Cz?W<$(Ruqj%wtf=;)(S1adD z&s-O_8~K`FX$VkRy2oQKSIpixriZk~N8Ywp9$|f4zB{m_J7{`I+^sCMeTH-@!}QEM z+f6xI*6|^8bn(yewy-dpVBp3jC9*!hv!OLf*yg%x9B_({Sv@!X<+Nnx)-;e&1T*4V zDPIpx6+_ng*V5PjPC6H~{wLB&xDGNd@^~UAjeVX+QY__;va5~3Ji>vzqfcIUpRPRb z`6Xns+JA(AWt~g7vI}e)>rCFvjQ8VqyiQk}=)6$atSv|GCZ*KCWHjx|k!I%Sq{qEn z`9)w5DSsm!F<6QJAf0rz!b7#9mYf+&Yub^GFlpcJ+O;?DJ59o}q&g&d#djyCJxA`c zngzrStjAO$-=xZ180>a3M4hAz`yLQmNolP-CNV0#3uVt7pfDK#DZE2swp`S&$J|&a z8R(3{CPr4Su}d7}yLw0hkf9>V{My?(6^j~qH1iMEshLmnEd3kn7!eM`b>}SUrNH^6 zmSk7Ep2XjP+cT{bKJGfg@FcZh_t89ozr*qRh(jp#NoA z#CH?|pE*5Hsu;QOHZyeMPYYT}S=GYhRHmb`@XN1vYjPjUf$piM7-IOUxO^;JZ8I~; zvN=~%(zX0!EKC98qf6_xXGb**Y~vtVHoRtQJ1*qR!FmX489L<;zq!po`8TtF^*i$T zpI8uh;nm039{RZ^ZywYoe?h&0|M*^q>q2&J4-IzcmG`*8nq!$1Wy2We9IBPtm#!@g>c<%HyMr}b%h3%%u}x)g4A zDJS2-Xv%)3u90h9iDk(c@<{*h9MqvS6RdQj_LA24xTFoOo^syv?wa1wyK6S>$J}9^ zhpu$>$gZ26D;|tVvPgw5Fx2k$(zOMB*u*J7tU&fWyCc8T?dDrpw2#?q0S;-#iH0(5 zB{~l#Ze^$CJb~f;u`1o#sd%&Z>iIc@`gs{?k~3lY1{E7$c!PbC5f*C-@u<%C=4!gC z#>zPN$#MEBJ&y)KIWBpM&o+2r^AO7EwuisR46gl65P+{i{7&~oVhnX%MMLHmy>e7h zzVfF=w;42|ctYgzp2~0EGCLLbgOiHl(U=lug~9Fkil^f87TU-BVIP_ZW}Z`O>EXFK#ry-ZhM^ zm4!x(z!ceZAFly6h#i?b~hA<^l z47_1K0&cg|im{r0^cwF)-$^Q%ocoeGaq2;K+sHU|*_^PLT$O4#(z_f(z4yN4o>!{~ z*>gfsa%ghp2YhRS$;!+;**Q6qLE2h>uVktI?iBRWS#(x)ZITjB6fH6lrmwH7PxqCP zUZXoS99MGn@-AmqPt#lrCdeK~dYwxNv+MPJ1vM3soUV|c#H}uIFQ+mCp4E&e_T!qO z{X%yegpX)4Z^_c*n3>52=H(C164n*k;W5K}?y6bqM2T_M9paNne||vuvzRnm_JXuV z8-Am0AigxT>^{96n@W82tV_`q9upjxSG_;CL$60i!qRZm=IF=dr?Nh&CfKiMFG=#+ zYfS*WV@nsb8$;S6_SOW<+Mv&Dbm@SxT^)9xI^|!&s$|*~vS;tc0q&ZB+_H9Gwj+Bs zp4vLJoGFV|M^>2w&=E2duAG+6E|cPhkC>sE*%$m!*DJG>S~`;lK;cmUZaOWR98}z4 zgVxwq{NXZ1eXyb?@oYn|d=K>HaiD_?5 zi_wp)syE|A4o^<^)ogjbL6Ab|U&xZD=`+y9KK|hY&DQOx{gd-8A%dz(j^6@QEy1Ht zxm7_7dJvrG{oa+Jp5=x0)pWNoq{3ZQ@>Is0E})#~N^zsdQ?Gg$lv(-3M-TS3WZJqPjM{| zICIy4iGm|>mD@^l;0N|6vMkhlgAbps_L4p4C^7|E>#9FzhB;G9h)(9O%Px{T}6EOGRf{V4Dx!6>q2 zF0m8^l7z-&CAmH~fpURL?yZoLxIF@h$%_7u-7%e-FgcUfKJybwnBih-=(LX+K;J<# zU$-vnm8!xGK!-S-u7viWN-D|MEIBrGZ5~euKY-c&YCUV8(I}nv_Ob5F^7!N^bAoj6 z0NAAjv|byv2`{V3QzAt{$~2=C6=x|M{_SZ#C5f21${Yz{`RD-CmOEfgtzE8mcA^Br z2St$+18m#xPF#g_xChVJ2XnV2{sUIl;b78AXCq^}ac(g}0hs;)emX4Px+AGb8x#I% z=_&Hyq`DBunBc=BR}SyyugM*4H&|YVgv2kxlZ!a8+LlETe`j!Mez@L7pjH~a zl0oN${MyI^JZ&T8%Dg^Fe{ZTjO9mv+fXS0PK{eZix!^YiAa-d=9^}3d6#Z{od@FTS zMCDE^-S|#aLNZWJlC%~9MOQXwii?5l+Yy>9KH$gH+_MW|h1opQmZUfw56hb{l>BmC zGlsy2kE!~ZL$a6%<6I;JTDKz;PUc`2$lO6Q@~eqJ#*z2tAmIbkjZ~9n%+W(@q!J}E zuIrYjCZ}_rEi-G&H+#7{Fc2mqH|V(PoDwVc8|r!#HPn)D{ZZP+-jZ>}VNZ0Qw%(gK p?)OG+)si_Q>fi4FIptTlBFD~rMk#BR%k!N3!_b0I@Vm=j{{hmAuu%X2 literal 0 HcmV?d00001