diff --git a/agent-service/src/agent/prompts.test.ts b/agent-service/src/agent/prompts.test.ts new file mode 100644 index 00000000000..b036c76eaf0 --- /dev/null +++ b/agent-service/src/agent/prompts.test.ts @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { buildSystemPrompt } from "./prompts"; +import { WorkflowSystemMetadata } from "./util/workflow-system-metadata"; + +describe("buildSystemPrompt", () => { + test("includes both operator type and display name", () => { + const metadata = new WorkflowSystemMetadata(); + metadata.loadFromMetadata({ + operators: [ + { + operatorType: "SmartFileScan", + operatorVersion: "1", + jsonSchema: { properties: { fileName: { type: "string" } }, required: ["fileName"] }, + additionalMetadata: { + userFriendlyName: "Smart Source", + operatorGroupName: "Data Input", + operatorDescription: "Auto-detects files and folders.", + inputPorts: [], + outputPorts: [{}], + }, + }, + ], + groups: [], + }); + + const prompt = buildSystemPrompt(metadata, ["SmartFileScan"]); + + expect(prompt).toContain("## SmartFileScan"); + expect(prompt).toContain("Display name: Smart Source"); + expect(prompt).toContain("Description: Auto-detects files and folders."); + }); +}); diff --git a/agent-service/src/agent/prompts.ts b/agent-service/src/agent/prompts.ts index 064eed2e3e5..ca3b542c463 100644 --- a/agent-service/src/agent/prompts.ts +++ b/agent-service/src/agent/prompts.ts @@ -268,10 +268,12 @@ function buildAllowedOperatorSchemas( for (const operatorType of operatorTypes) { const compactSchema = metadataStore.getCompactSchema(operatorType); const description = metadataStore.getDescription(operatorType); + const displayName = metadataStore.getAdditionalMetadata(operatorType)?.userFriendlyName; if (compactSchema) { schemas.push( `## ${operatorType}\n` + + (displayName ? `Display name: ${displayName}\n` : "") + (description ? `Description: ${description}\n` : "") + `Schema:\n\`\`\`json\n${JSON.stringify(compactSchema, null, 2)}\n\`\`\`` ); diff --git a/agent-service/src/types/agent.test.ts b/agent-service/src/types/agent.test.ts new file mode 100644 index 00000000000..abc4e73acf2 --- /dev/null +++ b/agent-service/src/types/agent.test.ts @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { describe, expect, test } from "bun:test"; +import { DEFAULT_AGENT_SETTINGS } from "./agent"; + +describe("DEFAULT_AGENT_SETTINGS", () => { + test("allows the smart source operator by default", () => { + expect(DEFAULT_AGENT_SETTINGS.allowedOperatorTypes).toContain("SmartFileScan"); + }); +}); diff --git a/agent-service/src/types/agent.ts b/agent-service/src/types/agent.ts index 765f5a7cb46..74cb6230c16 100644 --- a/agent-service/src/types/agent.ts +++ b/agent-service/src/types/agent.ts @@ -87,6 +87,7 @@ export const DEFAULT_AGENT_SETTINGS: Omit = { executionTimeoutMs: 240000, maxSteps: 100, allowedOperatorTypes: [ + "SmartFileScan", "CSVFileScan", "Filter", "Projection", diff --git a/amber/build.sbt b/amber/build.sbt index 1f363e73e91..0c31509d73e 100644 --- a/amber/build.sbt +++ b/amber/build.sbt @@ -197,6 +197,10 @@ libraryDependencies ++= Seq( // For ScalaPB 0.11.x: libraryDependencies += "com.thesamet.scalapb" %% "scalapb-json4s" % "0.12.0" +// Used by LLMSourceResource to extract text from PDF samples before prompting the LLM. +// Without this the LLM only sees raw PDF bytes and produces generic catch-all schemas. +libraryDependencies += "org.apache.pdfbox" % "pdfbox" % "3.0.3" + // enable protobuf compilation in Test Test / PB.protoSources += PB.externalSourcePath.value diff --git a/amber/requirements.txt b/amber/requirements.txt index 726310934dd..3e67e32cc25 100644 --- a/amber/requirements.txt +++ b/amber/requirements.txt @@ -48,3 +48,12 @@ SQLAlchemy==2.0.37 pg8000==1.31.5 pympler==1.1 boto3==1.40.53 +# Libraries the LLM File Source operator's generated parsers commonly reach for. +# Without these the worker fails with "No module named 'pdfplumber'" the moment +# the LLM emits PDF / HTML / Excel-handling code. +pdfplumber==0.11.9 +pypdf==6.11.0 +openpyxl==3.1.5 +lxml==5.3.0 +beautifulsoup4==4.13.4 +chardet==5.2.0 diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala index 4ff5ff15ae3..3842777bc59 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/pythonworker/PythonWorkflowWorker.scala @@ -41,9 +41,10 @@ import org.apache.texera.amber.engine.common.ambermessage._ import org.apache.texera.amber.engine.common.{CheckpointState, Utils} import org.apache.texera.amber.config.PythonUtils -import java.nio.file.Path +import java.io.{FileOutputStream, PrintStream} +import java.nio.file.{Files, Path, Paths} import java.util.concurrent.{ExecutorService, Executors} -import scala.sys.process.{BasicIO, Process} +import scala.sys.process.{Process, ProcessLogger} object PythonWorkflowWorker { def props(workerConfig: WorkerConfig): Props = Props(new PythonWorkflowWorker(workerConfig)) @@ -171,6 +172,16 @@ class PythonWorkflowWorker( // Set the Iceberg related arguments based on the catalog type. val isPostgres = StorageConfig.icebergCatalogType == "postgres" val isRest = StorageConfig.icebergCatalogType == "rest" + // Redirect the Python subprocess's stdout/stderr to a per-worker log file so Python-side + // exceptions (e.g., from user UDFs or generated LLM parsers) are recoverable. Previously + // these went to the JVM's own stdout/stderr, which deploy-daemon.sh redirects to /dev/null, + // making any Python crash invisible from disk. + val workerLogPath = pythonWorkerLogPath(workerConfig.workerId.name) + val workerLog = new PrintStream(new FileOutputStream(workerLogPath.toFile, true), true) + val logger = ProcessLogger( + line => workerLog.println(line), + line => workerLog.println(line) + ) pythonServerProcess = Process( Seq( PythonUtils.getPythonExecutable, @@ -194,7 +205,16 @@ class PythonWorkflowWorker( StorageConfig.s3Username, StorageConfig.s3Password ) - ).run(BasicIO.standard(false)) + ).run(logger) + } + + /** Choose a stable on-disk path for this worker's stdout/stderr capture. */ + private def pythonWorkerLogPath(workerId: String): Path = { + val logsDir = Paths.get("logs", "python-workers") + Files.createDirectories(logsDir) + // Sanitize worker IDs (they contain '/' separators that would create subdirs). + val safe = workerId.replace('/', '_').replace(':', '_') + logsDir.resolve(s"$safe.log") } override def loadFromCheckpoint(chkpt: CheckpointState): Unit = ??? diff --git a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala index 98b7c68c974..a9f72a3036d 100644 --- a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala +++ b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala @@ -130,6 +130,8 @@ class TexeraWebApplication environment.servlets.setSessionHandler(new SessionHandler) environment.jersey.register(classOf[SystemMetadataResource]) + environment.jersey.register(classOf[SmartFileInferenceResource]) + environment.jersey.register(classOf[LLMSourceResource]) // environment.jersey().register(classOf[MockKillWorkerResource]) environment.jersey.register(classOf[HealthCheckResource]) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/LLMSourceResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/LLMSourceResource.scala new file mode 100644 index 00000000000..46e3831b71e --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/web/resource/LLMSourceResource.scala @@ -0,0 +1,666 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.web.resource + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty} +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import kong.unirest.Unirest +import org.apache.texera.amber.core.storage.{DocumentFactory, FileResolver} +import org.apache.texera.amber.config.PythonUtils +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.config.LLMConfig +import play.api.libs.json.{JsObject, Json} + +import java.net.URI +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path} +import java.security.MessageDigest +import java.time.Instant +import javax.annotation.security.RolesAllowed +import javax.ws.rs.core.MediaType +import javax.ws.rs.{Consumes, POST, Path => WsPath, Produces, WebApplicationException} +import scala.jdk.CollectionConverters._ +import scala.sys.process._ +import scala.util.Try + +@JsonIgnoreProperties(ignoreUnknown = true) +case class LLMSourceGenerateRequest( + @JsonProperty("fileName") fileName: String, + @JsonProperty("userHint") userHint: Option[String] = None, + @JsonProperty("llmModel") llmModel: Option[String] = None, + // Iterative repair contract — accepted from day one even though MVP UI doesn't use it. + @JsonProperty("previousCode") previousCode: Option[String] = None, + @JsonProperty("previousError") previousError: Option[String] = None +) + +case class LLMSourceColumn(name: String, `type`: String) + +case class LLMSourceTable( + name: String, + description: String, + columns: java.util.List[LLMSourceColumn] +) + +case class LLMSourceGenerateResponse( + generatedCode: String, + tables: java.util.List[LLMSourceTable], + unionColumns: java.util.List[LLMSourceColumn], + llmModel: String, + sampleHash: String, + generatedAt: String, + warnings: java.util.List[String] +) + +@WsPath("/llm-source") +@RolesAllowed(Array("REGULAR", "ADMIN")) +@Consumes(Array(MediaType.APPLICATION_JSON)) +@Produces(Array(MediaType.APPLICATION_JSON)) +class LLMSourceResource extends LazyLogging { + + private val objectMapper = new ObjectMapper() + objectMapper.registerModule(DefaultScalaModule) + + // Must match a model name registered in the LiteLLM proxy's litellm-config.yaml. + // See bin/litellm-config.yaml for the registered list. + private val DefaultModel = "claude-haiku-4.5" + private val SampleByteCount = 64 * 1024 + private val BinarySampleByteCount = 2 * 1024 + // 2000 tokens covers a parser + a handful of table schemas. We measured Claude Haiku 4.5 at + // ~50 tokens/sec end-to-end through the LiteLLM proxy, so this caps the LLM call at ~40s. + private val MaxTokens = 2000 + private val ConnectTimeoutMs = 10_000 + private val SocketTimeoutMs = 90_000 + + // After getting code from the LLM, dry-run it against the real file. If it raises, feed the + // traceback back into the LLM and ask for a fix. Two retries (so up to three LLM calls + // total) keeps the worst-case latency reasonable while letting us self-heal most flaky outputs. + private val MaxDryRunRetries = 2 + private val DryRunSampleRows = 5 + + // Probing for a working python binary spawns 1 process per candidate; cache the result so we + // pay that cost at most once per backend lifetime instead of on every Generate request. + @volatile private var cachedPythonBinary: Option[Option[String]] = None + + @POST + @WsPath("/generate") + def generate(request: LLMSourceGenerateRequest): LLMSourceGenerateResponse = { + if (request.fileName == null || request.fileName.trim.isEmpty) { + throw new WebApplicationException("fileName is required", 400) + } + + val tOverall = System.currentTimeMillis() + val uri = FileResolver.resolve(request.fileName) + val resolved = FolderInputResolver.resolve(uri) + val sampleSource = resolved.files.headOption + .map(_.uri) + .getOrElse(uri) + val displayName = resolved.files.headOption.map(_.displayName).getOrElse(request.fileName) + val extension = extractExtension(displayName) + + val tSample = System.currentTimeMillis() + val (samplePreview, sampleBytes) = buildSample(sampleSource, extension) + val sampleHash = sha256Hex(sampleBytes) + val sampleMs = System.currentTimeMillis() - tSample + + // Resolve once here so the dry-run sees the same local path the workflow will use at run time. + // For folder inputs, this must stay a folder path rather than degrading to the sampled first file. + val resolvedLocalFilePath = Try(resolveRuntimeInputPath(uri).toAbsolutePath.toString) + .getOrElse(request.fileName) + + val model = request.llmModel.filter(_.nonEmpty).getOrElse(DefaultModel) + val warnings = new java.util.ArrayList[String]() + + // ---- LLM call with dry-run retry loop --------------------------------- + var attempt = 0 + var lastCode: String = "" + var lastTables: List[ParsedTable] = Nil + var lastRawResponse: String = "" + var lastSyntaxError: String = "" + var lastDryRunError: String = "" + var lastDryRunRowsSeen: Int = 0 + var llmMsTotal: Long = 0 + var validateMsTotal: Long = 0 + var success = false + + while (!success && attempt <= MaxDryRunRetries) { + val previousCode = if (attempt == 0) request.previousCode.getOrElse("") else lastCode + val previousError = if (attempt == 0) request.previousError.getOrElse("") + else if (lastDryRunError.nonEmpty) lastDryRunError + else lastSyntaxError + + val prompt = buildPrompt( + displayName = displayName, + extension = extension, + isFolder = resolved.isFolder, + fileCount = resolved.files.size, + userHint = request.userHint.getOrElse(""), + samplePreview = samplePreview, + previousCode = previousCode, + previousError = previousError + ) + + val tLlm = System.currentTimeMillis() + val rawLlmContent = callLLM(model, prompt) + llmMsTotal += System.currentTimeMillis() - tLlm + lastRawResponse = rawLlmContent + + val parsed = parseLLMResponse(rawLlmContent) + lastCode = parsed.code + lastTables = parsed.tables + + val tValidate = System.currentTimeMillis() + val syntaxResult = validatePython(lastCode) + lastSyntaxError = syntaxResult.getOrElse("") + validateMsTotal += System.currentTimeMillis() - tValidate + + if (syntaxResult.isDefined) { + logger.warn(s"generate attempt $attempt failed syntax check: ${syntaxResult.get.take(400)}") + attempt += 1 + } else { + val dryRun = dryRunGeneratedCode(lastCode, resolvedLocalFilePath) + if (dryRun.ok) { + lastDryRunError = "" + lastDryRunRowsSeen = dryRun.rowsSeen + success = true + logger.info(s"generate attempt $attempt dry-run OK: rowsSeen=${dryRun.rowsSeen}") + } else { + lastDryRunError = dryRun.error.getOrElse("dry-run failed with no error message") + logger.warn(s"generate attempt $attempt dry-run failed: ${lastDryRunError.take(400)}") + attempt += 1 + } + } + } + + if (!success) { + if (lastSyntaxError.nonEmpty) warnings.add(s"Generated code did not parse: $lastSyntaxError") + if (lastDryRunError.nonEmpty) warnings.add(s"Generated code failed dry-run: $lastDryRunError") + warnings.add( + s"Exhausted ${MaxDryRunRetries + 1} attempts. Returning the last code anyway so you can hand-edit it." + ) + } + + val totalMs = System.currentTimeMillis() - tOverall + logger.info( + s"generate: total=${totalMs}ms (sample=${sampleMs}ms, llm=${llmMsTotal}ms across ${attempt + (if (success) 1 else 0)} attempts, " + + s"validate=${validateMsTotal}ms, model=$model, prompt_chars_last=${lastRawResponse.length}, " + + s"success=$success, rowsSeen=$lastDryRunRowsSeen)" + ) + + val tableList = new java.util.ArrayList[LLMSourceTable]() + val seenColumns = scala.collection.mutable.LinkedHashMap[String, String]() + seenColumns.put("__table__", "string") + lastTables.foreach { t => + val cols = new java.util.ArrayList[LLMSourceColumn]() + t.columns.foreach { c => + cols.add(LLMSourceColumn(c.name, c.`type`)) + if (!seenColumns.contains(c.name)) { + seenColumns.put(c.name, c.`type`) + } else if (seenColumns(c.name) != c.`type`) { + warnings.add( + s"Column '${c.name}' appears with different types across tables (${seenColumns(c.name)} vs ${c.`type`}); using ${seenColumns(c.name)} in the union schema." + ) + } + } + tableList.add(LLMSourceTable(t.name, t.description, cols)) + } + val union = seenColumns.toList.map { case (n, t) => LLMSourceColumn(n, t) }.asJava + + LLMSourceGenerateResponse( + generatedCode = lastCode, + tables = tableList, + unionColumns = union, + llmModel = model, + sampleHash = sampleHash, + generatedAt = Instant.now().toString, + warnings = warnings + ) + } + + // --------------------------------------------------------------------------- + // Dry-run of the generated UDFSourceOperator against the real file. + // --------------------------------------------------------------------------- + + private case class DryRunResult(ok: Boolean, rowsSeen: Int, samples: Seq[String], error: Option[String]) + + private[resource] def resolveRuntimeInputPath(uri: URI): Path = + FolderInputResolver.materializeToLocalPath(uri) + + private val pythonSrcPath: String = { + val candidates = Seq( + "amber/src/main/python", + "../amber/src/main/python", + "../../amber/src/main/python" + ) + candidates.find(p => Files.exists(java.nio.file.Paths.get(p))).getOrElse(candidates.head) + } + + private val dryRunHarnessPath: String = { + val candidates = Seq( + "amber/src/main/scala/org/apache/texera/web/resource/llm_source_dry_run.py", + "../amber/src/main/scala/org/apache/texera/web/resource/llm_source_dry_run.py" + ) + candidates.find(p => Files.exists(java.nio.file.Paths.get(p))).getOrElse(candidates.head) + } + + /** + * Substitute the file URI token into the code, write it to a temp file, and invoke the dry-run + * harness which exec()s the code, instantiates GenerateOperator(), and iterates produce() for + * a few rows. The harness reports OK + sample rows or a Python traceback as JSON. + */ + private def dryRunGeneratedCode(code: String, resolvedFileUri: String): DryRunResult = { + val pythonBinary = findPythonBinary().getOrElse { + return DryRunResult(ok = true, rowsSeen = 0, samples = Seq.empty, error = None) + } + val substituted = code.replace("__TEXERA_FILE_URI__", resolvedFileUri) + val tmpFile = Files.createTempFile("texera-llm-source-dryrun-", ".py") + try { + Files.write(tmpFile, substituted.getBytes(StandardCharsets.UTF_8)) + val cmd = Seq( + pythonBinary, + dryRunHarnessPath, + tmpFile.toAbsolutePath.toString, + pythonSrcPath, + DryRunSampleRows.toString + ) + val stdout = new StringBuilder + val stderr = new StringBuilder + val procLogger = ProcessLogger( + line => { stdout.append(line); stdout.append('\n') }, + line => { stderr.append(line); stderr.append('\n') } + ) + val exit = Try(cmd.!(procLogger)).getOrElse(-1) + val rawOut = stdout.toString.trim + if (exit != 0 && rawOut.isEmpty) { + return DryRunResult( + ok = false, + rowsSeen = 0, + samples = Seq.empty, + error = Some(s"dry-run subprocess exited $exit. stderr: ${stderr.toString.trim.take(800)}") + ) + } + // The harness prints a single JSON object on stdout. + val lastJsonLine = rawOut + .split('\n') + .reverseIterator + .find(l => l.startsWith("{")) + .getOrElse(rawOut) + try { + val node = Json.parse(lastJsonLine) + val ok = (node \ "ok").asOpt[Boolean].getOrElse(false) + val rowsSeen = (node \ "rowsSeen").asOpt[Int].getOrElse(0) + val errMsg = (node \ "error").asOpt[String] + val tb = (node \ "traceback").asOpt[String] + val samples = (node \ "samples").asOpt[Seq[JsObject]].map(_.map(_.toString())).getOrElse(Seq.empty) + val combinedError = (errMsg, tb) match { + case (Some(e), Some(t)) => Some(s"$e\n$t".take(4000)) + case (Some(e), None) => Some(e) + case _ => None + } + DryRunResult(ok = ok, rowsSeen = rowsSeen, samples = samples, error = combinedError) + } catch { + case _: Throwable => + DryRunResult( + ok = false, + rowsSeen = 0, + samples = Seq.empty, + error = Some(s"dry-run output was not valid JSON: ${rawOut.take(800)}") + ) + } + } finally { + Try(Files.deleteIfExists(tmpFile)) + } + } + + // --------------------------------------------------------------------------- + // Sample reading + // --------------------------------------------------------------------------- + + private case class SamplePreview(text: String, hex: String, kind: String) + + private def buildSample(uri: URI, extension: String): (SamplePreview, Array[Byte]) = { + val ext = extension.toLowerCase + val isPdf = ext == "pdf" + val textish = Set("csv", "tsv", "json", "jsonl", "ndjson", "log", "txt", "html", + "htm", "xml", "yaml", "yml", "md") + val isText = textish.contains(ext) + + if (isPdf) { + val text = Try(extractPdfText(uri, maxPages = 2)).toOption.getOrElse("") + // For hash + audit, capture the first chunk of raw bytes. + val bytes = Try(readBytes(uri, BinarySampleByteCount)).getOrElse(Array.emptyByteArray) + val preview = SamplePreview( + text = if (text.nonEmpty) text else lossyDecode(bytes), + hex = hexPreview(bytes), + kind = "pdf" + ) + (preview, bytes) + } else if (isText) { + val bytes = readBytes(uri, SampleByteCount) + (SamplePreview(text = lossyDecode(bytes), hex = "", kind = "text"), bytes) + } else { + val bytes = readBytes(uri, BinarySampleByteCount) + (SamplePreview(text = lossyDecode(bytes), hex = hexPreview(bytes), kind = "binary"), bytes) + } + } + + private def readBytes(uri: URI, maxBytes: Int): Array[Byte] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + try { + val buffer = new Array[Byte](maxBytes) + var totalRead = 0 + var lastRead = 0 + while (totalRead < buffer.length && { + lastRead = stream.read(buffer, totalRead, buffer.length - totalRead) + lastRead + } > 0) { + totalRead += lastRead + } + if (totalRead == buffer.length) buffer else buffer.take(totalRead) + } finally stream.close() + } + + /** Extract text from the first N pages of a PDF using Apache PDFBox 3.x. */ + private def extractPdfText(uri: URI, maxPages: Int): String = { + try { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val doc = org.apache.pdfbox.Loader.loadPDF(file) + try { + val stripper = new org.apache.pdfbox.text.PDFTextStripper() + stripper.setStartPage(1) + stripper.setEndPage(math.min(maxPages, doc.getNumberOfPages)) + stripper.getText(doc) + } finally doc.close() + } catch { + case _: Throwable => "" + } + } + + private def lossyDecode(bytes: Array[Byte]): String = + new String(bytes, StandardCharsets.UTF_8) + + private def hexPreview(bytes: Array[Byte]): String = { + val limit = math.min(bytes.length, 256) + bytes.take(limit).map(b => f"${b & 0xff}%02x").mkString(" ") + } + + private def sha256Hex(bytes: Array[Byte]): String = { + val digest = MessageDigest.getInstance("SHA-256") + digest.digest(bytes).map(b => f"${b & 0xff}%02x").mkString + } + + private def extractExtension(name: String): String = { + val idx = name.lastIndexOf('.') + if (idx < 0 || idx == name.length - 1) "" else name.substring(idx + 1) + } + + // --------------------------------------------------------------------------- + // Prompt + LLM + // --------------------------------------------------------------------------- + + private def buildPrompt( + displayName: String, + extension: String, + isFolder: Boolean, + fileCount: Int, + userHint: String, + samplePreview: SamplePreview, + previousCode: String, + previousError: String + ): String = { + val sb = new StringBuilder + sb.append("You generate Python source-operator code for the Texera dataflow system.\n\n") + sb.append("OUTPUT FORMAT — return ONLY a JSON object with this exact shape:\n") + sb.append("{\n") + sb.append(" \"code\": \"\",\n") + sb.append(" \"tables\": [\n") + sb.append(" {\"name\": \"\",\n") + sb.append(" \"description\": \"\",\n") + sb.append(" \"columns\": [{\"name\": \"\", \"type\": \"\"}, ...]}\n") + sb.append(" ]\n") + sb.append("}\n\n") + sb.append("Permitted column types (lowercase): string, integer, long, double, boolean, timestamp, binary.\n\n") + sb.append("TABLE-GROUPING RULES (CRITICAL — read carefully):\n") + sb.append("- DO NOT produce a generic catch-all table (like `pdf_tables` with columns `page,table_index,col_0,col_1,...`). That defeats the purpose of this operator. The downstream user wants typed, semantically-named tables they can chart and analyze directly.\n") + sb.append("- Instead, READ the sample text to identify real, business-meaningful tables. Name each table after WHAT IT REPRESENTS (e.g., `revenue`, `headcount`, `expenses`), not after where it appeared (e.g., not `table_on_page_2`).\n") + sb.append("- Name columns after their semantic meaning (e.g., `month`, `region`, `revenue_usd`), NOT positional indices (`col_0`, `col_1`).\n") + sb.append("- MERGE physical tables that share schema and meaning (e.g., same revenue table appearing on multiple pages, quarterly results repeated per quarter, rows continuing across page breaks) into ONE TableSpec with combined rows. Add a discriminator column if the merged source needs disambiguation (e.g., `quarter`, `year`, `region`).\n") + sb.append("- KEEP separate any physically-similar but semantically-different tables (e.g., a 'revenue' table and a 'headcount' table may both have a `category`+`number` shape but mean different things — emit two TableSpecs).\n") + sb.append("- For each TableSpec, write a one-line `description` that says which physical tables you grouped and why.\n") + sb.append("- If the file legitimately has only ONE logical table, return a single-entry list — but still name it semantically, not generically.\n\n") + sb.append("CODE SHAPE — your `code` MUST follow this skeleton EXACTLY:\n") + sb.append("```python\n") + sb.append("from pytexera import *\n") + sb.append("import pdfplumber # or other libs below\n\n") + sb.append("class GenerateOperator(UDFSourceOperator):\n") + sb.append(" @overrides\n") + sb.append(" def produce(self) -> Iterator[Union[TupleLike, TableLike, None]]:\n") + sb.append(" file_uri = \"__TEXERA_FILE_URI__\"\n") + sb.append(" # ... open and parse the file ...\n") + sb.append(" for row in rows_of_table_a:\n") + sb.append(" yield {\"__table__\": \"\", \"col1\": ..., \"col2\": ...}\n") + sb.append(" for row in rows_of_table_b:\n") + sb.append(" yield {\"__table__\": \"\", \"colA\": ..., \"colB\": ...}\n") + sb.append("```\n\n") + sb.append("RULES for the code:\n") + sb.append("- Use the literal token `__TEXERA_FILE_URI__` for the file path. The backend substitutes it with an absolute local path before execution. DO NOT wrap it in any other path manipulation.\n") + sb.append("- Yield **plain Python dicts**, NOT `Tuple(...)` objects. The framework accepts dicts. e.g.: `yield {\"__table__\": \"revenue\", \"month\": \"Jan\", ...}`.\n") + sb.append("- Tag EVERY yielded row with a `__table__` key matching one of the `tables[*].name` entries you declared.\n") + sb.append("- The dict keys (other than `__table__`) MUST match the column names you declared in `tables[*].columns[*].name` exactly. Don't introduce columns not declared in the schema.\n") + sb.append("- Cast numeric values to the declared type: `int(...)` for integer/long, `float(...)` for double. Strip commas from formatted numbers (\"1,234\" → 1234). Wrap each cast in try/except and skip the row on failure rather than crashing the whole operator.\n") + sb.append("- For PDF tables: use `pdfplumber`. Iterate `page.extract_tables()` and check the header row to decide which logical table you're looking at — don't rely on page numbers alone (a table might shift between pages across files).\n") + sb.append("- Skip rows that are missing required fields (e.g., empty cells, header echoes mid-table) rather than yielding partial rows.\n") + sb.append("- Available libraries (already installed): pdfplumber, pypdf, pandas, openpyxl, lxml, beautifulsoup4, json, csv, re. DO NOT use network libraries.\n") + if (isFolder) { + sb.append("- This is a FOLDER input (") + .append(fileCount) + .append(" files of the same kind). `file_uri` will be a DIRECTORY path, not a file path. DO NOT call single-file readers such as `pdfplumber.open(file_uri)` directly. Iterate over every file first — for example `for path in pathlib.Path(file_uri).iterdir():` — then open each file and apply the same parser to it.\n") + } + sb.append("\n") + sb.append("---- FILE CONTEXT ----\n") + sb.append("File: ").append(displayName).append("\n") + if (extension.nonEmpty) sb.append("Extension: ").append(extension).append("\n") + sb.append("Is folder: ").append(isFolder).append("\n") + if (userHint.nonEmpty) sb.append("User hint: ").append(userHint).append("\n") + sb.append("\n") + sb.append("---- SAMPLE (").append(samplePreview.kind).append(") ----\n") + // Cap the sample text. Claude Haiku 4.5 reads through context linearly, so trimming + // here directly cuts generation time. 6KB is enough for a couple of pages of PDF text + // or several thousand lines of CSV/JSON sample. + sb.append(truncate(samplePreview.text, 6000)).append("\n") + if (samplePreview.hex.nonEmpty) { + sb.append("\n---- HEX PREVIEW (first 256 bytes) ----\n") + sb.append(samplePreview.hex).append("\n") + } + if (previousCode.nonEmpty || previousError.nonEmpty) { + sb.append("\n---- ITERATIVE REPAIR ----\n") + sb.append("Your previous attempt failed. Fix the code based on the error below.\n") + if (previousError.nonEmpty) { + sb.append("Error:\n").append(truncate(previousError, 4000)).append("\n") + } + if (previousCode.nonEmpty) { + sb.append("Previous code:\n").append(truncate(previousCode, 8000)).append("\n") + } + } + sb.append("\nReturn ONLY the JSON object. No prose, no markdown fences.\n") + sb.toString + } + + private def truncate(s: String, max: Int): String = + if (s.length <= max) s else s.substring(0, max) + s"\n... [truncated ${s.length - max} chars]" + + private def callLLM(model: String, prompt: String): String = { + // Note: we don't send response_format={"type":"json_object"} because Claude (the default + // on this LiteLLM proxy) doesn't natively support it. We rely on prompt-only JSON + // instructions + stripJsonFences in the parser. + val requestBody = Json.obj( + "model" -> model, + "max_tokens" -> MaxTokens, + "messages" -> Json.arr( + Json.obj( + "role" -> "user", + "content" -> prompt + ) + ) + ) + + val response = Unirest + .post(s"${LLMConfig.baseUrl}/chat/completions") + .header("Authorization", s"Bearer ${LLMConfig.masterKey}") + .header("Content-Type", "application/json") + .connectTimeout(ConnectTimeoutMs) + .socketTimeout(SocketTimeoutMs) + .body(Json.stringify(requestBody)) + .asString() + + if (response.getStatus >= 400) { + throw new WebApplicationException( + s"LLM call failed (${response.getStatus}): ${response.getBody}", + response.getStatus + ) + } + + val body = Json.parse(response.getBody) + (body \ "choices" \ 0 \ "message" \ "content").asOpt[String].getOrElse { + throw new WebApplicationException( + s"Unexpected LLM response (no choices[0].message.content): ${response.getBody}", + 502 + ) + } + } + + // --------------------------------------------------------------------------- + // Parsing the LLM output + // --------------------------------------------------------------------------- + + private case class ParsedLlm(code: String, tables: List[ParsedTable]) + private case class ParsedTable(name: String, description: String, columns: List[LLMSourceColumn]) + + private def parseLLMResponse(rawContent: String): ParsedLlm = { + val cleaned = stripJsonFences(rawContent.trim) + val node = try Json.parse(cleaned) + catch { + case _: Throwable => + throw new WebApplicationException( + s"LLM did not return valid JSON. Content: ${truncate(rawContent, 1000)}", + 502 + ) + } + val code = (node \ "code").asOpt[String].getOrElse { + throw new WebApplicationException("LLM response missing 'code' field.", 502) + } + val tables = (node \ "tables").asOpt[Seq[JsObject]].getOrElse(Seq.empty).map { t => + val name = (t \ "name").asOpt[String].getOrElse("table") + val description = (t \ "description").asOpt[String].getOrElse("") + val columns = (t \ "columns").asOpt[Seq[JsObject]].getOrElse(Seq.empty).map { c => + val cname = (c \ "name").asOpt[String].getOrElse("column") + val ctype = (c \ "type").asOpt[String].getOrElse("STRING").toUpperCase + LLMSourceColumn(cname, normalizeType(ctype)) + }.toList + ParsedTable(name, description, columns) + }.toList + if (tables.isEmpty) { + throw new WebApplicationException("LLM response had no tables.", 502) + } + ParsedLlm(code, tables) + } + + // Texera's AttributeType serializes lowercase via @JsonValue (see AttributeType.java), + // so persisted values MUST be lowercase or the property panel's enum dropdown won't match. + private val AllowedTypes = Set("string", "integer", "long", "double", "boolean", "timestamp", "binary") + private def normalizeType(t: String): String = { + val lower = t.toLowerCase + if (AllowedTypes.contains(lower)) lower + else lower match { + case "str" => "string" + case "int" | "int32" => "integer" + case "int64" | "bigint" => "long" + case "float" | "float64" | "numeric" | "decimal" => "double" + case "bool" => "boolean" + case "datetime" | "date" | "time" => "timestamp" + case "bytes" | "blob" => "binary" + case _ => "string" + } + } + + private def stripJsonFences(s: String): String = { + val fenced = "(?s)^```(?:json)?\\s*(.*?)\\s*```$".r + s match { + case fenced(inner) => inner + case _ => s + } + } + + // --------------------------------------------------------------------------- + // Python syntax validation + // --------------------------------------------------------------------------- + + /** Returns Some(error) if the code fails to parse, None if it's valid OR if no python binary is available. */ + private def validatePython(code: String): Option[String] = { + val tmpFile = Files.createTempFile("texera-llm-source-", ".py") + try { + Files.write(tmpFile, code.getBytes(StandardCharsets.UTF_8)) + val pythonBinary = findPythonBinary().getOrElse(return None) + val cmd = Seq( + pythonBinary, + "-c", + s"import ast,sys; ast.parse(open(${pyStringLiteral(tmpFile)}).read())" + ) + val stderr = new StringBuilder + val logger = ProcessLogger(_ => (), line => stderr.append(line).append('\n')) + val exit = Try(cmd.!(logger)).getOrElse(Int.MinValue) + if (exit == 0) None + else if (exit == Int.MinValue) None // subprocess couldn't run — treat as skipped, not failure + else Some(if (stderr.nonEmpty) stderr.toString.trim else s"python exited with code $exit") + } catch { + case _: java.io.IOException => None + } finally { + Try(Files.deleteIfExists(tmpFile)) + } + } + + /** + * Prefer the interpreter configured for real Python workers so dry-runs exercise the same + * dependency environment as workflow execution. The fallback probes keep local development + * usable when python.path is intentionally left blank or points to a stale binary. + */ + private[resource] def pythonBinaryCandidates: Seq[String] = + (Seq(PythonUtils.getPythonExecutable) ++ + Seq("python3", "python", "/opt/anaconda3/bin/python", "/usr/bin/python3", "/usr/local/bin/python3")) + .map(_.trim) + .filter(_.nonEmpty) + .distinct + + /** Probe for a working python binary. Cached across requests after first call. */ + private def findPythonBinary(): Option[String] = { + cachedPythonBinary match { + case Some(value) => value + case None => + val found = pythonBinaryCandidates.find { bin => + Try(Seq(bin, "-c", "0").!(ProcessLogger(_ => (), _ => ()))).toOption.contains(0) + } + cachedPythonBinary = Some(found) + found + } + } + + private def pyStringLiteral(p: Path): String = "'" + p.toAbsolutePath.toString.replace("'", "\\'") + "'" +} diff --git a/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala new file mode 100644 index 00000000000..27d9706462c --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/web/resource/SmartFileInferenceResource.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.web.resource + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty} +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.operator.source.scan.FileDecodingMethod +import org.apache.texera.amber.operator.source.scan.smart.{ + InferenceOverrides, + SmartFileFormat, + SmartFileInferencer +} + +import javax.annotation.security.RolesAllowed +import javax.ws.rs.core.MediaType +import javax.ws.rs.{Consumes, POST, Path, Produces} +import scala.jdk.CollectionConverters._ + +@JsonIgnoreProperties(ignoreUnknown = true) +case class SmartFileInferenceRequest( + @JsonProperty("fileName") fileName: String, + @JsonProperty("fileEncoding") fileEncoding: Option[String] = None, + @JsonProperty("formatOverride") formatOverride: Option[String] = None, + @JsonProperty("customDelimiter") customDelimiter: Option[String] = None, + @JsonProperty("hasHeader") hasHeader: Option[Boolean] = None, + @JsonProperty("sheetName") sheetName: Option[String] = None, + @JsonProperty("flatten") flatten: Option[Boolean] = None +) + +case class SmartFileInferenceColumn(name: String, `type`: String) + +case class SmartFileInferenceResponse( + detectedFormat: String, + schema: java.util.List[SmartFileInferenceColumn], + customDelimiter: String, + hasHeader: java.lang.Boolean, + sheetName: String, + availableSheetNames: java.util.List[String], + flatten: java.lang.Boolean, + isFolder: Boolean, + fileCount: Int +) + +@Path("/file-inference") +@RolesAllowed(Array("REGULAR", "ADMIN")) +@Consumes(Array(MediaType.APPLICATION_JSON)) +@Produces(Array(MediaType.APPLICATION_JSON)) +class SmartFileInferenceResource { + + @POST + @Path("/preview") + def preview(request: SmartFileInferenceRequest): SmartFileInferenceResponse = { + val uri = FileResolver.resolve(request.fileName) + val charset = request.fileEncoding + .flatMap(name => tryParseEncoding(name)) + .getOrElse(FileDecodingMethod.UTF_8.getCharset) + + val overrides = InferenceOverrides( + format = request.formatOverride.flatMap(s => tryParseFormat(s)), + delimiter = request.customDelimiter.flatMap(_.headOption), + hasHeader = request.hasHeader, + sheetName = request.sheetName, + flatten = request.flatten + ) + + val result = SmartFileInferencer.infer(uri, charset, overrides) + val columns = result.schema.getAttributes + .map(a => SmartFileInferenceColumn(a.getName, a.getType.toString)) + .asJava + + SmartFileInferenceResponse( + detectedFormat = result.format.getLabel, + schema = columns, + customDelimiter = result.csvDelimiter.orNull, + hasHeader = result.csvHasHeader.map(java.lang.Boolean.valueOf).orNull, + sheetName = result.sheetName.orNull, + availableSheetNames = result.availableSheetNames.asJava, + flatten = result.flatten.map(java.lang.Boolean.valueOf).orNull, + isFolder = result.isFolder, + fileCount = result.fileCount + ) + } + + private def tryParseFormat(value: String): Option[SmartFileFormat] = { + val upper = value.toUpperCase + // Accept both the enum name (CSV, TSV, ...) and the user-facing label ("Plain text", ...). + try Some(SmartFileFormat.valueOf(upper)) + catch { + case _: IllegalArgumentException => + SmartFileFormat.values().find(_.getLabel.equalsIgnoreCase(value)) + } + } + + private def tryParseEncoding(value: String): Option[java.nio.charset.Charset] = + try Some(FileDecodingMethod.valueOf(value.toUpperCase).getCharset) + catch { case _: IllegalArgumentException => None } +} diff --git a/amber/src/main/scala/org/apache/texera/web/resource/llm_source_dry_run.py b/amber/src/main/scala/org/apache/texera/web/resource/llm_source_dry_run.py new file mode 100644 index 00000000000..7aeaebf45d9 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/web/resource/llm_source_dry_run.py @@ -0,0 +1,129 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Dry-run a generated UDFSourceOperator class against a real file so the Generate +endpoint can return useful errors at design time instead of letting the workflow +crash at run time. + +Args (sys.argv): + 1. path to file containing the generated Python code + 2. python-source-path to add to sys.path (so `from pytexera import *` resolves) + 3. max rows to consume before stopping (positive int) + +Output: + Single JSON line on stdout. Keys: + {"ok": true, "rowsSeen": , "samples": []} + or + {"ok": false, "error": "", "traceback": ""} + +Exit code is always 0 — failures are reported via JSON, not via exit codes, so the +caller can distinguish "dry-run subprocess failed to launch" (non-zero) from "user +code threw at runtime" (zero + ok=false). +""" + +import json +import sys +import traceback + + +def main() -> None: + if len(sys.argv) < 4: + print(json.dumps({"ok": False, "error": "dry_run harness called with wrong arg count"})) + return + code_path, python_src_path, max_rows_str = sys.argv[1], sys.argv[2], sys.argv[3] + sys.path.insert(0, python_src_path) + try: + max_rows = max(1, int(max_rows_str)) + except ValueError: + max_rows = 5 + + try: + with open(code_path, "r", encoding="utf-8") as fh: + code = fh.read() + except OSError as e: + print(json.dumps({"ok": False, "error": f"could not read code file: {e}"})) + return + + try: + # Bring pytexera names into the module so the user code's `from pytexera import *` + # works the same way it would in the real worker. + import pytexera # noqa: F401 + + # Give the exec'd module a stable __name__ / __module__; without this the + # `@overrides` decorator's signature compatibility check trips on a None module + # attribute (`callable.__module__.split(...)`). + namespace: dict = {"__name__": "llm_source_generated", "__module__": "llm_source_generated"} + exec(compile(code, "", "exec"), namespace) + op_cls = namespace.get("GenerateOperator") + if op_cls is None: + print(json.dumps({ + "ok": False, + "error": "generated code did not define `GenerateOperator`", + })) + return + + op = op_cls() + if hasattr(op, "open"): + op.open() + rows = [] + try: + for row in op.produce(): + if row is None: + continue + rows.append(row) + if len(rows) >= max_rows: + break + finally: + if hasattr(op, "close"): + op.close() + + # Normalize each row to a plain dict for JSON serialization. + samples = [] + for r in rows[:3]: + try: + if hasattr(r, "as_dict"): + samples.append(r.as_dict()) + elif hasattr(r, "_field_data"): + # pytexera Tuple stores fields in _field_data (OrderedDict) + samples.append({k: _to_json_safe(v) for k, v in r._field_data.items()}) + elif isinstance(r, dict): + samples.append({k: _to_json_safe(v) for k, v in r.items()}) + else: + samples.append({"_repr": repr(r)}) + except Exception as inner: + samples.append({"_repr_error": str(inner)}) + + print(json.dumps({"ok": True, "rowsSeen": len(rows), "samples": samples})) + except Exception as e: + print(json.dumps({ + "ok": False, + "error": f"{type(e).__name__}: {e}", + "traceback": traceback.format_exc(), + })) + + +def _to_json_safe(value): + if isinstance(value, (str, int, float, bool)) or value is None: + return value + try: + return str(value) + except Exception: + return repr(value) + + +if __name__ == "__main__": + main() diff --git a/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala b/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala index b335ed0c3c7..5ff51f7f7e2 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/ExecutionResultService.scala @@ -46,6 +46,7 @@ import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAg import org.apache.texera.amber.engine.common.AmberRuntime import org.apache.texera.amber.engine.common.client.AmberClient import org.apache.texera.amber.engine.common.executionruntimestate.ExecutionMetadataStore +import org.apache.texera.amber.util.ImageFormatUtils import org.apache.texera.web.SubscriptionManager import org.apache.texera.web.model.websocket.event.{ PaginatedResultEvent, @@ -59,6 +60,7 @@ import org.apache.texera.web.service.WorkflowExecutionService.getLatestExecution import org.apache.texera.web.storage.{ExecutionStateStore, WorkflowStateStore} import java.lang.Byte.{SIZE => BitsPerByte} +import java.util.Base64 import java.util.UUID import scala.collection.mutable import scala.concurrent.duration.DurationInt @@ -76,6 +78,11 @@ object ExecutionResultService { ) .mkString("") + private def bytesToImageDataUrl(bytes: Array[Byte]): Option[String] = + ImageFormatUtils + .detectMimeType(bytes) + .map(mimeType => s"data:$mimeType;base64,${Base64.getEncoder.encodeToString(bytes)}") + /** * Converts a collection of Tuples to a list of JSON ObjectNodes. * @@ -107,25 +114,27 @@ object ExecutionResultService { case AttributeType.BINARY => value match { case byteArray: Array[Byte] => - val totalSize = byteArray.length - val sizeFormatted = f"$totalSize%,d" - val totalBits = totalSize * BitsPerByte - val preview = - if (totalBits <= binaryPreviewLeadingBits + binaryPreviewTrailingBits) - bytesToBinaryString(byteArray) - else { - val leadingBytesNeeded = - math.ceil(binaryPreviewLeadingBits.toDouble / BitsPerByte).toInt - val trailingBytesNeeded = - math.ceil(binaryPreviewTrailingBits.toDouble / BitsPerByte).toInt - val leading = bytesToBinaryString(byteArray.take(leadingBytesNeeded)) - .take(binaryPreviewLeadingBits) - val trailing = bytesToBinaryString( - byteArray.takeRight(trailingBytesNeeded) - ).takeRight(binaryPreviewTrailingBits) - s"$leading...$trailing" - } - s"" + bytesToImageDataUrl(byteArray).getOrElse { + val totalSize = byteArray.length + val sizeFormatted = f"$totalSize%,d" + val totalBits = totalSize * BitsPerByte + val preview = + if (totalBits <= binaryPreviewLeadingBits + binaryPreviewTrailingBits) + bytesToBinaryString(byteArray) + else { + val leadingBytesNeeded = + math.ceil(binaryPreviewLeadingBits.toDouble / BitsPerByte).toInt + val trailingBytesNeeded = + math.ceil(binaryPreviewTrailingBits.toDouble / BitsPerByte).toInt + val leading = bytesToBinaryString(byteArray.take(leadingBytesNeeded)) + .take(binaryPreviewLeadingBits) + val trailing = bytesToBinaryString( + byteArray.takeRight(trailingBytesNeeded) + ).takeRight(binaryPreviewTrailingBits) + s"$leading...$trailing" + } + s"" + } case _ => throw new RuntimeException( diff --git a/amber/src/test/scala/org/apache/texera/web/resource/LLMSourceResourceSpec.scala b/amber/src/test/scala/org/apache/texera/web/resource/LLMSourceResourceSpec.scala new file mode 100644 index 00000000000..3a3fd3423b8 --- /dev/null +++ b/amber/src/test/scala/org/apache/texera/web/resource/LLMSourceResourceSpec.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.web.resource + +import org.apache.texera.amber.config.PythonUtils +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import java.nio.file.Files + +class LLMSourceResourceSpec extends AnyFlatSpec with Matchers { + + "LLMSourceResource.pythonBinaryCandidates" should "prefer the configured worker interpreter" in { + new LLMSourceResource().pythonBinaryCandidates.head shouldBe PythonUtils.getPythonExecutable + } + + "LLMSourceResource.resolveRuntimeInputPath" should "keep a folder input as a folder for dry-runs" in { + val dir = Files.createTempDirectory("llm-source-folder-dry-run-") + try { + Files.writeString(dir.resolve("a.txt"), "a") + Files.writeString(dir.resolve("b.txt"), "b") + + val resolved = new LLMSourceResource().resolveRuntimeInputPath(dir.toUri) + + Files.isDirectory(resolved) shouldBe true + resolved shouldBe dir + } finally { + Files.deleteIfExists(dir.resolve("a.txt")) + Files.deleteIfExists(dir.resolve("b.txt")) + Files.deleteIfExists(dir) + } + } +} diff --git a/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala b/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala index 0afe31fc099..2d86c47a158 100644 --- a/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala +++ b/amber/src/test/scala/org/apache/texera/web/service/ExecutionResultServiceSpec.scala @@ -23,6 +23,10 @@ import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tup import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import java.awt.image.BufferedImage +import java.io.ByteArrayOutputStream +import javax.imageio.ImageIO + class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { "convertTuplesToJson" should "convert tuples with various field types correctly" in { @@ -181,6 +185,24 @@ class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { emptyBinaryString should include("size = 0 bytes") } + it should "serialize recognized image binaries as data URLs" in { + val attributes = List( + new Attribute("image", AttributeType.BINARY) + ) + val schema = new Schema(attributes) + val imageBytes = pngBytes(width = 2, height = 2) + + val tuple = Tuple + .builder(schema) + .add("image", AttributeType.BINARY, imageBytes) + .build() + + val result = ExecutionResultService.convertTuplesToJson(List(tuple)) + + result should have size 1 + result.head.get("image").asText() should startWith("data:image/png;base64,") + } + it should "handle binary data with single ByteBuffer" in { val attributes = List( new Attribute("singleBufferBinary", AttributeType.BINARY) @@ -475,4 +497,11 @@ class ExecutionResultServiceSpec extends AnyFlatSpec with Matchers { resultsDefault(2).get("value").asText() shouldBe "medium length" resultsDefault(3).get("value").asText() should endWith("...") } + + private def pngBytes(width: Int, height: Int): Array[Byte] = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + val out = new ByteArrayOutputStream() + ImageIO.write(image, "png", out) + out.toByteArray + } } diff --git a/build.sbt b/build.sbt index b7b6b3cfb20..22dcd24e085 100644 --- a/build.sbt +++ b/build.sbt @@ -50,6 +50,19 @@ lazy val asfLicensingSettingsWithVendored = AddMetaInfLicenseFiles.workflowOpera val jacksonVersion = "2.18.6" +// Globally exclude transitive Hadoop landmines that conflict with Texera's +// Dropwizard + Jersey stack. These ride in via Parquet's `parquet-hadoop`, +// added in common/workflow-operator/build.sbt for SmartFileScan. Defining the +// excludes at ThisBuild level ensures they apply to every project that +// transitively pulls Hadoop — most importantly amber. +ThisBuild / excludeDependencies ++= Seq( + ExclusionRule("javax.servlet.jsp", "jsp-api"), + ExclusionRule("javax.servlet", "servlet-api"), + ExclusionRule(organization = "com.sun.jersey"), + ExclusionRule(organization = "com.sun.jersey.contribs"), + ExclusionRule("com.github.pjfanning", "jersey-json") +) + lazy val DAO = (project in file("common/dao")).settings(asfLicensingSettings) lazy val Config = (project in file("common/config")).settings(asfLicensingSettings) lazy val Auth = (project in file("common/auth")) diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala new file mode 100644 index 00000000000..27c1d66ef9f --- /dev/null +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/ImageFormatUtils.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.util + +object ImageFormatUtils { + + private val PngMagic = Array[Byte](0x89.toByte, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a) + private val JpegMagic = Array[Byte](0xff.toByte, 0xd8.toByte, 0xff.toByte) + private val Gif87Magic = "GIF87a".getBytes("US-ASCII") + private val Gif89Magic = "GIF89a".getBytes("US-ASCII") + private val RiffMagic = "RIFF".getBytes("US-ASCII") + private val WebpMagic = "WEBP".getBytes("US-ASCII") + + def detectFormat(bytes: Array[Byte]): Option[String] = { + if (startsWith(bytes, PngMagic)) Some("png") + else if (startsWith(bytes, JpegMagic)) Some("jpeg") + else if (startsWith(bytes, Gif87Magic) || startsWith(bytes, Gif89Magic)) Some("gif") + else if (isWebp(bytes)) Some("webp") + else None + } + + def detectMimeType(bytes: Array[Byte]): Option[String] = + detectFormat(bytes).map { + case "png" => "image/png" + case "jpeg" => "image/jpeg" + case "gif" => "image/gif" + case "webp" => "image/webp" + } + + def extensionFormat(path: String): Option[String] = { + val lower = path.toLowerCase + val dot = lower.lastIndexOf('.') + if (dot < 0) return None + lower.substring(dot + 1) match { + case "png" => Some("png") + case "jpg" | "jpeg" => Some("jpeg") + case "gif" => Some("gif") + case "webp" => Some("webp") + case _ => None + } + } + + private def isWebp(bytes: Array[Byte]): Boolean = + bytes.length >= 12 && + startsWith(bytes, RiffMagic) && + startsWith(bytes.drop(8), WebpMagic) + + private def startsWith(bytes: Array[Byte], prefix: Array[Byte]): Boolean = { + if (bytes.length < prefix.length) return false + var index = 0 + while (index < prefix.length) { + if (bytes(index) != prefix(index)) return false + index += 1 + } + true + } +} diff --git a/common/workflow-operator/build.sbt b/common/workflow-operator/build.sbt index 1c082cae96e..a79165d0b64 100644 --- a/common/workflow-operator/build.sbt +++ b/common/workflow-operator/build.sbt @@ -113,4 +113,50 @@ libraryDependencies ++= Seq( "org.apache.lucene" % "lucene-analyzers-common" % "8.11.4" ) +// SmartFileSource: Parquet + Excel support. +// +// Hadoop drags in a LOT of stuff Texera doesn't use, and several of those +// transitive deps conflict head-on with Texera's existing Dropwizard + Jersey-3 +// stack. We exclude all of the known troublemakers here. If you're tempted to +// remove one of these, run TexeraWebApplication and watch it die at startup. +// +// Conflicts being avoided: +// - slf4j-reload4j / reload4j: conflicts with the project's logback setup +// - jsp-api 2.1: ships an ancient `javax.el.ExpressionFactory` (no +// `newInstance()`) that shadows the real `javax.el-3.0.x` Dropwizard's +// Hibernate Validator needs (NoSuchMethodError otherwise) +// - com.sun.jersey.* (Jersey 1.x): collides with the project's Jersey 3 via +// HK2 — JSONRootElementProvider gets instantiated and explodes on init +// - tomcat / jasper: only used by Hadoop's embedded web UIs +// - servlet-api 2.5: ancient javax servlet that conflicts with Jakarta +libraryDependencies ++= Seq( + "org.apache.parquet" % "parquet-hadoop" % "1.13.1", + "org.apache.hadoop" % "hadoop-common" % "3.3.6" + exclude("org.slf4j", "slf4j-reload4j") + exclude("ch.qos.reload4j", "reload4j") + exclude("javax.servlet.jsp", "jsp-api") + exclude("javax.servlet", "servlet-api") + exclude("org.mortbay.jetty", "jetty") + exclude("org.mortbay.jetty", "jetty-util") + exclude("org.mortbay.jetty", "jsp-api-2.1") + exclude("tomcat", "jasper-compiler") + exclude("tomcat", "jasper-runtime") + exclude("com.sun.jersey", "jersey-core") + exclude("com.sun.jersey", "jersey-server") + exclude("com.sun.jersey", "jersey-json") + exclude("com.sun.jersey", "jersey-servlet") + exclude("com.sun.jersey", "jersey-client") + excludeAll(ExclusionRule(organization = "com.sun.jersey")), + "org.apache.hadoop" % "hadoop-mapreduce-client-core" % "3.3.6" + exclude("org.slf4j", "slf4j-reload4j") + exclude("ch.qos.reload4j", "reload4j") + exclude("javax.servlet.jsp", "jsp-api") + exclude("javax.servlet", "servlet-api") + excludeAll(ExclusionRule(organization = "com.sun.jersey")), + "org.apache.poi" % "poi-ooxml" % "5.2.5" +) +// Global Hadoop transitive-dep blackhole is declared at the top-level +// build.sbt as `ThisBuild / excludeDependencies` so it applies to every +// downstream project (especially amber) that pulls Hadoop through us. + libraryDependencies += "io.github.classgraph" % "classgraph" % "4.8.184" % Test diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala index 4e9d6c6e2cd..db322c6084f 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala @@ -41,6 +41,7 @@ import org.apache.texera.amber.operator.difference.DifferenceOpDesc import org.apache.texera.amber.operator.distinct.DistinctOpDesc import org.apache.texera.amber.operator.dummy.DummyOpDesc import org.apache.texera.amber.operator.filter.SpecializedFilterOpDesc +import org.apache.texera.amber.operator.fileSplit.FileSplitOpDesc import org.apache.texera.amber.operator.hashJoin.HashJoinOpDesc import org.apache.texera.amber.operator.huggingFace.{ HuggingFaceIrisLogisticRegressionOpDesc, @@ -81,6 +82,8 @@ import org.apache.texera.amber.operator.source.scan.arrow.ArrowSourceOpDesc import org.apache.texera.amber.operator.source.scan.csv.CSVScanSourceOpDesc import org.apache.texera.amber.operator.source.scan.csvOld.CSVOldScanSourceOpDesc import org.apache.texera.amber.operator.source.scan.json.JSONLScanSourceOpDesc +import org.apache.texera.amber.operator.source.scan.smart.SmartFileSourceOpDesc +import org.apache.texera.amber.operator.source.llm.LLMFileSourceOpDesc import org.apache.texera.amber.operator.source.scan.text.TextInputSourceOpDesc import org.apache.texera.amber.operator.source.sql.asterixdb.AsterixDBSourceOpDesc import org.apache.texera.amber.operator.source.sql.mysql.MySQLSourceOpDesc @@ -164,9 +167,12 @@ trait StateTransferFunc @JsonSubTypes( Array( new Type(value = classOf[IfOpDesc], name = "If"), + new Type(value = classOf[FileSplitOpDesc], name = "FileSplit"), new Type(value = classOf[SankeyDiagramOpDesc], name = "SankeyDiagram"), new Type(value = classOf[IcicleChartOpDesc], name = "IcicleChart"), new Type(value = classOf[FileListerSourceOpDesc], name = "FileLister"), + new Type(value = classOf[SmartFileSourceOpDesc], name = "SmartFileScan"), + new Type(value = classOf[LLMFileSourceOpDesc], name = "LLMFileScan"), new Type(value = classOf[CSVScanSourceOpDesc], name = "CSVFileScan"), // disabled the ParallelCSVScanSourceOpDesc so that it does not confuse user. it can be re-enabled when doing experiments. // new Type(value = classOf[ParallelCSVScanSourceOpDesc], name = "ParallelCSVFileScan"), diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala new file mode 100644 index 00000000000..4b71e441202 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDesc.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithClassName +import org.apache.texera.amber.core.tuple.{AttributeType, Schema} +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow._ +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.{LogicalOp, PortDescription} +import org.apache.texera.amber.util.JSONUtils.objectMapper + +class FileSplitOpDesc extends LogicalOp { + + @JsonProperty + @JsonSchemaTitle("File Column") + @JsonPropertyDescription("leave empty to auto-detect source_file or filename") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var fileAttribute: Option[String] = None + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = + PhysicalOp + .oneToOnePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithClassName( + "org.apache.texera.amber.operator.fileSplit.FileSplitOpExec", + objectMapper.writeValueAsString(this) + ) + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withParallelizable(false) + .withPropagateSchema( + SchemaPropagationFunc(inputSchemas => { + require(inputSchemas.size == 1, "File Split requires exactly one input") + val inputSchema = inputSchemas.values.head + resolveFileAttribute(inputSchema) + operatorInfo.outputPorts.map(port => port.id -> inputSchema).toMap + }) + ) + + override def operatorInfo: OperatorInfo = { + val outputPortInfo = + if (outputPorts != null && outputPorts.nonEmpty) { + outputPorts.zipWithIndex.map { + case (portDesc: PortDescription, idx) => + OutputPort(PortIdentity(idx), displayName = portDesc.displayName) + } + } else { + List(OutputPort(PortIdentity()), OutputPort(PortIdentity(1))) + } + + OperatorInfo( + userFriendlyName = "File Split", + operatorDescription = "Route rows from the same file to the same output port", + operatorGroupName = OperatorGroupConstants.UTILITY_GROUP, + inputPorts = List(InputPort()), + outputPorts = outputPortInfo, + dynamicOutputPorts = true, + allowPortCustomization = true + ) + } + + def resolveFileAttribute(schema: Schema): String = { + val attributeName = fileAttribute.getOrElse { + List("source_file", "filename") + .find(schema.containsAttribute) + .getOrElse( + throw new IllegalArgumentException( + "File Split requires a source_file or filename column, or an explicit File Column" + ) + ) + } + if (!schema.containsAttribute(attributeName)) { + throw new IllegalArgumentException(s"File Split column '$attributeName' does not exist") + } + if (schema.getAttribute(attributeName).getType != AttributeType.STRING) { + throw new IllegalArgumentException(s"File Split column '$attributeName' must be a STRING") + } + attributeName + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala new file mode 100644 index 00000000000..9816cf34c17 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExec.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.executor.OperatorExecutor +import org.apache.texera.amber.core.tuple.{Tuple, TupleLike} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.util.JSONUtils.objectMapper + +import scala.collection.mutable + +class FileSplitOpExec(descString: String) extends OperatorExecutor { + private val desc: FileSplitOpDesc = objectMapper.readValue(descString, classOf[FileSplitOpDesc]) + private val fileToPort = mutable.LinkedHashMap.empty[String, PortIdentity] + private var fileAttribute: String = _ + private var outputPortCount: Int = _ + + override def open(): Unit = { + outputPortCount = desc.operatorInfo.outputPorts.length + require(outputPortCount > 0, "File Split requires at least one output port") + } + + override def processTupleMultiPort( + tuple: Tuple, + port: Int + ): Iterator[(TupleLike, Option[PortIdentity])] = { + if (fileAttribute == null) { + fileAttribute = desc.resolveFileAttribute(tuple.getSchema) + } + val sourceFile = Option(tuple.getField[String](fileAttribute)).getOrElse( + throw new IllegalArgumentException(s"File Split column '$fileAttribute' cannot be null") + ) + val outputPort = fileToPort.getOrElseUpdate( + sourceFile, + PortIdentity(fileToPort.size % outputPortCount) + ) + Iterator.single((tuple, Some(outputPort))) + } + + override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = ??? +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDesc.scala new file mode 100644 index 00000000000..73ffec7ebb9 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDesc.scala @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.llm + +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import com.typesafe.scalalogging.LazyLogging +import org.apache.texera.amber.core.executor.OpExecWithCode +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{OutputPort, PhysicalOp, SchemaPropagationFunc} +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.source.SourceOperatorDescriptor +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.amber.util.JSONUtils.objectMapper + +import scala.jdk.CollectionConverters._ +import scala.util.{Failure, Success, Try} + +/** + * Source operator whose Python body is generated by an LLM at design time. + * + * The descriptor is a thin wrapper around the Python UDF source runtime: at compile time we + * build OpExecWithCode(generatedCode, "python"), exactly like PythonUDFSourceOpDescV2. + * The LLM is never called at execution time — it runs once via the /llm-source/generate + * REST endpoint when the user clicks the "Generate" button in the property panel, and the + * resulting code is saved with the workflow. + * + * Multi-table handling (MVP): the operator emits a single output stream. Each emitted row + * carries a synthetic __table__ discriminator column identifying which logical table it + * belongs to. The frontend offers a "Add Filter+Projection" helper that creates a + * per-table downstream chain. Phase 2 will swap this for true multi-output ports by + * extending the Python UDF runtime; the JSON shape here is forward-compatible. + * + * The generatedCode is allowed to embed the token __TEXERA_FILE_URI__ which getPhysicalOp + * substitutes with the resolved file URI before handing the code to the worker. This keeps + * dataset paths out of the saved workflow JSON. Phase 2 should replace the token with + * proper OpExec param plumbing. + */ +class LLMFileSourceOpDesc extends SourceOperatorDescriptor with LazyLogging { + + @JsonProperty + @JsonSchemaTitle("File / Folder") + @JsonPropertyDescription("File or folder to parse. Pick from your datasets or paste a URI.") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var fileName: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Hint") + @JsonPropertyDescription( + "Optional free-text intent for the LLM, e.g. 'extract the monthly revenue table'." + ) + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var userHint: Option[String] = None + + @JsonProperty(defaultValue = "") + @JsonSchemaTitle("Generated parser") + @JsonPropertyDescription("Input your code here") + var generatedCode: String = "" + + @JsonProperty + @JsonSchemaTitle("Detected tables") + @JsonPropertyDescription("Tables the LLM identified in the file (set by the Generate action).") + var tables: java.util.List[LLMTableSpec] = new java.util.ArrayList[LLMTableSpec]() + + @JsonProperty + @JsonSchemaTitle("Union schema") + @JsonPropertyDescription( + "Union of every detected table's columns plus the __table__ discriminator. Drives the operator's output schema. Set by the Generate action." + ) + var unionColumns: java.util.List[Attribute] = new java.util.ArrayList[Attribute]() + + @JsonProperty(defaultValue = "1") + @JsonSchemaTitle("Worker count") + @JsonPropertyDescription("Specify how many parallel workers to launch.") + var workers: Int = 1 + + @JsonProperty + @JsonSchemaTitle("LLM model") + @JsonPropertyDescription("Audit: model that generated the parser.") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var llmModel: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Sample hash") + @JsonPropertyDescription("Audit: SHA-256 of the file bytes the LLM saw.") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var sampleHash: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Generated at") + @JsonPropertyDescription("Audit: ISO-8601 timestamp of generation.") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var generatedAt: Option[String] = None + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = { + require(workers >= 1, "Need at least 1 worker.") + require(generatedCode != null && generatedCode.trim.nonEmpty, + "Generated code is empty. Click Generate in the property panel first.") + val resolvedCode = withDenseUnionRows(substituteFileUri(generatedCode)) + val physicalOp = PhysicalOp + .sourcePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithCode(resolvedCode, "python") + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withIsOneToManyOp(true) + .withPropagateSchema( + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) + ) + .withLocationPreference(Option.empty) + + if (workers > 1) { + physicalOp + .withParallelizable(true) + .withSuggestedWorkerNum(workers) + } else { + physicalOp.withParallelizable(false) + } + } + + override def operatorInfo: OperatorInfo = + OperatorInfo( + userFriendlyName = "LLM File Source", + operatorDescription = + "Reads a sample of a file (or folder of similar files) and uses an LLM to generate a Python parser tailored to it. Great for irregular shapes — PDFs with embedded tables, vendor-specific reports, oddly-structured logs.", + operatorGroupName = OperatorGroupConstants.INPUT_GROUP, + inputPorts = List.empty, + outputPorts = List(OutputPort()), + supportReconfiguration = true + ) + + override def sourceSchema(): Schema = { + if (unionColumns != null && !unionColumns.isEmpty) { + Schema().add(unionColumns.asScala.toList) + } else { + // Generate not yet run — return a placeholder schema with just the __table__ + // discriminator so downstream compile checks don't blow up before the user clicks + // Generate. AttributeType.STRING serializes as "string" via @JsonValue, matching + // the lowercase casing the property panel's enum dropdown expects. + Schema().add(new Attribute("__table__", AttributeType.STRING)) + } + } + + /** + * Replace `__TEXERA_FILE_URI__` in the generated code with a path Python can actually open. + * + * The raw `fileName` property is a Texera URI (e.g., `dataset://...` or `/dataset/...`) + * that Python libraries like pdfplumber can't resolve. We pipe the URI through + * `FileResolver` + `DocumentFactory.asFile()` to materialize it as a local filesystem + * file (downloading/caching the dataset blob as a side effect when needed), and embed + * that absolute path into the generated code. + * + * If resolution fails (e.g., URI scheme not recognized), we fall back to the raw + * fileName so users who manually entered a local path still get a usable substitution. + */ + private def substituteFileUri(code: String): String = { + val raw = fileName.getOrElse("") + val resolved = Try { + val uri = FileResolver.resolve(raw) + FolderInputResolver.materializeToLocalPath(uri).toAbsolutePath.toString + } match { + case Success(path) => + logger.info(s"LLMFileSource: resolved '$raw' -> '$path'") + path + case Failure(err) => + logger.warn(s"LLMFileSource: could not resolve '$raw' (${err.getClass.getSimpleName}: ${err.getMessage}); falling back to raw fileName. The generated Python may fail to open the file.") + raw + } + code.replace("__TEXERA_FILE_URI__", resolved) + } + + /** + * LLM-generated multi-table parsers naturally emit sparse dicts: a revenue row only contains + * revenue fields, and a headcount row only contains headcount fields. Texera validates every + * emitted tuple against the single union output schema, so sparse rows must be padded with nulls + * before they reach the Python worker boundary. + */ + private[llm] def withDenseUnionRows(code: String): String = { + val unionColumnNames = sourceSchema().getAttributeNames + if (unionColumnNames.isEmpty) { + return code + } + + val columnsLiteral = objectMapper.writeValueAsString(unionColumnNames.asJava) + s"""$code + | + |# Added by LLMFileSourceOpDesc so sparse per-table rows satisfy Texera's union schema. + |_texera_llm_source_columns = $columnsLiteral + |_texera_llm_source_original_produce = GenerateOperator.produce + | + |def _texera_llm_source_dense_produce(self): + | for row in _texera_llm_source_original_produce(self): + | if isinstance(row, dict): + | yield {column: row.get(column) for column in _texera_llm_source_columns} + | else: + | yield row + | + |GenerateOperator.produce = _texera_llm_source_dense_produce + |""".stripMargin + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMTableSpec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMTableSpec.scala new file mode 100644 index 00000000000..f955043a0a6 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/llm/LLMTableSpec.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.llm + +import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty} +import org.apache.texera.amber.core.tuple.Attribute + +/** + * A logical table the LLM identified inside a file. Multiple physical occurrences sharing + * schema + meaning are merged into one spec. The `description` field carries the LLM's + * grouping rationale so the user can audit the merge decision. + */ +class LLMTableSpec @JsonCreator() ( + @JsonProperty("name") var name: String, + @JsonProperty("description") var description: String, + @JsonProperty("columns") var columns: java.util.List[Attribute] +) { + def this() = this("", "", new java.util.ArrayList[Attribute]()) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala new file mode 100644 index 00000000000..e54b8faddfb --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolver.scala @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan + +import org.apache.texera.amber.core.storage.{DocumentFactory, FileResolver} +import org.apache.texera.amber.core.storage.util.LakeFSStorageClient + +import java.net.{URI, URLDecoder, URLEncoder} +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path, Paths, StandardCopyOption} +import scala.jdk.CollectionConverters._ +import scala.util.Using + +case class ResolvedInputFile(uri: URI, displayName: String) +case class ResolvedFolderInput(files: List[ResolvedInputFile], isFolder: Boolean) + +object FolderInputResolver { + + def resolve(uri: URI): ResolvedFolderInput = + Option(uri.getScheme).map(_.toLowerCase) match { + case Some("file") => resolveLocalInput(uri) + case Some(FileResolver.DATASET_FILE_URI_SCHEME) => resolveDatasetInput(uri) + case _ => + ResolvedFolderInput(List(ResolvedInputFile(uri, uri.toASCIIString)), isFolder = false) + } + + /** + * Return a real local path that Python libraries can open. Local file-system folders are already + * usable as-is; dataset-backed folders need to be materialized into a temporary local directory + * because they only exist as a set of object-store files behind a virtual Texera path. + */ + def materializeToLocalPath(uri: URI): Path = { + if (Option(uri.getScheme).contains("file") && Files.isDirectory(Paths.get(uri))) { + return Paths.get(uri) + } + + val resolved = resolve(uri) + if (!resolved.isFolder) { + DocumentFactory.openReadonlyDocument(uri).asFile().toPath + } else { + val root = Files.createTempDirectory("texera-folder-input-") + resolved.files.foreach { file => + val target = root.resolve(file.displayName) + Option(target.getParent).foreach(parent => Files.createDirectories(parent)) + Using.resource(DocumentFactory.openReadonlyDocument(file.uri).asInputStream()) { in => + Files.copy(in, target, StandardCopyOption.REPLACE_EXISTING) + } + } + root + } + } + + private def resolveLocalInput(uri: URI): ResolvedFolderInput = { + val path = Paths.get(uri) + if (Files.isDirectory(path)) { + val files = Using.resource(Files.walk(path)) { stream => + stream + .iterator() + .asScala + .filter(Files.isRegularFile(_)) + .filterNot(isHiddenPath) + .map(file => ResolvedInputFile(file.toUri, path.relativize(file).toString)) + .toList + .sortBy(_.displayName) + } + ResolvedFolderInput(files, isFolder = true) + } else { + ResolvedFolderInput(List(ResolvedInputFile(uri, uri.toASCIIString)), isFolder = false) + } + } + + private def resolveDatasetInput(uri: URI): ResolvedFolderInput = { + val segments = Paths + .get(uri.getPath) + .iterator() + .asScala + .map(_.toString) + .toList + + if (segments.length < 3) { + throw new IllegalArgumentException(s"Dataset URI is missing a relative path: $uri") + } + + val repositoryName = segments.head + val versionHash = URLDecoder.decode(segments(1), StandardCharsets.UTF_8) + val relativePath = segments + .drop(2) + .map(part => URLDecoder.decode(part, StandardCharsets.UTF_8)) + .mkString("/") + + val objects = LakeFSStorageClient.retrieveObjectsOfVersion(repositoryName, versionHash) + val exactFile = objects.find(_.getPath == relativePath) + exactFile match { + case Some(file) => + ResolvedFolderInput( + List( + ResolvedInputFile( + buildDatasetFileUri(repositoryName, versionHash, file.getPath), + uri.toASCIIString + ) + ), + isFolder = false + ) + case None => + val prefix = if (relativePath.endsWith("/")) relativePath else s"$relativePath/" + val files = objects + .map(_.getPath) + .filter(_.startsWith(prefix)) + .filterNot(isHiddenDatasetPath) + .sorted + .map { path => + ResolvedInputFile( + buildDatasetFileUri(repositoryName, versionHash, path), + path.stripPrefix(prefix) + ) + } + ResolvedFolderInput(files, isFolder = true) + } + } + + private def buildDatasetFileUri(repositoryName: String, versionHash: String, relativePath: String): URI = { + val encodedSegments = + List(repositoryName, versionHash) ++ relativePath + .split("/") + .toList + .filter(_.nonEmpty) + .map(segment => URLEncoder.encode(segment, StandardCharsets.UTF_8)) + new URI(FileResolver.DATASET_FILE_URI_SCHEME, "", s"/${encodedSegments.mkString("/")}", null) + } + + private def isHiddenPath(path: Path): Boolean = + Option(path.getFileName).exists(_.toString.startsWith(".")) + + private def isHiddenDatasetPath(path: String): Boolean = + path.split("/").lastOption.exists(_.startsWith(".")) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala index 82997632d14..b3c3d260723 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDesc.scala @@ -20,11 +20,7 @@ package org.apache.texera.amber.operator.source.scan.file import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty} -import com.kjetland.jackson.jsonSchema.annotations.{ - JsonSchemaInject, - JsonSchemaString, - JsonSchemaTitle -} +import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaString, JsonSchemaTitle} import org.apache.texera.amber.core.executor.OpExecWithClassName import org.apache.texera.amber.core.tuple.{AttributeType, Schema} import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} @@ -53,14 +49,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { @JsonProperty(defaultValue = "false") @JsonSchemaTitle("Include Filename") - @JsonSchemaInject( - strings = Array( - new JsonSchemaString(path = HideAnnotation.hideTarget, value = "extract"), - new JsonSchemaString(path = HideAnnotation.hideType, value = HideAnnotation.Type.equals), - new JsonSchemaString(path = HideAnnotation.hideExpectedValue, value = "false") - ) - ) - val outputFileName: Boolean = false + var outputFileName: Boolean = false fileTypeName = Option("") @@ -92,4 +81,7 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { } schema.add(attributeName, attributeType.getType) } + + override def operatorInfo = + super.operatorInfo.copy(operatorDescription = "Scan data from a file or a folder of files") } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala index d47cf3681c2..3b71a126437 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpExec.scala @@ -21,9 +21,11 @@ package org.apache.texera.amber.operator.source.scan.file import org.apache.texera.amber.core.executor.SourceOperatorExecutor import org.apache.texera.amber.core.tuple.TupleLike +import org.apache.texera.amber.operator.source.scan.FolderInputResolver import org.apache.texera.amber.util.JSONUtils.objectMapper import java.io.IOException +import java.net.URI class FileScanSourceOpExec private[scan] ( descString: String @@ -33,14 +35,21 @@ class FileScanSourceOpExec private[scan] ( @throws[IOException] override def produceTuple(): Iterator[TupleLike] = { - FileScanUtils.createTuplesFromFile( - fileName = desc.fileName.get, - attributeType = desc.attributeType, - fileEncoding = desc.fileEncoding, - extract = desc.extract, - outputFileName = desc.outputFileName, - fileScanOffset = desc.fileScanOffset, - fileScanLimit = desc.fileScanLimit - ) + FolderInputResolver + .resolve(new URI(desc.fileName.get)) + .files + .iterator + .flatMap(file => + FileScanUtils.createTuplesFromFile( + fileName = file.uri.toASCIIString, + displayFileName = file.displayName, + attributeType = desc.attributeType, + fileEncoding = desc.fileEncoding, + extract = desc.extract, + outputFileName = desc.outputFileName, + fileScanOffset = desc.fileScanOffset, + fileScanLimit = desc.fileScanLimit + ) + ) } } diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala index a7f81b4869c..e022d96e435 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/file/FileScanUtils.scala @@ -110,7 +110,8 @@ private[file] object FileScanUtils { TupleLike(fields.toSeq: _*) } } else { - fileEntries.flatMap(entry => + fileEntries.zipAll(filenameIt, null, null).flatMap { + case (entry, entryFileName) => new BufferedReader(new InputStreamReader(entry, fileEncoding.getCharset)) .lines() .iterator() @@ -119,13 +120,14 @@ private[file] object FileScanUtils { fileScanOffset.getOrElse(0), fileScanOffset.getOrElse(0) + fileScanLimit.getOrElse(Int.MaxValue) ) - .map(line => - TupleLike(attributeType match { + .map { line => + val parsed = attributeType match { case FileAttributeType.SINGLE_STRING => line case _ => parseField(line, attributeType.getType) - }) - ) - ) + } + if (outputFileName) TupleLike(entryFileName, parsed) else TupleLike(parsed) + } + } } new AutoClosingIterator(rawIterator, () => closeables.foreach(_.close())) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala new file mode 100644 index 00000000000..fe73bf36f17 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSniffer.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.texera.amber.core.tuple.{AttributeType, AttributeTypeUtils} + +import java.io.StringReader + +/** A guess at how a CSV-family file should be read. */ +case class CSVDialect(delimiter: Char, hasHeader: Boolean) + +/** + * Heuristic CSV dialect detector. Given a text sample (first ~64 KB of the file), + * it picks the delimiter that produces the most consistent column count across rows, + * then decides whether the first row is a header. + * + * Not perfect — quoted multi-line values can confuse it on very short samples — but + * good enough for the common cases the Smart File Source wants to cover. + */ +object CSVDialectSniffer { + + private val Candidates: Seq[Char] = Seq(',', '\t', ';', '|') + + /** + * @param sampleText decoded text sample + * @param preferred an extension-based hint (`,` if `.csv`, `\t` if `.tsv`). When the + * data is consistent with the preferred delimiter, we keep it even + * if another delimiter would score marginally higher. + */ + def sniff(sampleText: String, preferred: Option[Char] = None): CSVDialect = { + val scored = Candidates.map(d => d -> scoreDelimiter(sampleText, d)).toMap + + val delimiter = preferred match { + case Some(p) if scored.getOrElse(p, 0.0) >= 0.5 => p + case _ => + scored + .filter { case (_, score) => score > 0.0 } + .toSeq + .sortBy { case (_, score) => -score } + .headOption + .map(_._1) + .getOrElse(',') // fall back to comma; downstream parsing will surface a real error + } + + val hasHeader = detectHeader(sampleText, delimiter) + CSVDialect(delimiter, hasHeader) + } + + /** + * A delimiter is "consistent" when the per-row column count is stable across rows. + * Score is `(rows_with_modal_count - 1) / total_rows`, in [0, 1]. + */ + private def scoreDelimiter(sample: String, delimiter: Char): Double = { + val rows = parseRows(sample, delimiter, headerExtraction = false, maxRows = 30) + if (rows.size < 2) return 0.0 + val counts = rows.map(_.length).filter(_ > 0) + if (counts.length < 2) return 0.0 + val modalCount = counts.groupBy(identity).view.mapValues(_.size).maxBy(_._2)._1 + if (modalCount < 2) return 0.0 // single-column "matches" don't tell us anything + val agreeing = counts.count(_ == modalCount) + (agreeing - 1).toDouble / rows.size + } + + /** + * Header detection: parse the first row, then parse subsequent rows; if at least one + * column has a row-1 type of STRING but later rows are numeric/boolean/timestamp, the + * first row is probably a header. + */ + private def detectHeader(sample: String, delimiter: Char): Boolean = { + val rows = parseRows(sample, delimiter, headerExtraction = false, maxRows = 30) + if (rows.size < 2) return true // safer default — most CSVs have headers + val firstRow = rows.head + val laterRows = rows.tail + val width = firstRow.length + if (width == 0) return true + + val laterTypes: Array[AttributeType] = AttributeTypeUtils.inferSchemaFromRows( + laterRows.iterator.map(r => r.padTo(width, "").take(width).asInstanceOf[Array[Any]]) + ) + + val firstTypes = firstRow.map { v => + if (v == null || v.trim.isEmpty) AttributeType.STRING + else AttributeTypeUtils.inferField(v) + } + + val typedColumns = laterTypes.zipWithIndex.collect { + case (t, i) + if t != AttributeType.STRING && i < firstTypes.length + && firstTypes(i) == AttributeType.STRING => + i + } + typedColumns.nonEmpty + } + + private def parseRows( + sample: String, + delimiter: Char, + headerExtraction: Boolean, + maxRows: Int + ): Array[Array[String]] = { + val format = new CsvFormat() + format.setDelimiter(delimiter) + format.setLineSeparator("\n") + format.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setFormat(format) + settings.setMaxCharsPerColumn(-1) + settings.setHeaderExtractionEnabled(headerExtraction) + settings.setNullValue("") + val parser = new CsvParser(settings) + val reader = new StringReader(sample) + try { + parser.beginParsing(reader) + val buf = scala.collection.mutable.ArrayBuffer.empty[Array[String]] + var count = 0 + var row = parser.parseNext() + while (row != null && count < maxRows) { + buf += row + count += 1 + row = parser.parseNext() + } + parser.stopParsing() + buf.toArray + } finally reader.close() + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala new file mode 100644 index 00000000000..143b1a9290f --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetector.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.texera.amber.util.ImageFormatUtils + +import java.nio.charset.Charset + +object FormatDetector { + + // Magic bytes used by the formats we support. + private val ParquetMagic: Array[Byte] = "PAR1".getBytes("US-ASCII") + private val XlsxMagic: Array[Byte] = Array(0x50, 0x4b, 0x03, 0x04).map(_.toByte) // PK\x03\x04 ZIP container + private val OleMagic: Array[Byte] = // legacy .xls (OLE2 compound document) + Array(0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1).map(_.toByte) + // Arrow IPC stream begins with "ARROW1\0\0", file format also starts with this signature. + private val ArrowMagic: Array[Byte] = "ARROW1".getBytes("US-ASCII") + + /** + * Cheap detection from a byte sample plus optional filename hint. + * Order: magic bytes (most reliable) → extension → content sniff. + */ + def detect( + fileNameHint: Option[String], + sample: Array[Byte], + charset: Charset + ): SmartFileFormat = { + if (startsWith(sample, ParquetMagic)) return SmartFileFormat.PARQUET + if (startsWith(sample, OleMagic)) return SmartFileFormat.EXCEL + if (startsWith(sample, ArrowMagic)) return SmartFileFormat.ARROW + if (ImageFormatUtils.detectFormat(sample).nonEmpty) return SmartFileFormat.IMAGE + + val extensionDetected = fileNameHint.flatMap(extensionFormat) + if (startsWith(sample, XlsxMagic) && extensionDetected.contains(SmartFileFormat.EXCEL)) { + return SmartFileFormat.EXCEL + } + + extensionDetected.foreach(return _) + + sniffText(sample, charset) + } + + /** Extension-based detection. Returns None if extension is unknown or absent. */ + def extensionFormat(fileName: String): Option[SmartFileFormat] = { + val lower = fileName.toLowerCase + val dot = lower.lastIndexOf('.') + if (dot < 0) return None + lower.substring(dot + 1) match { + case "csv" => Some(SmartFileFormat.CSV) + case "tsv" | "tab" => Some(SmartFileFormat.TSV) + case "json" => Some(SmartFileFormat.JSON) + case "jsonl" | "ndjson" => Some(SmartFileFormat.JSONL) + case "arrow" => Some(SmartFileFormat.ARROW) + case "parquet" | "pq" => Some(SmartFileFormat.PARQUET) + case "xlsx" | "xls" | "xlsm" => Some(SmartFileFormat.EXCEL) + case "png" | "jpg" | "jpeg" | + "gif" | "webp" => Some(SmartFileFormat.IMAGE) + case "txt" | "log" => Some(SmartFileFormat.TEXT) + case _ => None + } + } + + /** + * Content-based sniffing for text formats when neither magic bytes nor extension + * give a definitive answer. Heuristics: + * - first non-blank char `{` → JSON object → ambiguous JSON vs JSONL → look at how many + * `{` start at the beginning of a line + * - first non-blank char `[` → JSON array + * - lines with consistent tabs but few commas → TSV + * - otherwise → CSV (the most common case) + */ + private def sniffText(sample: Array[Byte], charset: Charset): SmartFileFormat = { + val text = new String(sample, charset) + val trimmed = text.dropWhile(_.isWhitespace) + if (trimmed.isEmpty) return SmartFileFormat.TEXT + + trimmed.head match { + case '[' => return SmartFileFormat.JSON + case '{' => + // Either a single JSON object, JSON array of objects pretty-printed, or JSONL. + // JSONL: multiple lines each starting with `{`. + val objectLineStarts = text.linesIterator + .filter(_.nonEmpty) + .count(line => line.headOption.contains('{')) + return if (objectLineStarts >= 2) SmartFileFormat.JSONL else SmartFileFormat.JSON + case _ => + } + + // Delimiter heuristic — only the first ~30 lines. + val lines = text.linesIterator.take(30).filter(_.nonEmpty).toList + if (lines.isEmpty) return SmartFileFormat.TEXT + val tabHits = lines.count(_.contains('\t')) + val commaHits = lines.count(_.contains(',')) + if (tabHits > 0 && tabHits >= commaHits) SmartFileFormat.TSV + else if (commaHits > 0) SmartFileFormat.CSV + else SmartFileFormat.TEXT + } + + private def startsWith(sample: Array[Byte], prefix: Array[Byte]): Boolean = { + if (sample.length < prefix.length) return false + var i = 0 + while (i < prefix.length) { + if (sample(i) != prefix(i)) return false + i += 1 + } + true + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala new file mode 100644 index 00000000000..3954c8cf55b --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/ParquetUtils.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.example.data.Group +import org.apache.parquet.example.data.simple.convert.GroupRecordConverter +import org.apache.parquet.hadoop.ParquetFileReader +import org.apache.parquet.hadoop.util.HadoopInputFile +import org.apache.parquet.io.ColumnIOFactory +import org.apache.parquet.schema.LogicalTypeAnnotation +import org.apache.parquet.schema.LogicalTypeAnnotation.{ + DateLogicalTypeAnnotation, + StringLogicalTypeAnnotation, + TimestampLogicalTypeAnnotation +} +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName +import org.apache.parquet.schema.{MessageType, PrimitiveType, Type} +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} + +import java.io.File + +object ParquetUtils { + + /** Map a Parquet `MessageType` to a Texera Schema. Skips non-primitive (nested) fields. */ + def toTexeraSchema(messageType: MessageType): Schema = { + val attrs = scala.collection.mutable.ListBuffer.empty[Attribute] + val fieldCount = messageType.getFieldCount + var i = 0 + while (i < fieldCount) { + val field: Type = messageType.getType(i) + if (field.isPrimitive) { + attrs += new Attribute(field.getName, toAttributeType(field.asPrimitiveType())) + } + i += 1 + } + Schema(attrs.toList) + } + + def toAttributeType(primitive: PrimitiveType): AttributeType = { + val logical = primitive.getLogicalTypeAnnotation + primitive.getPrimitiveTypeName match { + case PrimitiveTypeName.BOOLEAN => AttributeType.BOOLEAN + case PrimitiveTypeName.INT32 => + logical match { + case _: DateLogicalTypeAnnotation => AttributeType.TIMESTAMP + case _ => AttributeType.INTEGER + } + case PrimitiveTypeName.INT64 => + logical match { + case _: TimestampLogicalTypeAnnotation => AttributeType.TIMESTAMP + case _ => AttributeType.LONG + } + case PrimitiveTypeName.FLOAT | PrimitiveTypeName.DOUBLE => AttributeType.DOUBLE + case PrimitiveTypeName.INT96 => AttributeType.TIMESTAMP + case PrimitiveTypeName.BINARY => + logical match { + case _: StringLogicalTypeAnnotation => AttributeType.STRING + case _ if isStringLike(logical) => AttributeType.STRING + case _ => AttributeType.BINARY + } + case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => AttributeType.BINARY + } + } + + private def isStringLike(logical: LogicalTypeAnnotation): Boolean = { + if (logical == null) return false + // EnumLogicalTypeAnnotation / JsonLogicalTypeAnnotation also serialize as text. + val name = logical.toString.toLowerCase + name.contains("string") || name.contains("enum") || name.contains("json") + } + + /** Opens a `ParquetFileReader` on a local file. */ + def openReader(file: File): ParquetFileReader = { + val conf = newConfiguration() + val inputFile = HadoopInputFile.fromPath(new Path(file.toURI), conf) + ParquetFileReader.open(inputFile) + } + + /** + * Read the file into a lazy iterator of `Group` records. + * Caller is responsible for closing the returned reader via [[ParquetReadHandle.close]]. + */ + def openRecords(file: File): ParquetReadHandle = { + val conf = newConfiguration() + val inputFile = HadoopInputFile.fromPath(new Path(file.toURI), conf) + val reader = ParquetFileReader.open(inputFile) + val schema = reader.getFooter.getFileMetaData.getSchema + val converter = new GroupRecordConverter(schema) + val columnIO = new ColumnIOFactory().getColumnIO(schema) + val iterator = new Iterator[Group] { + private var currentPages = reader.readNextRowGroup() + private var recordReader = + if (currentPages != null) columnIO.getRecordReader(currentPages, converter) else null + private var remaining: Long = if (currentPages != null) currentPages.getRowCount else 0L + + override def hasNext: Boolean = { + if (remaining > 0) return true + // Advance to next row group. + var nextPages = reader.readNextRowGroup() + while (nextPages != null && nextPages.getRowCount == 0) nextPages = reader.readNextRowGroup() + if (nextPages == null) false + else { + currentPages = nextPages + recordReader = columnIO.getRecordReader(nextPages, converter) + remaining = nextPages.getRowCount + true + } + } + + override def next(): Group = { + if (!hasNext) throw new NoSuchElementException + remaining -= 1 + recordReader.read().asInstanceOf[Group] + } + } + ParquetReadHandle(schema, iterator, () => reader.close()) + } + + /** Read a primitive field at position `index` of a Parquet `Group`, honoring schema. */ + def readField(group: Group, index: Int, schema: MessageType): Any = { + if (group.getFieldRepetitionCount(index) == 0) return null + val field = schema.getType(index) + if (!field.isPrimitive) return null + val primitive = field.asPrimitiveType() + primitive.getPrimitiveTypeName match { + case PrimitiveTypeName.BOOLEAN => group.getBoolean(index, 0) + case PrimitiveTypeName.INT32 => + primitive.getLogicalTypeAnnotation match { + case _: DateLogicalTypeAnnotation => + // Date stored as days since epoch. + val days = group.getInteger(index, 0).toLong + new java.sql.Timestamp(days * 86400000L) + case _ => Int.box(group.getInteger(index, 0)) + } + case PrimitiveTypeName.INT64 => + primitive.getLogicalTypeAnnotation match { + case ts: TimestampLogicalTypeAnnotation => + val raw = group.getLong(index, 0) + val millis = ts.getUnit match { + case LogicalTypeAnnotation.TimeUnit.MILLIS => raw + case LogicalTypeAnnotation.TimeUnit.MICROS => raw / 1000L + case LogicalTypeAnnotation.TimeUnit.NANOS => raw / 1000000L + } + new java.sql.Timestamp(millis) + case _ => Long.box(group.getLong(index, 0)) + } + case PrimitiveTypeName.FLOAT => Double.box(group.getFloat(index, 0).toDouble) + case PrimitiveTypeName.DOUBLE => Double.box(group.getDouble(index, 0)) + case PrimitiveTypeName.INT96 => + // INT96 → 96-bit timestamp; convert via Parquet's NanoTime helper. + val binary = group.getInt96(index, 0) + int96ToTimestamp(binary.getBytes) + case PrimitiveTypeName.BINARY => + val binary = group.getBinary(index, 0) + primitive.getLogicalTypeAnnotation match { + case _: StringLogicalTypeAnnotation => binary.toStringUsingUTF8 + case logical if isStringLike(logical) => binary.toStringUsingUTF8 + case _ => binary.getBytes + } + case PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY => group.getBinary(index, 0).getBytes + } + } + + private def int96ToTimestamp(bytes: Array[Byte]): java.sql.Timestamp = { + // INT96: 8 bytes little-endian nanoseconds of day, then 4 bytes little-endian Julian day. + var nanos: Long = 0L + for (i <- 0 until 8) nanos |= (bytes(i).toLong & 0xff) << (8 * i) + var julian: Int = 0 + for (i <- 0 until 4) julian |= (bytes(8 + i).toInt & 0xff) << (8 * i) + val daysFromEpoch = julian - 2440588 // Julian day 2440588 = 1970-01-01 + val millis = daysFromEpoch.toLong * 86400000L + nanos / 1000000L + new java.sql.Timestamp(millis) + } + + private def newConfiguration(): Configuration = { + val conf = new Configuration(false) + // Reduce noisy default classpath probing — we only ever look at local files. + conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") + conf + } + + case class ParquetReadHandle( + schema: MessageType, + records: Iterator[Group], + closer: () => Unit + ) { + def close(): Unit = closer() + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java new file mode 100644 index 00000000000..190b367daec --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileFormat.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + +public enum SmartFileFormat { + AUTO("Auto-detect"), + CSV("CSV"), + TSV("TSV"), + JSON("JSON"), + JSONL("JSONL"), + ARROW("Arrow"), + PARQUET("Parquet"), + EXCEL("Excel"), + IMAGE("Image"), + TEXT("Plain text"); + + private final String label; + + SmartFileFormat(String label) { + this.label = label; + } + + @JsonValue + public String getLabel() { + return label; + } + + /** Accept either the enum name (e.g. "CSV") or the label (e.g. "Plain text"). */ + @JsonCreator + public static SmartFileFormat fromString(String value) { + if (value == null) { + return null; + } + for (SmartFileFormat format : values()) { + if (format.name().equalsIgnoreCase(value) || format.label.equalsIgnoreCase(value)) { + return format; + } + } + throw new IllegalArgumentException("Unknown SmartFileFormat: " + value); + } + + @Override + public String toString() { + return label; + } +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala new file mode 100644 index 00000000000..0a657639d65 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileInferencer.scala @@ -0,0 +1,476 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.databind.JsonNode +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.ipc.ArrowFileReader +import org.apache.poi.ss.usermodel.{Cell, CellType, DateUtil, Sheet, WorkbookFactory} +import org.apache.texera.amber.core.storage.DocumentFactory +import org.apache.texera.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.amber.util.ArrowUtils +import org.apache.texera.amber.util.JSONUtils.{JSONToMap, objectMapper} + +import java.io.{BufferedReader, InputStream, InputStreamReader} +import java.net.URI +import java.nio.charset.Charset +import java.nio.file.{Files, StandardOpenOption} +import scala.collection.mutable.ArrayBuffer +import scala.jdk.CollectionConverters._ +import scala.util.Using + +/** + * Overrides supplied by the user. Each `Some(...)` value short-circuits the corresponding + * detection step; `None` means "let the inferencer decide". + */ +case class InferenceOverrides( + format: Option[SmartFileFormat] = None, + delimiter: Option[Char] = None, + hasHeader: Option[Boolean] = None, + sheetName: Option[String] = None, + flatten: Option[Boolean] = None +) + +/** + * The full inference result. Carries the inferred schema along with the configuration + * the runtime executor needs to read the file the same way the inferencer did. + */ +case class InferenceResult( + format: SmartFileFormat, + schema: Schema, + csvDelimiter: Option[String] = None, + csvHasHeader: Option[Boolean] = None, + sheetName: Option[String] = None, + availableSheetNames: List[String] = Nil, + flatten: Option[Boolean] = None, + isFolder: Boolean = false, + fileCount: Int = 1 +) + +/** + * The single source of truth for "look at this file and decide how to read it." + * Both the operator descriptor (compile-time schema declaration) and the live + * preview REST endpoint route through this object so their behavior is identical. + */ +object SmartFileInferencer { + + /** Bytes to read when sniffing format / delimiter / header. */ + private val SampleByteCount = 64 * 1024 + + /** Rows to read when inferring types. Matches `ScanSourceOpDesc.INFER_READ_LIMIT`. */ + private val InferRowLimit = 100 + + /** Cheap detection that only reads the header bytes. */ + def detect(uri: URI, encoding: Charset): SmartFileFormat = { + val sample = readSampleBytes(uri) + FormatDetector.detect(Some(uri.getPath), sample, encoding) + } + + /** Full inference: format detection + schema. */ + def infer(uri: URI, encoding: Charset, overrides: InferenceOverrides): InferenceResult = { + val input = FolderInputResolver.resolve(uri) + if (input.isFolder) { + inferFolder(uri, input.files.map(_.uri), encoding, overrides) + } else { + inferSingle(uri, encoding, overrides) + } + } + + private def inferFolder( + folderUri: URI, + files: List[URI], + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + if (files.isEmpty) { + throw new IllegalArgumentException(s"Folder $folderUri does not contain any readable files") + } + + val inferred = files.map(file => inferSingle(file, encoding, overrides)) + val first = inferred.head + val mismatchedFormat = inferred.find(_.format != first.format) + if (mismatchedFormat.nonEmpty) { + throw new IllegalArgumentException( + s"Folder $folderUri must contain files with the same detected format" + ) + } + + val expectedSchema = schemaSignature(first.schema) + val mismatchedSchema = inferred.find(result => schemaSignature(result.schema) != expectedSchema) + if (mismatchedSchema.nonEmpty) { + throw new IllegalArgumentException( + s"Folder $folderUri must contain files with the same inferred schema" + ) + } + + first.copy(isFolder = true, fileCount = files.size) + } + + private def inferSingle(uri: URI, encoding: Charset, overrides: InferenceOverrides): InferenceResult = { + val format = overrides.format + .filter(_ != SmartFileFormat.AUTO) + .getOrElse { + val sample = readSampleBytes(uri) + FormatDetector.detect(Some(uri.getPath), sample, encoding) + } + + format match { + case SmartFileFormat.CSV | SmartFileFormat.TSV => inferCsv(uri, encoding, format, overrides) + case SmartFileFormat.JSONL => inferJsonl(uri, encoding, overrides) + case SmartFileFormat.JSON => inferJson(uri, encoding, overrides) + case SmartFileFormat.ARROW => inferArrow(uri) + case SmartFileFormat.PARQUET => inferParquet(uri) + case SmartFileFormat.EXCEL => inferExcel(uri, overrides) + case SmartFileFormat.IMAGE => inferImage() + case SmartFileFormat.TEXT => inferText() + case SmartFileFormat.AUTO => + throw new IllegalStateException("AUTO should have been resolved before dispatch") + } + } + + private def schemaSignature(schema: Schema): List[(String, AttributeType)] = + schema.getAttributes.map(attribute => attribute.getName -> attribute.getType) + + // --------------------------------------------------------------------------- + // CSV / TSV + // --------------------------------------------------------------------------- + + private def inferCsv( + uri: URI, + encoding: Charset, + format: SmartFileFormat, + overrides: InferenceOverrides + ): InferenceResult = { + val sampleText = readSampleText(uri, encoding) + val preferred = format match { + case SmartFileFormat.TSV => Some('\t') + case _ => Some(',') + } + val sniffed = CSVDialectSniffer.sniff(sampleText, preferred) + val delimiter = overrides.delimiter.getOrElse(sniffed.delimiter) + val hasHeader = overrides.hasHeader.getOrElse(sniffed.hasHeader) + val schema = inferCsvSchema(uri, encoding, delimiter, hasHeader) + InferenceResult( + format = format, + schema = schema, + csvDelimiter = Some(delimiter.toString), + csvHasHeader = Some(hasHeader) + ) + } + + private def inferCsvSchema( + uri: URI, + encoding: Charset, + delimiter: Char, + hasHeader: Boolean + ): Schema = { + val csvFormat = new CsvFormat() + csvFormat.setDelimiter(delimiter) + csvFormat.setLineSeparator("\n") + csvFormat.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setMaxCharsPerColumn(-1) + settings.setFormat(csvFormat) + settings.setHeaderExtractionEnabled(hasHeader) + settings.setNullValue("") + + val parser = new CsvParser(settings) + val stream = openStream(uri) + val reader = new InputStreamReader(stream, encoding) + try { + parser.beginParsing(reader) + val rows = ArrayBuffer.empty[Array[String]] + var row = parser.parseNext() + var read = 0 + while (row != null && read < InferRowLimit) { + rows += row + read += 1 + row = parser.parseNext() + } + parser.stopParsing() + val attributeTypes = inferSchemaFromRows(rows.iterator.map(_.asInstanceOf[Array[Any]])) + val header = + if (hasHeader) + Option(parser.getContext.headers()) + .getOrElse((1 to attributeTypes.length).map(i => s"column-$i").toArray) + else + (1 to attributeTypes.length).map(i => s"column-$i").toArray + val pairs = header.indices.map { i => + val attributeType = + if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + (header(i), attributeType) + } + pairs.foldLeft(Schema()) { case (s, (name, t)) => s.add(name, t) } + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // JSONL + // --------------------------------------------------------------------------- + + private def inferJsonl( + uri: URI, + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + val flatten = overrides.flatten.getOrElse(false) + val stream = openStream(uri) + val reader = new BufferedReader(new InputStreamReader(stream, encoding)) + try { + val fieldNames = scala.collection.mutable.LinkedHashSet[String]() + val rows = ArrayBuffer.empty[Map[String, String]] + val lines = reader.lines().iterator().asScala.take(InferRowLimit) + lines.foreach { line => + if (line != null && line.trim.nonEmpty) { + val root: JsonNode = objectMapper.readTree(line) + if (root.isObject) { + val fields = JSONToMap(root, flatten = flatten) + fields.keys.foreach(fieldNames += _) + rows += fields + } + } + } + val orderedNames = fieldNames.toList + val schema = buildJsonSchema(orderedNames, rows.toSeq) + InferenceResult( + format = SmartFileFormat.JSONL, + schema = schema, + flatten = Some(flatten) + ) + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // JSON (single object or array of objects) + // --------------------------------------------------------------------------- + + private def inferJson( + uri: URI, + encoding: Charset, + overrides: InferenceOverrides + ): InferenceResult = { + val flatten = overrides.flatten.getOrElse(false) + val stream = openStream(uri) + val reader = new InputStreamReader(stream, encoding) + try { + val root = objectMapper.readTree(reader) + val rows = ArrayBuffer.empty[Map[String, String]] + val fieldNames = scala.collection.mutable.LinkedHashSet[String]() + + val objectNodes: Iterator[JsonNode] = + if (root.isArray) root.elements().asScala + else if (root.isObject) Iterator.single(root) + else Iterator.empty + + var count = 0 + while (objectNodes.hasNext && count < InferRowLimit) { + val node = objectNodes.next() + if (node.isObject) { + val fields = JSONToMap(node, flatten = flatten) + fields.keys.foreach(fieldNames += _) + rows += fields + count += 1 + } + } + + val schema = buildJsonSchema(fieldNames.toList, rows.toSeq) + InferenceResult( + format = SmartFileFormat.JSON, + schema = schema, + flatten = Some(flatten) + ) + } finally reader.close() + } + + private def buildJsonSchema(orderedNames: List[String], rows: Seq[Map[String, String]]): Schema = { + if (orderedNames.isEmpty) return Schema() + val attributeTypes = inferSchemaFromRows(rows.iterator.map { row => + orderedNames.map(name => row.getOrElse(name, null)).toArray[Any] + }) + val attrs = orderedNames.indices.map { i => + val t = + if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + new Attribute(orderedNames(i), t) + } + Schema(attrs.toList) + } + + // --------------------------------------------------------------------------- + // Arrow + // --------------------------------------------------------------------------- + + private def inferArrow(uri: URI): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val allocator = new RootAllocator() + val schema = Using + .Manager { use => + val channel = use(Files.newByteChannel(file.toPath, StandardOpenOption.READ)) + val reader = use(new ArrowFileReader(channel, allocator)) + ArrowUtils.toTexeraSchema(reader.getVectorSchemaRoot.getSchema) + } + .getOrElse(throw new RuntimeException(s"Failed to read Arrow schema from $uri")) + InferenceResult(format = SmartFileFormat.ARROW, schema = schema) + } + + // --------------------------------------------------------------------------- + // Parquet + // --------------------------------------------------------------------------- + + private def inferParquet(uri: URI): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val reader = ParquetUtils.openReader(file) + try { + val parquetSchema = reader.getFooter.getFileMetaData.getSchema + InferenceResult(format = SmartFileFormat.PARQUET, schema = ParquetUtils.toTexeraSchema(parquetSchema)) + } finally reader.close() + } + + // --------------------------------------------------------------------------- + // Excel + // --------------------------------------------------------------------------- + + private def inferExcel(uri: URI, overrides: InferenceOverrides): InferenceResult = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val workbook = WorkbookFactory.create(file, null, true) // read-only + try { + val sheetNames = (0 until workbook.getNumberOfSheets).map(workbook.getSheetName).toList + val targetSheet: Sheet = overrides.sheetName + .flatMap(name => Option(workbook.getSheet(name))) + .getOrElse(workbook.getSheetAt(0)) + val hasHeader = overrides.hasHeader.getOrElse(true) + + val rowIter = targetSheet.iterator().asScala + val sampled = rowIter.take(InferRowLimit + 1).toList + if (sampled.isEmpty) { + return InferenceResult( + format = SmartFileFormat.EXCEL, + schema = Schema(), + sheetName = Some(targetSheet.getSheetName), + availableSheetNames = sheetNames, + csvHasHeader = Some(hasHeader) + ) + } + + val columnCount = sampled.map(_.getLastCellNum.toInt).max + val rowsAsStrings: List[Array[String]] = sampled.map { row => + (0 until columnCount).map(c => cellToString(row.getCell(c))).toArray + } + + val header: Array[String] = + if (hasHeader && rowsAsStrings.nonEmpty) + rowsAsStrings.head.zipWithIndex.map { + case (s, i) => if (s == null || s.isEmpty) s"column-${i + 1}" else s + } + else (1 to columnCount).map(i => s"column-$i").toArray + + val dataRows = if (hasHeader) rowsAsStrings.drop(1) else rowsAsStrings + val attributeTypes = inferSchemaFromRows(dataRows.iterator.map(_.asInstanceOf[Array[Any]])) + + val schema = header.indices.foldLeft(Schema()) { (s, i) => + val t = if (i < attributeTypes.length) attributeTypes(i) else AttributeType.STRING + s.add(header(i), t) + } + + InferenceResult( + format = SmartFileFormat.EXCEL, + schema = schema, + sheetName = Some(targetSheet.getSheetName), + availableSheetNames = sheetNames, + csvHasHeader = Some(hasHeader) + ) + } finally workbook.close() + } + + private def cellToString(cell: Cell): String = { + if (cell == null) return null + cell.getCellType match { + case CellType.STRING => cell.getStringCellValue + case CellType.BOOLEAN => String.valueOf(cell.getBooleanCellValue) + case CellType.NUMERIC => + if (DateUtil.isCellDateFormatted(cell)) + new java.sql.Timestamp(cell.getDateCellValue.getTime).toString + else { + val d = cell.getNumericCellValue + if (d == d.toLong.toDouble) d.toLong.toString else d.toString + } + case CellType.FORMULA => + cellToString(safelyEvaluate(cell)) + case CellType.BLANK | CellType._NONE | CellType.ERROR => null + case _ => null + } + } + + private def safelyEvaluate(cell: Cell): Cell = { + try { + val evaluator = cell.getSheet.getWorkbook.getCreationHelper.createFormulaEvaluator() + evaluator.evaluateInCell(cell) + } catch { + case _: Throwable => cell + } + } + + // --------------------------------------------------------------------------- + // Plain text + // --------------------------------------------------------------------------- + + private def inferText(): InferenceResult = + InferenceResult( + format = SmartFileFormat.TEXT, + schema = Schema(List(new Attribute("line", AttributeType.STRING))) + ) + + private def inferImage(): InferenceResult = + InferenceResult( + format = SmartFileFormat.IMAGE, + schema = Schema() + .add("image", AttributeType.BINARY) + .add("format", AttributeType.STRING) + .add("width", AttributeType.INTEGER) + .add("height", AttributeType.INTEGER) + ) + + // --------------------------------------------------------------------------- + // I/O helpers + // --------------------------------------------------------------------------- + + private def openStream(uri: URI): InputStream = + DocumentFactory.openReadonlyDocument(uri).asInputStream() + + private def readSampleBytes(uri: URI): Array[Byte] = { + val stream = openStream(uri) + try { + val buffer = new Array[Byte](SampleByteCount) + var totalRead = 0 + var lastRead = 0 + while (totalRead < buffer.length && { + lastRead = stream.read(buffer, totalRead, buffer.length - totalRead); lastRead + } > 0) { + totalRead += lastRead + } + if (totalRead == buffer.length) buffer else buffer.take(totalRead) + } finally stream.close() + } + + private def readSampleText(uri: URI, charset: Charset): String = + new String(readSampleBytes(uri), charset) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala new file mode 100644 index 00000000000..e2101bd2808 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDesc.scala @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} +import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.executor.OpExecWithClassName +import org.apache.texera.amber.core.tuple.Schema +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{OutputPort, PhysicalOp, SchemaPropagationFunc} +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import org.apache.texera.amber.operator.source.scan.ScanSourceOpDesc +import org.apache.texera.amber.util.JSONUtils.objectMapper + +import java.io.IOException +import java.net.URI + +class SmartFileSourceOpDesc extends ScanSourceOpDesc { + + @JsonProperty(defaultValue = "AUTO") + @JsonSchemaTitle("Format") + @JsonPropertyDescription("override automatic format detection") + var formatOverride: SmartFileFormat = SmartFileFormat.AUTO + + @JsonProperty + @JsonSchemaTitle("Delimiter") + @JsonPropertyDescription("CSV/TSV delimiter (auto-detected if empty)") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var customDelimiter: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Has Header") + @JsonPropertyDescription("first row contains column names (CSV/TSV/Excel)") + @JsonDeserialize(contentAs = classOf[java.lang.Boolean]) + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var hasHeader: Option[Boolean] = None + + @JsonProperty + @JsonSchemaTitle("Excel Sheet Name") + @JsonPropertyDescription("for Excel files; leave empty to use the first sheet") + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var sheetName: Option[String] = None + + @JsonProperty + @JsonSchemaTitle("Flatten Nested JSON") + @JsonPropertyDescription("flatten nested JSON objects and arrays into dot-notation columns") + @JsonDeserialize(contentAs = classOf[java.lang.Boolean]) + @JsonInclude(JsonInclude.Include.NON_ABSENT) + var flatten: Option[Boolean] = None + + @JsonProperty(defaultValue = "false") + @JsonSchemaTitle("Include Source File") + @JsonPropertyDescription("append a source file column when reading folders") + var includeSourceFile: Boolean = false + + @JsonProperty(defaultValue = "source_file") + @JsonSchemaTitle("Source File Column") + @JsonPropertyDescription("column name used when source file output is enabled") + var sourceFileAttribute: String = "source_file" + + fileTypeName = Option("Smart") + + override def operatorInfo: OperatorInfo = + OperatorInfo( + userFriendlyName = "Smart Source", + operatorDescription = + "Auto-detects file format and schema for a file or a folder of similar files. Supports CSV, TSV, JSON, JSONL, Arrow, Parquet, Excel, images, and plain text.", + operatorGroupName = OperatorGroupConstants.INPUT_GROUP, + inputPorts = List.empty, + outputPorts = List(OutputPort()) + ) + + @throws[IOException] + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalOp = { + PhysicalOp + .sourcePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithClassName( + "org.apache.texera.amber.operator.source.scan.smart.SmartFileSourceOpExec", + objectMapper.writeValueAsString(this) + ) + ) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withPropagateSchema( + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) + ) + } + + override def sourceSchema(): Schema = { + if (!fileResolved()) return null + withOptionalSourceFile(runInference().schema) + } + + /** Run inference using the descriptor's own fields as overrides. */ + def runInference(): InferenceResult = { + val overrides = InferenceOverrides( + format = Option(formatOverride), + delimiter = customDelimiter.flatMap(_.headOption), + hasHeader = hasHeader, + sheetName = sheetName, + flatten = flatten + ) + SmartFileInferencer.infer( + new URI(fileName.get), + fileEncoding.getCharset, + overrides + ) + } + + def withOptionalSourceFile(schema: Schema): Schema = + if (includeSourceFile) schema.add(sourceFileAttribute, org.apache.texera.amber.core.tuple.AttributeType.STRING) + else schema +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala new file mode 100644 index 00000000000..b6849bc0cd5 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExec.scala @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import com.fasterxml.jackson.databind.JsonNode +import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowFileReader +import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory} +import org.apache.texera.amber.core.executor.SourceOperatorExecutor +import org.apache.texera.amber.core.storage.DocumentFactory +import org.apache.texera.amber.core.tuple.{AttributeTypeUtils, Schema, TupleLike} +import org.apache.texera.amber.operator.source.scan.FolderInputResolver +import org.apache.texera.amber.util.{ArrowUtils, ImageFormatUtils, JSONUtils} +import org.apache.texera.amber.util.JSONUtils.{JSONToMap, objectMapper} + +import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader} +import java.net.URI +import java.nio.file.{Files, StandardOpenOption} +import javax.imageio.ImageIO +import scala.collection.immutable.ArraySeq +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpExec(descString: String) extends SourceOperatorExecutor { + + private val desc: SmartFileSourceOpDesc = + objectMapper.readValue(descString, classOf[SmartFileSourceOpDesc]) + + private var inference: InferenceResult = _ + private var schema: Schema = _ + private val resources = scala.collection.mutable.ListBuffer.empty[AutoCloseable] + private var tupleSource: Iterator[TupleLike] = Iterator.empty + + private def closeableOf(fn: () => Unit): AutoCloseable = + new AutoCloseable { override def close(): Unit = fn() } + + override def open(): Unit = { + inference = desc.runInference() + schema = desc.withOptionalSourceFile(inference.schema) + tupleSource = openReader() + } + + override def produceTuple(): Iterator[TupleLike] = { + var it = tupleSource.drop(desc.offset.getOrElse(0)) + if (desc.limit.isDefined) it = it.take(desc.limit.get) + it + } + + override def close(): Unit = { + resources.foreach { c => + try c.close() + catch { case _: Throwable => /* swallow on shutdown */ } + } + resources.clear() + } + + // --------------------------------------------------------------------------- + // Per-format readers + // --------------------------------------------------------------------------- + + private def openReader(): Iterator[TupleLike] = { + val input = FolderInputResolver.resolve(new URI(desc.fileName.get)) + input.files.iterator.flatMap { file => + val rows = inference.format match { + case SmartFileFormat.CSV | SmartFileFormat.TSV => csvReader(file.uri) + case SmartFileFormat.JSONL => jsonlReader(file.uri) + case SmartFileFormat.JSON => jsonReader(file.uri) + case SmartFileFormat.ARROW => arrowReader(file.uri) + case SmartFileFormat.PARQUET => parquetReader(file.uri) + case SmartFileFormat.EXCEL => excelReader(file.uri) + case SmartFileFormat.IMAGE => imageReader(file.uri) + case SmartFileFormat.TEXT => textReader(file.uri) + case SmartFileFormat.AUTO => + throw new IllegalStateException("AUTO should have been resolved by inferencer") + } + if (desc.includeSourceFile) rows.map(appendSourceFile(_, file.displayName)) else rows + } + } + + private def appendSourceFile(tuple: TupleLike, displayName: String): TupleLike = + TupleLike(tuple.getFields :+ displayName) + + // CSV / TSV ---------------------------------------------------------------- + + private def csvReader(uri: URI): Iterator[TupleLike] = { + val delimiter = inference.csvDelimiter + .flatMap(_.headOption) + .getOrElse(if (inference.format == SmartFileFormat.TSV) '\t' else ',') + val hasHeader = inference.csvHasHeader.getOrElse(true) + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val reader = new InputStreamReader(stream, desc.fileEncoding.getCharset) + resources += reader + + val format = new CsvFormat() + format.setDelimiter(delimiter) + format.setLineSeparator("\n") + format.setComment('\u0000') + val settings = new CsvParserSettings() + settings.setMaxCharsPerColumn(-1) + settings.setFormat(format) + settings.setHeaderExtractionEnabled(hasHeader) + settings.setNullValue("") + val parser = new CsvParser(settings) + parser.beginParsing(reader) + resources += closeableOf(() => parser.stopParsing()) + + new Iterator[TupleLike] { + private var nextRow: Array[String] = parser.parseNext() + override def hasNext: Boolean = nextRow != null + override def next(): TupleLike = { + val row = nextRow + nextRow = parser.parseNext() + try { + TupleLike( + ArraySeq.unsafeWrapArray( + AttributeTypeUtils.parseFields(row.asInstanceOf[Array[Any]], schema) + ): _* + ) + } catch { + case _: Throwable => null + } + } + }.filter(_ != null) + } + + // JSONL -------------------------------------------------------------------- + + private def jsonlReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val br = new BufferedReader(new InputStreamReader(stream, desc.fileEncoding.getCharset)) + resources += br + val flatten = inference.flatten.getOrElse(false) + val names = schema.getAttributeNames + + br.lines().iterator().asScala + .flatMap { line => + if (line == null || line.trim.isEmpty) None + else { + try { + val node = objectMapper.readTree(line) + if (!node.isObject) None + else Some(buildTupleFromJsonObject(node, names, flatten)) + } catch { + case _: Throwable => None + } + } + } + } + + // JSON --------------------------------------------------------------------- + + private def jsonReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val reader = new InputStreamReader(stream, desc.fileEncoding.getCharset) + resources += reader + val flatten = inference.flatten.getOrElse(false) + val names = schema.getAttributeNames + + val root = objectMapper.readTree(reader) + val nodes: Iterator[JsonNode] = + if (root.isArray) root.elements().asScala + else if (root.isObject) Iterator.single(root) + else Iterator.empty + + nodes.flatMap { node => + if (!node.isObject) None + else + try Some(buildTupleFromJsonObject(node, names, flatten)) + catch { case _: Throwable => None } + } + } + + private def buildTupleFromJsonObject( + node: JsonNode, + names: List[String], + flatten: Boolean + ): TupleLike = { + val fields = JSONToMap(node, flatten).withDefaultValue(null) + val parsed = names.map { name => + AttributeTypeUtils.parseField(fields(name), schema.getAttribute(name).getType) + } + TupleLike(parsed: _*) + } + + // Arrow -------------------------------------------------------------------- + + private def arrowReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val allocator = new RootAllocator() + val channel = Files.newByteChannel(file.toPath, StandardOpenOption.READ) + val arrowReader = new ArrowFileReader(channel, allocator) + val vectorRoot: VectorSchemaRoot = arrowReader.getVectorSchemaRoot + resources += vectorRoot + resources += arrowReader + resources += allocator + resources += closeableOf(() => channel.close()) + + new Iterator[TupleLike] { + private var idx = 0 + override def hasNext: Boolean = { + if (vectorRoot.getRowCount > idx) true + else if (arrowReader.loadNextBatch()) { idx = 0; vectorRoot.getRowCount > 0 } + else false + } + override def next(): TupleLike = { + val tuple = ArrowUtils.getTexeraTuple(idx, vectorRoot) + idx += 1 + tuple + } + } + } + + // Parquet ------------------------------------------------------------------ + + private def parquetReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val handle = ParquetUtils.openRecords(file) + resources += closeableOf(() => handle.close()) + + val parquetSchema = handle.schema + val attributeNames = schema.getAttributeNames + val parquetIndex: Map[String, Int] = + (0 until parquetSchema.getFieldCount).map(i => parquetSchema.getType(i).getName -> i).toMap + + handle.records.map { group => + val values = attributeNames.map { name => + parquetIndex.get(name) match { + case Some(i) => + val raw = ParquetUtils.readField(group, i, parquetSchema) + try AttributeTypeUtils.parseField(raw, schema.getAttribute(name).getType) + catch { case _: Throwable => raw } + case None => null + } + } + TupleLike(values: _*) + } + } + + // Excel -------------------------------------------------------------------- + + private def excelReader(uri: URI): Iterator[TupleLike] = { + val file = DocumentFactory.openReadonlyDocument(uri).asFile() + val workbook: Workbook = WorkbookFactory.create(file, null, true) + resources += workbook + val sheet = inference.sheetName + .flatMap(name => Option(workbook.getSheet(name))) + .getOrElse(workbook.getSheetAt(0)) + val hasHeader = inference.csvHasHeader.getOrElse(true) + val attributeNames = schema.getAttributeNames + + val rowIter = sheet.iterator().asScala + val dataRows = if (hasHeader && rowIter.hasNext) { rowIter.next(); rowIter } else rowIter + + dataRows.map { row => + val values = attributeNames.indices.map { i => + val cell = row.getCell(i) + val raw = readExcelCell(cell) + try AttributeTypeUtils.parseField(raw, schema.getAttributes(i).getType) + catch { case _: Throwable => raw } + } + TupleLike(values: _*) + } + } + + private def readExcelCell(cell: org.apache.poi.ss.usermodel.Cell): Any = { + import org.apache.poi.ss.usermodel.{CellType, DateUtil} + if (cell == null) return null + cell.getCellType match { + case CellType.STRING => cell.getStringCellValue + case CellType.BOOLEAN => java.lang.Boolean.valueOf(cell.getBooleanCellValue) + case CellType.NUMERIC => + if (DateUtil.isCellDateFormatted(cell)) + new java.sql.Timestamp(cell.getDateCellValue.getTime) + else { + val d = cell.getNumericCellValue + if (d == d.toLong.toDouble) java.lang.Long.valueOf(d.toLong) + else java.lang.Double.valueOf(d) + } + case CellType.FORMULA => + try { + val evaluator = cell.getSheet.getWorkbook.getCreationHelper.createFormulaEvaluator() + val evaluated = evaluator.evaluate(cell) + evaluated.getCellType match { + case CellType.STRING => evaluated.getStringValue + case CellType.BOOLEAN => java.lang.Boolean.valueOf(evaluated.getBooleanValue) + case CellType.NUMERIC => + val d = evaluated.getNumberValue + if (d == d.toLong.toDouble) java.lang.Long.valueOf(d.toLong) + else java.lang.Double.valueOf(d) + case _ => null + } + } catch { + case _: Throwable => null + } + case _ => null + } + } + + // Images ------------------------------------------------------------------- + + private def imageReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val bytes = + try stream.readAllBytes() + finally stream.close() + val image = ImageIO.read(new ByteArrayInputStream(bytes)) + val format = ImageFormatUtils + .detectFormat(bytes) + .orElse(ImageFormatUtils.extensionFormat(uri.getPath)) + .getOrElse("unknown") + val width = Option(image).map(image => Int.box(image.getWidth)).orNull + val height = Option(image).map(image => Int.box(image.getHeight)).orNull + Iterator.single(TupleLike(bytes, format, width, height)) + } + + // Plain text --------------------------------------------------------------- + + private def textReader(uri: URI): Iterator[TupleLike] = { + val stream = DocumentFactory.openReadonlyDocument(uri).asInputStream() + val br = new BufferedReader(new InputStreamReader(stream, desc.fileEncoding.getCharset)) + resources += br + br.lines().iterator().asScala.map(line => TupleLike(line)) + } + + // Keep the JSONUtils import live (used transitively by JSONToMap/objectMapper above). + locally(JSONUtils) +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala new file mode 100644 index 00000000000..398f4de7729 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpDescSpec.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.scalatest.flatspec.AnyFlatSpec + +class FileSplitOpDescSpec extends AnyFlatSpec { + + "FileSplitOpDesc" should "propagate the input schema to every output port" in { + val desc = new FileSplitOpDesc() + val inputSchema = Schema( + List( + new Attribute("source_file", AttributeType.STRING), + new Attribute("value", AttributeType.INTEGER) + ) + ) + + val outputSchemas = desc.getExternalOutputSchemas(Map(PortIdentity() -> inputSchema)) + + assert(outputSchemas.keySet == Set(PortIdentity(), PortIdentity(1))) + assert(outputSchemas.values.forall(_ == inputSchema)) + } + + it should "reject inputs without a file identity column" in { + val desc = new FileSplitOpDesc() + val inputSchema = Schema(List(new Attribute("value", AttributeType.INTEGER))) + + val err = intercept[IllegalArgumentException] { + desc.getExternalOutputSchemas(Map(PortIdentity() -> inputSchema)) + } + assert(err.getMessage.contains("source_file")) + assert(err.getMessage.contains("filename")) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala new file mode 100644 index 00000000000..7b04ec961ba --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/fileSplit/FileSplitOpExecSpec.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.fileSplit + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +class FileSplitOpExecSpec extends AnyFlatSpec { + + "FileSplitOpExec" should "keep rows from the same file on the same output port" in { + val desc = new FileSplitOpDesc() + val exec = new FileSplitOpExec(objectMapper.writeValueAsString(desc)) + val schema = Schema( + List( + new Attribute("source_file", AttributeType.STRING), + new Attribute("value", AttributeType.INTEGER) + ) + ) + + exec.open() + val outputs = List( + Tuple(schema, Array[Any]("a.csv", 1)), + Tuple(schema, Array[Any]("b.csv", 2)), + Tuple(schema, Array[Any]("a.csv", 3)), + Tuple(schema, Array[Any]("c.csv", 4)) + ).flatMap(tuple => exec.processTupleMultiPort(tuple, 0).toList) + exec.close() + + assert(outputs.map(_._2.get) == List(PortIdentity(), PortIdentity(1), PortIdentity(), PortIdentity())) + } + + it should "auto-detect the filename column used by file scans" in { + val desc = new FileSplitOpDesc() + val exec = new FileSplitOpExec(objectMapper.writeValueAsString(desc)) + val schema = Schema( + List( + new Attribute("filename", AttributeType.STRING), + new Attribute("content", AttributeType.BINARY) + ) + ) + + exec.open() + val output = exec + .processTupleMultiPort(Tuple(schema, Array[Any]("cat.png", Array[Byte](1, 2, 3))), 0) + .next() + exec.close() + + assert(output._2.contains(PortIdentity())) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDescSpec.scala new file mode 100644 index 00000000000..9945f1b9dd9 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/llm/LLMFileSourceOpDescSpec.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.llm + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType} +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import scala.jdk.CollectionConverters._ + +class LLMFileSourceOpDescSpec extends AnyFlatSpec with Matchers { + + private val workflowId = WorkflowIdentity(1L) + private val executionId = ExecutionIdentity(1L) + + "LLMFileSourceOpDesc.getPhysicalOp" should "densify sparse per-table rows to the union schema" in { + val desc = new LLMFileSourceOpDesc + desc.generatedCode = + """from pytexera import * + | + |class GenerateOperator(UDFSourceOperator): + | @overrides + | def produce(self): + | yield {"__table__": "revenue_by_region", "month": "January"} + |""".stripMargin + desc.unionColumns = List( + new Attribute("__table__", AttributeType.STRING), + new Attribute("month", AttributeType.STRING), + new Attribute("department", AttributeType.STRING) + ).asJava + + val code = desc.getPhysicalOp(workflowId, executionId).getCode + + code should include("""_texera_llm_source_columns = ["__table__","month","department"]""") + code should include("yield {column: row.get(column) for column in _texera_llm_source_columns}") + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolverSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolverSpec.scala new file mode 100644 index 00000000000..8f538868c73 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/FolderInputResolverSpec.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import java.nio.file.Files + +class FolderInputResolverSpec extends AnyFlatSpec with Matchers { + + "FolderInputResolver.materializeToLocalPath" should "return a local folder path for folder inputs" in { + val dir = Files.createTempDirectory("folder-input-resolver-") + try { + Files.writeString(dir.resolve("one.txt"), "one") + + FolderInputResolver.materializeToLocalPath(dir.toUri) shouldBe dir + } finally { + Files.deleteIfExists(dir.resolve("one.txt")) + Files.deleteIfExists(dir) + } + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala index 4437c018bd5..b5906e4edfd 100644 --- a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/file/FileScanSourceOpDescSpec.scala @@ -27,6 +27,9 @@ import org.apache.texera.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec +import java.nio.file.Files +import scala.jdk.CollectionConverters._ + class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { var fileScanSourceOpDesc: FileScanSourceOpDesc = _ @@ -185,4 +188,60 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { FileScanSourceOpExec.close() } + it should "read a folder of binary files and preserve relative file names" in { + val dir = Files.createTempDirectory("file-scan-image-folder-") + try { + Files.write(dir.resolve("cat.png"), Array[Byte](1, 2, 3)) + Files.write(dir.resolve("dog.png"), Array[Byte](4, 5, 6)) + + fileScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(dir.toString)) + fileScanSourceOpDesc.attributeType = FileAttributeType.BINARY + fileScanSourceOpDesc.outputFileName = true + + val exec = new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) + exec.open() + val tuples = exec + .produceTuple() + .map(_.asInstanceOf[SchemaEnforceable].enforceSchema(fileScanSourceOpDesc.sourceSchema())) + .toList + exec.close() + + assert(tuples.map(_.getField[String]("filename")) == List("cat.png", "dog.png")) + assert(tuples.map(_.getField[Array[Byte]]("line").toList) == List(List[Byte](1, 2, 3), List[Byte](4, 5, 6))) + } finally deleteRecursively(dir) + } + + it should "preserve relative file names for line-based folder scans" in { + val dir = Files.createTempDirectory("file-scan-text-folder-") + try { + Files.writeString(dir.resolve("a.txt"), "line-a\n") + Files.writeString(dir.resolve("b.txt"), "line-b\n") + + fileScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(dir.toString)) + fileScanSourceOpDesc.attributeType = FileAttributeType.STRING + fileScanSourceOpDesc.outputFileName = true + + val exec = new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) + exec.open() + val tuples = exec + .produceTuple() + .map(_.asInstanceOf[SchemaEnforceable].enforceSchema(fileScanSourceOpDesc.sourceSchema())) + .toList + exec.close() + + assert(tuples.map(_.getField[String]("filename")) == List("a.txt", "b.txt")) + assert(tuples.map(_.getField[String]("line")) == List("line-a", "line-b")) + } finally deleteRecursively(dir) + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } + } diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala new file mode 100644 index 00000000000..82f349d2c2f --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/CSVDialectSnifferSpec.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.scalatest.flatspec.AnyFlatSpec + +class CSVDialectSnifferSpec extends AnyFlatSpec { + + "CSVDialectSniffer" should "detect comma as delimiter for plain CSV" in { + val text = "id,name,age\n1,Ada,36\n2,Lin,29\n3,Bob,42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ',') + assert(dialect.hasHeader) + } + + it should "detect tab as delimiter for TSV-like content" in { + val text = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n3\tBob\t42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == '\t') + assert(dialect.hasHeader) + } + + it should "detect semicolon as delimiter when commas are absent" in { + val text = "id;name;age\n1;Ada;36\n2;Lin;29\n3;Bob;42\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ';') + } + + it should "detect missing header when all rows look like data" in { + val text = "1,Ada,36\n2,Lin,29\n3,Bob,42\n4,Eve,55\n" + val dialect = CSVDialectSniffer.sniff(text) + assert(dialect.delimiter == ',') + // First row is purely numeric/string mixed; later rows are the same shape. + // The sniffer defaults to "has header" only when row 1 looks distinct. + assert(!dialect.hasHeader) + } + + it should "honor a preferred delimiter when the content is consistent with it" in { + val text = "a,b,c\n1,2,3\n4,5,6\n" + val dialect = CSVDialectSniffer.sniff(text, preferred = Some(',')) + assert(dialect.delimiter == ',') + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala new file mode 100644 index 00000000000..cecc74034f5 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/FormatDetectorSpec.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.scalatest.flatspec.AnyFlatSpec + +import java.nio.charset.StandardCharsets + +class FormatDetectorSpec extends AnyFlatSpec { + + private val utf8 = StandardCharsets.UTF_8 + + "FormatDetector" should "detect Parquet by magic bytes" in { + val bytes = "PAR1".getBytes(utf8) ++ Array.fill(20)(0.toByte) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.PARQUET) + } + + it should "detect XLSX by ZIP magic bytes" in { + val bytes = Array[Byte](0x50, 0x4b, 0x03, 0x04, 0, 0, 0, 0) + assert(FormatDetector.detect(Some("foo.xlsx"), bytes, utf8) == SmartFileFormat.EXCEL) + } + + it should "not classify a generic ZIP container as Excel" in { + val bytes = Array[Byte](0x50, 0x4b, 0x03, 0x04, 0, 0, 0, 0) + assert(FormatDetector.detect(Some("archive.zip"), bytes, utf8) == SmartFileFormat.TEXT) + } + + it should "detect Arrow by ARROW1 magic" in { + val bytes = "ARROW1\u0000\u0000".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.ARROW) + } + + it should "detect TSV when content contains tabs and extension matches" in { + val bytes = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n".getBytes(utf8) + assert(FormatDetector.detect(Some("users.tsv"), bytes, utf8) == SmartFileFormat.TSV) + } + + it should "detect TSV by content even if extension is .csv" in { + val bytes = "id\tname\tage\n1\tAda\t36\n2\tLin\t29\n".getBytes(utf8) + val detected = FormatDetector.detect(Some("misnamed.csv"), bytes, utf8) + // The .csv extension wins over content sniffing — that's the expected ranking. + assert(detected == SmartFileFormat.CSV) + } + + it should "fall back to content sniffing when extension is unknown" in { + val bytes = "id\tname\n1\tAda\n2\tLin\n".getBytes(utf8) + assert(FormatDetector.detect(Some("blob.bin"), bytes, utf8) == SmartFileFormat.TSV) + } + + it should "detect JSONL when multiple lines start with {" in { + val bytes = "{\"a\":1}\n{\"a\":2}\n{\"a\":3}\n".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.JSONL) + } + + it should "detect JSON array when content starts with [" in { + val bytes = "[ {\"a\":1}, {\"a\":2} ]".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.JSON) + } + + it should "detect plain text when there are no delimiters" in { + val bytes = "hello world\nthis is text\n".getBytes(utf8) + assert(FormatDetector.detect(None, bytes, utf8) == SmartFileFormat.TEXT) + } + + it should "prefer extension over content sniffing for CSV" in { + val bytes = "a,b,c\n1,2,3\n".getBytes(utf8) + assert(FormatDetector.detect(Some("data.csv"), bytes, utf8) == SmartFileFormat.CSV) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala new file mode 100644 index 00000000000..ec092c69d22 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpDescSpec.scala @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.example.data.simple.SimpleGroupFactory +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.example.GroupWriteSupport +import org.apache.parquet.schema.{MessageTypeParser, Type} +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.TestOperators +import org.scalatest.flatspec.AnyFlatSpec + +import java.awt.image.BufferedImage +import java.io.{File, FileOutputStream} +import javax.imageio.ImageIO +import java.nio.file.Files +import java.nio.charset.StandardCharsets +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpDescSpec extends AnyFlatSpec { + + "SmartFileSourceOpDesc.operatorInfo" should "advertise the broader Smart Source name" in { + val desc = new SmartFileSourceOpDesc() + + assert(desc.operatorInfo.userFriendlyName == "Smart Source") + } + + "SmartFileSourceOpDesc" should "infer CSV format and schema from a CSV file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.CountrySalesSmallCsvPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.csvDelimiter.contains(",")) + assert(result.csvHasHeader.contains(true)) + assert(result.schema.getAttributes.length == 14) + assert(result.schema.getAttribute("Order ID").getType == AttributeType.INTEGER) + } + + it should "infer JSONL format and schema from a JSONL file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.smallJsonLPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.JSONL) + assert(result.schema.getAttributes.nonEmpty) + } + + it should "respect a formatOverride from the user" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.CountrySalesSmallCsvPath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.formatOverride = SmartFileFormat.CSV + desc.customDelimiter = Some(",") + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + } + + it should "infer plain text format for a .txt file" in { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(TestOperators.TestTextFilePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.TEXT) + assert(result.schema.getAttributeNames == List("line")) + assert(result.schema.getAttribute("line").getType == AttributeType.STRING) + } + + it should "infer string columns for a header-only CSV file" in { + val tmp = Files.createTempFile("smartfile-header-only-", ".csv") + try { + Files.writeString(tmp, "id,name,score\n", StandardCharsets.UTF_8) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.toFile.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.schema.getAttributeNames == List("id", "name", "score")) + assert(result.schema.getAttributes.forall(_.getType == AttributeType.STRING)) + } finally Files.deleteIfExists(tmp) + } + + it should "infer one schema for a folder of similar CSV files" in { + val dir = Files.createTempDirectory("smartfile-folder-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.CSV) + assert(result.isFolder) + assert(result.fileCount == 2) + assert(result.schema.getAttributeNames == List("id", "name")) + } finally deleteRecursively(dir) + } + + it should "infer image folders as image records" in { + val dir = Files.createTempDirectory("smartfile-image-folder-") + try { + writePng(dir.resolve("cat.png").toFile, width = 3, height = 2) + writePng(dir.resolve("dog.png").toFile, width = 4, height = 5) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.IMAGE) + assert(result.isFolder) + assert(result.fileCount == 2) + assert(result.schema.getAttributeNames == List("image", "format", "width", "height")) + assert(result.schema.getAttribute("image").getType == AttributeType.BINARY) + assert(result.schema.getAttribute("format").getType == AttributeType.STRING) + assert(result.schema.getAttribute("width").getType == AttributeType.INTEGER) + assert(result.schema.getAttribute("height").getType == AttributeType.INTEGER) + } finally deleteRecursively(dir) + } + + it should "append a source file column when folder provenance is enabled" in { + val dir = Files.createTempDirectory("smartfile-folder-source-column-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val schema = desc.sourceSchema() + assert(schema.getAttributeNames == List("id", "name", "source_file")) + assert(schema.getAttribute("source_file").getType == AttributeType.STRING) + } finally deleteRecursively(dir) + } + + it should "reject folders that mix file formats" in { + val dir = Files.createTempDirectory("smartfile-mixed-folder-") + try { + Files.writeString(dir.resolve("part.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("part.jsonl"), """{"id":2,"name":"Lin"}""" + "\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val err = intercept[IllegalArgumentException](desc.runInference()) + assert(err.getMessage.contains("same detected format")) + } finally deleteRecursively(dir) + } + + it should "reject empty folders" in { + val dir = Files.createTempDirectory("smartfile-empty-folder-") + try { + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val err = intercept[IllegalArgumentException](desc.runInference()) + assert(err.getMessage.contains("does not contain any readable files")) + } finally deleteRecursively(dir) + } + + it should "infer schema from a generated Excel file" in { + val tmp = Files.createTempFile("smartfile-test-", ".xlsx").toFile + try { + writeExcel(tmp) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.EXCEL) + val attrs = result.schema.getAttributes + assert(attrs.length == 3) + assert(attrs.head.getName == "id") + assert(attrs(1).getName == "name") + assert(attrs(2).getName == "score") + assert(attrs.head.getType == AttributeType.INTEGER) + assert(attrs(2).getType == AttributeType.DOUBLE) + } finally tmp.delete() + } + + it should "infer schema from a generated Parquet file" in { + val tmp = Files.createTempFile("smartfile-test-", ".parquet").toFile + tmp.delete() // ParquetWriter wants to create the file itself + try { + writeParquet(tmp) + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(tmp.getAbsolutePath) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val result = desc.runInference() + assert(result.format == SmartFileFormat.PARQUET) + val attrs = result.schema.getAttributes + assert(attrs.length == 3) + assert(attrs.exists(_.getName == "id")) + assert(result.schema.getAttribute("id").getType == AttributeType.INTEGER) + assert(result.schema.getAttribute("name").getType == AttributeType.STRING) + assert(result.schema.getAttribute("score").getType == AttributeType.DOUBLE) + } finally tmp.delete() + } + + private def writeExcel(out: File): Unit = { + val workbook = new XSSFWorkbook() + try { + val sheet = workbook.createSheet("Sheet1") + val header = sheet.createRow(0) + header.createCell(0).setCellValue("id") + header.createCell(1).setCellValue("name") + header.createCell(2).setCellValue("score") + + val rows = Seq((1, "Ada", 36.5), (2, "Lin", 29.1), (3, "Bob", 42.0)) + rows.zipWithIndex.foreach { + case ((id, name, score), i) => + val row = sheet.createRow(i + 1) + row.createCell(0).setCellValue(id.toDouble) + row.createCell(1).setCellValue(name) + row.createCell(2).setCellValue(score) + } + val fos = new FileOutputStream(out) + try workbook.write(fos) + finally fos.close() + } finally workbook.close() + } + + private def writePng(out: File, width: Int, height: Int): Unit = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + ImageIO.write(image, "png", out) + } + + private def writeParquet(out: File): Unit = { + val schemaStr = + """ + |message simple { + | required int32 id; + | required binary name (UTF8); + | required double score; + |} + """.stripMargin + val schema = MessageTypeParser.parseMessageType(schemaStr) + val conf = new Configuration(false) + conf.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") + GroupWriteSupport.setSchema(schema, conf) + + val factory = new SimpleGroupFactory(schema) + val writer = new ParquetWriter[org.apache.parquet.example.data.Group]( + new Path(out.toURI), + new GroupWriteSupport(), + org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED, + ParquetWriter.DEFAULT_BLOCK_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + ParquetWriter.DEFAULT_PAGE_SIZE, + true, + false, + ParquetWriter.DEFAULT_WRITER_VERSION, + conf + ) + try { + writer.write(factory.newGroup().append("id", 1).append("name", "Ada").append("score", 36.5d)) + writer.write(factory.newGroup().append("id", 2).append("name", "Lin").append("score", 29.1d)) + } finally writer.close() + + // Avoid compiler unused-import warning for Type — keep an explicit reference here so that + // if MessageTypeParser ever changes its return type the compile fails loudly. + val _: Type = schema + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala new file mode 100644 index 00000000000..13cc2d80cd1 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/source/scan/smart/SmartFileSourceOpExecSpec.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.source.scan.smart + +import org.apache.texera.amber.core.storage.FileResolver +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +import java.awt.image.BufferedImage +import java.io.File +import javax.imageio.ImageIO +import java.nio.charset.StandardCharsets +import java.nio.file.Files +import scala.jdk.CollectionConverters._ + +class SmartFileSourceOpExecSpec extends AnyFlatSpec { + + "SmartFileSourceOpExec" should "read a folder of similar CSV files as one source" in { + val dir = Files.createTempDirectory("smartfile-folder-exec-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.size == 2) + assert(tuples.map(_.getFields(0)) == List(1, 2)) + assert(tuples.map(_.getFields(1)) == List("Ada", "Lin")) + } finally deleteRecursively(dir) + } + + it should "preserve the originating file for folder rows when enabled" in { + val dir = Files.createTempDirectory("smartfile-folder-source-column-exec-") + try { + Files.writeString(dir.resolve("2025-01.csv"), "id,name\n1,Ada\n", StandardCharsets.UTF_8) + Files.writeString(dir.resolve("2025-02.csv"), "id,name\n2,Lin\n", StandardCharsets.UTF_8) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.map(_.getFields.last) == List("2025-01.csv", "2025-02.csv")) + } finally deleteRecursively(dir) + } + + it should "read image folders as image records with metadata" in { + val dir = Files.createTempDirectory("smartfile-image-folder-exec-") + try { + writePng(dir.resolve("cat.png").toFile, width = 3, height = 2) + writePng(dir.resolve("dog.png").toFile, width = 4, height = 5) + + val desc = new SmartFileSourceOpDesc() + desc.fileName = Some(dir.toString) + desc.setResolvedFileName(FileResolver.resolve(desc.fileName.get)) + desc.includeSourceFile = true + + val exec = new SmartFileSourceOpExec(objectMapper.writeValueAsString(desc)) + exec.open() + val tuples = exec.produceTuple().toList + exec.close() + + assert(tuples.size == 2) + assert(tuples.map(_.getFields(0).asInstanceOf[Array[Byte]].nonEmpty) == List(true, true)) + assert(tuples.map(_.getFields(1)) == List("png", "png")) + assert(tuples.map(_.getFields(2)) == List(3, 4)) + assert(tuples.map(_.getFields(3)) == List(2, 5)) + assert(tuples.map(_.getFields(4)) == List("cat.png", "dog.png")) + } finally deleteRecursively(dir) + } + + private def deleteRecursively(path: java.nio.file.Path): Unit = { + Files + .walk(path) + .iterator() + .asScala + .toSeq + .sortBy(_.getNameCount)(Ordering.Int.reverse) + .foreach(Files.deleteIfExists) + } + + private def writePng(out: File, width: Int, height: Int): Unit = { + val image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB) + ImageIO.write(image, "png", out) + } +} diff --git a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts index 2eda4b53bf6..89ae3fe95ff 100644 --- a/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts +++ b/frontend/src/app/dashboard/component/user/user-dataset/user-dataset-explorer/user-dataset-version-filetree/user-dataset-version-filetree.component.ts @@ -58,6 +58,9 @@ export class UserDatasetVersionFiletreeComponent implements AfterViewInit { @Input() public isExpandAllAfterViewInit = false; + @Input() + public isDirectorySelectable = false; + @ViewChild("tree") tree: any; @Output() @@ -69,9 +72,13 @@ export class UserDatasetVersionFiletreeComponent implements AfterViewInit { actionMapping: { mouse: { click: (tree: any, node: any, $event: any) => { + const isDirectory = node.data.type === "directory"; + if (isDirectory && this.isDirectorySelectable) { + this.selectedTreeNode.emit(node.data); + } if (node.hasChildren) { TREE_ACTIONS.TOGGLE_EXPANDED(tree, node, $event); - } else { + } else if (!isDirectory) { this.selectedTreeNode.emit(node.data); } }, diff --git a/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts b/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts index 5de61b33860..55bb8450ae5 100644 --- a/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts +++ b/frontend/src/app/workspace/component/dataset-file-selector/dataset-file-selector.component.ts @@ -62,6 +62,7 @@ export class DatasetFileSelectorComponent extends FieldType { nzData: { fileMode: true, selectedPath: this.formControl.getRawValue(), + allowDirectorySelection: this.props["allowFolderSelection"] === true, }, nzBodyStyle: { resize: "both", diff --git a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html index f8189ddb3ff..d6b43f767b7 100644 --- a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html +++ b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.html @@ -55,6 +55,7 @@
diff --git a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts index 7f70792f937..9ec8a0809c4 100644 --- a/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts +++ b/frontend/src/app/workspace/component/dataset-selection-modal/dataset-selection-modal.component.ts @@ -53,9 +53,10 @@ import { ɵNzTransitionPatchDirective } from "ng-zorro-antd/core/transition-patc ], }) export class DatasetSelectionModalComponent implements OnInit { - private readonly data = inject(NZ_MODAL_DATA) as { + public readonly data = inject(NZ_MODAL_DATA) as { fileMode: boolean; selectedPath?: string | null; + allowDirectorySelection?: boolean; }; datasets: ReadonlyArray = []; diff --git a/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html b/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html index 1f2c2963f29..d04ec91fcdc 100644 --- a/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html +++ b/frontend/src/app/workspace/component/property-editor/operator-property-edit-frame/operator-property-edit-frame.component.html @@ -88,6 +88,114 @@

{{ operatorDescription }}

+
+ Detecting file format... +
+ + +
+ + + {{ summary.tables.length }} table(s) detected + +
+ + +
+ +
+
+ +
+
+
+ {{ table.name }} + +
+
+ {{ table.description }} +
+
+ + {{ col.name }}: {{ col.type }} + +
+
+
+
+ +
+ Detected + {{ smartFileInferenceSummary.detectedFormat }} + + Folder: {{ smartFileInferenceSummary.fileCount }} files + + + Delimiter: {{ delimiter }} + + + Header: {{ smartFileInferenceSummary.hasHeader ? "yes" : "no" }} + + Sheet: {{ smartFileInferenceSummary.sheetName }} + {{ smartFileInferenceSummary.schema.length }} columns +
+
= new Subject(); + /** Prevent duplicate inference calls for the same operator/file pair. */ + private smartFileLastInferenceKey: string | undefined; + private smartFileInferenceByOperator = new Map(); + public smartFileInferenceSummary?: SmartFileInferenceResponse; + public smartFileInferenceLoading = false; + + /** + * LLM File Source — loading flag. The summary itself is derived on demand from the operator's + * persisted properties (see `llmGenerationSummary` getter below) so it survives component + * tear-down/recreation that happens when highlight changes. + */ + public llmGenerationLoading = false; + constructor( private formlyJsonschema: FormlyJsonschema, private workflowActionService: WorkflowActionService, @@ -173,7 +203,10 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On private changeDetectorRef: ChangeDetectorRef, private workflowVersionService: WorkflowVersionService, private workflowStatusSerivce: WorkflowStatusService, - private config: GuiConfigService + private config: GuiConfigService, + private smartFileInferenceService: SmartFileInferenceService, + private llmSourceService: LLMSourceService, + private workflowUtilService: WorkflowUtilService ) {} ngOnChanges(changes: SimpleChanges): void { @@ -243,6 +276,12 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On this.setFormlyFormBinding(this.currentOperatorSchema.jsonSchema); this.formTitle = operator.customDisplayName ?? this.currentOperatorSchema.additionalMetadata.userFriendlyName; this.operatorDescription = this.currentOperatorSchema.additionalMetadata.operatorDescription; + this.smartFileInferenceSummary = + this.currentOperatorSchema.operatorType === SMART_FILE_SCAN_TYPE + ? this.smartFileInferenceByOperator.get(operator.operatorID) + : undefined; + this.smartFileInferenceLoading = false; + this.llmGenerationLoading = false; /** * Important: make a deep copy of the initial property data object. * Prevent the form directly changes the value in the texera graph without going through workflow action service. @@ -349,10 +388,312 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On this.typeInferenceOnLambdaFunction(formData); this.workflowActionService.setOperatorProperty(this.currentOperatorId, cloneDeep(formData)); this.listeningToChange = true; + this.runSmartFileInferenceIfNeeded(formData); } }); } + /** + * For `SmartFileScan` operators, when the user picks a new file the backend can sniff the + * format, dialect, and schema and tell us what to prefill. This method only fires once per + * fileName change (so editing other fields doesn't re-trigger it) and silently no-ops for any + * other operator type. + */ + private runSmartFileInferenceIfNeeded(formData: Record): void { + if (!this.currentOperatorId) return; + if (this.currentOperatorSchema?.operatorType !== SMART_FILE_SCAN_TYPE) return; + const fileName = formData?.["fileName"]; + if (typeof fileName !== "string" || fileName.length === 0) return; + const operatorIdAtRequestTime = this.currentOperatorId; + const inferenceKey = `${operatorIdAtRequestTime}:${fileName}`; + if (inferenceKey === this.smartFileLastInferenceKey) return; + this.smartFileLastInferenceKey = inferenceKey; + this.smartFileInferenceByOperator.delete(operatorIdAtRequestTime); + this.smartFileInferenceSummary = undefined; + this.smartFileInferenceLoading = true; + + const formatOverride = formData["formatOverride"]; + const requestFormat = + typeof formatOverride === "string" && formatOverride !== "Auto-detect" && formatOverride !== "AUTO" + ? formatOverride + : undefined; + const customDelimiter = formData["customDelimiter"]; + const hasHeader = formData["hasHeader"]; + const sheetName = formData["sheetName"]; + const flatten = formData["flatten"]; + const fileEncoding = formData["fileEncoding"]; + + this.smartFileInferenceService + .preview({ + fileName, + fileEncoding: typeof fileEncoding === "string" ? fileEncoding : undefined, + formatOverride: requestFormat, + customDelimiter: + typeof customDelimiter === "string" && customDelimiter.length > 0 ? customDelimiter : undefined, + hasHeader: typeof hasHeader === "boolean" ? hasHeader : undefined, + sheetName: typeof sheetName === "string" && sheetName.length > 0 ? sheetName : undefined, + flatten: typeof flatten === "boolean" ? flatten : undefined, + }) + .pipe(untilDestroyed(this)) + .subscribe({ + next: response => this.applySmartFileInference(operatorIdAtRequestTime, fileName, response), + error: (err: unknown) => { + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.smartFileInferenceLoading = false; + } + if (this.smartFileLastInferenceKey === inferenceKey) { + this.smartFileLastInferenceKey = undefined; + } + // Surface as a non-blocking warning. Sniffing failure shouldn't break the workflow — + // the operator's own sourceSchema() call will re-attempt at compile time. + this.notificationService.warning(`Could not auto-detect file: ${this.smartFileInferenceErrorMessage(err)}`); + }, + }); + } + + private applySmartFileInference( + operatorIdAtRequestTime: string, + fileNameAtRequestTime: string, + response: SmartFileInferenceResponse + ): void { + const operator = this.workflowActionService.getTexeraGraph().getOperator(operatorIdAtRequestTime); + if (!operator) return; + // Drop stale responses — user may have already changed the file again. + if (operator.operatorProperties["fileName"] !== fileNameAtRequestTime) return; + + const merged: Record = { ...operator.operatorProperties }; + merged["formatOverride"] = response.detectedFormat; + if (response.customDelimiter !== null && response.customDelimiter !== undefined) { + merged["customDelimiter"] = response.customDelimiter; + } + if (response.hasHeader !== null && response.hasHeader !== undefined) { + merged["hasHeader"] = response.hasHeader; + } + if (response.sheetName !== null && response.sheetName !== undefined) { + merged["sheetName"] = response.sheetName; + } + if (response.flatten !== null && response.flatten !== undefined) { + merged["flatten"] = response.flatten; + } + const sourceFileColumnExists = response.schema.some(column => column.name.toLowerCase() === "source_file"); + if (response.isFolder && !sourceFileColumnExists && merged["includeSourceFile"] === undefined) { + merged["includeSourceFile"] = true; + } + this.smartFileInferenceByOperator.set(operatorIdAtRequestTime, response); + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.smartFileInferenceSummary = response; + this.smartFileInferenceLoading = false; + } + this.workflowActionService.setOperatorProperty(operatorIdAtRequestTime, merged); + } + + public formatSmartFileDelimiter(delimiter: string | null): string | undefined { + if (delimiter === null) return undefined; + if (delimiter === "\t") return "tab"; + if (delimiter === " ") return "space"; + return delimiter; + } + + /** + * Reconstruct the "tables detected" summary from the operator's persisted JSON properties. + * We can't keep an in-memory cache because the parent property editor tears down and recreates + * this component every time the highlight changes — which happens when our "Filter+Projection" + * helper adds new operators and we re-highlight the source. + */ + public get llmGenerationSummary(): LLMSourceGenerateResponse | undefined { + if (!this.currentOperatorId) return undefined; + if (this.currentOperatorSchema?.operatorType !== LLM_FILE_SCAN_TYPE) return undefined; + const op = this.workflowActionService.getTexeraGraph().getOperator(this.currentOperatorId); + if (!op) return undefined; + const props = op.operatorProperties as Record; + const tables = props["tables"]; + if (!Array.isArray(tables) || tables.length === 0) return undefined; + const reverseColumns = (cols: unknown): LLMSourceColumn[] => + Array.isArray(cols) + ? cols.map(c => { + const attr = c as Record; + return { + name: (attr["attributeName"] as string) ?? (attr["name"] as string) ?? "", + type: (attr["attributeType"] as string) ?? (attr["type"] as string) ?? "string", + }; + }) + : []; + return { + generatedCode: (props["generatedCode"] as string) ?? "", + tables: tables.map(raw => { + const t = raw as Record; + return { + name: (t["name"] as string) ?? "", + description: (t["description"] as string) ?? "", + columns: reverseColumns(t["columns"]), + }; + }), + unionColumns: reverseColumns(props["unionColumns"]), + llmModel: (props["llmModel"] as string) ?? "", + sampleHash: (props["sampleHash"] as string) ?? "", + generatedAt: (props["generatedAt"] as string) ?? "", + warnings: [], + }; + } + + /** ngFor trackBy for the tables list — keeps DOM nodes stable across regenerates. */ + public trackLLMTable = (_idx: number, table: LLMSourceTable): string => table?.name ?? String(_idx); + public trackLLMColumn = (_idx: number, col: LLMSourceColumn): string => col?.name ?? String(_idx); + + /** + * Triggered by the "Generate" button in the property panel when an `LLMFileScan` operator is + * selected. Sends the current `fileName` + `userHint` to the backend, which samples the file, + * asks the LLM for a parser + table schemas, and returns the result. On success we persist the + * generated code, schema, and audit metadata onto the operator's properties. + */ + public triggerLLMGeneration(): void { + if (!this.currentOperatorId) return; + const operatorIdAtRequestTime = this.currentOperatorId; + const fileName = this.formData?.["fileName"]; + if (typeof fileName !== "string" || fileName.length === 0) { + this.notificationService.warning("Pick a file first."); + return; + } + const userHint = this.formData?.["userHint"]; + const llmModel = this.formData?.["llmModel"]; + + this.llmGenerationLoading = true; + this.llmSourceService + .generate({ + fileName, + userHint: typeof userHint === "string" && userHint.length > 0 ? userHint : undefined, + llmModel: typeof llmModel === "string" && llmModel.length > 0 ? llmModel : undefined, + }) + .pipe(untilDestroyed(this)) + .subscribe({ + next: response => this.applyLLMGeneration(operatorIdAtRequestTime, response), + error: (err: unknown) => { + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.llmGenerationLoading = false; + this.changeDetectorRef.detectChanges(); + } + // eslint-disable-next-line no-console + console.error("[LLMSource] generate failed", err); + this.notificationService.error(`LLM generation failed: ${this.smartFileInferenceErrorMessage(err)}`); + }, + }); + } + + private applyLLMGeneration(operatorIdAtRequestTime: string, response: LLMSourceGenerateResponse): void { + // Reset the loading flag FIRST so the spinner never gets stuck if anything below throws. + if (this.currentOperatorId === operatorIdAtRequestTime) { + this.llmGenerationLoading = false; + } + try { + const operator = this.workflowActionService.getTexeraGraph().getOperator(operatorIdAtRequestTime); + if (!operator) { + // eslint-disable-next-line no-console + console.warn("[LLMSource] operator no longer exists when response arrived", operatorIdAtRequestTime); + return; + } + const tables = Array.isArray(response.tables) ? response.tables : []; + const unionColumns = Array.isArray(response.unionColumns) ? response.unionColumns : []; + const merged: Record = { ...operator.operatorProperties }; + merged["generatedCode"] = response.generatedCode ?? ""; + merged["tables"] = tables.map(t => ({ + name: t?.name ?? "", + description: t?.description ?? "", + columns: Array.isArray(t?.columns) + ? t.columns.map(c => ({ attributeName: c?.name ?? "", attributeType: c?.type ?? "string" })) + : [], + })); + merged["unionColumns"] = unionColumns.map(c => ({ + attributeName: c?.name ?? "", + attributeType: c?.type ?? "string", + })); + merged["llmModel"] = response.llmModel ?? ""; + merged["sampleHash"] = response.sampleHash ?? ""; + merged["generatedAt"] = response.generatedAt ?? ""; + this.workflowActionService.setOperatorProperty(operatorIdAtRequestTime, merged); + // Force change detection so the getter-backed `llmGenerationSummary` re-evaluates immediately + // — without this, in some edge cases the property panel doesn't refresh until the next user action. + this.changeDetectorRef.detectChanges(); + if (response.warnings && response.warnings.length > 0) { + this.notificationService.warning(`LLM warnings: ${response.warnings.join("; ")}`); + } + } catch (err) { + // eslint-disable-next-line no-console + console.error("[LLMSource] applyLLMGeneration failed", err); + this.notificationService.error(`Could not apply LLM response: ${(err as Error)?.message ?? err}`); + } + } + + /** + * One-click "split-out this table" helper. Adds a Filter (__table__ == name) + Projection + * (drops __table__ and columns belonging to other tables) downstream of the LLM source. + * Phase 2 will replace this with true multi-output ports on the source itself. + */ + public addFilterProjectionForTable(table: LLMSourceTable): void { + if (!this.currentOperatorId) return; + const sourceId = this.currentOperatorId; + const sourceOp = this.workflowActionService.getTexeraGraph().getOperator(sourceId); + if (!sourceOp) return; + const summary = this.llmGenerationSummary; + if (!summary) return; + + const sourcePos = this.workflowActionService.getJointGraphWrapper().getElementPosition(sourceId); + // Offset each table's chain vertically so subsequent clicks don't stack on top of each other. + const tableIndex = summary.tables.findIndex(t => t.name === table.name); + const yOffset = (tableIndex >= 0 ? tableIndex : 0) * 120; + const filterPos: Point = { x: sourcePos.x + 200, y: sourcePos.y + yOffset }; + const projectionPos: Point = { x: sourcePos.x + 400, y: sourcePos.y + yOffset }; + + const filter = this.workflowUtilService.getNewOperatorPredicate("Filter", `Filter: ${table.name}`); + const projection = this.workflowUtilService.getNewOperatorPredicate("Projection", `Project: ${table.name}`); + + // Predicate: __table__ == table.name + (filter.operatorProperties as Record)["predicates"] = [ + { attribute: "__table__", condition: "=", value: table.name }, + ]; + + // Projection: drop __table__ + any union columns that don't belong to this table. + const keepNames = new Set(table.columns.map((c: LLMSourceColumn) => c.name)); + const toDrop = summary.unionColumns + .map((c: LLMSourceColumn) => c.name) + .filter(name => name === "__table__" || !keepNames.has(name)); + (projection.operatorProperties as Record)["isDrop"] = true; + (projection.operatorProperties as Record)["attributes"] = toDrop.map(name => ({ + originalAttribute: name, + alias: "", + })); + + this.workflowActionService.addOperator(filter, filterPos); + this.workflowActionService.addOperator(projection, projectionPos); + const sourceOutputPort = sourceOp.outputPorts[0]; + const filterInputPort = filter.inputPorts[0]; + const filterOutputPort = filter.outputPorts[0]; + const projectionInputPort = projection.inputPorts[0]; + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: sourceId, portID: sourceOutputPort.portID }, + target: { operatorID: filter.operatorID, portID: filterInputPort.portID }, + }); + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: filter.operatorID, portID: filterOutputPort.portID }, + target: { operatorID: projection.operatorID, portID: projectionInputPort.portID }, + }); + // Keep the LLM source operator selected so the user can keep clicking "Filter+Projection" + // for the other tables without manually re-clicking the source on the canvas. + const jointWrapper = this.workflowActionService.getJointGraphWrapper(); + jointWrapper.unhighlightOperators(...jointWrapper.getCurrentHighlightedOperatorIDs()); + jointWrapper.highlightOperators(sourceId); + this.notificationService.success(`Added Filter + Projection for '${table.name}'.`); + } + + private smartFileInferenceErrorMessage(err: unknown): string { + if (typeof err !== "object" || err === null) return "unknown error"; + const maybeError = err as { error?: { message?: unknown }; message?: unknown }; + if (typeof maybeError.error?.message === "string") return maybeError.error.message; + if (typeof maybeError.message === "string") return maybeError.message; + return "unknown error"; + } + typeInferenceOnLambdaFunction(formData: any): void { if (!this.currentOperatorId?.includes("PythonLambdaFunction")) { return; @@ -468,6 +809,13 @@ export class OperatorPropertyEditFrameComponent implements OnInit, OnChanges, On // if the title is fileName, then change it to custom autocomplete input template if (mappedField.key === "fileName") { mappedField.type = "inputautocomplete"; + mappedField.props = { + ...mappedField.props, + allowFolderSelection: + this.currentOperatorSchema?.operatorType === this.smartFileScanType || + this.currentOperatorSchema?.operatorType === this.llmFileScanType || + this.currentOperatorSchema?.operatorType === "FileScan", + }; } if (mappedField.key === "datasetVersionPath") { diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts new file mode 100644 index 00000000000..1b5428892c7 --- /dev/null +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.spec.ts @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { isImageDataUrl } from "./result-table-cell.utils"; + +describe("isImageDataUrl", () => { + it("should recognize supported image data URLs", () => { + expect(isImageDataUrl("data:image/png;base64,AAAA")).toBe(true); + expect(isImageDataUrl("data:image/jpeg;base64,BBBB")).toBe(true); + expect(isImageDataUrl("data:image/webp;base64,CCCC")).toBe(true); + }); + + it("should reject binary previews and non-image strings", () => { + expect(isImageDataUrl("")).toBe(false); + expect(isImageDataUrl("data:text/plain;base64,AAAA")).toBe(false); + expect(isImageDataUrl(42)).toBe(false); + }); +}); diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts new file mode 100644 index 00000000000..830551304b9 --- /dev/null +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-cell.utils.ts @@ -0,0 +1,22 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +export function isImageDataUrl(value: unknown): value is string { + return typeof value === "string" && /^data:image\/(?:png|jpeg|gif|webp);base64,/i.test(value); +} diff --git a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html index 5400d978ee3..6fc0b49dd89 100644 --- a/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html +++ b/frontend/src/app/workspace/component/result-panel/result-table-frame/result-table-frame.component.html @@ -161,7 +161,14 @@
- {{ column.getCell(row) }} + + + {{ column.getCell(row) }} + + +
+
+ +
+ +
+
Visual Journey
+
{{ currentTrace.title }}
+
+ {{ currentTrace.subtitle }} +
+
+ +
+ {{ currentTrace.heroMetric.label }} + {{ currentTrace.heroMetric.value }} +
+
+ +

+ {{ currentTrace.summary }} +

+ + +
+
+
{{ i + 1 }}
+
+ + {{ getKindLabel(step.kind) }} +
+
+
+ +
+
+
+
+
+ +
+ + {{ i + 1 }} +
+ +
+
+ {{ getKindLabel(step.kind) }} + + + {{ getStepLabel(step) }} + +
+ +

{{ step.title }}

+

{{ step.detail }}

+ +
+
+ {{ metric.label }} + {{ metric.value }} +
+
+
+
+
+ + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss new file mode 100644 index 00000000000..4adb8bade0c --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.scss @@ -0,0 +1,346 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +:host { + position: fixed; + inset: 0; + z-index: 6; + pointer-events: none; +} + +.trace-panel { + position: absolute; + top: 74px; + right: 14px; + bottom: 14px; + width: min(420px, calc(100vw - 28px)); + background: #fff; + border: 1px solid #dfe5ec; + box-shadow: 0 18px 42px rgba(19, 29, 40, 0.18); + display: flex; + flex-direction: column; + overflow: hidden; + pointer-events: auto; +} + +.trace-header { + padding: 18px 18px 14px; + border-bottom: 1px solid #e8edf3; + background: + linear-gradient(135deg, rgba(255, 244, 214, 0.9), rgba(232, 247, 255, 0.94)), + #fff; +} + +.close-button { + position: absolute; + top: 12px; + right: 12px; + border: 0; + background: #fff; + width: 30px; + height: 30px; + display: grid; + place-items: center; + color: #253040; + cursor: pointer; +} + +.hero { + display: grid; + grid-template-columns: 74px 1fr; + gap: 12px; + align-items: center; + padding-right: 34px; +} + +.hero-media { + width: 74px; + height: 74px; + background: #fff; + border: 1px solid #d9e0e8; + display: grid; + place-items: center; +} + +.hero-media img { + max-width: 64px; + max-height: 64px; + object-fit: contain; + image-rendering: pixelated; +} + +.hero-copy { + min-width: 0; +} + +.hero-kicker { + color: #46617f; + font-size: 11px; + text-transform: uppercase; + letter-spacing: 0; +} + +.hero-title { + color: #17202d; + font-size: 20px; + font-weight: 700; + line-height: 1.2; +} + +.hero-subtitle { + color: #556577; + font-size: 13px; + margin-top: 4px; +} + +.hero-metric { + grid-column: 1 / -1; + justify-self: start; + display: inline-flex; + align-items: baseline; + gap: 8px; + margin-top: 10px; + padding: 7px 10px; + background: #17202d; + color: #fff; +} + +.hero-metric span, +.metric-pill span { + font-size: 11px; + color: inherit; + opacity: 0.74; +} + +.hero-metric strong { + font-size: 18px; +} + +.trace-summary { + margin: 12px 0 0; + color: #334255; + font-size: 13px; + line-height: 1.45; +} + +.filmstrip { + display: flex; + gap: 10px; + padding: 14px 18px; + overflow-x: auto; + border-bottom: 1px solid #edf1f5; +} + +.film-frame { + position: relative; + flex: 0 0 auto; + width: 72px; +} + +.film-index { + position: absolute; + top: -5px; + left: -5px; + z-index: 1; + width: 20px; + height: 20px; + display: grid; + place-items: center; + background: #17202d; + color: #fff; + font-size: 11px; +} + +.film-image { + width: 72px; + height: 72px; + border: 1px solid #dfe5ec; + background: #fff; + display: grid; + place-items: center; +} + +.film-image img { + width: 100%; + height: 100%; + object-fit: contain; + image-rendering: pixelated; +} + +.film-image--empty { + background: #f3f6f8; + color: #526375; + font-size: 11px; + text-align: center; +} + +.trace-steps { + padding: 18px; + overflow-y: auto; +} + +.trace-step { + position: relative; + display: grid; + grid-template-columns: 16px 68px 1fr; + gap: 12px; + min-height: 92px; + padding-bottom: 18px; +} + +.trace-step:last-child { + padding-bottom: 0; +} + +.trace-line { + position: relative; + display: flex; + justify-content: center; +} + +.trace-line::after { + content: ""; + position: absolute; + top: 18px; + bottom: -18px; + width: 2px; + background: #d9e0e8; +} + +.trace-step:last-child .trace-line::after { + display: none; +} + +.trace-dot { + position: relative; + z-index: 1; + width: 12px; + height: 12px; + margin-top: 6px; + background: #62768d; +} + +.trace-step--source .trace-dot { + background: #1f8a70; +} + +.trace-step--match .trace-dot { + background: #2b6cb0; +} + +.trace-step--compute .trace-dot { + background: #d97706; +} + +.trace-step--render .trace-dot { + background: #b83280; +} + +.step-media { + width: 68px; + height: 68px; + border: 1px solid #dfe5ec; + background: #f7f9fb; + display: grid; + place-items: center; + color: #526375; + font-size: 16px; + font-weight: 700; +} + +.step-media img { + width: 100%; + height: 100%; + object-fit: contain; + image-rendering: pixelated; +} + +.step-body { + min-width: 0; +} + +.step-meta { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-bottom: 6px; +} + +.kind-chip, +.operator-chip { + border: 1px solid #d9e0e8; + background: #fff; + color: #314255; + padding: 3px 7px; + font-size: 11px; + line-height: 1.2; +} + +.operator-chip { + cursor: pointer; +} + +.operator-chip--static { + cursor: default; +} + +.step-body h3 { + margin: 0; + color: #17202d; + font-size: 14px; + font-weight: 700; + line-height: 1.35; +} + +.step-body p { + margin: 5px 0 0; + color: #556577; + font-size: 12px; + line-height: 1.45; +} + +.metric-row { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-top: 8px; +} + +.metric-pill { + display: inline-flex; + align-items: baseline; + gap: 5px; + padding: 4px 7px; + background: #f3f6f8; + color: #253040; +} + +.metric-pill strong { + font-size: 12px; +} + +@media (max-width: 720px) { + .trace-panel { + top: 60px; + right: 8px; + bottom: 8px; + width: calc(100vw - 16px); + } +} + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts new file mode 100644 index 00000000000..d5ff3f4f6ad --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.spec.ts @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { ComponentFixture, TestBed } from "@angular/core/testing"; +import { BehaviorSubject } from "rxjs"; +import { VisualTracePanelComponent } from "./visual-trace-panel.component"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; +import { VisualTrace } from "../../types/visual-trace.interface"; + +describe("VisualTracePanelComponent", () => { + let fixture: ComponentFixture; + let component: VisualTracePanelComponent; + let traceSubject: BehaviorSubject; + + beforeEach(async () => { + traceSubject = new BehaviorSubject(undefined); + + await TestBed.configureTestingModule({ + imports: [VisualTracePanelComponent], + providers: [ + { + provide: VisualTraceService, + useValue: { + trace$: traceSubject.asObservable(), + closeTrace: vi.fn(), + }, + }, + { + provide: WorkflowActionService, + useValue: { + getTexeraGraph: () => ({ + hasOperator: vi.fn().mockReturnValue(true), + getOperator: vi.fn().mockReturnValue({ + operatorID: "op1", + operatorType: "PythonUDFV2", + customDisplayName: "Battle Logic", + }), + }), + highlightOperators: vi.fn(), + }, + }, + ], + }).compileComponents(); + + fixture = TestBed.createComponent(VisualTracePanelComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it("renders a visual journey with hero media, metrics, and ordered steps", () => { + traceSubject.next({ + title: "Charizard wins", + subtitle: "Fire matchup", + heroImage: "data:image/png;base64,abc", + heroMetric: { label: "Advantage", value: "2x" }, + steps: [ + { + title: "Loaded sprite", + operatorId: "op1", + image: "data:image/png;base64,abc", + metrics: [{ label: "Rows", value: "440" }], + }, + { + title: "Rendered result", + kind: "render", + }, + ], + }); + fixture.detectChanges(); + + const native = fixture.nativeElement as HTMLElement; + expect(native.querySelector(".trace-panel")).toBeTruthy(); + expect(native.querySelector(".hero-title")?.textContent).toContain("Charizard wins"); + expect(native.querySelector(".hero-media img")).toBeTruthy(); + expect(native.querySelector(".hero-metric")?.textContent).toContain("2x"); + expect(native.querySelectorAll(".trace-step")).toHaveLength(2); + expect(native.textContent).toContain("Loaded sprite"); + expect(native.textContent).toContain("Rendered result"); + }); +}); + diff --git a/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts new file mode 100644 index 00000000000..2962229517a --- /dev/null +++ b/frontend/src/app/workspace/component/visual-trace-panel/visual-trace-panel.component.ts @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { NgClass, NgFor, NgIf } from "@angular/common"; +import { Component, OnInit } from "@angular/core"; +import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; +import { NzIconDirective } from "ng-zorro-antd/icon"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; +import { VisualTrace, VisualTraceStep, VisualTraceStepKind } from "../../types/visual-trace.interface"; + +@UntilDestroy() +@Component({ + selector: "texera-visual-trace-panel", + templateUrl: "./visual-trace-panel.component.html", + styleUrls: ["./visual-trace-panel.component.scss"], + imports: [NgIf, NgFor, NgClass, NzIconDirective], +}) +export class VisualTracePanelComponent implements OnInit { + public trace?: VisualTrace; + + constructor( + private readonly visualTraceService: VisualTraceService, + private readonly workflowActionService: WorkflowActionService + ) {} + + ngOnInit(): void { + this.visualTraceService.trace$.pipe(untilDestroyed(this)).subscribe(trace => { + this.trace = trace; + }); + } + + public close(): void { + this.visualTraceService.closeTrace(); + } + + public focusOperator(step: VisualTraceStep): void { + if (!step.operatorId || !this.workflowActionService.getTexeraGraph().hasOperator(step.operatorId)) { + return; + } + this.workflowActionService.highlightOperators(false, step.operatorId); + } + + public getStepLabel(step: VisualTraceStep): string { + if (step.operatorLabel) { + return step.operatorLabel; + } + if (!step.operatorId || !this.workflowActionService.getTexeraGraph().hasOperator(step.operatorId)) { + return this.getKindLabel(step.kind); + } + const operator = this.workflowActionService.getTexeraGraph().getOperator(step.operatorId); + return operator.customDisplayName ?? operator.operatorType; + } + + public getKindLabel(kind?: VisualTraceStepKind): string { + switch (kind) { + case "source": + return "Source"; + case "match": + return "Match"; + case "compute": + return "Compute"; + case "render": + return "Render"; + default: + return "Step"; + } + } +} diff --git a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html index c092a4bf74a..0c759a0af5d 100644 --- a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html +++ b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.html @@ -18,5 +18,7 @@ --> diff --git a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts index eb329c1c7f1..4602476e798 100644 --- a/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts +++ b/frontend/src/app/workspace/component/visualization-panel-content/visualization-frame-content.component.ts @@ -17,11 +17,22 @@ * under the License. */ -import { AfterContentInit, Component, Input } from "@angular/core"; +import { AfterContentInit, Component, ElementRef, HostListener, Input, ViewChild } from "@angular/core"; import { DomSanitizer } from "@angular/platform-browser"; import { WorkflowResultService } from "../../service/workflow-result/workflow-result.service"; import { auditTime, filter } from "rxjs/operators"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; +import { VisualTraceService } from "../../service/visual-trace/visual-trace.service"; +import { + buildStructuralVisualTrace, + buildVisualTraceBridgeScript, + extractVisualTraceSelectionFromElement, + findVisualTraceElement, + parseVisualTraceMessage, + parseVisualTracePayloadAttribute, + parseVisualTraceSelectionMessage, +} from "../../service/visual-trace/visual-trace.utils"; +import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; @UntilDestroy() @Component({ @@ -32,13 +43,17 @@ import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; export class VisualizationFrameContentComponent implements AfterContentInit { // operatorId: string = inject(NZ_MODAL_DATA).operatorId; @Input() operatorId?: string; + @ViewChild("visualizationFrame") visualizationFrame?: ElementRef; // progressive visualization update and redraw interval in milliseconds public static readonly UPDATE_INTERVAL_MS = 2000; htmlData: any = ""; + private removeFrameClickListener?: () => void; constructor( private workflowResultService: WorkflowResultService, - private sanitizer: DomSanitizer + private sanitizer: DomSanitizer, + private visualTraceService: VisualTraceService, + private workflowActionService: WorkflowActionService ) {} ngAfterContentInit() { @@ -79,9 +94,77 @@ export class VisualizationFrameContentComponent implements AfterContentInit { const firstDiv = doc.body.querySelector("div"); if (firstDiv) firstDiv.style.height = "100%"; + const bridgeScript = doc.createElement("script"); + bridgeScript.textContent = buildVisualTraceBridgeScript(); + doc.body.appendChild(bridgeScript); + const serializer = new XMLSerializer(); const newHtmlString = serializer.serializeToString(doc); this.htmlData = this.sanitizer.bypassSecurityTrustHtml(newHtmlString); // this line bypasses angular security } + + @HostListener("window:message", ["$event"]) + handleWindowMessage(event: MessageEvent): void { + if (this.visualizationFrame?.nativeElement.contentWindow && event.source !== this.visualizationFrame.nativeElement.contentWindow) { + return; + } + const trace = parseVisualTraceMessage(event.data); + if (trace) { + this.visualTraceService.openTrace(trace); + return; + } + + const selection = parseVisualTraceSelectionMessage(event.data); + if (!selection || !this.operatorId) { + return; + } + this.openStructuralTrace(selection); + } + + onVisualizationFrameLoad(): void { + this.removeFrameClickListener?.(); + + const frameDocument = this.visualizationFrame?.nativeElement.contentDocument; + if (!frameDocument) { + return; + } + + const handleClick = (event: MouseEvent): void => { + const traceElement = findVisualTraceElement(event.target); + if (!traceElement) { + return; + } + + const trace = parseVisualTracePayloadAttribute(traceElement.getAttribute("data-texera-trace")); + if (trace) { + this.visualTraceService.openTrace(trace); + return; + } + + const selection = extractVisualTraceSelectionFromElement(traceElement); + if (selection && this.operatorId) { + this.openStructuralTrace(selection); + } + }; + + frameDocument.addEventListener("click", handleClick); + this.removeFrameClickListener = () => frameDocument.removeEventListener("click", handleClick); + } + + private openStructuralTrace(selection: { title?: string; image?: string; imageAlt?: string }): void { + if (!this.operatorId) { + return; + } + + const graph = this.workflowActionService.getTexeraGraph(); + const structuralTrace = buildStructuralVisualTrace(selection, this.operatorId, { + hasOperator: operatorId => graph.hasOperator(operatorId), + getOperator: operatorId => graph.getOperator(operatorId), + getInputOperatorIds: operatorId => graph.getInputLinksByOperatorId(operatorId).map(link => link.source.operatorID), + }); + if (structuralTrace) { + this.visualTraceService.openTrace(structuralTrace); + } + } } diff --git a/frontend/src/app/workspace/component/workspace.component.html b/frontend/src/app/workspace/component/workspace.component.html index c54446fb318..3d7dd8c9cb3 100644 --- a/frontend/src/app/workspace/component/workspace.component.html +++ b/frontend/src/app/workspace/component/workspace.component.html @@ -37,4 +37,5 @@ *ngIf="copilotEnabled" [agentIdToActivate]="agentIdToActivate"> + diff --git a/frontend/src/app/workspace/component/workspace.component.ts b/frontend/src/app/workspace/component/workspace.component.ts index 9968c26f647..f3af0802ba9 100644 --- a/frontend/src/app/workspace/component/workspace.component.ts +++ b/frontend/src/app/workspace/component/workspace.component.ts @@ -61,6 +61,7 @@ import { LeftPanelComponent } from "./left-panel/left-panel.component"; import { AgentPanelComponent } from "./agent/agent-panel/agent-panel.component"; import { PropertyEditorComponent } from "./property-editor/property-editor.component"; import { FormlyRepeatDndComponent } from "../../common/formly/repeat-dnd/repeat-dnd.component"; +import { VisualTracePanelComponent } from "./visual-trace-panel/visual-trace-panel.component"; export const SAVE_DEBOUNCE_TIME_IN_MS = 5000; @@ -83,6 +84,7 @@ export const SAVE_DEBOUNCE_TIME_IN_MS = 5000; NgIf, AgentPanelComponent, PropertyEditorComponent, + VisualTracePanelComponent, FormlyRepeatDndComponent, ], }) diff --git a/frontend/src/app/workspace/service/llm-source/llm-source.service.ts b/frontend/src/app/workspace/service/llm-source/llm-source.service.ts new file mode 100644 index 00000000000..c50647cc265 --- /dev/null +++ b/frontend/src/app/workspace/service/llm-source/llm-source.service.ts @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { HttpClient } from "@angular/common/http"; +import { Injectable } from "@angular/core"; +import { Observable } from "rxjs"; +import { AppSettings } from "../../../common/app-setting"; + +export interface LLMSourceColumn { + name: string; + type: string; +} + +export interface LLMSourceTable { + name: string; + description: string; + columns: LLMSourceColumn[]; +} + +export interface LLMSourceGenerateRequest { + fileName: string; + userHint?: string; + llmModel?: string; + previousCode?: string; + previousError?: string; +} + +export interface LLMSourceGenerateResponse { + generatedCode: string; + tables: LLMSourceTable[]; + unionColumns: LLMSourceColumn[]; + llmModel: string; + sampleHash: string; + generatedAt: string; + warnings: string[]; +} + +/** Operator type string registered in LogicalOp.scala. */ +export const LLM_FILE_SCAN_TYPE = "LLMFileScan"; + +/** + * Talks to the backend `POST /api/llm-source/generate` endpoint that powers the + * LLM-generated source operator. The endpoint reads a sample of the user's file, + * asks the LLM to write a Python parser and declare per-table schemas, validates + * the result, and returns it. Generation is design-time only — the workflow itself + * never calls the LLM at execution time. + */ +@Injectable({ + providedIn: "root", +}) +export class LLMSourceService { + constructor(private http: HttpClient) {} + + generate(request: LLMSourceGenerateRequest): Observable { + return this.http.post( + `${AppSettings.getApiEndpoint()}/llm-source/generate`, + request + ); + } +} diff --git a/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts b/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts new file mode 100644 index 00000000000..2c48806aec4 --- /dev/null +++ b/frontend/src/app/workspace/service/smart-file-inference/smart-file-inference.service.ts @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { HttpClient } from "@angular/common/http"; +import { Injectable } from "@angular/core"; +import { Observable } from "rxjs"; +import { AppSettings } from "../../../common/app-setting"; + +export interface SmartFileInferenceColumn { + name: string; + type: string; +} + +export interface SmartFileInferenceResponse { + detectedFormat: string; + schema: SmartFileInferenceColumn[]; + customDelimiter: string | null; + hasHeader: boolean | null; + sheetName: string | null; + availableSheetNames: string[]; + flatten: boolean | null; + isFolder: boolean; + fileCount: number; +} + +export interface SmartFileInferenceRequest { + fileName: string; + fileEncoding?: string; + formatOverride?: string; + customDelimiter?: string; + hasHeader?: boolean; + sheetName?: string; + flatten?: boolean; +} + +/** Operator type string registered in LogicalOp.scala. */ +export const SMART_FILE_SCAN_TYPE = "SmartFileScan"; + +/** + * Talks to the backend `POST /api/file-inference/preview` endpoint that backs the + * SmartFileScan operator. The endpoint runs the same inference path the operator + * uses at workflow compile time, so what the user sees in the property panel is + * exactly what the workflow will produce for either one file or one folder. + */ +@Injectable({ + providedIn: "root", +}) +export class SmartFileInferenceService { + constructor(private http: HttpClient) {} + + preview(request: SmartFileInferenceRequest): Observable { + return this.http.post( + `${AppSettings.getApiEndpoint()}/file-inference/preview`, + request + ); + } +} diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts new file mode 100644 index 00000000000..6e8a72e2203 --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.service.ts @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { BehaviorSubject } from "rxjs"; +import { VisualTrace } from "../../types/visual-trace.interface"; + +@Injectable({ + providedIn: "root", +}) +export class VisualTraceService { + private readonly traceSubject = new BehaviorSubject(undefined); + public readonly trace$ = this.traceSubject.asObservable(); + + public openTrace(trace: VisualTrace): void { + this.traceSubject.next(trace); + } + + public closeTrace(): void { + this.traceSubject.next(undefined); + } +} + diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts new file mode 100644 index 00000000000..7005c898018 --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.spec.ts @@ -0,0 +1,224 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { + buildStructuralVisualTrace, + extractVisualTraceSelectionFromElement, + findVisualTraceElement, + parseVisualTraceMessage, + parseVisualTracePayloadAttribute, + parseVisualTraceSelectionMessage, +} from "./visual-trace.utils"; + +describe("parseVisualTraceMessage", () => { + it("accepts a valid visual trace message", () => { + expect( + parseVisualTraceMessage({ + type: "texera-visual-trace", + payload: { + title: "Charizard wins", + heroImage: "data:image/png;base64,abc", + steps: [ + { + title: "Loaded sprite", + kind: "source", + metrics: [{ label: "Rows", value: "440" }], + }, + ], + }, + }) + ).toEqual({ + title: "Charizard wins", + heroImage: "data:image/png;base64,abc", + steps: [ + { + title: "Loaded sprite", + kind: "source", + metrics: [{ label: "Rows", value: "440" }], + }, + ], + }); + }); + + it("rejects malformed or incomplete trace messages", () => { + expect(parseVisualTraceMessage(undefined)).toBeUndefined(); + expect(parseVisualTraceMessage({ type: "other", payload: {} })).toBeUndefined(); + expect(parseVisualTraceMessage({ type: "texera-visual-trace", payload: { title: "Missing steps" } })).toBeUndefined(); + expect( + parseVisualTraceMessage({ + type: "texera-visual-trace", + payload: { + title: "Bad step", + steps: [{ detail: "No title" }], + }, + }) + ).toBeUndefined(); + }); +}); + +describe("parseVisualTraceSelectionMessage", () => { + it("accepts a valid fallback selection message", () => { + expect( + parseVisualTraceSelectionMessage({ + type: "texera-visual-trace-selection", + payload: { + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }, + }) + ).toEqual({ + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }); + }); + + it("rejects malformed selection messages", () => { + expect(parseVisualTraceSelectionMessage(undefined)).toBeUndefined(); + expect(parseVisualTraceSelectionMessage({ type: "other", payload: {} })).toBeUndefined(); + expect(parseVisualTraceSelectionMessage({ type: "texera-visual-trace-selection", payload: {} })).toBeUndefined(); + }); +}); + +describe("buildStructuralVisualTrace", () => { + it("builds an upstream workflow journey when a visualization only reports the clicked image", () => { + const operators = { + source: { operatorID: "source", operatorType: "Smart Source", customDisplayName: "Pokemon Images" }, + udf: { operatorID: "udf", operatorType: "Python UDF", customDisplayName: "Map sprites" }, + visualizer: { operatorID: "visualizer", operatorType: "HTML Visualizer" }, + }; + const inputs = { + source: [], + udf: ["source"], + visualizer: ["udf"], + }; + + expect( + buildStructuralVisualTrace( + { title: "Charizard", image: "data:image/png;base64,abc", imageAlt: "Charizard sprite" }, + "visualizer", + { + hasOperator: (operatorId: string) => operatorId in operators, + getOperator: (operatorId: string) => operators[operatorId as keyof typeof operators], + getInputOperatorIds: (operatorId: string) => inputs[operatorId as keyof typeof inputs], + } + ) + ).toEqual({ + title: "Charizard", + subtitle: "Workflow path to HTML Visualizer", + summary: + "Auto-built from the upstream workflow graph. Add a trace payload in the visualization for row-level details.", + heroImage: "data:image/png;base64,abc", + heroImageAlt: "Charizard sprite", + heroMetric: { label: "Steps", value: "3" }, + steps: [ + { + title: "Pokemon Images", + operatorId: "source", + operatorLabel: "Pokemon Images", + kind: "source", + }, + { + title: "Map sprites", + operatorId: "udf", + operatorLabel: "Map sprites", + kind: "compute", + }, + { + title: "HTML Visualizer", + operatorId: "visualizer", + operatorLabel: "HTML Visualizer", + kind: "render", + image: "data:image/png;base64,abc", + imageAlt: "Charizard sprite", + }, + ], + }); + }); + + it("returns undefined when the visualizer operator is missing", () => { + expect( + buildStructuralVisualTrace( + { title: "Charizard", image: "data:image/png;base64,abc" }, + "missing", + { + hasOperator: () => false, + getOperator: () => { + throw new Error("should not be called"); + }, + getInputOperatorIds: () => [], + } + ) + ).toBeUndefined(); + }); +}); + +describe("visual trace DOM helpers", () => { + it("reads a rich trace payload from an element attribute", () => { + expect( + parseVisualTracePayloadAttribute( + JSON.stringify({ + title: "Charizard wins", + steps: [{ title: "Rendered card" }], + }) + ) + ).toEqual({ + title: "Charizard wins", + steps: [{ title: "Rendered card" }], + }); + }); + + it("finds an image-bearing ancestor and extracts a fallback selection", () => { + const card = document.createElement("div"); + card.className = "pokemon-side"; + card.innerHTML = ` +
WINNER
+ Charizard +
Charizard
+ `; + const badge = card.querySelector(".winner-badge"); + expect(badge).not.toBeNull(); + const traceElement = findVisualTraceElement(badge); + + expect(traceElement).toBe(card); + expect(extractVisualTraceSelectionFromElement(traceElement as Element)).toEqual({ + title: "Charizard", + image: "data:image/png;base64,abc", + imageAlt: "Charizard", + }); + }); + + it("accepts element-like click targets from iframe documents", () => { + const frame = document.createElement("iframe"); + document.body.appendChild(frame); + const frameDocument = frame.contentDocument as Document; + const card = frameDocument.createElement("div"); + card.innerHTML = ` +
WINNER
+ Charizard + `; + frameDocument.body.appendChild(card); + + const badge = card.querySelector(".winner-badge"); + expect(findVisualTraceElement(badge)).toBe(card); + + frame.remove(); + }); +}); diff --git a/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts new file mode 100644 index 00000000000..d94bde9723e --- /dev/null +++ b/frontend/src/app/workspace/service/visual-trace/visual-trace.utils.ts @@ -0,0 +1,293 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { + VisualTrace, + VisualTraceMetric, + VisualTraceSelection, + VisualTraceStep, + VisualTraceStepKind, +} from "../../types/visual-trace.interface"; + +const TRACE_MESSAGE_TYPE = "texera-visual-trace"; +const TRACE_SELECTION_MESSAGE_TYPE = "texera-visual-trace-selection"; +const VALID_STEP_KINDS = new Set(["source", "match", "compute", "render"]); + +export interface VisualTraceGraphOperator { + operatorID: string; + operatorType: string; + customDisplayName?: string; +} + +export interface VisualTraceGraphReader { + hasOperator(operatorId: string): boolean; + getOperator(operatorId: string): VisualTraceGraphOperator; + getInputOperatorIds(operatorId: string): string[]; +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null; +} + +function isElementLike(value: EventTarget | null): value is Element { + return ( + typeof value === "object" && + value !== null && + "nodeType" in value && + value.nodeType === 1 && + "matches" in value && + typeof value.matches === "function" && + "querySelector" in value && + typeof value.querySelector === "function" + ); +} + +function parseMetric(value: unknown): VisualTraceMetric | undefined { + if (!isRecord(value) || typeof value.label !== "string" || typeof value.value !== "string") { + return undefined; + } + return { + label: value.label, + value: value.value, + }; +} + +function parseStep(value: unknown): VisualTraceStep | undefined { + if (!isRecord(value) || typeof value.title !== "string") { + return undefined; + } + + const kind: VisualTraceStepKind | undefined = + typeof value.kind === "string" && VALID_STEP_KINDS.has(value.kind as VisualTraceStepKind) + ? (value.kind as VisualTraceStepKind) + : undefined; + const metrics = Array.isArray(value.metrics) ? value.metrics.map(parseMetric).filter(Boolean) : undefined; + + return { + title: value.title, + detail: typeof value.detail === "string" ? value.detail : undefined, + operatorId: typeof value.operatorId === "string" ? value.operatorId : undefined, + operatorLabel: typeof value.operatorLabel === "string" ? value.operatorLabel : undefined, + image: typeof value.image === "string" ? value.image : undefined, + imageAlt: typeof value.imageAlt === "string" ? value.imageAlt : undefined, + kind, + metrics: metrics as VisualTraceMetric[] | undefined, + }; +} + +export function parseVisualTraceMessage(message: unknown): VisualTrace | undefined { + if (!isRecord(message) || message.type !== TRACE_MESSAGE_TYPE || !isRecord(message.payload)) { + return undefined; + } + + const payload = message.payload; + if (typeof payload.title !== "string" || !Array.isArray(payload.steps)) { + return undefined; + } + + const steps = payload.steps.map(parseStep); + if (steps.length === 0 || steps.some(step => step === undefined)) { + return undefined; + } + + return { + title: payload.title, + subtitle: typeof payload.subtitle === "string" ? payload.subtitle : undefined, + summary: typeof payload.summary === "string" ? payload.summary : undefined, + heroImage: typeof payload.heroImage === "string" ? payload.heroImage : undefined, + heroImageAlt: typeof payload.heroImageAlt === "string" ? payload.heroImageAlt : undefined, + heroMetric: parseMetric(payload.heroMetric), + steps: steps as VisualTraceStep[], + }; +} + +export function parseVisualTraceSelectionMessage(message: unknown): VisualTraceSelection | undefined { + if (!isRecord(message) || message.type !== TRACE_SELECTION_MESSAGE_TYPE || !isRecord(message.payload)) { + return undefined; + } + + const payload = message.payload; + const selection = { + title: typeof payload.title === "string" ? payload.title : undefined, + image: typeof payload.image === "string" ? payload.image : undefined, + imageAlt: typeof payload.imageAlt === "string" ? payload.imageAlt : undefined, + }; + + return selection.title || selection.image ? selection : undefined; +} + +export function buildStructuralVisualTrace( + selection: VisualTraceSelection, + targetOperatorId: string, + graph: VisualTraceGraphReader +): VisualTrace | undefined { + if (!graph.hasOperator(targetOperatorId)) { + return undefined; + } + + const visited = new Set(); + const operatorIds: string[] = []; + const visit = (operatorId: string): void => { + if (visited.has(operatorId) || !graph.hasOperator(operatorId)) { + return; + } + visited.add(operatorId); + graph.getInputOperatorIds(operatorId).forEach(visit); + operatorIds.push(operatorId); + }; + visit(targetOperatorId); + + const targetOperator = graph.getOperator(targetOperatorId); + const targetLabel = targetOperator.customDisplayName ?? targetOperator.operatorType; + const steps = operatorIds.map(operatorId => { + const operator = graph.getOperator(operatorId); + const operatorLabel = operator.customDisplayName ?? operator.operatorType; + const inputIds = graph.getInputOperatorIds(operatorId); + const kind: VisualTraceStepKind = + operatorId === targetOperatorId ? "render" : inputIds.length === 0 ? "source" : "compute"; + + return { + title: operatorLabel, + operatorId, + operatorLabel, + kind, + image: operatorId === targetOperatorId ? selection.image : undefined, + imageAlt: operatorId === targetOperatorId ? selection.imageAlt : undefined, + }; + }); + + return { + title: selection.title ?? "Selected result", + subtitle: `Workflow path to ${targetLabel}`, + summary: "Auto-built from the upstream workflow graph. Add a trace payload in the visualization for row-level details.", + heroImage: selection.image, + heroImageAlt: selection.imageAlt, + heroMetric: { + label: "Steps", + value: String(steps.length), + }, + steps, + }; +} + +export function parseVisualTracePayloadAttribute(value: string | null): VisualTrace | undefined { + if (!value) { + return undefined; + } + try { + return parseVisualTraceMessage({ + type: TRACE_MESSAGE_TYPE, + payload: JSON.parse(value), + }); + } catch { + return undefined; + } +} + +export function findVisualTraceElement(target: EventTarget | null): Element | undefined { + let element = isElementLike(target) ? target : undefined; + while (element && element !== document.body) { + if (element.hasAttribute("data-texera-trace") || element.matches("img") || element.querySelector("img")) { + return element; + } + element = element.parentElement ?? undefined; + } + return undefined; +} + +export function extractVisualTraceSelectionFromElement(element: Element): VisualTraceSelection | undefined { + const image = element.matches("img") ? element : element.querySelector("img"); + if (!image || image.tagName !== "IMG") { + return undefined; + } + const titleElement = element.querySelector("[data-texera-trace-title], .pokemon-name"); + const imageAlt = image.getAttribute("alt") ?? undefined; + const title = titleElement?.textContent?.trim() || imageAlt || undefined; + const selection = { + title, + image: image.getAttribute("src") ?? undefined, + imageAlt: imageAlt || title, + }; + return selection.title || selection.image ? selection : undefined; +} + +export function buildVisualTraceBridgeScript(): string { + return ` +(() => { + const TRACE_MESSAGE_TYPE = "texera-visual-trace"; + const TRACE_SELECTION_MESSAGE_TYPE = "texera-visual-trace-selection"; + const emitTrace = payload => window.parent.postMessage({ type: TRACE_MESSAGE_TYPE, payload }, "*"); + const emitSelection = payload => window.parent.postMessage({ type: TRACE_SELECTION_MESSAGE_TYPE, payload }, "*"); + const parseTrace = value => { + try { + return JSON.parse(value); + } catch { + return undefined; + } + }; + + const findFallbackElement = target => { + let element = target instanceof Element ? target : null; + while (element && element !== document.body) { + if (element.hasAttribute("data-texera-trace")) { + return element; + } + if (element.matches("img") || element.querySelector("img")) { + return element; + } + element = element.parentElement; + } + return null; + }; + + const buildFallbackSelection = element => { + const image = element.matches("img") ? element : element.querySelector("img"); + if (!image) { + return undefined; + } + const titleElement = element.querySelector("[data-texera-trace-title], .pokemon-name"); + const title = titleElement?.textContent?.trim() || image.getAttribute("alt") || undefined; + return { + title, + image: image.getAttribute("src") || undefined, + imageAlt: image.getAttribute("alt") || title, + }; + }; + + document.addEventListener("click", event => { + const element = findFallbackElement(event.target); + if (!element) { + return; + } + const payload = parseTrace(element.getAttribute("data-texera-trace")); + if (payload) { + emitTrace(payload); + return; + } + const selection = buildFallbackSelection(element); + if (selection) { + emitSelection(selection); + } + }); + + window.texera = window.texera || {}; + window.texera.showTrace = emitTrace; +})(); +`; +} diff --git a/frontend/src/app/workspace/types/visual-trace.interface.ts b/frontend/src/app/workspace/types/visual-trace.interface.ts new file mode 100644 index 00000000000..0af072ec34f --- /dev/null +++ b/frontend/src/app/workspace/types/visual-trace.interface.ts @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +export type VisualTraceStepKind = "source" | "match" | "compute" | "render"; + +export interface VisualTraceMetric { + label: string; + value: string; +} + +export interface VisualTraceStep { + title: string; + detail?: string; + operatorId?: string; + operatorLabel?: string; + image?: string; + imageAlt?: string; + kind?: VisualTraceStepKind; + metrics?: VisualTraceMetric[]; +} + +export interface VisualTrace { + title: string; + subtitle?: string; + summary?: string; + heroImage?: string; + heroImageAlt?: string; + heroMetric?: VisualTraceMetric; + steps: VisualTraceStep[]; +} + +export interface VisualTraceSelection { + title?: string; + image?: string; + imageAlt?: string; +} diff --git a/frontend/src/assets/operator_images/FileSplit.png b/frontend/src/assets/operator_images/FileSplit.png new file mode 100644 index 00000000000..f6e57404cbe Binary files /dev/null and b/frontend/src/assets/operator_images/FileSplit.png differ diff --git a/frontend/src/assets/operator_images/LLMFileScan.png b/frontend/src/assets/operator_images/LLMFileScan.png new file mode 100644 index 00000000000..b1266bd6656 Binary files /dev/null and b/frontend/src/assets/operator_images/LLMFileScan.png differ diff --git a/frontend/src/assets/operator_images/SmartFileScan.png b/frontend/src/assets/operator_images/SmartFileScan.png new file mode 100644 index 00000000000..b1266bd6656 Binary files /dev/null and b/frontend/src/assets/operator_images/SmartFileScan.png differ