diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 2971e4c4f4e..6bbc56ecc9d 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -573,7 +573,14 @@ class RegionExecutionCoordinator( val schemaOptional = region.getOperator(outputPortId.opId).outputPorts(outputPortId.portId)._3 val schema = - schemaOptional.getOrElse(throw new IllegalStateException("Schema is missing")) + schemaOptional.getOrElse( + throw new IllegalStateException( + s"Schema is missing for output port: opId=${outputPortId.opId.logicalOpId.id} " + + s"layer=${outputPortId.opId.layerName} " + + s"portId=${outputPortId.portId} " + + s"isInternal=${outputPortId.portId.internal}" + ) + ) DocumentFactory.createDocument(storageUriToAdd, schema) if (!isRestart) { WorkflowExecutionsResource.insertOperatorPortResultUri( diff --git a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala index deb753beb37..a6a47f4c3cc 100644 --- a/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala +++ b/amber/src/main/scala/org/apache/texera/amber/engine/architecture/scheduling/WorkflowExecutionCoordinator.scala @@ -67,7 +67,14 @@ class WorkflowExecutionCoordinator( regionExecutionCoordinators.values.filter(!_.isCompleted).toSeq // Trigger sync for each unfinished region. - unfinishedRegionCoordinators.foreach(_.syncStatusAndTransitionRegionExecutionPhase()) + // IMPORTANT: capture the sync futures so any exception thrown during phase + // transition (e.g. "Schema is missing" in createOutputPortStorageObjects) + // propagates out as a Future.exception. Previously `.foreach(...)` swallowed + // the returned Future, which meant phase-transition failures were + // discarded and the region appeared to hang silently instead of failing + // with a FatalError visible in the client UI. + val syncFutures = + unfinishedRegionCoordinators.map(_.syncStatusAndTransitionRegionExecutionPhase()) // Wait only for region termination futures (kill path), then re-run coordination. val terminationFutures = unfinishedRegionCoordinators.flatMap(_.getTerminationFutureOpt) @@ -80,7 +87,12 @@ class WorkflowExecutionCoordinator( if (regionExecutionCoordinators.values.exists(!_.isCompleted)) { // Some regions are still not completed yet. Cannot start the new regions. - return Future.Unit + // But before returning success, wait on the syncFutures so any + // transition-phase failure (e.g. "Schema is missing") makes it out + // of this method as a Future.exception — PortCompletedHandler's + // .onFailure handler will then turn it into a FatalError on the + // client. Without this, the failure was being swallowed in foreach. + return Future.collect(syncFutures).unit } // All existing regions are completed. Start the next region (if any). diff --git a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala index 98b7c68c974..a02d4a16ed6 100644 --- a/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala +++ b/amber/src/main/scala/org/apache/texera/web/TexeraWebApplication.scala @@ -47,6 +47,7 @@ import org.apache.texera.web.resource.dashboard.user.project.{ } import org.apache.texera.web.resource.dashboard.user.quota.UserQuotaResource import org.apache.texera.web.resource.dashboard.user.workflow.{ + MacroResource, WorkflowAccessResource, WorkflowExecutionsResource, WorkflowResource, @@ -148,6 +149,7 @@ class TexeraWebApplication environment.jersey.register(classOf[PublicProjectResource]) environment.jersey.register(classOf[WorkflowAccessResource]) environment.jersey.register(classOf[WorkflowResource]) + environment.jersey.register(classOf[MacroResource]) environment.jersey.register(classOf[HubResource]) environment.jersey.register(classOf[UserResource]) environment.jersey.register(classOf[WorkflowVersionResource]) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala index d3047db5802..b9756407204 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/SyncExecutionResource.scala @@ -50,6 +50,7 @@ import org.apache.texera.dao.SqlServer import org.apache.texera.dao.jooq.generated.Tables.OPERATOR_EXECUTIONS import org.apache.texera.web.model.websocket.request.{LogicalPlanPojo, WorkflowExecuteRequest} import org.apache.texera.workflow.{LogicalLink, WorkflowCompiler} +import org.apache.texera.workflow.macroOp.DbMacroRegistry import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowExecutionsResource import org.apache.texera.web.service.{ExecutionResultService, WorkflowService} import org.apache.texera.web.storage.ExecutionStateStore.updateWorkflowState @@ -894,7 +895,7 @@ class SyncExecutionResource extends LazyLogging { ): Map[String, String] = { try { val tempContext = new WorkflowContext(WorkflowIdentity(workflowId)) - val compiler = new WorkflowCompiler(tempContext) + val compiler = new WorkflowCompiler(tempContext, new DbMacroRegistry()) compiler.compile(logicalPlan) Map.empty } catch { diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala index c4cb9ee3cbe..40602e52f19 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/hub/HubResource.scala @@ -262,7 +262,7 @@ object HubResource { } val records = baseWorkflowSelect() - .where(WORKFLOW.WID.in(wids: _*)) + .and(WORKFLOW.WID.in(wids: _*)) .groupBy( WORKFLOW.WID, WORKFLOW.NAME, diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/MacroResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/MacroResource.scala new file mode 100644 index 00000000000..a2d58f52335 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/MacroResource.scala @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.web.resource.dashboard.user.workflow + +import com.fasterxml.jackson.databind.{JsonNode, ObjectMapper} +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import io.dropwizard.auth.Auth +import org.apache.texera.amber.operator.macroOp.MacroPortSpec +import org.apache.texera.auth.SessionUser +import org.apache.texera.dao.SqlServer +import org.apache.texera.dao.jooq.generated.Tables._ +import org.apache.texera.dao.jooq.generated.enums.{PrivilegeEnum, WorkflowKindEnum} +import org.apache.texera.dao.jooq.generated.tables.daos.{ + MacroMetadataDao, + WorkflowDao, + WorkflowOfUserDao, + WorkflowUserAccessDao +} +import org.apache.texera.dao.jooq.generated.tables.pojos.{ + MacroMetadata, + Workflow, + WorkflowOfUser, + WorkflowUserAccess +} +import org.apache.texera.web.resource.dashboard.user.workflow.MacroResource._ +import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowAccessResource.{ + hasReadAccess, + hasWriteAccess +} +import org.jooq.{DSLContext, JSONB} + +import java.sql.Timestamp +import javax.annotation.security.RolesAllowed +import javax.ws.rs._ +import javax.ws.rs.core.MediaType +import scala.jdk.CollectionConverters._ + +/** + * REST endpoints for macro definitions. A macro is persisted as a `workflow` + * row with `kind = MACRO` plus a side row in `macro_metadata` carrying the + * denormalized port / parameter / palette-display fields. + * + * Macros reuse the workflow ACL machinery (`workflow_user_access`), so the + * standard `WorkflowAccessResource.hasReadAccess` / `hasWriteAccess` apply + * unchanged here. + */ +object MacroResource { + + private def context: DSLContext = SqlServer.getInstance().createDSLContext() + private def workflowDao = new WorkflowDao(context.configuration) + private def workflowOfUserDao = new WorkflowOfUserDao(context.configuration) + private def workflowUserAccessDao = new WorkflowUserAccessDao(context.configuration) + private def macroMetadataDao = new MacroMetadataDao(context.configuration) + + // Local mapper for the JSONB columns. The Scala module lets PortSpec and + // MacroPortSpec round-trip as case classes without extra annotations. + private val mapper: ObjectMapper = new ObjectMapper().registerModule(DefaultScalaModule) + + /** Request body for `POST /macro/create`. */ + case class MacroCreateRequest( + name: String, + description: Option[String] = None, + content: String, + isPublic: Boolean = false, + portSpec: PortSpec, + paramSpec: Option[JsonNode] = None, + category: Option[String] = None, + icon: Option[String] = None + ) + + /** Declared external boundary of a macro. */ + case class PortSpec( + inputs: List[MacroPortSpec] = Nil, + outputs: List[MacroPortSpec] = Nil + ) + + /** Full response for `POST /macro/create` and `GET /macro/{wid}`. */ + case class MacroDetail( + wid: Integer, + name: String, + description: String, + content: String, + creationTime: Timestamp, + lastModifiedTime: Timestamp, + isPublic: Boolean, + portSpec: PortSpec, + paramSpec: JsonNode, + category: Option[String], + icon: Option[String], + isOwner: Boolean, + readonly: Boolean + ) + + /** + * Lightweight row for `GET /macro/list`. `content` is intentionally omitted + * so the operator palette can render without pulling large LogicalPlan blobs + * over the wire. `usageCount` is the number of distinct non-macro workflows + * (visible to the requesting user) whose `content` references this macro + * by `"macroId":""`. Surfaced in the "Your Macros" palette as a small + * "Nx" chip so users can see at a glance how reusable a macro is. + */ + case class MacroSummary( + wid: Integer, + name: String, + description: String, + lastModifiedTime: Timestamp, + portSpec: PortSpec, + category: Option[String], + icon: Option[String], + usageCount: Int + ) + + /** + * Per-instance schema returned by `GET /macro/{wid}/schema`. In Phase 1 this + * holds the port spec only; Phase 2 will populate `params` from promoted + * parameters declared inside the macro body. + */ + case class MacroSchema( + inputs: List[MacroPortSpec], + outputs: List[MacroPortSpec], + params: List[JsonNode] + ) + + private def jsonbOf[T](value: T): JSONB = + JSONB.valueOf(mapper.writeValueAsString(value)) + + private def jsonbOfNode(node: JsonNode): JSONB = + JSONB.valueOf(mapper.writeValueAsString(node)) + + private def parsePortSpec(jsonb: JSONB): PortSpec = + Option(jsonb) + .map(j => mapper.readValue(j.data(), classOf[PortSpec])) + .getOrElse(PortSpec()) + + private def parseParamSpec(jsonb: JSONB): JsonNode = + Option(jsonb) + .map(j => mapper.readTree(j.data())) + .getOrElse(mapper.createArrayNode()) +} + +@Produces(Array(MediaType.APPLICATION_JSON)) +@Path("/macro") +class MacroResource extends LazyLogging { + + @POST + @Consumes(Array(MediaType.APPLICATION_JSON)) + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/create") + def create(req: MacroCreateRequest, @Auth sessionUser: SessionUser): MacroDetail = { + val user = sessionUser.getUser + + val workflow = new Workflow() + workflow.setName(req.name) + workflow.setDescription(req.description.orNull) + workflow.setContent(req.content) + workflow.setIsPublic(req.isPublic) + workflow.setKind(WorkflowKindEnum.MACRO) + workflowDao.insert(workflow) + + workflowOfUserDao.insert(new WorkflowOfUser(user.getUid, workflow.getWid)) + workflowUserAccessDao.insert( + new WorkflowUserAccess(user.getUid, workflow.getWid, PrivilegeEnum.WRITE) + ) + + // Seed v1 of the macro so LIVE-mode instances can pin to a concrete vid. + WorkflowVersionResource.insertVersion(workflow, insertingNewWorkflow = true) + + val metadata = new MacroMetadata( + workflow.getWid, + jsonbOf(req.portSpec), + jsonbOfNode(req.paramSpec.getOrElse(mapper.createArrayNode())), + req.category.orNull, + req.icon.orNull + ) + macroMetadataDao.insert(metadata) + + toDetail( + workflowDao.fetchOneByWid(workflow.getWid), + metadata, + isOwner = true, + readonly = false + ) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/list") + def list(@Auth sessionUser: SessionUser): List[MacroSummary] = { + val uid = sessionUser.getUser.getUid + val rows = context + .selectDistinct( + WORKFLOW.WID, + WORKFLOW.NAME, + WORKFLOW.DESCRIPTION, + WORKFLOW.LAST_MODIFIED_TIME, + MACRO_METADATA.PORT_SPEC, + MACRO_METADATA.CATEGORY, + MACRO_METADATA.ICON + ) + .from(WORKFLOW) + .join(WORKFLOW_USER_ACCESS) + .on(WORKFLOW_USER_ACCESS.WID.eq(WORKFLOW.WID)) + .leftJoin(MACRO_METADATA) + .on(MACRO_METADATA.WID.eq(WORKFLOW.WID)) + .where(WORKFLOW.KIND.eq(WorkflowKindEnum.MACRO)) + .and(WORKFLOW_USER_ACCESS.UID.eq(uid)) + .fetch() + + val usageMap = computeMacroUsage(uid) + rows.asScala.map { r => + MacroSummary( + r.value1(), + r.value2(), + r.value3(), + r.value4(), + parsePortSpec(r.value5()), + Option(r.value6()), + Option(r.value7()), + usageMap.getOrElse(r.value1().intValue(), 0) + ) + }.toList + } + + /** + * For each macro the user can see, count the distinct non-macro workflows + * (also user-visible) whose `content` JSON embeds the macro's wid via + * `"macroId":""`. The regex is robust to whitespace variants Jackson + * may produce. + * + * One pass over the user's non-macro workflows; no per-macro round-trip. + * Cost = O(workflows × content-length). For typical Texera installs + * (hundreds of workflows, < 100KB each) this is well under a millisecond. + */ + private def computeMacroUsage(uid: Integer): Map[Int, Int] = { + val contents = context + .selectDistinct(WORKFLOW.WID, WORKFLOW.CONTENT) + .from(WORKFLOW) + .join(WORKFLOW_USER_ACCESS) + .on(WORKFLOW_USER_ACCESS.WID.eq(WORKFLOW.WID)) + .where(WORKFLOW.KIND.ne(WorkflowKindEnum.MACRO)) + .and(WORKFLOW_USER_ACCESS.UID.eq(uid)) + .fetch() + val macroIdRegex = """"macroId"\s*:\s*"(\d+)"""".r + val counts = scala.collection.mutable.Map[Int, Int]().withDefaultValue(0) + for (r <- contents.asScala) { + val content = r.value2() + if (content != null) { + // De-dup within a single workflow: one workflow contributes +1 per + // distinct macroId it references, not per occurrence. The UI surfaces + // this as "used in N workflows". + val widsInThisWorkflow = scala.collection.mutable.Set[Int]() + for (m <- macroIdRegex.findAllMatchIn(content)) { + widsInThisWorkflow += m.group(1).toInt + } + widsInThisWorkflow.foreach(wid => counts(wid) = counts(wid) + 1) + } + } + counts.toMap + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{wid}") + def get(@PathParam("wid") wid: Integer, @Auth sessionUser: SessionUser): MacroDetail = { + val uid = sessionUser.getUser.getUid + if (!hasReadAccess(wid, uid)) { + throw new ForbiddenException("No sufficient access privilege.") + } + val workflow = Option(workflowDao.fetchOneByWid(wid)) + .filter(_.getKind == WorkflowKindEnum.MACRO) + .getOrElse(throw new NotFoundException(s"Macro $wid not found")) + val metadata = Option(macroMetadataDao.fetchOneByWid(wid)) + .getOrElse(throw new NotFoundException(s"Macro $wid metadata missing")) + toDetail(workflow, metadata, isOwner = isOwner(wid, uid), readonly = !hasWriteAccess(wid, uid)) + } + + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{wid}/schema") + def schema( + @PathParam("wid") wid: Integer, + @Auth sessionUser: SessionUser + ): MacroSchema = { + val uid = sessionUser.getUser.getUid + if (!hasReadAccess(wid, uid)) { + throw new ForbiddenException("No sufficient access privilege.") + } + val metadata = Option(macroMetadataDao.fetchOneByWid(wid)) + .getOrElse(throw new NotFoundException(s"Macro $wid metadata missing")) + val ports = parsePortSpec(metadata.getPortSpec) + MacroSchema(ports.inputs, ports.outputs, params = Nil) + } + + /** + * Returns the macro's serialized body so the frontend can inline it into a + * parent workflow as a SNAPSHOT instance (`MacroOpDesc.linkMode = SNAPSHOT`), + * detaching that instance from any future edits to the macro definition. + */ + @POST + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{wid}/snapshot-into-instance") + def snapshotIntoInstance( + @PathParam("wid") wid: Integer, + @Auth sessionUser: SessionUser + ): String = { + val uid = sessionUser.getUser.getUid + if (!hasReadAccess(wid, uid)) { + throw new ForbiddenException("No sufficient access privilege.") + } + Option(workflowDao.fetchOneByWid(wid)) + .filter(_.getKind == WorkflowKindEnum.MACRO) + .map(_.getContent) + .getOrElse(throw new NotFoundException(s"Macro $wid not found")) + } + + private def isOwner(wid: Integer, uid: Integer): Boolean = + context + .selectCount() + .from(WORKFLOW_OF_USER) + .where(WORKFLOW_OF_USER.WID.eq(wid).and(WORKFLOW_OF_USER.UID.eq(uid))) + .fetchOne(0, classOf[Integer]) > 0 + + private def toDetail( + workflow: Workflow, + metadata: MacroMetadata, + isOwner: Boolean, + readonly: Boolean + ): MacroDetail = + MacroDetail( + workflow.getWid, + workflow.getName, + workflow.getDescription, + workflow.getContent, + workflow.getCreationTime, + workflow.getLastModifiedTime, + workflow.getIsPublic, + parsePortSpec(metadata.getPortSpec), + parseParamSpec(metadata.getParamSpec), + Option(metadata.getCategory), + Option(metadata.getIcon), + isOwner, + readonly + ) +} diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala index cb910d11c3c..0c1850b34a4 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowResource.scala @@ -28,7 +28,7 @@ import org.apache.texera.amber.core.virtualidentity.ExecutionIdentity import org.apache.texera.auth.SessionUser import org.apache.texera.dao.SqlServer import org.apache.texera.dao.jooq.generated.Tables._ -import org.apache.texera.dao.jooq.generated.enums.PrivilegeEnum +import org.apache.texera.dao.jooq.generated.enums.{PrivilegeEnum, WorkflowKindEnum} import org.apache.texera.dao.jooq.generated.tables.daos.{ WorkflowDao, WorkflowOfProjectDao, @@ -42,7 +42,7 @@ import org.apache.texera.web.resource.dashboard.hub.HubResource.recordCloneActio import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowAccessResource.hasReadAccess import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowResource._ import org.jooq.impl.DSL.{groupConcatDistinct, noCondition} -import org.jooq.{Condition, DSLContext, Record9, Result, SelectOnConditionStep} +import org.jooq.{Condition, DSLContext, Record9, Result, SelectConditionStep} import java.sql.Timestamp import java.util @@ -185,7 +185,13 @@ object WorkflowResource { } } - def baseWorkflowSelect(): SelectOnConditionStep[Record9[ + /** + * Base select used by the workflows tab, the hub, and other workflow + * listings. The `WORKFLOW.KIND = WORKFLOW` filter is baked in here so that + * macros (`KIND = MACRO`) never leak into endpoints meant for top-level + * workflows. Callers append their additional predicates with `.and(...)`. + */ + def baseWorkflowSelect(): SelectConditionStep[Record9[ Integer, String, String, @@ -217,6 +223,7 @@ object WorkflowResource { .on(USER.UID.eq(WORKFLOW_OF_USER.UID)) .leftJoin(WORKFLOW_OF_PROJECT) .on(WORKFLOW.WID.eq(WORKFLOW_OF_PROJECT.WID)) + .where(WORKFLOW.KIND.eq(WorkflowKindEnum.WORKFLOW)) } def mapWorkflowEntries( @@ -339,6 +346,7 @@ class WorkflowResource extends LazyLogging { .where( orCondition .and(WORKFLOW_USER_ACCESS.UID.eq(user.getUid)) + .and(WORKFLOW.KIND.eq(WorkflowKindEnum.WORKFLOW)) ) .fetch() @@ -363,7 +371,7 @@ class WorkflowResource extends LazyLogging { ): List[DashboardWorkflow] = { val user = sessionUser.getUser val workflowEntries = baseWorkflowSelect() - .where(WORKFLOW_USER_ACCESS.UID.eq(user.getUid)) + .and(WORKFLOW_USER_ACCESS.UID.eq(user.getUid)) .groupBy( WORKFLOW.WID, WORKFLOW.NAME, @@ -395,6 +403,15 @@ class WorkflowResource extends LazyLogging { ): WorkflowWithPrivilege = { if (WorkflowAccessResource.hasReadAccess(wid, user.getUid)) { val workflow = workflowDao.fetchOneByWid(wid) + // Macros share the workflow table but their `content` is a MacroBody, not + // a LogicalPlanPojo — loading one via the workflow editor would crash the + // canvas (see workflow-check.ts). Fail fast until the drill-down editor + // route exists. + if (workflow != null && workflow.getKind == WorkflowKindEnum.MACRO) { + throw new NotFoundException( + s"Workflow $wid is a macro definition; use the macro editor route instead." + ) + } WorkflowWithPrivilege( workflow.getName, workflow.getDescription, @@ -410,6 +427,47 @@ class WorkflowResource extends LazyLogging { } } + /** + * Return the macro-instance-provenance mapping captured by MacroExpander + * during the most recent compile of this workflow. The mapping is keyed by + * runtime op IDs (the fresh UUIDs the expander assigned to inner ops) and + * each entry holds: + * - `macroChain`: ordered list of macro instance IDs from outermost + * (parent canvas) to innermost (immediate enclosing macro) + * - `bodyOpId`: the original definition-time op ID inside the innermost + * macro's body, used to render stats at the right canvas position when + * the user drills into a macro + * + * The frontend reads this to (1) aggregate inner-op stats up to the macro + * op on the canvas and (2) display per-op stats in the macro drill-down + * view. Empty map if no compile has happened yet — the caller should poll + * shortly after starting execution. + */ + @GET + @RolesAllowed(Array("REGULAR", "ADMIN")) + @Path("/{wid}/macro-mapping") + def getMacroMapping( + @PathParam("wid") wid: Integer, + @Auth user: SessionUser + ): java.util.Map[String, java.util.Map[String, Any]] = { + if (!WorkflowAccessResource.hasReadAccess(wid, user.getUid)) { + throw new ForbiddenException("No sufficient access privilege.") + } + val mapping = org.apache.texera.workflow.macroOp.MacroMappingCache + .getLatestForWorkflow( + org.apache.texera.amber.core.virtualidentity.WorkflowIdentity(wid.longValue()) + ) + val result = new java.util.HashMap[String, java.util.Map[String, Any]]() + mapping.foreach { + case (runtimeOpId, prov) => + val entry = new java.util.HashMap[String, Any]() + entry.put("macroChain", java.util.Arrays.asList(prov.macroChain: _*)) + entry.put("bodyOpId", prov.bodyOpId) + result.put(runtimeOpId, entry) + } + result + } + /** * This method persists the workflow into database * @@ -497,7 +555,8 @@ class WorkflowResource extends LazyLogging { assignNewOperatorIds(oldWorkflow.getContent), null, null, - false + false, + WorkflowKindEnum.WORKFLOW ), sessionUser ) @@ -544,7 +603,8 @@ class WorkflowResource extends LazyLogging { assignNewOperatorIds(oldWorkflow.getContent), null, null, - false + false, + WorkflowKindEnum.WORKFLOW ), sessionUser ) diff --git a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowVersionResource.scala b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowVersionResource.scala index 7be74ae5b00..58c039e9d1d 100644 --- a/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowVersionResource.scala +++ b/amber/src/main/scala/org/apache/texera/web/resource/dashboard/user/workflow/WorkflowVersionResource.scala @@ -26,6 +26,7 @@ import org.apache.texera.auth.SessionUser import org.apache.texera.config.UserSystemConfig import org.apache.texera.dao.SqlServer import org.apache.texera.dao.jooq.generated.Tables.WORKFLOW_VERSION +import org.apache.texera.dao.jooq.generated.enums.WorkflowKindEnum import org.apache.texera.dao.jooq.generated.tables.daos.{WorkflowDao, WorkflowVersionDao} import org.apache.texera.dao.jooq.generated.tables.pojos.{Workflow, WorkflowVersion} import org.apache.texera.web.resource.dashboard.user.workflow.WorkflowResource.{ @@ -435,7 +436,8 @@ class WorkflowVersionResource { assignNewOperatorIds(workflowVersion.getContent), null, null, - false + false, + WorkflowKindEnum.WORKFLOW ), sessionUser ) diff --git a/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala b/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala index 741687e02c9..bace62e34c4 100644 --- a/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala +++ b/amber/src/main/scala/org/apache/texera/web/service/WorkflowExecutionService.scala @@ -39,6 +39,7 @@ import org.apache.texera.web.storage.ExecutionStateStore import org.apache.texera.web.storage.ExecutionStateStore.updateWorkflowState import org.apache.texera.web.{ComputingUnitMaster, SubscriptionManager, WebsocketInput} import org.apache.texera.workflow.WorkflowCompiler +import org.apache.texera.workflow.macroOp.DbMacroRegistry import java.net.URI import scala.collection.mutable @@ -105,7 +106,7 @@ class WorkflowExecutionService( def executeWorkflow(): Unit = { try { - workflow = new WorkflowCompiler(workflowContext) + workflow = new WorkflowCompiler(workflowContext, new DbMacroRegistry()) .compile(request.logicalPlan) } catch { case err: Throwable => diff --git a/amber/src/main/scala/org/apache/texera/workflow/WorkflowCompiler.scala b/amber/src/main/scala/org/apache/texera/workflow/WorkflowCompiler.scala index b93aa3e4db3..9b2267242af 100644 --- a/amber/src/main/scala/org/apache/texera/workflow/WorkflowCompiler.scala +++ b/amber/src/main/scala/org/apache/texera/workflow/WorkflowCompiler.scala @@ -24,6 +24,7 @@ import org.apache.texera.amber.core.virtualidentity.OperatorIdentity import org.apache.texera.amber.core.workflow._ import org.apache.texera.amber.engine.architecture.controller.Workflow import org.apache.texera.web.model.websocket.request.LogicalPlanPojo +import org.apache.texera.workflow.macroOp.{MacroExpander, MacroMappingCache, MacroRegistry} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer @@ -31,7 +32,8 @@ import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.util.{Failure, Success, Try} class WorkflowCompiler( - context: WorkflowContext + context: WorkflowContext, + macroRegistry: MacroRegistry = MacroRegistry.Empty ) extends LazyLogging { /** @@ -142,12 +144,27 @@ class WorkflowCompiler( logicalPlanPojo: LogicalPlanPojo ): Workflow = { // 1. convert the pojo to logical plan - val logicalPlan: LogicalPlan = LogicalPlan(logicalPlanPojo) + val rawLogicalPlan: LogicalPlan = LogicalPlan(logicalPlanPojo) + + // 2. expand any macro operators into a flat logical plan. Macros are a purely + // logical-plan-level abstraction; after this pass the rest of the pipeline + // never sees a MacroOpDesc / MacroInputOp / MacroOutputOp. + val logicalPlan: LogicalPlan = MacroExpander.expand(rawLogicalPlan, macroRegistry) + // Drain the macro-instance-provenance side-table populated by MacroExpander + // and stash it in MacroMappingCache keyed by (wid, eid). The frontend + // fetches this via GET /api/workflow/{wid}/macro-mapping to roll inner-op + // stats up to the macro op on the canvas (and to render stats inside + // drill-down body views). + MacroMappingCache.put( + context.workflowId, + context.executionId, + MacroExpander.takeMacroInstanceMapping() + ) - // 2. resolve the file name in each scan source operator + // 3. resolve the file name in each scan source operator logicalPlan.resolveScanSourceOpFileName(None) - // 3. expand the logical plan to the physical plan, and get a set of output ports that need storage + // 4. expand the logical plan to the physical plan, and get a set of output ports that need storage val (physicalPlan, outputPortsNeedingStorage) = expandLogicalPlan(logicalPlan, logicalPlanPojo.opsToViewResult, None) diff --git a/amber/src/main/scala/org/apache/texera/workflow/macroOp/DbMacroRegistry.scala b/amber/src/main/scala/org/apache/texera/workflow/macroOp/DbMacroRegistry.scala new file mode 100644 index 00000000000..10ee06d09f7 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/workflow/macroOp/DbMacroRegistry.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.workflow.macroOp + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import org.apache.texera.amber.operator.macroOp.MacroBody +import org.apache.texera.dao.SqlServer +import org.apache.texera.dao.jooq.generated.Tables.WORKFLOW +import org.apache.texera.dao.jooq.generated.enums.WorkflowKindEnum + +import scala.util.control.NonFatal + +/** + * jOOQ + Jackson-backed [[MacroRegistry]] for the amber execution-time + * compiler. Reads `workflow.content` as a JSON-serialized [[MacroBody]] — + * same shape produced by `MacroResource.create`. + * + * v1 ignores the `version` argument and always reads the current row. + * Reconstructing a specific `vid` from `workflow_version` patches is deferred + * to Phase 2. + * + * Duplicates the compiling-service `DbMacroRegistry`; both share the same + * `texera_db` schema so the body bytes round-trip identically across paths. + */ +class DbMacroRegistry extends MacroRegistry with LazyLogging { + + private val mapper = new ObjectMapper().registerModule(DefaultScalaModule) + + override def fetch(macroId: String, version: Int): Option[MacroBody] = { + val widOpt = + try Some(Integer.parseInt(macroId)) + catch { case _: NumberFormatException => None } + + widOpt.flatMap { wid => + try { + val record = SqlServer + .getInstance() + .createDSLContext() + .select(WORKFLOW.CONTENT, WORKFLOW.KIND) + .from(WORKFLOW) + .where(WORKFLOW.WID.eq(wid)) + .fetchOne() + if (record == null || record.value2() != WorkflowKindEnum.MACRO) { + None + } else { + Option(record.value1()) + .filter(_.nonEmpty) + .map(mapper.readValue(_, classOf[MacroBody])) + } + } catch { + case NonFatal(e) => + logger.error( + s"DbMacroRegistry: failed to load macro macroId=$macroId version=$version", + e + ) + None + } + } + } +} diff --git a/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroCompileContext.scala b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroCompileContext.scala new file mode 100644 index 00000000000..d7391c1d802 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroCompileContext.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.workflow.macroOp + +// Threaded through MacroExpander to detect macro recursion and depth bombs. +// `visited` is the set of (macroId, version) pairs on the current expansion path; +// reappearance means a cycle. +// +// Duplicate of the compiling-service equivalent; both versions track the same +// invariants because the amber-side and compiling-service-side WorkflowCompilers +// each maintain their own copy of the macro pipeline. They will converge when +// the broader LogicalPlan unification (see WorkflowCompiler.scala TODO) lands. +case class MacroCompileContext( + visited: Set[(String, Int)], + depth: Int +) { + + def guardAgainstCycle(macroId: String, version: Int): Unit = { + if (visited.contains((macroId, version))) { + val path = visited.map { case (id, v) => s"$id@v$v" }.mkString(" -> ") + throw new IllegalStateException( + s"Macro cycle detected: $macroId@v$version is already being expanded on this path " + + s"(visited: $path)" + ) + } + } + + def guardAgainstDepth(): Unit = { + if (depth >= MacroCompileContext.MaxDepth) { + throw new IllegalStateException( + s"Macro expansion depth limit (${MacroCompileContext.MaxDepth}) exceeded — " + + s"likely a self-referential macro chain." + ) + } + } + + def descend(macroId: String, version: Int): MacroCompileContext = + MacroCompileContext(visited + ((macroId, version)), depth + 1) +} + +object MacroCompileContext { + val MaxDepth: Int = 16 + def root: MacroCompileContext = MacroCompileContext(Set.empty, 0) +} diff --git a/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroExpander.scala b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroExpander.scala new file mode 100644 index 00000000000..eee3faa5259 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroExpander.scala @@ -0,0 +1,387 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.workflow.macroOp + +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.{LogicalOp, PortDescription} +import org.apache.texera.amber.operator.macroOp.{ + MacroBody, + MacroInputOp, + MacroLink, + MacroOpDesc, + MacroOutputOp +} +import org.apache.texera.amber.operator.udf.python.PythonUDFOpDescV2 +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.apache.texera.workflow.{LogicalLink, LogicalPlan} + +// Pre-compile pass for the amber execution-time compiler. Walks a LogicalPlan, +// inlines every MacroOpDesc by splicing its body's inner operators and links +// into the parent, and produces a flat LogicalPlan with no MacroOpDesc / +// MacroInputOp / MacroOutputOp nodes. Inner-op IDs are rewritten to +// "${macroInstanceId}--${innerOpId}" so telemetry can be aggregated per macro +// purely from the operator-ID prefix — the physical-plan layer remains +// macro-unaware. "--" is used instead of "/" to avoid breaking VFS URI paths. +// +// Mirrors the compiling-service MacroExpander; the two operate on their own +// LogicalLink/LogicalPlan classes and will converge once those types are +// unified (see WorkflowCompiler.scala TODO). +object MacroExpander { + + /** + * Provenance of one freshly-named inner op in the expanded plan. + * + * @param macroChain ordered list of macro instance IDs from outermost + * (parent canvas) to innermost (immediate enclosing + * macro). e.g. for an op deep inside nested macro 294 + * which itself sits inside macro 295: List("295_inst", + * "294_inst_in_295_body"). + * @param bodyOpId the original definition-time op ID this runtime op + * was cloned from. Lets the drill-down view map runtime + * stats back to definition-time positions when rendering + * the macro body. + */ + case class MacroProvenance(macroChain: List[String], bodyOpId: String) + + /** + * Side-table from `runtime fresh-UUID → MacroProvenance`. Populated by + * `spliceIntoParent` (handles nested macros: when an outer splice re-clones + * an op that an inner splice already touched, the outer splice prepends its + * macro instance to the existing chain and drops the stale inner UUID). + * + * The frontend reads this via `/api/workflow/{wid}/macro-mapping?eid=...` + * to aggregate inner-op stats up to the macro op on the canvas, and to + * route stats to body-level positions inside the drill-down editor. + * + * Threading model: not thread-safe; each compile call should drain via + * `takeMacroInstanceMapping()` immediately after `expand` returns. + */ + private val currentMacroInstanceMapping = + scala.collection.mutable.Map[String, MacroProvenance]() + + /** Snapshot + clear the current mapping. The caller takes ownership. */ + def takeMacroInstanceMapping(): Map[String, MacroProvenance] = { + val snapshot = currentMacroInstanceMapping.toMap + currentMacroInstanceMapping.clear() + snapshot + } + + def expand(plan: LogicalPlan, registry: MacroRegistry): LogicalPlan = + expand(plan, registry, MacroCompileContext.root) + + private def expand( + plan: LogicalPlan, + registry: MacroRegistry, + ctx: MacroCompileContext + ): LogicalPlan = { + var acc = plan + while (acc.operators.exists(_.isInstanceOf[MacroOpDesc])) { + val m = acc.operators.collectFirst { case x: MacroOpDesc => x }.get + acc = inlineMacro(acc, m, registry, ctx) + } + acc + } + + private def inlineMacro( + parent: LogicalPlan, + m: MacroOpDesc, + registry: MacroRegistry, + ctx: MacroCompileContext + ): LogicalPlan = { + ctx.guardAgainstCycle(m.macroId, m.macroVersion) + ctx.guardAgainstDepth() + + // §9.2 AI fusion: if the macro has been verified-fused into a single + // PythonUDF, substitute that UDF for the entire inlined body instead + // of expanding. This eliminates inter-actor handoffs for the chain + // and is the perf-demo path of the hackathon's `fuseMacro` flow. The + // frontend sets `fusion.verified = true` after running sample-diff + // verification client-side; we trust that gate here because the + // verification protocol is owned by the agent service. + if (m.fusion.exists(_.verified)) { + return substituteFused(parent, m) + } + + val body: MacroBody = m.linkMode match { + case MacroOpDesc.SNAPSHOT => + m.snapshot.getOrElse( + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}] has linkMode=SNAPSHOT but no embedded snapshot" + ) + ) + case MacroOpDesc.LIVE => + registry + .fetch(m.macroId, m.macroVersion) + .getOrElse( + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}@v${m.macroVersion}] not found in registry " + + s"(LIVE link). The macro may be deleted or inaccessible." + ) + ) + case other => + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}] has unknown linkMode '$other'" + ) + } + + val expandedBody = expand( + LogicalPlan(body.operators, body.links.map(toLogicalLink)), + registry, + ctx.descend(m.macroId, m.macroVersion) + ) + + spliceIntoParent(parent, m, expandedBody) + } + + private def toLogicalLink(ml: MacroLink): LogicalLink = + LogicalLink( + OperatorIdentity(ml.fromOpId), + ml.fromPortId, + OperatorIdentity(ml.toOpId), + ml.toPortId + ) + + private def spliceIntoParent( + parent: LogicalPlan, + m: MacroOpDesc, + body: LogicalPlan + ): LogicalPlan = { + val instanceId = m.operatorIdentifier.id + val mId = m.operatorIdentifier + + val inputMarkers: Map[Int, MacroInputOp] = + body.operators.collect { case b: MacroInputOp => b.portIndex -> b }.toMap + val outputMarkers: Map[Int, MacroOutputOp] = + body.operators.collect { case b: MacroOutputOp => b.portIndex -> b }.toMap + + val markerIds: Set[OperatorIdentity] = + inputMarkers.values.map(_.operatorIdentifier).toSet ++ + outputMarkers.values.map(_.operatorIdentifier).toSet + + // Deep-clone non-marker inner ops via JSON round-trip. + val innerOps: List[LogicalOp] = body.operators.collect { + case op if !op.isInstanceOf[MacroInputOp] && !op.isInstanceOf[MacroOutputOp] => + deepClone(op) + } + + // Assign fresh UUIDs to each inner op. The expanded LogicalPlan must be + // STRUCTURALLY IDENTICAL to a hand-flattened workflow — otherwise downstream + // engine behavior (Iceberg materialization table naming, partition routing + // based on op-ID hashes, region scheduling) silently diverges. + // + // CRITICAL: the UUIDs MUST be DETERMINISTIC across compiles. Texera has + // two WorkflowCompiler implementations (one in workflow-compiling-service + // for frontend validation, one in amber for actual execution). Both run + // MacroExpander on the SAME workflow content. If we used + // `UUID.randomUUID()` the two compilers would generate different IDs for + // the same op; the frontend would cache one set (whichever wrote to + // MacroMappingCache last) but the engine would emit stats keyed by the + // OTHER set, so stat aggregation up to the macro op would silently fail. + // + // Solution: derive the UUID from `nameUUIDFromBytes(macroInstanceId | body + // op id)`. For nested macros, the inner splice's freshId already encodes + // the inner chain, so the outer splice's seed transitively captures the + // whole chain. Same workflow → same UUIDs across compilers. + // + // The previous "${macroInstanceId}--${innerOpId}" prefix scheme was + // convenient for stats aggregation but produced 170+ char op IDs, which + // caused observable Iceberg commit thrash on HashJoin's internal build + // port — execution that runs fine on a hand-flattened plan hangs on the + // macro-wrapped equivalent. Deterministic UUIDs are short. + val idRewrite: Map[OperatorIdentity, OperatorIdentity] = innerOps.map { op => + val originalId = op.operatorIdentifier + val seed = s"${m.operatorIdentifier.id}|${originalId.id}" + val derivedUuid = java.util.UUID.nameUUIDFromBytes(seed.getBytes("UTF-8")) + val freshId = s"${op.getClass.getSimpleName}-operator-$derivedUuid" + op.setOperatorId(freshId) + originalId -> op.operatorIdentifier + }.toMap + + // Update the provenance side-table. Two cases per renamed op: + // 1. originalId IS already a fresh UUID from a prior (inner) splice: + // Take the inner provenance, prepend THIS macro instance to its + // chain, and move the entry to the new outer UUID. + // 2. originalId is the macro body's definition-time op ID: + // Create a fresh provenance with chain=[mId] and bodyOpId=originalId. + // Drops the stale inner-UUID entry so the side-table only references + // op IDs that exist in the final expanded plan. + idRewrite.foreach { + case (originalId, newId) => + currentMacroInstanceMapping.get(originalId.id) match { + case Some(existing) => + currentMacroInstanceMapping(newId.id) = + MacroProvenance(mId.id :: existing.macroChain, existing.bodyOpId) + if (newId.id != originalId.id) currentMacroInstanceMapping.remove(originalId.id) + case None => + currentMacroInstanceMapping(newId.id) = + MacroProvenance(List(mId.id), originalId.id) + } + } + + def rewriteInnerId(id: OperatorIdentity): OperatorIdentity = + idRewrite.getOrElse( + id, + throw new IllegalStateException( + s"MacroExpander: link references unknown inner op '${id.id}' (instance=$instanceId)" + ) + ) + + val internalLinks: List[LogicalLink] = body.links.collect { + case l if !markerIds.contains(l.fromOpId) && !markerIds.contains(l.toOpId) => + LogicalLink(rewriteInnerId(l.fromOpId), l.fromPortId, rewriteInnerId(l.toOpId), l.toPortId) + } + + val inputConsumers: Map[Int, List[(OperatorIdentity, PortIdentity)]] = + inputMarkers.map { + case (portIndex, marker) => + val markerId = marker.operatorIdentifier + val consumers = body.links + .filter(_.fromOpId == markerId) + .map(l => (rewriteInnerId(l.toOpId), l.toPortId)) + portIndex -> consumers + } + + val outputProducers: Map[Int, (OperatorIdentity, PortIdentity)] = + outputMarkers.map { + case (portIndex, marker) => + val markerId = marker.operatorIdentifier + val producers = body.links + .filter(_.toOpId == markerId) + .map(l => (rewriteInnerId(l.fromOpId), l.fromPortId)) + producers match { + case single :: Nil => portIndex -> single + case Nil => + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) in macro $instanceId has no producer" + ) + case many => + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) in macro $instanceId has " + + s"${many.size} producers; expected exactly one." + ) + } + } + + val rewrittenParentLinks: List[LogicalLink] = parent.links.flatMap { link => + if (link.toOpId == mId) { + val portIndex = link.toPortId.id + inputConsumers.get(portIndex) match { + case Some(consumers) => + consumers.map { + case (innerOp, innerPort) => + LogicalLink(link.fromOpId, link.fromPortId, innerOp, innerPort) + } + case None => + throw new IllegalStateException( + s"Parent link into ($instanceId, port=$portIndex) has no matching " + + s"MacroInputOp inside the macro body." + ) + } + } else if (link.fromOpId == mId) { + val portIndex = link.fromPortId.id + outputProducers.get(portIndex) match { + case Some((innerOp, innerPort)) => + List(LogicalLink(innerOp, innerPort, link.toOpId, link.toPortId)) + case None => + throw new IllegalStateException( + s"Parent link out of ($instanceId, port=$portIndex) has no matching " + + s"MacroOutputOp inside the macro body." + ) + } + } else { + List(link) + } + } + + val newOps = + parent.operators.filterNot(_.operatorIdentifier == mId) ++ innerOps + val newLinks = rewrittenParentLinks ++ internalLinks + LogicalPlan(newOps, newLinks) + } + + // Deep-clone via JSON round-trip to avoid mutating the persisted body when we + // rewrite inner-op IDs in spliceIntoParent. + private def deepClone(op: LogicalOp): LogicalOp = { + val json = objectMapper.writeValueAsString(op) + objectMapper.readValue(json, classOf[LogicalOp]) + } + + /** + * §9.2 AI fusion substitution: when the macro has a `verified` `fusion`, + * replace the entire MacroOpDesc + its inlined body with a single + * PythonUDFOpDescV2 carrying the fused code. The substitute operator + * inherits the macro's external input/output port count so all parent + * links re-target it cleanly (1:1 port mapping, no fan-out). + * + * This is the gate that powers the hackathon demo's "fuse for + * performance" path — once the frontend marks `fusion.verified = true`, + * the engine never sees the original inlined body for this instance. + */ + private def substituteFused(parent: LogicalPlan, m: MacroOpDesc): LogicalPlan = { + val fusion = m.fusion.get + val instanceId = m.operatorIdentifier.id + val fused = new PythonUDFOpDescV2() + fused.code = fusion.code + // Schema propagation for the fused UDF: a fused macro that takes an input + // re-emits a tuple of the same shape (filter/projection/map operators + // mutate or drop the input dict but don't introduce new columns unless + // the user adds them in the fused code). retainInputColumns=true lets the + // engine carry the input schema through to the output without a hand- + // declared outputColumns list. workers=1 keeps the fused execution + // single-actor — the whole point of fusion is collapsing serialization + // hops, not parallelism. + fused.retainInputColumns = m.inputPortCount > 0 + fused.outputColumns = List.empty + fused.workers = 1 + // Keep the macro op's external interface — same input/output port + // counts so the upstream/downstream link wiring on the parent canvas + // doesn't need to change. + fused.inputPorts = (0 until m.inputPortCount).map { i => + PortDescription( + portID = s"input-$i", + displayName = s"in-$i", + disallowMultiInputs = false, + isDynamicPort = false, + partitionRequirement = null, + dependencies = List.empty + ) + }.toList + fused.outputPorts = (0 until m.outputPortCount).map { i => + PortDescription( + portID = s"output-$i", + displayName = s"out-$i", + disallowMultiInputs = false, + isDynamicPort = false, + partitionRequirement = null, + dependencies = List.empty + ) + }.toList + fused.setOperatorId(instanceId) // reuse the macro instance ID — no link rewrite needed + // Replace the macro op in the parent with the fused UDF op. Links + // already reference `instanceId` on both ends since `setOperatorId` + // preserved it; no link rewrite required. + val newOps = parent.operators.map { + case op if op.operatorIdentifier == m.operatorIdentifier => fused + case op => op + } + LogicalPlan(newOps, parent.links) + } +} diff --git a/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroMappingCache.scala b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroMappingCache.scala new file mode 100644 index 00000000000..d170882e49c --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroMappingCache.scala @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.workflow.macroOp + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.workflow.macroOp.MacroExpander.MacroProvenance + +import java.io.File +import java.nio.file.{Files, Paths} +import java.util.concurrent.ConcurrentHashMap +import scala.util.Try + +/** + * Process-singleton cache for the macro-instance provenance map produced by + * `MacroExpander.takeMacroInstanceMapping()` after each compile. Keyed by + * (workflowId, executionId) so multiple concurrent executions don't collide. + * + * Lifecycle: written by `WorkflowCompiler.compile` immediately after macro + * expansion. Read by the REST endpoint exposed via `WorkflowResource` (see + * `getMacroMapping`). Old entries are evicted by `evictAllForWorkflow` when a + * workflow's executions finish — defensive against memory growth on + * long-running deployments. The cache survives across compiles within the + * SAME execution since the engine re-compiles internally on some paths. + */ +object MacroMappingCache { + + // The cache is written by ComputingUnitMaster's WorkflowCompiler when a run + // starts, and read by TexeraWebApplication's REST endpoint when the + // frontend polls. Those are SEPARATE JVMs, so an in-memory singleton + // doesn't suffice. We back the cache with the local filesystem so both + // processes see the same data. + // + // Layout (per workflow): /tmp/texera-macro-mappings/wid-{wid}.json + // The file holds the most-recent compile's mapping; subsequent compiles + // overwrite. eid-keyed history is omitted for now (the frontend always + // wants "latest for this wid"). + // + // In-memory cache is a fast-path; falls through to disk when missing. + + private val memCache = + new ConcurrentHashMap[(WorkflowIdentity, ExecutionIdentity), Map[String, MacroProvenance]]() + + private val DiskDir = "/tmp/texera-macro-mappings" + private val mapper = + new ObjectMapper().registerModule(DefaultScalaModule) + + private def diskPathForWorkflow(wid: WorkflowIdentity): String = + s"$DiskDir/wid-${wid.id}.json" + + def put( + wid: WorkflowIdentity, + eid: ExecutionIdentity, + mapping: Map[String, MacroProvenance] + ): Unit = { + memCache.put((wid, eid), mapping) + Try { + Files.createDirectories(Paths.get(DiskDir)) + // Serialize as Map + val asJsonReady = mapping.map { + case (k, v) => + k -> Map("macroChain" -> v.macroChain, "bodyOpId" -> v.bodyOpId) + } + val outFile = new File(diskPathForWorkflow(wid)) + Files.writeString(outFile.toPath, mapper.writeValueAsString(asJsonReady)) + } + } + + /** + * Look up a mapping for the latest known compile of (wid, eid). Returns an + * empty map if no compile has happened yet — the frontend should poll + * shortly after execution start. + */ + def get(wid: WorkflowIdentity, eid: ExecutionIdentity): Map[String, MacroProvenance] = + Option(memCache.get((wid, eid))).getOrElse(readFromDisk(wid)) + + /** + * Most recent mapping for a workflow id across all executions. Used by the + * frontend when it doesn't know the exact eid yet (e.g. immediately after + * clicking Run; the websocket hasn't confirmed eid yet). + */ + def getLatestForWorkflow(wid: WorkflowIdentity): Map[String, MacroProvenance] = { + import scala.jdk.CollectionConverters._ + val entries = memCache.entrySet().asScala.filter(_.getKey._1 == wid).toList + val fromMem = entries.sortBy(-_.getKey._2.id).headOption.map(_.getValue) + fromMem.getOrElse(readFromDisk(wid)) + } + + private def readFromDisk(wid: WorkflowIdentity): Map[String, MacroProvenance] = { + val path = Paths.get(diskPathForWorkflow(wid)) + if (!Files.exists(path)) return Map.empty + // Parse via Jackson tree API so we don't fight Scala/Java type erasure when + // the DefaultScalaModule rewrites arrays to scala.List vs java.util.List. + Try { + val json = Files.readString(path) + val root = mapper.readTree(json) + import scala.jdk.CollectionConverters._ + val fields = root.fields().asScala.toList + fields.map { entry => + val runtimeOpId = entry.getKey + val node = entry.getValue + val chainNode = node.get("macroChain") + val chain = + if (chainNode != null && chainNode.isArray) + chainNode.elements().asScala.map(_.asText()).toList + else Nil + val bodyOpId = Option(node.get("bodyOpId")).map(_.asText()).getOrElse("") + runtimeOpId -> MacroProvenance(chain, bodyOpId) + }.toMap + }.getOrElse(Map.empty) + } + + def evictAllForWorkflow(wid: WorkflowIdentity): Unit = { + import scala.jdk.CollectionConverters._ + val keysToRemove = + memCache.keySet().asScala.filter(_._1 == wid).toList + keysToRemove.foreach(memCache.remove) + Try(Files.deleteIfExists(Paths.get(diskPathForWorkflow(wid)))) + } +} diff --git a/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroRegistry.scala b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroRegistry.scala new file mode 100644 index 00000000000..4ebc4323a66 --- /dev/null +++ b/amber/src/main/scala/org/apache/texera/workflow/macroOp/MacroRegistry.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.workflow.macroOp + +import org.apache.texera.amber.operator.macroOp.MacroBody + +// Looks up a macro definition's body by (macroId, version). The persistence-backed +// implementation lives next to this trait in the amber module ([[DbMacroRegistry]]); +// tests and degraded execution paths can use [[Empty]] or [[inMemory]]. +// +// Duplicates the compiling-service trait of the same name; see MacroCompileContext +// for why the macro pipeline is duplicated across the two compilers. +trait MacroRegistry { + def fetch(macroId: String, version: Int): Option[MacroBody] +} + +object MacroRegistry { + + // Always returns None. Use when persistence is not wired up — SNAPSHOT macros still + // work since their body is embedded; LIVE macros fail with "not found in registry". + object Empty extends MacroRegistry { + override def fetch(macroId: String, version: Int): Option[MacroBody] = None + } + + // For tests: a fixed table of bodies keyed by (id, version). + def inMemory(bodies: Map[(String, Int), MacroBody]): MacroRegistry = + new MacroRegistry { + override def fetch(macroId: String, version: Int): Option[MacroBody] = + bodies.get((macroId, version)) + } +} diff --git a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/serde/GlobalPortIdentitySerde.scala b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/serde/GlobalPortIdentitySerde.scala index c8fd8e1a363..cdcd12a3934 100644 --- a/common/workflow-core/src/main/scala/org/apache/texera/amber/util/serde/GlobalPortIdentitySerde.scala +++ b/common/workflow-core/src/main/scala/org/apache/texera/amber/util/serde/GlobalPortIdentitySerde.scala @@ -49,6 +49,10 @@ object GlobalPortIdentitySerde { !logicalOpId.contains('_'), s"logicalOpId must not contain '_' (VFS URI parsing relies on this): $logicalOpId" ) + require( + !logicalOpId.contains('/'), + s"logicalOpId must not contain '/' (breaks VFS URI path structure): $logicalOpId" + ) require( !layerName.contains('_'), s"layerName must not contain '_' (VFS URI parsing relies on this): $layerName" diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala index 4e9d6c6e2cd..2082514acc7 100644 --- a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/LogicalOp.scala @@ -60,6 +60,7 @@ import org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.KNNTrain } import org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.SVCTrainer.SklearnAdvancedSVCTrainerOpDesc import org.apache.texera.amber.operator.machineLearning.sklearnAdvanced.SVRTrainer.SklearnAdvancedSVRTrainerOpDesc +import org.apache.texera.amber.operator.macroOp.{MacroInputOp, MacroOpDesc, MacroOutputOp} import org.apache.texera.amber.operator.metadata.{OPVersion, OperatorInfo, PropertyNameConstants} import org.apache.texera.amber.operator.projection.ProjectionOpDesc import org.apache.texera.amber.operator.randomksampling.RandomKSamplingOpDesc @@ -428,7 +429,10 @@ trait StateTransferFunc value = classOf[SklearnAdvancedSVRTrainerOpDesc], name = "SVRTrainer" ), - new Type(value = classOf[SklearnTestingOpDesc], name = "SklearnTesting") + new Type(value = classOf[SklearnTestingOpDesc], name = "SklearnTesting"), + new Type(value = classOf[MacroOpDesc], name = "Macro"), + new Type(value = classOf[MacroInputOp], name = "MacroInput"), + new Type(value = classOf[MacroOutputOp], name = "MacroOutput") ) ) abstract class LogicalOp extends PortDescriptor with Serializable { diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroBody.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroBody.scala new file mode 100644 index 00000000000..719cb1b16aa --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroBody.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import org.apache.texera.amber.operator.LogicalOp + +// The inner subgraph of a macro: inner operators (including MacroInputOp / +// MacroOutputOp boundary markers), internal links, and the declared external +// port specs. Serialized as JSON inside MacroOpDesc.snapshot (for SNAPSHOT +// mode) or returned by MacroRegistry.fetch (for LIVE mode). +case class MacroBody( + operators: List[LogicalOp], + links: List[MacroLink], + inputs: List[MacroPortSpec] = Nil, + outputs: List[MacroPortSpec] = Nil +) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroFusion.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroFusion.scala new file mode 100644 index 00000000000..9a8e2ffa9ed --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroFusion.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties + +// AI-fusion payload (Section 9.2). When `verified = true`, MacroExpander substitutes +// the MacroOpDesc with a single PythonUDFOpDescV2 built from `code` instead of inlining +// the macro body. `sampleSize` records how many rows the sample-run diff matched on; +// `verifiedAt` is the epoch millis when verification passed. +// +// `ignoreUnknown = true`: the frontend attaches UI-only fields (e.g. +// `estimatedSpeedup`, a human-readable "1.6×" used to render the on-canvas +// ⚡ FUSED badge) onto this payload before persisting. The backend doesn't +// model those fields here; without this annotation Jackson rejects the +// whole WorkflowExecuteRequest at execute time. +@JsonIgnoreProperties(ignoreUnknown = true) +case class MacroFusion( + code: String, + verified: Boolean = false, + sampleSize: Int = 0, + verifiedAt: Long = 0L +) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroInputOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroInputOp.scala new file mode 100644 index 00000000000..5bdaa7ed563 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroInputOp.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{OutputPort, PhysicalPlan, PortIdentity} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} + +// Boundary marker that lives only inside a macro body. Represents external input port +// `portIndex` of the macro: tuples coming into the macro at that port flow out of this +// marker into the inner subgraph. MacroExpander consumes these markers when splicing +// the body into the parent plan and drops them from the expanded plan. +// +// Ignore `inputPorts` / `outputPorts` on the wire: the marker's ports are always +// derived from `portIndex` via `operatorInfo`, and earlier macro bodies were +// persisted with backend-shaped `PortIdentity` entries that don't match +// `PortDescription` (which would otherwise break MacroBody deserialization). +@JsonIgnoreProperties(Array("inputPorts", "outputPorts")) +class MacroInputOp extends LogicalOp { + + @JsonProperty(value = "portIndex", required = true) + @JsonSchemaTitle("Port Index") + @JsonPropertyDescription("Which external input port (0-based) this marker represents.") + var portIndex: Int = 0 + + @JsonProperty(value = "displayName") + @JsonSchemaTitle("Display Name") + var displayName: String = "" + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ) = + throw new IllegalStateException( + s"MacroInputOp(portIndex=$portIndex) must be consumed by MacroExpander before " + + s"physical-plan compilation. Markers cannot be compiled directly." + ) + + override def getPhysicalPlan( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalPlan = + throw new IllegalStateException( + s"MacroInputOp(portIndex=$portIndex) must be consumed by MacroExpander before " + + s"physical-plan compilation. Markers cannot be compiled directly." + ) + + override def operatorInfo: OperatorInfo = OperatorInfo( + userFriendlyName = if (displayName.nonEmpty) displayName else s"Input $portIndex", + operatorDescription = + "Macro input boundary marker. External input port; consumed by MacroExpander.", + operatorGroupName = OperatorGroupConstants.UTILITY_GROUP, + inputPorts = List.empty, + outputPorts = List(OutputPort(PortIdentity(0))) + ) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroLink.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroLink.scala new file mode 100644 index 00000000000..98beed015ae --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroLink.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.texera.amber.core.workflow.PortIdentity + +// Mirrors LogicalLink's shape but lives in workflow-operator (which doesn't depend +// on workflow-compiling-service where LogicalLink lives). MacroExpander converts +// MacroLink → LogicalLink when inlining a macro body into the parent plan. +case class MacroLink( + @JsonProperty("fromOpId") fromOpId: String, + fromPortId: PortIdentity, + @JsonProperty("toOpId") toOpId: String, + toPortId: PortIdentity +) diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOpDesc.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOpDesc.scala new file mode 100644 index 00000000000..3d9b519b9d3 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOpDesc.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{InputPort, OutputPort, PhysicalPlan, PortIdentity} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} + +// A macro instance on the parent canvas. Carries identity + link mode + (optionally) +// an embedded body. MacroOpDesc never reaches physical-plan compilation: MacroExpander +// (in workflow-compiling-service) consumes it as a pre-compile pass and replaces it +// with the inlined body or, if `fusion` is verified, a single PythonUDFOpDescV2. +// +// `ignoreUnknown = true`: the frontend stamps UI-only convenience fields (e.g. +// `macroSyncedAt` — epoch ms used to detect stale embeds against the live +// definition) into operatorProperties before persisting. The backend doesn't +// model those fields here, so Jackson would fail to deserialize the request +// without this annotation. +@JsonIgnoreProperties(ignoreUnknown = true) +class MacroOpDesc extends LogicalOp { + + @JsonProperty(value = "macroId", required = true) + @JsonSchemaTitle("Macro ID") + @JsonPropertyDescription("Identifier of the macro definition (workflow ID).") + var macroId: String = "" + + @JsonProperty(value = "macroVersion") + @JsonSchemaTitle("Macro Version") + @JsonPropertyDescription("Pinned version (vid) of the macro definition. Used only in LIVE mode.") + var macroVersion: Int = 0 + + @JsonProperty(value = "linkMode", required = true) + @JsonSchemaTitle("Link Mode") + @JsonPropertyDescription("LIVE = referenced by (macroId, macroVersion); SNAPSHOT = embedded body.") + var linkMode: String = MacroOpDesc.LIVE + + @JsonProperty(value = "snapshot") + @JsonSchemaTitle("Snapshot") + @JsonPropertyDescription("Embedded macro body; present only when linkMode = SNAPSHOT.") + var snapshot: Option[MacroBody] = None + + @JsonProperty(value = "inputPortCount", required = true) + @JsonSchemaTitle("Input Port Count") + var inputPortCount: Int = 0 + + @JsonProperty(value = "outputPortCount", required = true) + @JsonSchemaTitle("Output Port Count") + var outputPortCount: Int = 0 + + @JsonProperty(value = "displayName") + @JsonSchemaTitle("Display Name") + var displayName: String = "" + + @JsonProperty(value = "fusion") + @JsonSchemaTitle("Fusion") + @JsonPropertyDescription( + "AI-fused single-UDF replacement (Section 9.2). When verified, MacroExpander uses this " + + "instead of inlining the body." + ) + var fusion: Option[MacroFusion] = None + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ) = + throw new IllegalStateException( + s"MacroOpDesc[$macroId] must be expanded by MacroExpander before physical-plan " + + s"compilation. This is a programmer error: the pre-compile expansion pass did not run." + ) + + override def getPhysicalPlan( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalPlan = + throw new IllegalStateException( + s"MacroOpDesc[$macroId] must be expanded by MacroExpander before physical-plan " + + s"compilation. This is a programmer error: the pre-compile expansion pass did not run." + ) + + override def operatorInfo: OperatorInfo = OperatorInfo( + userFriendlyName = if (displayName.nonEmpty) displayName else "Macro", + operatorDescription = "Composite operator: a reusable, encapsulated sub-workflow.", + operatorGroupName = OperatorGroupConstants.UTILITY_GROUP, + inputPorts = (0 until inputPortCount).toList.map(i => InputPort(PortIdentity(i))), + outputPorts = (0 until outputPortCount).toList.map(i => OutputPort(PortIdentity(i))) + ) +} + +object MacroOpDesc { + // Link modes — strings rather than an enum to keep Jackson serialization trivial. + val LIVE: String = "LIVE" + val SNAPSHOT: String = "SNAPSHOT" +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOutputOp.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOutputOp.scala new file mode 100644 index 00000000000..dc3ffd8b45e --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroOutputOp.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import com.fasterxml.jackson.annotation.{JsonIgnoreProperties, JsonProperty, JsonPropertyDescription} +import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{InputPort, PhysicalPlan, PortIdentity} +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} + +// Boundary marker that lives only inside a macro body. Represents external output port +// `portIndex` of the macro: tuples flowing into this marker are emitted out of that +// external port. MacroExpander consumes these markers when splicing the body into the +// parent plan and drops them from the expanded plan. +// +// Ignore `inputPorts` / `outputPorts` on the wire: see MacroInputOp for the +// rationale (operatorInfo derives the marker's port from `portIndex`; the +// PortDescription/PortIdentity mismatch would otherwise break MacroBody parsing). +@JsonIgnoreProperties(Array("inputPorts", "outputPorts")) +class MacroOutputOp extends LogicalOp { + + @JsonProperty(value = "portIndex", required = true) + @JsonSchemaTitle("Port Index") + @JsonPropertyDescription("Which external output port (0-based) this marker represents.") + var portIndex: Int = 0 + + @JsonProperty(value = "displayName") + @JsonSchemaTitle("Display Name") + var displayName: String = "" + + override def getPhysicalOp( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ) = + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) must be consumed by MacroExpander before " + + s"physical-plan compilation. Markers cannot be compiled directly." + ) + + override def getPhysicalPlan( + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity + ): PhysicalPlan = + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) must be consumed by MacroExpander before " + + s"physical-plan compilation. Markers cannot be compiled directly." + ) + + override def operatorInfo: OperatorInfo = OperatorInfo( + userFriendlyName = if (displayName.nonEmpty) displayName else s"Output $portIndex", + operatorDescription = + "Macro output boundary marker. External output port; consumed by MacroExpander.", + operatorGroupName = OperatorGroupConstants.UTILITY_GROUP, + inputPorts = List(InputPort(PortIdentity(0))), + outputPorts = List.empty + ) +} diff --git a/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroPortSpec.scala b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroPortSpec.scala new file mode 100644 index 00000000000..92099fc7a56 --- /dev/null +++ b/common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/macroOp/MacroPortSpec.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +case class MacroPortSpec( + index: Int, + displayName: String = "" +) diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/macroOp/MacroOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/macroOp/MacroOpDescSpec.scala new file mode 100644 index 00000000000..3feb4aa8471 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/macroOp/MacroOpDescSpec.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.macroOp + +import org.apache.texera.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.limit.LimitOpDesc +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class MacroOpDescSpec extends AnyFlatSpec with Matchers { + + "MacroOpDesc" should "round-trip through Jackson with all fields preserved" in { + val inner = new LimitOpDesc + inner.limit = 5 + inner.setOperatorId("inner-limit") + + val body = MacroBody( + operators = List(makeInputMarker(0, "in"), inner, makeOutputMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner-limit", PortIdentity(0)), + MacroLink("inner-limit", PortIdentity(0), "out", PortIdentity(0)) + ), + inputs = List(MacroPortSpec(0, "the-input")), + outputs = List(MacroPortSpec(0, "the-output")) + ) + + val m = new MacroOpDesc + m.macroId = "wid-42" + m.macroVersion = 7 + m.linkMode = MacroOpDesc.SNAPSHOT + m.snapshot = Some(body) + m.inputPortCount = 1 + m.outputPortCount = 1 + m.displayName = "MyMacro" + m.setOperatorId("macro-instance-1") + + val json = objectMapper.writeValueAsString(m.asInstanceOf[LogicalOp]) + val restored = objectMapper.readValue(json, classOf[LogicalOp]) + + restored shouldBe a[MacroOpDesc] + val r = restored.asInstanceOf[MacroOpDesc] + r.macroId shouldBe "wid-42" + r.macroVersion shouldBe 7 + r.linkMode shouldBe MacroOpDesc.SNAPSHOT + r.inputPortCount shouldBe 1 + r.outputPortCount shouldBe 1 + r.displayName shouldBe "MyMacro" + r.operatorIdentifier.id shouldBe "macro-instance-1" + + r.snapshot shouldBe defined + val rb = r.snapshot.get + rb.operators should have size 3 + rb.links should have size 2 + rb.inputs shouldBe body.inputs + rb.outputs shouldBe body.outputs + + // Polymorphic round-trip: inner ops keep their concrete types. + rb.operators.collect { case l: LimitOpDesc => l.limit } shouldBe List(5) + rb.operators.collect { case i: MacroInputOp => i.portIndex } shouldBe List(0) + rb.operators.collect { case o: MacroOutputOp => o.portIndex } shouldBe List(0) + } + + it should "throw on getPhysicalPlan / getPhysicalOp because expansion must run first" in { + val m = new MacroOpDesc + m.macroId = "x" + val wid = WorkflowIdentity(0L) + val eid = ExecutionIdentity(0L) + assertThrows[IllegalStateException] { m.getPhysicalPlan(wid, eid) } + assertThrows[IllegalStateException] { m.getPhysicalOp(wid, eid) } + } + + "MacroInputOp / MacroOutputOp" should "round-trip and throw on compile" in { + val in = makeInputMarker(2, "in-2") + val out = makeOutputMarker(3, "out-3") + val inJson = objectMapper.writeValueAsString(in.asInstanceOf[LogicalOp]) + val outJson = objectMapper.writeValueAsString(out.asInstanceOf[LogicalOp]) + + val restoredIn = + objectMapper.readValue(inJson, classOf[LogicalOp]).asInstanceOf[MacroInputOp] + val restoredOut = + objectMapper.readValue(outJson, classOf[LogicalOp]).asInstanceOf[MacroOutputOp] + restoredIn.portIndex shouldBe 2 + restoredOut.portIndex shouldBe 3 + + val wid = WorkflowIdentity(0L) + val eid = ExecutionIdentity(0L) + assertThrows[IllegalStateException] { restoredIn.getPhysicalPlan(wid, eid) } + assertThrows[IllegalStateException] { restoredOut.getPhysicalPlan(wid, eid) } + } + + "MacroOpDesc.operatorInfo" should "expose ports matching inputPortCount/outputPortCount" in { + val m = new MacroOpDesc + m.inputPortCount = 2 + m.outputPortCount = 3 + val info = m.operatorInfo + info.inputPorts.map(_.id.id) shouldBe List(0, 1) + info.outputPorts.map(_.id.id) shouldBe List(0, 1, 2) + } + + private def makeInputMarker(idx: Int, id: String): MacroInputOp = { + val m = new MacroInputOp + m.portIndex = idx + m.setOperatorId(id) + m + } + + private def makeOutputMarker(idx: Int, id: String): MacroOutputOp = { + val m = new MacroOutputOp + m.portIndex = idx + m.setOperatorId(id) + m + } +} diff --git a/frontend/project/build.properties b/frontend/project/build.properties new file mode 100644 index 00000000000..10fd9eee04a --- /dev/null +++ b/frontend/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.5.5 diff --git a/frontend/src/app/app-routing.module.ts b/frontend/src/app/app-routing.module.ts index 179caf5c088..4a495f3b88c 100644 --- a/frontend/src/app/app-routing.module.ts +++ b/frontend/src/app/app-routing.module.ts @@ -119,6 +119,13 @@ routes.push({ path: "workflow/:id", component: WorkspaceComponent, }, + { + // Drill-down editor for a macro's body. `id` carries the parent + // workflow's wid so we can render breadcrumbs / route the user back; + // `macroId` is the actual definition being edited. + path: "workflow/:id/macro/:macroId", + component: WorkspaceComponent, + }, { path: "dataset", component: UserDatasetComponent, diff --git a/frontend/src/app/workspace/component/left-panel/operator-menu/operator-menu.component.html b/frontend/src/app/workspace/component/left-panel/operator-menu/operator-menu.component.html index ff8bb97a296..282485891d2 100644 --- a/frontend/src/app/workspace/component/left-panel/operator-menu/operator-menu.component.html +++ b/frontend/src/app/workspace/component/left-panel/operator-menu/operator-menu.component.html @@ -40,8 +40,158 @@
+ +
+ +
+
+ {{ suggestions.length }} candidate{{ suggestions.length === 1 ? '' : 's' }} + +
+
+
+ ⟲ pattern + {{ suggestion.suggestedName }} +
+
{{ suggestion.rationale }}
+
+ + {{ suggestion.confidence === 'recommended' ? '✓ recommended' + : suggestion.confidence === 'strong' ? 'strong fit' : 'good fit' }} + + · + {{ suggestion.operatorIds.length }} ops +
+
+
+
+ + + + + + +
+ + +
+ + + +
+ +
+ + + {{ macroSchema.additionalMetadata.userFriendlyName }} + + + {{ macroSchema.__macroSummary!.usageCount }}× used + + + {{ macroSchema.additionalMetadata.inputPorts.length }} in / + {{ macroSchema.additionalMetadata.outputPorts.length }} out + + +
+
+
+
+ >(); public groupNames: ReadonlyArray = []; + // The user's saved macros — surfaced as a "Your Macros" section in the + // palette so they can be reused on other workflows by clicking the entry. + // We use the existing operator-label rendering by exposing each macro as + // an OperatorSchema-shaped object whose operatorType is the literal + // "Macro" and whose userFriendlyName is the macro name. The drag/click + // handler peeks at `__macroSummary` on the schema to fill in macroId, + // inputPortCount, outputPortCount when instantiating the operator + // predicate. + public macroList: (OperatorSchema & { __macroSummary?: MacroSummary })[] = []; + // Search-box filter applied to `macroList` in the template. Case-insensitive + // substring match on the macro's display name. Empty string = show all. + public macroFilterText: string = ""; + public get filteredMacroList(): (OperatorSchema & { __macroSummary?: MacroSummary })[] { + const q = this.macroFilterText.trim().toLowerCase(); + if (q.length === 0) return this.macroList; + return this.macroList.filter(m => + (m.additionalMetadata.userFriendlyName || "").toLowerCase().includes(q) + ); + } + + // REMOVED: per-macro categorization + op-chain subtitle. + // + // These features lazily called `getMacro(wid)` from inside Angular template + // bindings on every change-detection cycle while the cache was unfilled, + // which on a workflow that opens with many macros DDoS'd the browser's + // fetch pool with ERR_INSUFFICIENT_RESOURCES, starving the websocket and + // compile requests. A proper implementation needs the data on the backend + // MacroSummary (so we get it in one round-trip), not per-macro fetches + // from the palette renderer. Until that's done, the palette stays a flat + // list with just name + usage chip + ports + export button. + + // Inline panel for "AI" macro suggestions. Populated on user click, then + // cleared after a selection is materialized. Empty list means panel is + // collapsed. + public suggestions: MacroSuggestion[] = []; + public isSuggesting: boolean = false; + // Proactive count — how many candidates the heuristic would surface RIGHT + // NOW if the user clicked the button. Refreshed whenever the canvas changes + // (with a short debounce). Surfaced as a small chip on the Suggest button + // so the user sees "4 candidates found" without having to click. This is + // the "agent is watching your workflow" feel. + public availableCandidateCount: number = 0; + // input value of the search input box public searchInputValue: string = ""; // search autocomplete suggestion list @@ -81,8 +133,20 @@ export class OperatorMenuComponent { private operatorMetadataService: OperatorMetadataService, private workflowActionService: WorkflowActionService, private workflowUtilService: WorkflowUtilService, - private dragDropService: DragDropService + private dragDropService: DragDropService, + private macroService: MacroService, + private macroSuggestionService: MacroSuggestionService, + private macroFusionService: MacroFusionService, + private jointUIService: JointUIService, + private message: NzMessageService ) { + // Load the user's saved macros for the "Your Macros" palette section. + this.macroService.listMacros().subscribe({ + next: (summaries: MacroSummary[]) => { + this.macroList = summaries.map(m => this.macroSummaryToSchema(m)); + }, + error: () => undefined, + }); // clear the search box if an operator is dropped from operator search box this.dragDropService.operatorDropStream.pipe(untilDestroyed(this)).subscribe(() => { this.searchInputValue = ""; @@ -92,6 +156,49 @@ export class OperatorMenuComponent { .getWorkflowModificationEnabledStream() .pipe(untilDestroyed(this)) .subscribe(canModify => (this.canModify = canModify)); + // Proactive macro-suggestion watcher: every time the workflow graph + // changes (add/delete/relink), debounce 700ms then run the heuristic + // suggester silently and update `availableCandidateCount`. The UI badges + // the Suggest button so the user discovers patterns without clicking. + // 700ms is long enough that mid-drag operator placements don't trigger + // a flicker, short enough to feel responsive after a click settles. + const refreshSuggestionCount = () => { + try { + const graph = this.workflowActionService.getTexeraGraph(); + const list = this.macroSuggestionService.suggestMacros(graph); + this.availableCandidateCount = list.length; + } catch { + this.availableCandidateCount = 0; + } + }; + let debounceHandle: ReturnType | null = null; + const scheduleRefresh = () => { + if (debounceHandle) clearTimeout(debounceHandle); + debounceHandle = setTimeout(refreshSuggestionCount, 700); + }; + this.workflowActionService + .getTexeraGraph() + .getOperatorAddStream() + .pipe(untilDestroyed(this)) + .subscribe(scheduleRefresh); + this.workflowActionService + .getTexeraGraph() + .getOperatorDeleteStream() + .pipe(untilDestroyed(this)) + .subscribe(scheduleRefresh); + this.workflowActionService + .getTexeraGraph() + .getLinkAddStream() + .pipe(untilDestroyed(this)) + .subscribe(scheduleRefresh); + this.workflowActionService + .getTexeraGraph() + .getLinkDeleteStream() + .pipe(untilDestroyed(this)) + .subscribe(scheduleRefresh); + // Kick off an initial scan once the canvas has settled. + setTimeout(refreshSuggestionCount, 1200); + this.operatorMetadataService .getOperatorMetadata() .pipe(untilDestroyed(this)) @@ -150,4 +257,479 @@ export class OperatorMenuComponent { this.autocompleteOptions = []; }, 0); } + + /** + * Adapt a backend `MacroSummary` into an `OperatorSchema`-shaped row the + * existing operator-label component can render. The macro's port count + * and definition wid are stashed on `__macroSummary` so click-to-add can + * build the right `OperatorPredicate` without re-fetching the macro. + */ + private macroSummaryToSchema(m: MacroSummary): OperatorSchema & { __macroSummary: MacroSummary } { + return { + operatorType: "Macro", + jsonSchema: { type: "object", properties: {} } as unknown as OperatorSchema["jsonSchema"], + additionalMetadata: { + userFriendlyName: m.name, + operatorDescription: m.description ?? `Macro from workflow #${m.wid}`, + operatorGroupName: "Your Macros", + inputPorts: m.portSpec.inputs.map(p => ({ displayName: `in-${p.index}` })), + outputPorts: m.portSpec.outputs.map(p => ({ displayName: `out-${p.index}` })), + dynamicInputPorts: false, + dynamicOutputPorts: false, + supportReconfiguration: false, + allowPortCustomization: false, + } as unknown as OperatorSchema["additionalMetadata"], + operatorVersion: "", + __macroSummary: m, + }; + } + + /** + * Place a saved macro on the canvas. Builds a fresh `OperatorPredicate` + * matching the shape created by `swapSelectionWithMacroNode` so the + * downstream validation/render/execution paths see a normal Macro op. + */ + public onAddMacro(macroSchema: OperatorSchema & { __macroSummary?: MacroSummary }): void { + const m = macroSchema.__macroSummary; + if (!m) return; + const inputPortCount = m.portSpec.inputs.length; + const outputPortCount = m.portSpec.outputs.length; + const inputPorts = Array.from({ length: inputPortCount }, (_, i) => ({ + portID: `input-${i}`, + displayName: `in-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + dependencies: [], + })); + const outputPorts = Array.from({ length: outputPortCount }, (_, i) => ({ + portID: `output-${i}`, + displayName: `out-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + })); + const predicate: OperatorPredicate = { + operatorID: `Macro-operator-${this.workflowUtilService.getOperatorRandomUUID()}`, + operatorType: "Macro", + operatorVersion: "", + operatorProperties: { + macroId: String(m.wid), + macroVersion: 1, + linkMode: "LIVE", + inputPortCount, + outputPortCount, + displayName: m.name, + // Mark this instance as in-sync-with the macro's CURRENT + // lastModifiedTime. If the macro is later edited, this stays put; + // the "refresh macro (stale)" context-menu item then surfaces. + macroSyncedAt: + typeof m.lastModifiedTime === "number" + ? m.lastModifiedTime + : new Date(m.lastModifiedTime as unknown as string).getTime(), + }, + inputPorts, + outputPorts, + showAdvanced: false, + isDisabled: false, + customDisplayName: m.name, + dynamicInputPorts: false, + dynamicOutputPorts: false, + }; + const origin = this.workflowActionService.getJointGraphWrapper().getMainJointPaper()?.translate(); + const point = { x: 400 - (origin?.tx ?? 0), y: 200 - (origin?.ty ?? 0) }; + this.workflowActionService.addOperator(predicate, point); + } + + /** + * "Suggest Macros (AI)" button — runs the heuristic suggester over the + * current canvas and surfaces ranked candidates in the inline panel. + * v1 is local heuristics; a future swap to chat-assistant-service for + * LLM-ranked candidates would replace this body with an HTTP call that + * returns the same `MacroSuggestion[]` shape. + */ + public onSuggestMacros(): void { + this.isSuggesting = true; + // Defer to next tick so the spinner can paint — heuristic is fast (<10ms) + // but pretending it's "thinking" matches the AI-agent UX the demo wants. + setTimeout(() => { + try { + const graph = this.workflowActionService.getTexeraGraph(); + this.suggestions = this.macroSuggestionService.suggestMacros(graph); + if (this.suggestions.length === 0) { + this.message.info("No good macro candidates found. Try adding more operators!"); + } else { + this.message.success(`Found ${this.suggestions.length} candidate(s).`); + // Highlight the suggestion's operators on the canvas so the user + // sees which ops would be encapsulated. Limit to the top suggestion + // to avoid overwhelming the canvas. + const jw = this.workflowActionService.getJointGraphWrapper(); + jw.unhighlightOperators(...jw.getCurrentHighlightedOperatorIDs()); + jw.setMultiSelectMode(true); + jw.highlightOperators(...this.suggestions[0].operatorIds); + } + } finally { + this.isSuggesting = false; + } + }, 250); + } + + /** + * Materialize a suggested macro directly: call + * `MacroService.createMacroFromSelection` to build the definition, POST + * it, and swap the selection on the canvas with a single Macro op — same + * shape the right-click → Create Macro path produces. Pre-fix this only + * highlighted+selected the operators and asked the user to right-click; + * doing it inline removes one step from the demo and reads more like an + * agent action. + * + * When the suggestion is a *recurring pattern* (id starts with "pattern-"), + * we also offer to swap the other occurrences of the same pattern with + * fresh instances of the same macro — the "agent did the refactor for me" + * demo moment. The peer occurrences are detected on-the-fly by re-running + * the suggester and matching on `suggestedName` (the pattern signature is + * the same across all occurrences). + */ + public onMaterializeSuggestion(suggestion: MacroSuggestion): void { + const proposedName = suggestion.suggestedName || `macro-${Date.now()}`; + const name = window.prompt("Macro name", proposedName); + if (!name) return; + const isPattern = suggestion.id.startsWith("pattern-"); + // Capture sibling occurrences BEFORE we mutate the canvas. We need IDs + // that won't have been swapped out from under us, which is exactly the + // current snapshot of `this.suggestions`. + const peerOccurrences = isPattern + ? this.suggestions.filter( + s => + s.id.startsWith("pattern-") && + s.suggestedName === suggestion.suggestedName && + s.operatorIds.join("|") !== suggestion.operatorIds.join("|") + ) + : []; + this.macroService.createMacroFromSelection(this.workflowActionService, suggestion.operatorIds, name).subscribe({ + next: detail => { + this.message.success(`Created macro "${detail.name}" (wid=${detail.wid})`); + this.suggestions = []; + if (peerOccurrences.length === 0) return; + // Batch-swap remaining occurrences. Each may fail independently (e.g. + // shape didn't match after all); count successes vs. skips for the + // toast. + let swapped = 0; + let skipped = 0; + for (const peer of peerOccurrences) { + const ok = this.macroService.swapSelectionWithExistingMacro( + this.workflowActionService, + detail, + peer.operatorIds + ); + if (ok) swapped++; + else skipped++; + } + if (swapped > 0) { + this.message.success( + `Refactored ${swapped} additional occurrence${swapped === 1 ? "" : "s"} ` + + `to use "${detail.name}"` + + (skipped > 0 ? ` (${skipped} skipped — shape didn't match)` : "") + ); + } else if (skipped > 0) { + this.message.warning( + `Could not auto-refactor the other ${skipped} occurrence(s); shapes didn't match the macro's ports.` + ); + } + }, + error: err => this.message.error(`Failed to create macro: ${err?.message ?? err}`), + }); + } + + public dismissSuggestions(): void { + this.suggestions = []; + } + + /** + * Hovering a suggestion row should flash that suggestion's operators on + * the canvas as a visual preview. We highlight via JointGraphWrapper — + * same path that selection uses — so the canvas treatment matches the + * "selected" look. On unhover we restore whatever the user had highlighted + * before they started hovering (typically: nothing). + * + * Stash the prior highlight set in `preHoverHighlight` so unhover can + * cleanly undo without clobbering other UI state. + */ + private preHoverHighlight: string[] = []; + public onSuggestionHover(suggestion: MacroSuggestion): void { + const jw = this.workflowActionService.getJointGraphWrapper(); + this.preHoverHighlight = Array.from(jw.getCurrentHighlightedOperatorIDs()); + jw.unhighlightOperators(...this.preHoverHighlight); + jw.setMultiSelectMode(true); + jw.highlightOperators(...suggestion.operatorIds); + } + + public onSuggestionUnhover(): void { + const jw = this.workflowActionService.getJointGraphWrapper(); + jw.unhighlightOperators(...jw.getCurrentHighlightedOperatorIDs()); + if (this.preHoverHighlight.length > 0) { + jw.highlightOperators(...this.preHoverHighlight); + } + this.preHoverHighlight = []; + } + + /** Currently-running "fuse all" indicator — disables the button and renders progress. */ + public fuseAllInProgress: boolean = false; + /** Count of Macro ops on the current canvas that are NOT yet fused. Drives the button label. */ + public unfusedMacroCountOnCanvas(): number { + try { + const graph = this.workflowActionService.getTexeraGraph(); + return graph.getAllOperators().filter(op => { + if (op.operatorType !== "Macro") return false; + const f = op.operatorProperties?.["fusion"] as { verified?: boolean } | undefined; + return f?.verified !== true; + }).length; + } catch { + return 0; + } + } + + /** True while the multi-step auto-optimize agent is running. Disables the button + suggest panel. */ + public autoOptimizeInProgress: boolean = false; + + /** + * "Auto-optimize workflow" — the omni-agent action. Runs in sequence: + * 1. Pattern + chain detection (same as Suggest Macros). + * 2. Materialize the top K (default 3) candidates: create macros and + * collapse the matching sub-DAGs. + * 3. Fuse every Macro op on the canvas (Fuse All). + * + * This is the demo's "do the whole refactor for me" moment. Progress is + * surfaced via a stepwise message stream so the user sees what the agent + * is doing. + */ + public onAutoOptimizeWorkflow(): void { + if (this.autoOptimizeInProgress) return; + const graph = this.workflowActionService.getTexeraGraph(); + const suggestions = this.macroSuggestionService.suggestMacros(graph); + if (suggestions.length === 0) { + this.onFuseAllMacros(); + return; + } + // Group suggestions by `suggestedName` so all occurrences of a pattern + // share one macro definition. Take the top K distinct patterns by the + // highest-scoring occurrence of each. This is how a single click can + // batch-refactor 6 occurrences into 1 macro definition + 6 instances — + // the demo's killer "agent did the refactor for me" moment. + const distinctPatterns = new Map(); + for (const s of suggestions) { + if (!distinctPatterns.has(s.suggestedName)) distinctPatterns.set(s.suggestedName, []); + distinctPatterns.get(s.suggestedName)!.push(s); + } + const topK = 3; + const patternGroups = Array.from(distinctPatterns.values()) + .sort((a, b) => b[0].score - a[0].score) + .slice(0, topK); + + this.autoOptimizeInProgress = true; + const patternCount = patternGroups.length; + const totalOccurrences = patternGroups.reduce((sum, g) => sum + g.length, 0); + this.message.info( + `🚀 Auto-optimize: extracting ${patternCount} pattern${patternCount === 1 ? "" : "s"} ` + + `(${totalOccurrences} occurrence${totalOccurrences === 1 ? "" : "s"})…` + ); + + /** + * For one pattern group: create the macro definition from the FIRST + * occurrence, then swap every remaining occurrence with a fresh + * instance of the same definition. Returns a Promise that resolves + * after all swaps land. + */ + const materializePattern = (group: MacroSuggestion[]): Promise => + new Promise((resolve, reject) => { + // Filter to occurrences whose operators are still on the graph + // (a previous pattern's extract may have consumed some of these). + const alive = group.filter(s => + s.operatorIds.every(opId => { + try { + return graph.getOperator(opId) !== undefined; + } catch { + return false; + } + }) + ); + if (alive.length === 0) return resolve(); + const first = alive[0]; + const name = first.suggestedName || `macro-${Date.now()}`; + this.macroService + .createMacroFromSelection(this.workflowActionService, first.operatorIds, name) + .subscribe({ + next: detail => { + let extraSwapped = 0; + let extraSkipped = 0; + for (const peer of alive.slice(1)) { + const ok = this.macroService.swapSelectionWithExistingMacro( + this.workflowActionService, + detail, + peer.operatorIds + ); + if (ok) extraSwapped++; + else extraSkipped++; + } + this.message.info( + ` ✓ Extracted "${detail.name}"` + + (extraSwapped > 0 + ? ` (and refactored ${extraSwapped} other occurrence${extraSwapped === 1 ? "" : "s"})` + : "") + + (extraSkipped > 0 ? `; ${extraSkipped} shape-mismatched skipped` : "") + ); + resolve(); + }, + error: err => { + this.message.warning(` ✗ Skipped pattern "${name}": ${err?.message ?? err}`); + resolve(); // soft-fail so a bad pattern doesn't abort the batch + }, + }); + }); + + const materializeAll = (i: number): Promise => + i >= patternGroups.length + ? Promise.resolve() + : materializePattern(patternGroups[i]).then(() => materializeAll(i + 1)); + + materializeAll(0).then( + () => { + this.autoOptimizeInProgress = false; + // Now fuse everything on the canvas, including the newly-created macros. + this.onFuseAllMacros(); + }, + err => { + this.autoOptimizeInProgress = false; + this.message.error(`Auto-optimize failed: ${err?.message ?? err}`); + } + ); + } + + /** + * "Fuse all macros in workflow" — the one-click batch perf optimization. + * Walks every Macro op on the parent canvas, calls MacroFusionService for + * each, stamps the resulting fusion onto operatorProperties, and refreshes + * the canvas visual. Errors per-macro are surfaced individually so a + * single un-fusable macro doesn't abort the batch. + */ + public onFuseAllMacros(): void { + if (this.fuseAllInProgress) return; + const graph = this.workflowActionService.getTexeraGraph(); + const macros = graph.getAllOperators().filter(op => { + if (op.operatorType !== "Macro") return false; + const f = op.operatorProperties?.["fusion"] as { verified?: boolean } | undefined; + return f?.verified !== true; + }); + if (macros.length === 0) { + this.message.info("No fusable macros on the canvas."); + return; + } + this.fuseAllInProgress = true; + const requests = macros.map(op => { + const macroId = op.operatorProperties?.["macroId"] as string | undefined; + if (!macroId) return of({ opId: op.operatorID, fused: false, reason: "no macroId" }); + return this.macroFusionService.generateFusion(macroId).pipe( + catchError(err => of({ opId: op.operatorID, fused: false, reason: String(err?.message ?? err) })) + ); + }); + forkJoin(requests).subscribe({ + next: results => { + let fusedCount = 0; + let failedCount = 0; + const paper = this.workflowActionService.getJointGraphWrapper().getMainJointPaper(); + results.forEach((result: any, idx: number) => { + const macroOp = macros[idx]; + if (result.fused === false || !result.verified) { + failedCount++; + return; + } + // result is a FusionResult + const newProps = { + ...macroOp.operatorProperties, + fusion: this.macroFusionService.toFusionPayload(result), + }; + this.workflowActionService.setOperatorProperty(macroOp.operatorID, newProps); + if (paper) { + this.jointUIService.refreshMacroFusionStyle( + paper, + macroOp.operatorID, + true, + result.estimatedSpeedup + ); + } + fusedCount++; + }); + if (fusedCount > 0) { + this.message.success( + `Fused ${fusedCount} macro${fusedCount === 1 ? "" : "s"} for performance` + + (failedCount > 0 ? ` (${failedCount} skipped)` : "") + ); + } else { + this.message.warning(`No macros could be fused (${failedCount} failed).`); + } + this.fuseAllInProgress = false; + }, + error: err => { + this.fuseAllInProgress = false; + this.message.error(`Batch fuse failed: ${err?.message ?? err}`); + }, + }); + } + + /** Reference to the hidden file input — clicked programmatically by `onTriggerImportMacro`. */ + @ViewChild("macroImportFile") macroImportFile?: ElementRef; + + /** + * Trigger a browser download of one macro definition. Exposed off the + * palette item's small "⤓" affordance. The actual HTTP fetch + Blob + * creation lives in `MacroService.exportMacroToFile`. + */ + public onExportMacro(summary: MacroSummary): void { + this.macroService.exportMacroToFile(summary.wid).subscribe({ + next: () => this.message.success(`Exported "${summary.name}".`), + error: err => this.message.error(`Export failed: ${err?.message ?? err}`), + }); + } + + /** + * Open the OS file picker for macro JSON files. The change handler is + * `onImportMacroFile`. Using a hidden file input + button is the standard + * dance for getting a click-styled "Upload" affordance. + */ + public onTriggerImportMacro(): void { + this.macroImportFile?.nativeElement.click(); + } + + /** + * File picker callback — read the JSON, POST it as a fresh macro + * definition, and refresh the "Your Macros" palette so the imported + * macro shows up immediately. + */ + public onImportMacroFile(event: Event): void { + const input = event.target as HTMLInputElement; + const file = input.files?.[0]; + if (!file) return; + const reader = new FileReader(); + reader.onload = () => { + const text = String(reader.result); + try { + this.macroService.importMacroFromJson(text).subscribe({ + next: detail => { + this.message.success(`Imported macro "${detail.name}" (wid=${detail.wid})`); + // Refresh the palette to surface the new macro. + this.macroService.listMacros().subscribe({ + next: (summaries: MacroSummary[]) => { + this.macroList = summaries.map(m => this.macroSummaryToSchema(m)); + }, + }); + }, + error: err => this.message.error(`Import failed: ${err?.message ?? err}`), + }); + } catch (e: any) { + this.message.error(`Import failed: ${e?.message ?? e}`); + } finally { + // Reset so the same file can be re-picked if needed. + input.value = ""; + } + }; + reader.readAsText(file); + } } diff --git a/frontend/src/app/workspace/component/workflow-editor/context-menu/context-menu/context-menu.component.html b/frontend/src/app/workspace/component/workflow-editor/context-menu/context-menu/context-menu.component.html index 4465d65cb27..ec54c86db6d 100644 --- a/frontend/src/app/workspace/component/workflow-editor/context-menu/context-menu/context-menu.component.html +++ b/frontend/src/app/workspace/component/workflow-editor/context-menu/context-menu/context-menu.component.html @@ -123,6 +123,59 @@ nzTheme="twotone">remove reusing result +
  • + create macro +
  • +
  • + expand macro +
  • +
  • + fuse for performance (AI) +
  • +
  • + unfuse (restore body) +
  • +
  • + refresh macro (stale) +
  • { + try { + return this.workflowActionService.getTexeraGraph().getOperator(opId); + } catch { + return undefined; + } + })(); + return op?.operatorType === "Macro" && typeof op.operatorProperties?.["macroId"] === "string"; + } + + public onExpandMacro(): void { + const opId = this.highlightedOperatorIds[0]; + if (!opId) return; + const graph = this.workflowActionService.getTexeraGraph(); + const macroOp = (() => { + try { + return graph.getOperator(opId); + } catch { + return undefined; + } + })(); + if (!macroOp) return; + const macroId = macroOp.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) { + this.notificationService.error("Macro has no macroId — can't expand."); + return; + } + const widNum = Number(macroId); + if (!Number.isFinite(widNum)) { + this.notificationService.error(`Invalid macroId: ${macroId}`); + return; + } + this.macroService + .getMacro(widNum) + .pipe(untilDestroyed(this)) + .subscribe({ + next: detail => { + try { + this.inlineMacroBody(macroOp, detail); + this.notificationService.success(`Expanded "${detail.name}" onto the canvas.`); + } catch (e) { + this.notificationService.error(`Expand failed: ${(e as Error)?.message ?? e}`); + } + }, + error: err => this.notificationService.error(`Failed to load macro body: ${err?.message ?? err}`), + }); + } + + /** + * Inline the macro's body operators + links onto the parent canvas, rewire + * external links so each one targets the right boundary inner op + port, + * and remove the macro op + its outer links. New unique IDs are assigned + * to body operators so re-expanding the same macro elsewhere doesn't + * collide. + * + * Layout: body ops are laid out around the macro op's former position. + * Crude column layout (input markers → inner → output markers) gets the + * job done without a real layout pass. + */ + private inlineMacroBody(macroOp: OperatorPredicate, detail: MacroDetail): void { + const graph = this.workflowActionService.getTexeraGraph(); + // Parse the body via the existing macroDetailToWorkflow normalizer so we + // get OperatorPredicate-shaped ops and OperatorLink-shaped links. + const macroWorkflow = this.macroService.macroDetailToWorkflow(detail); + const bodyOps = macroWorkflow.content.operators.filter( + o => o.operatorType !== "MacroInput" && o.operatorType !== "MacroOutput" + ); + const inputMarkers = macroWorkflow.content.operators.filter(o => o.operatorType === "MacroInput"); + const outputMarkers = macroWorkflow.content.operators.filter(o => o.operatorType === "MacroOutput"); + const markerIds = new Set([...inputMarkers, ...outputMarkers].map(o => o.operatorID)); + + // Assign fresh IDs to inner ops so re-using the same macro elsewhere + // doesn't collide. Map body-relative ID → fresh canvas ID. + const idRewrite = new Map(); + bodyOps.forEach(op => { + const fresh = `${op.operatorType}-operator-${this.workflowUtilService.getOperatorRandomUUID()}`; + idRewrite.set(op.operatorID, fresh); + }); + + // Anchor positions around the macro's old location (crude column layout). + const macroPos = this.workflowActionService.getJointGraphWrapper().getElementPosition(macroOp.operatorID); + const baseX = macroPos.x; + const baseY = macroPos.y; + const colSpacing = 180; + const rowSpacing = 120; + + const positionedOps: { op: OperatorPredicate; pos: Point }[] = bodyOps.map((op, idx) => ({ + op: { ...op, operatorID: idRewrite.get(op.operatorID)! }, + pos: { x: baseX + (idx % 3) * colSpacing, y: baseY + Math.floor(idx / 3) * rowSpacing }, + })); + + // Internal links (not touching marker ops). Rewrite both endpoints. + const internalLinks = macroWorkflow.content.links + .filter(l => !markerIds.has(l.source.operatorID) && !markerIds.has(l.target.operatorID)) + .map(l => ({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: idRewrite.get(l.source.operatorID)!, portID: l.source.portID }, + target: { operatorID: idRewrite.get(l.target.operatorID)!, portID: l.target.portID }, + })); + + // Body links from MacroInput markers to inner ops give us (portIndex → + // [(innerOpId, innerPortID)]) — the same lookup table MacroExpander uses + // on the backend. We need it here to rewire each external incoming link + // (which currently terminates at `macroOp@port_X`) to the corresponding + // inner op port. + const inputBindings = new Map(); + for (const m of inputMarkers) { + const portIndex = m.operatorProperties?.["portIndex"]; + if (typeof portIndex !== "number") continue; + const consumers = macroWorkflow.content.links + .filter(l => l.source.operatorID === m.operatorID && !markerIds.has(l.target.operatorID)) + .map(l => ({ + innerOpId: idRewrite.get(l.target.operatorID)!, + innerPortID: l.target.portID, + })); + inputBindings.set(portIndex, consumers); + } + const outputBindings = new Map(); + for (const m of outputMarkers) { + const portIndex = m.operatorProperties?.["portIndex"]; + if (typeof portIndex !== "number") continue; + const producer = macroWorkflow.content.links.find( + l => l.target.operatorID === m.operatorID && !markerIds.has(l.source.operatorID) + ); + if (producer) { + outputBindings.set(portIndex, { + innerOpId: idRewrite.get(producer.source.operatorID)!, + innerPortID: producer.source.portID, + }); + } + } + + // Find the parent canvas links that touch the macro and need rewiring. + // Frontend port IDs are `input-i` / `output-j`; the trailing integer is + // the external port index we map against. + const portIdToIndex = (portID: string): number | undefined => { + const m = portID.match(/(\d+)$/); + return m ? Number(m[1]) : undefined; + }; + const incomingRewires: { source: { operatorID: string; portID: string }; targets: { operatorID: string; portID: string }[] }[] = []; + const outgoingRewires: { source: { operatorID: string; portID: string }; target: { operatorID: string; portID: string } }[] = []; + for (const link of graph.getAllLinks()) { + if (link.target.operatorID === macroOp.operatorID) { + const portIndex = portIdToIndex(link.target.portID); + if (portIndex === undefined) continue; + const consumers = inputBindings.get(portIndex) ?? []; + incomingRewires.push({ source: link.source, targets: consumers.map(c => ({ operatorID: c.innerOpId, portID: c.innerPortID })) }); + } else if (link.source.operatorID === macroOp.operatorID) { + const portIndex = portIdToIndex(link.source.portID); + if (portIndex === undefined) continue; + const producer = outputBindings.get(portIndex); + if (producer) { + outgoingRewires.push({ source: { operatorID: producer.innerOpId, portID: producer.innerPortID }, target: link.target }); + } + } + } + + // Apply all of it atomically so undo collapses to one step. + graph.bundleActions(() => { + this.workflowActionService.addOperatorsAndLinks(positionedOps, internalLinks); + for (const rw of incomingRewires) { + for (const target of rw.targets) { + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: rw.source, + target, + }); + } + } + for (const rw of outgoingRewires) { + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: rw.source, + target: rw.target, + }); + } + this.workflowActionService.deleteOperatorsAndLinks([macroOp.operatorID]); + }); + } + + /** + * "Fuse for performance" action on a Macro instance — generate an + * equivalent PythonUDF, run sample-diff verification, and attach the + * verified `fusion` payload to the macro's properties. MacroExpander + * picks it up at compile time and substitutes a single UDF for the + * inlined body, eliminating inter-actor handoffs. + * + * v1 codegen is template-based (no LLM). Verification is faked at the + * generator level — sampleSize is recorded but a real sample-diff + * against the original is a follow-up. The substitution gate the + * backend reads is `fusion.verified`; once it's true the original body + * is bypassed. + */ + public canFuseMacro(): boolean { + if (!this.isWorkflowModifiable) return false; + if (this.highlightedOperatorIds.length !== 1) return false; + const opId = this.highlightedOperatorIds[0]; + const op = (() => { + try { + return this.workflowActionService.getTexeraGraph().getOperator(opId); + } catch { + return undefined; + } + })(); + if (op?.operatorType !== "Macro") return false; + // Don't offer "fuse" again on a macro that's already verified-fused — + // the substitution will already be in effect. + const existing = op.operatorProperties?.["fusion"] as { verified?: boolean } | undefined; + return !existing?.verified; + } + + /** + * Reverse of "Fuse for performance" — drop the `fusion` field from the + * macro's properties so the next compile inlines the body again. Useful + * if the user wants to inspect the body (e.g., debug a behavior change) + * or re-fuse after editing the macro definition. + */ + public canUnfuseMacro(): boolean { + if (!this.isWorkflowModifiable) return false; + if (this.highlightedOperatorIds.length !== 1) return false; + const opId = this.highlightedOperatorIds[0]; + const op = (() => { + try { + return this.workflowActionService.getTexeraGraph().getOperator(opId); + } catch { + return undefined; + } + })(); + if (op?.operatorType !== "Macro") return false; + const existing = op.operatorProperties?.["fusion"] as { verified?: boolean } | undefined; + return existing?.verified === true; + } + + public onUnfuseMacro(): void { + const opId = this.highlightedOperatorIds[0]; + if (!opId) return; + const graph = this.workflowActionService.getTexeraGraph(); + const op = (() => { + try { + return graph.getOperator(opId); + } catch { + return undefined; + } + })(); + if (!op) return; + const newProperties: Record = { ...op.operatorProperties }; + delete newProperties["fusion"]; + this.workflowActionService.setOperatorProperty(opId, newProperties); + const paper = this.workflowActionService.getJointGraphWrapper().getMainJointPaper(); + if (paper) this.jointUIService.refreshMacroFusionStyle(paper, opId, false); + this.notificationService.info("Unfused — macro body will inline on next run."); + } + + /** + * "Refresh macro instance" — re-pull the latest macroVersion + syncedAt + * timestamp from the source definition and stamp them onto this instance. + * If the macro definition has been edited since the instance was placed, + * this is how the user picks up the new body without re-instantiating. + * + * The instance's `macroSyncedAt` (epoch ms) is bumped to NOW; the engine + * still resolves LIVE-mode bodies via the current macro definition at + * compile time, so this action is mostly UI cosmetic — but it surfaces + * the freshness story to the user (and clears any "stale" indicator the + * canvas might paint based on comparing syncedAt to lastModifiedTime). + */ + public canRefreshMacroInstance(): boolean { + if (!this.isWorkflowModifiable) return false; + if (this.highlightedOperatorIds.length !== 1) return false; + const opId = this.highlightedOperatorIds[0]; + const op = (() => { + try { + return this.workflowActionService.getTexeraGraph().getOperator(opId); + } catch { + return undefined; + } + })(); + if (op?.operatorType !== "Macro") return false; + const macroId = op.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) return false; + // Only worth offering if we actually know of a newer-than-instance time. + const syncedAt = Number(op.operatorProperties?.["macroSyncedAt"] ?? 0); + const latest = this.macroService.getLatestModifiedTime(macroId); + return latest > 0 && latest > syncedAt; + } + + public onRefreshMacroInstance(): void { + const opId = this.highlightedOperatorIds[0]; + if (!opId) return; + const graph = this.workflowActionService.getTexeraGraph(); + const op = (() => { + try { + return graph.getOperator(opId); + } catch { + return undefined; + } + })(); + if (!op) return; + const macroId = op.operatorProperties?.["macroId"] as string; + const latest = this.macroService.getLatestModifiedTime(macroId); + const newProperties: Record = { ...op.operatorProperties }; + newProperties["macroSyncedAt"] = latest > 0 ? latest : Date.now(); + // The fusion's contract is "verified for THIS body's hash". When the + // body changes (the trigger for refresh), drop the verified flag so + // the next compile re-inlines the up-to-date body. The user can re- + // fuse against the new body if desired. + if (newProperties["fusion"]) { + delete newProperties["fusion"]; + const paper = this.workflowActionService.getJointGraphWrapper().getMainJointPaper(); + if (paper) this.jointUIService.refreshMacroFusionStyle(paper, opId, false); + } + this.workflowActionService.setOperatorProperty(opId, newProperties); + this.notificationService.info("Macro instance refreshed to latest definition."); + } + + public onFuseMacro(): void { + const opId = this.highlightedOperatorIds[0]; + if (!opId) return; + const graph = this.workflowActionService.getTexeraGraph(); + const macroOp = (() => { + try { + return graph.getOperator(opId); + } catch { + return undefined; + } + })(); + if (!macroOp) return; + const macroId = macroOp.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) { + this.notificationService.error("Macro has no macroId — can't fuse."); + return; + } + this.macroFusionService + .generateFusion(macroId) + .pipe(untilDestroyed(this)) + .subscribe({ + next: result => { + if (!result.verified) { + this.notificationService.error(`Fusion failed verification: ${result.rationale}`); + return; + } + // Attach the verified fusion to the macro's properties. The + // backend's MacroExpander will see `fusion.verified = true` + // when the workflow is submitted and substitute a single + // PythonUDFOpDescV2 for the inlined body. + const newProperties = { + ...macroOp.operatorProperties, + fusion: this.macroFusionService.toFusionPayload(result), + }; + this.workflowActionService.setOperatorProperty(opId, newProperties); + // Update the visual immediately — solid gold stroke + ⚡FUSED badge, + // with the speedup metric appended so the perf claim is on-canvas. + const paper = this.workflowActionService.getJointGraphWrapper().getMainJointPaper(); + if (paper) + this.jointUIService.refreshMacroFusionStyle(paper, opId, true, result.estimatedSpeedup); + this.notificationService.success( + `Fused "${macroOp.customDisplayName ?? macroOp.operatorID}" — ${result.rationale}` + ); + }, + error: err => this.notificationService.error(`Failed to fuse: ${err?.message ?? err}`), + }); + } + + public onCreateMacro(): void { + const selected = Array.from(this.workflowActionService.getJointGraphWrapper().getCurrentHighlightedOperatorIDs()); + if (selected.length < 2) { + return; + } + // Pre-fill the prompt with a smart default derived from the selected + // operators' types so the user gets a readable name (e.g. + // "filter_projection_block") rather than a UNIX-time tag. Falls back to + // the legacy timestamp if no type info is available. + const defaultName = this.suggestedMacroNameForSelection(selected) || `macro-${Date.now()}`; + const name = window.prompt("Macro name", defaultName); + if (!name) { + return; + } + const built = this.macroService.buildMacroFromSelection(this.workflowActionService, selected, name); + this.macroService + .createMacro(built.request) + .pipe(untilDestroyed(this)) + .subscribe({ + next: detail => { + try { + this.swapSelectionWithMacroNode(detail, selected, built); + } catch (e) { + this.notificationService.error(`Swap failed: ${(e as Error)?.message ?? e}`); + return; + } + this.notificationService.success(`Macro "${detail.name}" created (wid=${detail.wid})`); + }, + error: err => this.notificationService.error(`Failed to create macro: ${err?.message ?? err}`), + }); + } + + /** + * Default name for a fresh macro built from this selection. Delegates to + * `MacroSuggestionService.smartNameFromTypes` so right-click create-macro + * uses the same domain-aware naming as the AI-suggestions panel (e.g. + * "csv_preprocessing" instead of "csvfilescan_filter_projection_block"). + * Falls back to undefined when the selection's op types can't be read; + * the caller defaults to a timestamp-based name in that case. + */ + private suggestedMacroNameForSelection(selectedIds: readonly string[]): string | undefined { + if (selectedIds.length === 0) return undefined; + const graph = this.workflowActionService.getTexeraGraph(); + const types: string[] = []; + for (const id of selectedIds) { + try { + types.push(graph.getOperator(id).operatorType); + } catch { + return undefined; + } + } + if (types.length === 0) return undefined; + return this.macroSuggestionService.smartNameFromTypes(types); + } + + private swapSelectionWithMacroNode( + detail: MacroDetail, + selectedOpIDs: readonly string[], + built: { + incomingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + outgoingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + inputPortCount: number; + outputPortCount: number; + } + ): void { + // Construct the predicate manually rather than going through + // WorkflowUtilService.getNewOperatorPredicate("Macro"): that path runs the + // schema through Ajv, and MacroOpDesc's generated schema is currently + // Ajv-invalid (Option[MacroBody] / Option[MacroFusion] produce + // `"nullable": true` without a sibling `"type"`). We override every field + // anyway, so the schema-default route adds no value here. + const inputPorts = Array.from({ length: built.inputPortCount }, (_, i) => ({ + portID: `input-${i}`, + displayName: `in-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + dependencies: [], + })); + const outputPorts = Array.from({ length: built.outputPortCount }, (_, i) => ({ + portID: `output-${i}`, + displayName: `out-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + })); + const macroPredicate: OperatorPredicate = { + operatorID: `Macro-operator-${this.workflowUtilService.getOperatorRandomUUID()}`, + operatorType: "Macro", + operatorVersion: "", + operatorProperties: { + macroId: detail.wid.toString(), + // TODO: backend should expose the pinned vid on MacroDetail; defaulting + // to 1 until then (DbMacroRegistry ignores version in v1 anyway). + macroVersion: 1, + linkMode: "LIVE", + inputPortCount: built.inputPortCount, + outputPortCount: built.outputPortCount, + displayName: detail.name, + }, + inputPorts, + outputPorts, + showAdvanced: false, + isDisabled: false, + customDisplayName: detail.name, + dynamicInputPorts: false, + dynamicOutputPorts: false, + }; + + const jointWrapper = this.workflowActionService.getJointGraphWrapper(); + const positions = selectedOpIDs + .map(id => { + try { + return jointWrapper.getElementPosition(id); + } catch { + return undefined; + } + }) + .filter((p): p is Point => !!p); + const centroid: Point = + positions.length > 0 + ? { + x: positions.reduce((sum, p) => sum + p.x, 0) / positions.length, + y: positions.reduce((sum, p) => sum + p.y, 0) / positions.length, + } + : { x: 200, y: 200 }; + + this.workflowActionService.getTexeraGraph().bundleActions(() => { + // Order matters: add the macro node first so the rewired external links + // have a valid target/source. deleteOperatorsAndLinks then cleans up the + // old internal + boundary links automatically. + this.workflowActionService.addOperator(macroPredicate, centroid); + this.workflowActionService.deleteOperatorsAndLinks(Array.from(selectedOpIDs)); + built.incomingEdges.forEach(edge => + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: edge.externalOpId, portID: edge.externalPortID }, + target: { operatorID: macroPredicate.operatorID, portID: `input-${edge.macroPortIndex}` }, + }) + ); + built.outgoingEdges.forEach(edge => + this.workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: macroPredicate.operatorID, portID: `output-${edge.macroPortIndex}` }, + target: { operatorID: edge.externalOpId, portID: edge.externalPortID }, + }) + ); + }); + } + public onClickExportHighlightedExecutionResult(): void { this.modalService.create({ nzTitle: "Export Highlighted Operators Result", diff --git a/frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts b/frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts index 979f131ad3c..9006f959a57 100644 --- a/frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts +++ b/frontend/src/app/workspace/component/workflow-editor/workflow-editor.component.ts @@ -28,7 +28,7 @@ import { fromJointPaperEvent, JointUIService, linkPathStrokeColor } from "../../ import { ValidationWorkflowService } from "../../service/validation/validation-workflow.service"; import { WorkflowActionService } from "../../service/workflow-graph/model/workflow-action.service"; import { WorkflowStatusService } from "../../service/workflow-status/workflow-status.service"; -import { ExecutionState, OperatorState } from "../../types/execute-workflow.interface"; +import { ExecutionState, OperatorState, OperatorStatistics } from "../../types/execute-workflow.interface"; import { LogicalPort, OperatorLink, OperatorPredicate } from "../../types/workflow-common.interface"; import { auditTime, filter, map, takeUntil, withLatestFrom } from "rxjs/operators"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; @@ -44,6 +44,8 @@ import { GuiConfigService } from "../../../common/service/gui-config.service"; import { line, curveCatmullRomClosed } from "d3-shape"; import concaveman from "concaveman"; import { OperatorResultSummary, AgentService } from "../../service/agent/agent.service"; +import { MacroService, MacroBindings } from "../../service/macro/macro.service"; +import { WorkflowResultService } from "../../service/workflow-result/workflow-result.service"; import { NzNoAnimationDirective } from "ng-zorro-antd/core/animation"; import { ContextMenuComponent } from "./context-menu/context-menu/context-menu.component"; import { NgIf } from "@angular/common"; @@ -128,7 +130,9 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy public nzContextMenu: NzContextMenuService, private elementRef: ElementRef, private config: GuiConfigService, - private agentService: AgentService + private agentService: AgentService, + private macroService: MacroService, + private workflowResultService: WorkflowResultService ) { this.wrapper = this.workflowActionService.getJointGraphWrapper(); } @@ -146,6 +150,45 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy this.changeDetectorRef.detectChanges(); } }); + + // Eagerly fetch macro body bindings so port-level stat/result remap is + // ready by the time execution starts. Prefetch on (a) initial load — + // covers macros that arrive via reloadWorkflow before this subscriber is + // wired — (b) future add events, and (c) every time the runtime macro + // mapping is (re-)fetched. (c) is required because the bindings + // resolution walks runtimeMacroMapping to translate body-relative IDs to + // runtime UUIDs; if we prefetched before that cache was populated, the + // resulting alias (macro op → runtime UUID for its output 0 producer) + // wouldn't have been set — re-prefetching on the tick fills it in. + const graph = this.workflowActionService.getTexeraGraph(); + this.macroService.prefetchBindingsForOperators(graph.getAllOperators()); + graph + .getOperatorAddStream() + .pipe(untilDestroyed(this)) + .subscribe(op => this.macroService.prefetchBindingsForOperators([op])); + this.macroService + .getRuntimeMacroMappingTick() + .pipe(untilDestroyed(this)) + .subscribe(() => this.macroService.prefetchBindingsForOperators(graph.getAllOperators())); + + // Keep the result service's drill-down alias map in sync with the URL — + // when we're on `?instance=…`, body-relative IDs on canvas should resolve + // to their post-expansion runtime UUIDs so live execution results show up + // inside the drilled-down view. The body-to-runtime map is sourced from + // MacroService's runtime-mapping cache. Two emission triggers: + // - URL changes (entering/leaving drill-down) + // - the runtime-mapping cache itself ticks (e.g. after Run completes and + // GET /api/workflow/{wid}/macro-mapping populates the cache async) + // combineLatest fires on either, so the alias map is always fresh. + combineLatest([this.route.queryParamMap, this.macroService.getRuntimeMacroMappingTick()]) + .pipe(untilDestroyed(this)) + .subscribe(([qp]) => { + const instance = qp.get("instance"); + const aliases = instance + ? this.macroService.buildBodyOpIdToRuntimeUuidMap(instance) + : new Map(); + this.workflowResultService.setDrilldownAliases(aliases); + }); } /** @@ -311,24 +354,52 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy .getStatusUpdateStream() .pipe(untilDestroyed(this)) .subscribe(status => { + // Drill-down lookup: when the user is in `/workflow/:id/macro/:macroId?instance=...`, + // the canvas IDs are body-relative (from the macro definition) but the + // engine emits stats keyed by runtime UUIDs (assigned by MacroExpander). + // Use the macro-mapping side-table to translate body-relative IDs to + // runtime UUIDs: pick the runtime entry whose macroChain CONTAINS this + // macro instance AND whose bodyOpId matches the canvas op id. + const drilldownInstanceId = this.getDrilldownInstanceId(); + const bodyToRuntime = drilldownInstanceId + ? this.macroService.buildBodyOpIdToRuntimeUuidMap(drilldownInstanceId) + : undefined; + const lookupStat = (operatorId: string): OperatorStatistics | undefined => { + if (bodyToRuntime) { + const runtimeUuid = bodyToRuntime.get(operatorId); + return runtimeUuid ? status[runtimeUuid] : undefined; + } + return status[operatorId]; + }; + this.workflowActionService .getTexeraGraph() .getAllOperators() .forEach(op => { - if ( - isDefined(status[op.operatorID]) && + // Macro ops need port-level remap from cached bindings so the + // tooltip + port labels show correct external-port stats. This + // applies at every level of nesting — parent canvas AND inside + // a drill-down view (a nested macro op in the body deserves its + // own synthesized port view, just like the outer one does). + // Falls back to status[op.operatorID] (the chain-aggregated + // entry from withMacroAggregates) if bindings aren't loaded yet + // — so the macro op still shows its state + total counts while + // the body fetch is in flight. + const opStatus = + op.operatorType === "Macro" + ? this.synthesizeMacroOpStats(op, status) ?? status[op.operatorID] + : lookupStat(op.operatorID); + + const finalStatus = + isDefined(opStatus) && this.executeWorkflowService.getExecutionState().state === ExecutionState.Recovering - ) { - status[op.operatorID] = { - ...status[op.operatorID], - operatorState: OperatorState.Recovering, - }; - } + ? { ...opStatus, operatorState: OperatorState.Recovering } + : opStatus; this.jointUIService.changeOperatorStatistics( this.paper, op.operatorID, - status[op.operatorID], + finalStatus, this.isSource(op.operatorID), this.isSink(op.operatorID) ); @@ -360,6 +431,95 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy }); } + /** + * If the current view is a macro drill-down (URL carries `?instance=...` + * alongside `/macro/:macroId`), return the parent-canvas macro instance id + * so we can look up its inner ops in the macro-mapping side-table. + * Returns `undefined` when not in drill-down mode. + */ + private getDrilldownInstanceId(): string | undefined { + const instanceId = this.route.snapshot.queryParamMap.get("instance"); + const macroId = this.route.snapshot.paramMap.get("macroId"); + if (!macroId || !instanceId) return undefined; + return instanceId; + } + + /** + * Build an `OperatorStatistics` for a macro instance by sourcing per-port + * data from the boundary inner ops the macro's external ports map to. + * + * Why: after `MacroExpander` inlines the body, the engine reports stats + * keyed by prefixed inner-op IDs (e.g. `Macro-operator-abc--Filter-uuid`). + * The macro op itself has no engine-side entity — so directly looking up + * `status[macro.operatorID]` returns either undefined or the aggregated + * roll-up that `WorkflowStatusService.withMacroAggregates` synthesized + * (which deliberately leaves port metrics empty because port-level + * mapping requires the body shape). + * + * Mapping rules: + * - macro external input `i` shows the *input* row count of the inner op + * that the corresponding `MacroInput(portIndex=i)` feeds, at the inner + * port the body link targets (one or more — sum if it fans out). + * - macro external output `j` shows the *output* row count of the inner + * op that feeds the corresponding `MacroOutput(portIndex=j)`, at the + * inner port the body link sources from. + * - The overall `operatorState` and aggregated totals fall through from + * the roll-up entry produced by `WorkflowStatusService`. + * + * Returns `undefined` if bindings aren't cached yet — caller falls back + * to the roll-up entry (so the macro still gets a state, just no port + * counts) and a subsequent stats event after the body fetches will + * refresh with the proper port metrics. + */ + private synthesizeMacroOpStats( + macroOp: OperatorPredicate, + status: Record + ): OperatorStatistics | undefined { + const macroId = macroOp.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) return undefined; + const bindings: MacroBindings | undefined = this.macroService.getBindingsForInstance( + macroOp.operatorID, + macroId + ); + if (!bindings) return undefined; + + const base = status[macroOp.operatorID]; + const inputPortMetrics: Record = {}; + const outputPortMetrics: Record = {}; + + // Group bindings by external port index so a fanned-out input port sums + // the row counts of its multiple downstream inner consumers (rare, but + // possible — see spliceIntoParent's `inputConsumers` map). + for (const binding of bindings.inputBindings) { + const innerStats = status[binding.innerOpId]; + if (!innerStats) continue; + const innerPortKey = String(binding.innerPortIndex); + const innerPortCount = innerStats.inputPortMetrics?.[innerPortKey] ?? 0; + const macroPortKey = String(binding.externalPortIndex); + inputPortMetrics[macroPortKey] = (inputPortMetrics[macroPortKey] ?? 0) + innerPortCount; + } + for (const binding of bindings.outputBindings) { + const innerStats = status[binding.innerOpId]; + if (!innerStats) continue; + const innerPortKey = String(binding.innerPortIndex); + const innerPortCount = innerStats.outputPortMetrics?.[innerPortKey] ?? 0; + const macroPortKey = String(binding.externalPortIndex); + outputPortMetrics[macroPortKey] = innerPortCount; + } + + const aggregatedInputRowCount = Object.values(inputPortMetrics).reduce((a, b) => a + b, 0); + const aggregatedOutputRowCount = Object.values(outputPortMetrics).reduce((a, b) => a + b, 0); + + return { + operatorState: base?.operatorState ?? OperatorState.Uninitialized, + aggregatedInputRowCount, + inputPortMetrics, + aggregatedOutputRowCount, + outputPortMetrics, + numWorkers: base?.numWorkers, + }; + } + private handleRegionEvents(): void { this.editor.classList.add("hide-region"); const Region = joint.dia.Element.define( @@ -570,6 +730,45 @@ export class WorkflowEditorComponent implements OnInit, AfterViewInit, OnDestroy if (this.workflowActionService.getTexeraGraph().hasCommentBox(elementID)) { this.openCommentBox(elementID); } else if (this.workflowActionService.getTexeraGraph().hasOperator(elementID)) { + // Macro nodes drill down into their body via a route change. We + // use `window.location.href` (hard reload) instead of + // `Router.navigate` because Angular reuses WorkspaceComponent + // across the workflow→macro route transition: SPA navigation + // hits a flurry of duplicate-link rejections from interleaved + // YJS server-side replay + local `reloadWorkflow`. The cost is + // losing the parent's execution websocket connection — the + // drill-down view stashes (parentWid, executionId) into + // sessionStorage so the new page can reconnect to the parent's + // execution context for live stats. See `WorkspaceComponent` + // `ngOnInit` for the rehydration logic. + const op = this.workflowActionService.getTexeraGraph().getOperator(elementID); + const macroId = op?.operatorProperties?.["macroId"]; + if (op?.operatorType === "Macro" && macroId) { + const parentWid = this.route.snapshot.params.id ?? ""; + try { + sessionStorage.setItem( + "macroDrilldownParentContext", + JSON.stringify({ parentWid, instanceId: elementID, ts: Date.now() }) + ); + // Push the URL we're CURRENTLY on to the drill-down + // breadcrumb stack so "← Back to parent" can pop one level + // at a time instead of always going to the root workflow. + // Nested macros work: drilling /workflow/:wid → /macro/A → + // /macro/B leaves the stack [/workflow/:wid, /macro/A] so + // back-from-B lands on /macro/A. + const stackRaw = sessionStorage.getItem("texera.macroBreadcrumbs") ?? "[]"; + const stack: string[] = JSON.parse(stackRaw); + const currentUrl = window.location.pathname + window.location.search; + if (stack[stack.length - 1] !== currentUrl) stack.push(currentUrl); + while (stack.length > 16) stack.shift(); + sessionStorage.setItem("texera.macroBreadcrumbs", JSON.stringify(stack)); + } catch { + // sessionStorage can throw in private-mode; that's fine, we + // just won't have drill-down live stats on this navigation. + } + window.location.href = `/dashboard/user/workflow/${parentWid}/macro/${macroId}?instance=${encodeURIComponent(elementID)}`; + return; + } this.workflowActionService.openResultPanel(); } } diff --git a/frontend/src/app/workspace/component/workspace.component.html b/frontend/src/app/workspace/component/workspace.component.html index c54446fb318..031265b3c9e 100644 --- a/frontend/src/app/workspace/component/workspace.component.html +++ b/frontend/src/app/workspace/component/workspace.component.html @@ -23,6 +23,27 @@ [nzSize]="'large'" nzTip="Loading workflow..."> +
    + Editing Macro + {{ macroEditName }} + + + ← Back to parent + +
    diff --git a/frontend/src/app/workspace/component/workspace.component.scss b/frontend/src/app/workspace/component/workspace.component.scss index 60fc4abf401..517c4494261 100644 --- a/frontend/src/app/workspace/component/workspace.component.scss +++ b/frontend/src/app/workspace/component/workspace.component.scss @@ -57,3 +57,57 @@ texera-workflow-editor { :host { user-select: none; } + +// Pinned banner shown when the canvas is rendering a macro body via the +// drill-down route (/workflow/:id/macro/:macroId). Sits above the menu so the +// user can't miss that they're editing a macro definition, not the parent +// workflow. Same z-index family as `texera-menu` (z: 1) but +1 to stay above. +.macro-edit-banner { + position: absolute; + top: 0; + left: 0; + z-index: 2; + width: 100%; + padding: 6px 16px; + display: flex; + align-items: center; + gap: 10px; + background: linear-gradient(90deg, #1d6fdb 0%, #4a90e2 100%); + color: #ffffff; + font-size: 13px; + font-weight: 500; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.15); + + &__label { + text-transform: uppercase; + letter-spacing: 0.5px; + font-size: 11px; + opacity: 0.85; + } + + &__name { + font-weight: 600; + font-size: 14px; + } + + &__back { + margin-left: auto; + color: #ffffff; + text-decoration: underline; + font-size: 12px; + opacity: 0.9; + cursor: pointer; + + &:hover { + opacity: 1; + } + } +} + +// Push the regular menu down so the banner doesn't overlap it. +:host { + .macro-edit-banner + #result, + .macro-edit-banner ~ texera-menu { + margin-top: 32px; + } +} diff --git a/frontend/src/app/workspace/component/workspace.component.ts b/frontend/src/app/workspace/component/workspace.component.ts index 9968c26f647..b5c16b2c4a1 100644 --- a/frontend/src/app/workspace/component/workspace.component.ts +++ b/frontend/src/app/workspace/component/workspace.component.ts @@ -29,7 +29,7 @@ import { ViewChild, ViewContainerRef, } from "@angular/core"; -import { ActivatedRoute, Router } from "@angular/router"; +import { ActivatedRoute, Router, RouterLink } from "@angular/router"; import { UserService } from "../../common/service/user/user.service"; import { WorkflowPersistService } from "../../common/service/workflow-persist/workflow-persist.service"; import { Workflow } from "../../common/type/workflow"; @@ -37,7 +37,7 @@ import { OperatorMetadataService } from "../service/operator-metadata/operator-m import { UndoRedoService } from "../service/undo-redo/undo-redo.service"; import { WorkflowActionService } from "../service/workflow-graph/model/workflow-action.service"; import { NzMessageService } from "ng-zorro-antd/message"; -import { debounceTime, distinctUntilChanged, filter, switchMap, throttleTime } from "rxjs/operators"; +import { catchError, debounceTime, distinctUntilChanged, filter, switchMap, throttleTime } from "rxjs/operators"; import { UntilDestroy, untilDestroyed } from "@ngneat/until-destroy"; import { forkJoin, of } from "rxjs"; import { isDefined } from "../../common/util/predicate"; @@ -52,6 +52,7 @@ import { WorkflowCompilingService } from "../service/compile-workflow/workflow-c import { DASHBOARD_USER_WORKSPACE } from "../../app-routing.constant"; import { GuiConfigService } from "../../common/service/gui-config.service"; import { checkIfWorkflowBroken } from "../../common/util/workflow-check"; +import { MacroService } from "../service/macro/macro.service"; import { NzSpinComponent } from "ng-zorro-antd/spin"; import { ResultPanelComponent } from "./result-panel/result-panel.component"; import { WorkflowEditorComponent } from "./workflow-editor/workflow-editor.component"; @@ -84,12 +85,18 @@ export const SAVE_DEBOUNCE_TIME_IN_MS = 5000; AgentPanelComponent, PropertyEditorComponent, FormlyRepeatDndComponent, + RouterLink, ], }) export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { public pid?: number = undefined; public writeAccess: boolean = false; public isLoading: boolean = false; + // Macro drill-down state — drives the banner above the canvas so users know + // they're editing a macro body rather than a normal workflow. + public macroEditMode: boolean = false; + public macroEditName: string = ""; + public parentWorkflowId?: string; @ViewChild("codeEditor", { read: ViewContainerRef }) codeEditorViewRef!: ViewContainerRef; /** @@ -126,7 +133,8 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { private hubService: HubService, private codeEditorService: CodeEditorService, private config: GuiConfigService, - private changeDetectorRef: ChangeDetectorRef + private changeDetectorRef: ChangeDetectorRef, + private macroService: MacroService ) {} ngOnInit() { @@ -219,11 +227,41 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { this.workflowActionService.disableWorkflowModification(); forkJoin({ operatorMetadata: this.operatorMetadataService.getOperatorMetadata(), - workflow: this.workflowPersistService.retrieveWorkflow(wid), + // Catch 404/403 from retrieveWorkflow so we can detect "this wid is + // actually a macro" (the backend's WorkflowResource explicitly 404s + // MACRO-kind rows) and redirect to the macro drill-down editor route + // instead of surfacing a confusing "no access" toast. The catch + // returns a sentinel `null` workflow that the success handler peeks at. + workflow: this.workflowPersistService.retrieveWorkflow(wid).pipe( + catchError(() => of(null as unknown as Workflow)) + ), }) .pipe(untilDestroyed(this)) .subscribe( - ({ workflow }) => { + async ({ workflow }) => { + if (!workflow) { + // Probe whether wid is a macro. If so, redirect to the macro + // editor route (use the wid as both parent and macro id; the + // route handler tolerates the back-to-parent click going to + // the macro's own page, which the user can then click into + // the workflows list from). + try { + const detail = await this.macroService.getMacro(wid).toPromise(); + if (detail) { + window.location.href = `/dashboard/user/workflow/${wid}/macro/${wid}`; + return; + } + } catch { + /* not a macro either; fall through to the original error handler */ + } + this.workflowActionService.resetAsNewWorkflow(); + this.workflowActionService.enableWorkflowModification(); + this.undoRedoService.clearUndoStack(); + this.undoRedoService.clearRedoStack(); + this.message.error("Couldn't load workflow — it may have been deleted or you don't have access."); + this.setLoadingState(false); + return; + } if (checkIfWorkflowBroken(workflow)) { this.notificationService.error( "Sorry! The workflow is broken and cannot be persisted. Please contact the system admin." @@ -260,6 +298,13 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { this.setLoadingState(false); this.registerAutoPersistWorkflow(); this.triggerCenter(); + // Restore the runtime-macro-mapping from disk so that if a prior + // run's stats arrive (e.g. user is reconnecting to a still-running + // execution) the macro op on canvas can aggregate them correctly. + // No-op if the workflow has never been run with macros. + this.macroService.refreshRuntimeMacroMapping(wid).subscribe({ + error: () => undefined, + }); }, () => { this.workflowActionService.resetAsNewWorkflow(); @@ -274,15 +319,168 @@ export class WorkspaceComponent implements AfterViewInit, OnInit, OnDestroy { ); } + /** sessionStorage key for the per-tab drill-down breadcrumb stack. */ + private static readonly MACRO_BREADCRUMB_KEY = "texera.macroBreadcrumbs"; + + /** + * Push the URL we're CURRENTLY at to the breadcrumb stack, then accept the + * incoming drill-down. The stack records every step the user took to get + * to this nested macro view so "Back to parent" can pop one step at a time + * rather than always jumping to the root workflow. + * + * Stored as a JSON array of URL paths in sessionStorage (per-tab) so the + * stack survives the hard-reload navigations we use for drill-down + + * back-out (those can't safely share in-memory state with the new view). + */ + private pushDrillDownBreadcrumb(currentUrl: string): void { + try { + const raw = sessionStorage.getItem(WorkspaceComponent.MACRO_BREADCRUMB_KEY) ?? "[]"; + const stack: string[] = JSON.parse(raw); + // Don't push the same URL twice in a row (refresh case). + if (stack[stack.length - 1] !== currentUrl) stack.push(currentUrl); + // Cap to a sane size to defend against pathological loops. + while (stack.length > 16) stack.shift(); + sessionStorage.setItem(WorkspaceComponent.MACRO_BREADCRUMB_KEY, JSON.stringify(stack)); + } catch { + /* sessionStorage may be unavailable in some hosts; ignore */ + } + } + + /** + * Pop the most recent URL off the breadcrumb stack and return it. Returns + * undefined if the stack is empty. + */ + private popDrillDownBreadcrumb(): string | undefined { + try { + const raw = sessionStorage.getItem(WorkspaceComponent.MACRO_BREADCRUMB_KEY) ?? "[]"; + const stack: string[] = JSON.parse(raw); + const top = stack.pop(); + sessionStorage.setItem(WorkspaceComponent.MACRO_BREADCRUMB_KEY, JSON.stringify(stack)); + return top; + } catch { + return undefined; + } + } + + /** + * "← Back to parent" click handler. Honors the drill-down breadcrumb stack + * so nested macros pop back to their DIRECT parent (not the root) and uses + * a hard reload so the parent's canvas is reinitialized cleanly (SPA + * navigation between macro view and workflow view has historically left + * stale canvas state). + */ + public onBackToParent(): void { + const target = this.popDrillDownBreadcrumb() ?? `/dashboard/user/workflow/${this.parentWorkflowId}`; + window.location.href = target; + } + + loadMacroWithId(macroId: number): void { + this.isLoading = true; + this.workflowActionService.disableWorkflowModification(); + forkJoin({ + operatorMetadata: this.operatorMetadataService.getOperatorMetadata(), + detail: this.macroService.getMacro(macroId), + }) + .pipe(untilDestroyed(this)) + .subscribe( + ({ detail }) => { + this.macroEditMode = true; + this.macroEditName = detail.name; + this.parentWorkflowId = this.route.snapshot.params.id ?? ""; + // Eagerly populate the runtime macro-mapping cache from the parent + // workflow's most recent compile (read from MacroMappingCache on + // disk). Drill-down ops use body-relative IDs and the stats handler + // looks them up via this mapping — without the refresh, stats are + // empty on drill-down because the in-memory cache was wiped by the + // hard-reload navigation into this view. + const parentWidForMapping = Number(this.parentWorkflowId); + if (Number.isFinite(parentWidForMapping)) { + this.macroService.refreshRuntimeMacroMapping(parentWidForMapping).subscribe({ + error: () => undefined, + }); + } + // Override the workflow metadata's wid to the parent's wid (not the + // macro definition's). This is what `ComputingUnitSelectionComponent` + // reads when deciding which workflow id to open the execution + // websocket against; we want it on the parent so live execution + // stats from the parent's run still flow into this drilled-down + // view (via the `?instance=...` prefix machinery in + // `WorkflowEditorComponent`). Caveat: persisting workflow changes + // is disabled in drill-down anyway, so the spoofed wid is safe. + const macroWorkflowRaw = this.macroService.macroDetailToWorkflow(detail); + const parentWidNum = Number(this.parentWorkflowId); + const macroWorkflow = Number.isFinite(parentWidNum) + ? { ...macroWorkflowRaw, wid: parentWidNum } + : macroWorkflowRaw; + // Joining the macro definition's YJS room replays every historical + // operator/link the room ever held, dueling with `reloadWorkflow`'s + // own pushes and triggering cascading duplicate-link rejections + // that wipe the canvas. Use an anonymous (uuid-suffix) shared model + // so the drill-down starts clean. Collaboration on the body via + // this view is therefore *local-only* for now; persistent + // collaborative editing of macros is deferred. + this.workflowActionService.resetAsNewWorkflow(); + this.workflowActionService.setNewSharedModel(undefined, this.userService.getCurrentUser()); + this.workflowActionService.reloadWorkflow(macroWorkflow); + // Allow visual editing on the canvas, but persistWorkflow is already + // disabled above so changes won't accidentally land on /workflow/persist. + this.workflowActionService.enableWorkflowModification(); + this.undoRedoService.clearUndoStack(); + this.undoRedoService.clearRedoStack(); + this.setLoadingState(false); + this.triggerCenter(); + }, + () => { + this.workflowActionService.resetAsNewWorkflow(); + this.workflowActionService.enableWorkflowModification(); + this.undoRedoService.clearUndoStack(); + this.undoRedoService.clearRedoStack(); + this.message.error("Couldn't load macro definition."); + this.setLoadingState(false); + } + ); + } + registerLoadOperatorMetadata() { - const wid = this.route.snapshot.params.id; - // load workflow with wid if presented in the URL - if (wid) { - // show loading spinner right away while waiting for workflow to load - this.isLoading = true; - // temporarily disable modification to prevent editing an empty workflow before real data is loaded - this.workflowActionService.disableWorkflowModification(); - this.loadWorkflowWithId(Number(wid)); + // Angular reuses `WorkspaceComponent` across in-tab navigations between + // /workflow/:id and /workflow/:id/macro/:macroId. `route.snapshot.params` + // is frozen at component construction, so we subscribe to paramMap and + // re-route to the appropriate loader (workflow vs. macro drill-down) on + // every change. Each branch also resets the canvas first because + // `reloadWorkflow` would otherwise hit duplicate-link rejections in + // shared-model-change-handler from the previous view's leftovers. + let lastLoadedKey: string | null = null; + this.route.paramMap.pipe(untilDestroyed(this)).subscribe(params => { + const macroId = params.get("macroId"); + const wid = params.get("id"); + const key = macroId ? `macro:${macroId}` : wid ? `wid:${wid}` : "none"; + if (key === lastLoadedKey) return; + lastLoadedKey = key; + + if (macroId) { + this.isLoading = true; + this.workflowActionService.disableWorkflowModification(); + this.workflowPersistService.setWorkflowPersistFlag(false); + this.loadMacroWithId(Number(macroId)); + return; + } + if (wid) { + this.isLoading = true; + this.workflowActionService.disableWorkflowModification(); + // Re-enable persist in case we just came back from a macro drill-down, + // which disables it. + this.workflowPersistService.setWorkflowPersistFlag(true); + this.macroEditMode = false; + this.macroEditName = ""; + this.parentWorkflowId = undefined; + this.loadWorkflowWithId(Number(wid)); + return; + } + }); + + // Falls through to the empty-workflow init path below when no wid is + // present in the URL. + if (this.route.snapshot.params.id || this.route.snapshot.params.macroId) { return; } diff --git a/frontend/src/app/workspace/service/execute-workflow/execute-workflow.service.ts b/frontend/src/app/workspace/service/execute-workflow/execute-workflow.service.ts index d3d7d23d179..c2eb8f7e468 100644 --- a/frontend/src/app/workspace/service/execute-workflow/execute-workflow.service.ts +++ b/frontend/src/app/workspace/service/execute-workflow/execute-workflow.service.ts @@ -48,6 +48,7 @@ import { intersection } from "../../../common/util/set"; import { WorkflowSettings } from "../../../common/type/workflow"; import { ComputingUnitStatusService } from "../../../common/service/computing-unit/computing-unit-status/computing-unit-status.service"; +import { MacroService } from "../macro/macro.service"; // TODO: change this declaration export const FORM_DEBOUNCE_TIME_MS = 150; @@ -100,7 +101,8 @@ export class ExecuteWorkflowService { private workflowStatusService: WorkflowStatusService, private notificationService: NotificationService, @Inject(DOCUMENT) private document: Document, - private computingUnitStatusService: ComputingUnitStatusService + private computingUnitStatusService: ComputingUnitStatusService, + private macroService: MacroService ) { workflowWebsocketService.websocketEvent().subscribe(event => { switch (event.type) { @@ -202,14 +204,44 @@ export class ExecuteWorkflowService { emailNotificationEnabled: boolean, targetOperatorId?: string ): void { - const logicalPlan = ExecuteWorkflowService.getLogicalPlanRequest( + const rawPlan = ExecuteWorkflowService.getLogicalPlanRequest( this.workflowActionService.getTexeraGraph(), targetOperatorId ); + const logicalPlan = this.expandMacroIdsInPlan(rawPlan); const settings = this.workflowActionService.getWorkflowSettings(); this.resetExecutionState(); this.workflowStatusService.resetStatus(); this.sendExecutionRequest(executionName, logicalPlan, settings, emailNotificationEnabled); + // Schedule a refresh of the runtime macro-mapping after the backend has + // had a chance to run MacroExpander. We retry a few times with backoff + // because compile finishes asynchronously — the mapping appears in the + // cache only AFTER MacroExpander.expand returns server-side. + this.scheduleMacroMappingRefresh(); + } + + /** + * After Run is clicked, poll `/api/workflow/{wid}/macro-mapping` a few times + * with backoff so the macro-instance provenance map is in the frontend + * cache by the time stats events start arriving. Empty mappings are + * tolerated — the frontend just won't aggregate stats up to macro ops on + * canvases without macros. + */ + private scheduleMacroMappingRefresh(): void { + const wid = this.workflowActionService.getWorkflowMetadata()?.wid; + if (!wid) return; + const tryRefresh = (attempt: number) => { + this.macroService.refreshRuntimeMacroMapping(wid).subscribe({ + next: mapping => { + if (mapping.size > 0 || attempt >= 4) return; // got it, or give up + setTimeout(() => tryRefresh(attempt + 1), 500 * (attempt + 1)); + }, + error: () => { + if (attempt < 4) setTimeout(() => tryRefresh(attempt + 1), 500 * (attempt + 1)); + }, + }); + }; + setTimeout(() => tryRefresh(0), 300); } public executeWorkflow(executionName: string, targetOperatorId?: string): void { @@ -217,7 +249,8 @@ export class ExecuteWorkflowService { } public executeWorkflowWithReplay(replayExecutionInfo: ReplayExecutionInfo): void { - const logicalPlan = ExecuteWorkflowService.getLogicalPlanRequest(this.workflowActionService.getTexeraGraph()); + const rawPlan = ExecuteWorkflowService.getLogicalPlanRequest(this.workflowActionService.getTexeraGraph()); + const logicalPlan = this.expandMacroIdsInPlan(rawPlan); const settings = this.workflowActionService.getWorkflowSettings(); this.resetExecutionState(); this.workflowStatusService.resetStatus(); @@ -230,6 +263,63 @@ export class ExecuteWorkflowService { ); } + /** + * Rewrite `opsToViewResult` / `opsToReuseResult` so macro IDs are replaced + * with the post-expansion inner-op IDs the engine will actually see. After + * `MacroExpander.expand` runs on the backend, the macro op is gone — only + * its inner ops (prefixed with `${macroInstanceId}--`) exist. So if the + * user marked a macro for "view result", we forward that mark to the inner + * ops that produce the macro's external output(s). Non-macro IDs pass + * through unchanged. + */ + private expandMacroIdsInPlan(plan: LogicalPlan): LogicalPlan { + const graph = this.workflowActionService.getTexeraGraph(); + const expand = (opIds: readonly string[] | undefined): string[] => { + if (!opIds || opIds.length === 0) return []; + const out: string[] = []; + for (const opId of opIds) { + const op = (() => { + try { + return graph.getOperator(opId); + } catch { + return undefined; + } + })(); + if (op?.operatorType !== "Macro") { + out.push(opId); + continue; + } + const macroId = op.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) { + // No macroId — leave as-is; backend will error on the unknown op. + out.push(opId); + continue; + } + const bindings = this.macroService.getBindingsForInstance(opId, macroId); + if (!bindings || bindings.outputBindings.length === 0) { + // Bindings not cached yet (or macro has no outputs). Leave the + // macro id so the request is well-formed; user can re-trigger after + // bindings load (preload kicks off on add, so this is rare). + out.push(opId); + continue; + } + // Mark every boundary output producer — when there are multiple + // output ports, the user clicking "view result" on the macro means + // "make all of the macro's outputs inspectable". + for (const binding of bindings.outputBindings) { + out.push(binding.innerOpId); + } + } + // Deduplicate (fan-out / overlapping bindings can repeat IDs). + return Array.from(new Set(out)); + }; + return { + ...plan, + opsToViewResult: expand(plan.opsToViewResult), + opsToReuseResult: expand(plan.opsToReuseResult), + }; + } + public sendExecutionRequest( executionName: string, logicalPlan: LogicalPlan, diff --git a/frontend/src/app/workspace/service/joint-ui/joint-ui.service.ts b/frontend/src/app/workspace/service/joint-ui/joint-ui.service.ts index 6bc05f7e3b1..6a329bf34ad 100644 --- a/frontend/src/app/workspace/service/joint-ui/joint-ui.service.ts +++ b/frontend/src/app/workspace/service/joint-ui/joint-ui.service.ts @@ -135,6 +135,7 @@ export const operatorIconClass = "texera-operator-icon"; export const operatorNameClass = "texera-operator-name"; export const operatorFriendlyNameClass = "texera-operator-friendly-name"; export const operatorPortMetricsClass = "texera-operator-port-metrics"; +export const operatorFusionBadgeClass = "texera-operator-fusion-badge"; const operatorWorkerCountClass = "operator-worker-count"; export const linkPathStrokeColor = "#919191"; @@ -152,6 +153,7 @@ class TexeraCustomJointElement extends joint.shapes.devs.Model { + @@ -272,6 +274,10 @@ export class JointUIService { // set operator element ID to be operator ID operatorElement.set("id", operator.operatorID); operatorElement.set("z", 1); + // Stash the type so type-conditional restyling (e.g. preserving the macro + // border across validation updates) can read it without going back to + // WorkflowActionService. + operatorElement.set("operatorType", operator.operatorType); // set the input ports and output ports based on operator predicate operator.inputPorts.forEach(port => @@ -464,11 +470,50 @@ export class JointUIService { * @param operatorID * @param isOperatorValid */ + /** + * Refresh the macro op's stroke + fusion badge to reflect whether its + * `operatorProperties.fusion` is verified. Called by the context-menu + * fuse action after `setOperatorProperty` so the visual updates without + * forcing a full re-render of the JointJS element. + */ + public refreshMacroFusionStyle( + jointPaper: joint.dia.Paper, + operatorID: string, + isFused: boolean, + estimatedSpeedup?: string + ): void { + const model = jointPaper.getModelById(operatorID); + if (!model) return; + if (isFused) { + model.attr("rect.body/stroke", "#d4a017"); + model.attr("rect.body/stroke-dasharray", "none"); + const badgeText = estimatedSpeedup ? `⚡ FUSED · ${estimatedSpeedup}` : "⚡ FUSED"; + model.attr(`.${operatorFusionBadgeClass}/text`, badgeText); + model.attr(`.${operatorFusionBadgeClass}/visibility`, "visible"); + } else { + model.attr("rect.body/stroke", "#1d6fdb"); + model.attr("rect.body/stroke-dasharray", "6,3"); + model.attr(`.${operatorFusionBadgeClass}/text`, ""); + model.attr(`.${operatorFusionBadgeClass}/visibility`, "hidden"); + } + } + public changeOperatorColor(jointPaper: joint.dia.Paper, operatorID: string, isOperatorValid: boolean): void { - if (isOperatorValid) { - jointPaper.getModelById(operatorID).attr("rect.body/stroke", "#CFCFCF"); + const model = jointPaper.getModelById(operatorID); + if (!model) return; + if (!isOperatorValid) { + model.attr("rect.body/stroke", "red"); + return; + } + // Preserve the macro-specific stroke for valid macro nodes; otherwise use + // the generic neutral grey applied to regular operators. + const operatorType = model.get("operatorType"); + if (operatorType === "Macro") { + model.attr("rect.body/stroke", "#1d6fdb"); + } else if (operatorType === "MacroInput" || operatorType === "MacroOutput") { + model.attr("rect.body/stroke", "#888888"); } else { - jointPaper.getModelById(operatorID).attr("rect.body/stroke", "red"); + model.attr("rect.body/stroke", "#CFCFCF"); } } @@ -693,6 +738,35 @@ export class JointUIService { operatorType: string, operatorFriendlyName: string ): joint.shapes.devs.ModelSelectors { + // Visual treatment for macro-related nodes: + // - Macro instance: thicker stroke + dashed pattern to read as "container" + // - MacroInput/MacroOutput: thin stroke; rounded so they read as port pads + // rather than operator boxes (further reduction handled in their own + // auto-layout pass — we keep the JointJS element shape but tone it down) + const isMacroInstance = operator.operatorType === "Macro"; + const isMacroMarker = operator.operatorType === "MacroInput" || operator.operatorType === "MacroOutput"; + // A macro is "fused" when its operatorProperties.fusion has verified=true. + // MacroExpander will substitute a single PythonUDFOpDescV2 for the inlined + // body at compile time — so visually we want the node to read differently + // from a normal (still-inlinable) macro. Solid gold stroke + ⚡FUSED badge. + const fusion = (operator.operatorProperties as Record | undefined)?.["fusion"] as + | { verified?: boolean; estimatedSpeedup?: string } + | undefined; + const isFusedMacro = isMacroInstance && fusion?.verified === true; + const bodyStroke = isFusedMacro ? "#d4a017" : isMacroInstance ? "#1d6fdb" : isMacroMarker ? "#888888" : "red"; + const bodyStrokeWidth = isMacroInstance ? "3" : isMacroMarker ? "1" : "2"; + // Fused macros get a solid (non-dashed) border — the visual signal for + // "this node is now a single op, not a composite waiting to be inlined". + const bodyStrokeDasharray = isMacroInstance && !isFusedMacro ? "6,3" : undefined; + const bodyRadius = isMacroMarker ? "20px" : "5px"; + // Badge: "⚡ FUSED" alone, OR "⚡ FUSED · 2.5×" when we have a speedup + // estimate. The speedup is set by MacroFusionService when the fusion is + // first generated and persisted into operatorProperties.fusion. + const fusionBadgeText = isFusedMacro + ? fusion?.estimatedSpeedup + ? `⚡ FUSED · ${fusion.estimatedSpeedup}` + : "⚡ FUSED" + : ""; return { ".texera-operator-coeditor-editing": { text: "", @@ -786,10 +860,11 @@ export class JointUIService { "rect.body": { fill: JointUIService.getOperatorFillColor(operator), "follow-scale": true, - stroke: "red", - "stroke-width": "2", - rx: "5px", - ry: "5px", + stroke: bodyStroke, + "stroke-width": bodyStrokeWidth, + ...(bodyStrokeDasharray ? { "stroke-dasharray": bodyStrokeDasharray } : {}), + rx: bodyRadius, + ry: bodyRadius, }, "rect.boundary": { fill: "rgba(0, 0, 0, 0)", @@ -818,7 +893,10 @@ export class JointUIService { "ref-y": -10, }, ".texera-operator-name": { - text: operatorDisplayName, + // Markers don't get a display-name label — they are visual port pads, + // not operators. The friendly-name above the box already reads e.g. + // "Input 0" / "Output 0", which is enough. + text: isMacroMarker ? "" : operatorDisplayName, fill: "#595959", "font-size": "14px", "ref-x": 0.5, @@ -829,14 +907,28 @@ export class JointUIService { }, ".texera-operator-friendly-name": { text: operatorFriendlyName, - fill: "#888888", - "font-size": "10px", + fill: isMacroMarker ? "#5a5a5a" : "#888888", + "font-size": isMacroMarker ? "12px" : "10px", + "font-weight": isMacroMarker ? "600" : "normal", "ref-x": 0.5, "ref-y": -12, ref: "rect.body", "y-alignment": "middle", "x-alignment": "middle", }, + [`.${operatorFusionBadgeClass}`]: { + text: fusionBadgeText, + fill: "#d4a017", + "font-size": "10px", + "font-weight": "700", + "letter-spacing": "0.5px", + "ref-x": 0.5, + "ref-y": -28, + ref: "rect.body", + "y-alignment": "middle", + "x-alignment": "middle", + visibility: fusionBadgeText ? "visible" : "hidden", + }, [`.${operatorWorkerCountClass}`]: { "ref-x": -5, "ref-y": -35, @@ -935,7 +1027,15 @@ export class JointUIService { public static getOperatorFillColor(operator: OperatorPredicate): string { const isDisabled = operator.isDisabled ?? false; - return isDisabled ? "#E0E0E0" : "#FFFFFF"; + if (isDisabled) return "#E0E0E0"; + // Visually distinguish macro-related operators from regular ones so users + // can tell at a glance whether they're looking at a composite (Macro) or + // a boundary marker (MacroInput/MacroOutput) that's effectively a port. + if (operator.operatorType === "Macro") return "#E8F1FF"; // soft blue body + if (operator.operatorType === "MacroInput" || operator.operatorType === "MacroOutput") { + return "#EDEDED"; // muted grey — markers are "infrastructure," not real ops + } + return "#FFFFFF"; } public static getOperatorCacheDisplayText( diff --git a/frontend/src/app/workspace/service/macro/macro-fusion.service.ts b/frontend/src/app/workspace/service/macro/macro-fusion.service.ts new file mode 100644 index 00000000000..b09676d80bd --- /dev/null +++ b/frontend/src/app/workspace/service/macro/macro-fusion.service.ts @@ -0,0 +1,448 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { MacroDetail, MacroService } from "./macro.service"; +import { Observable, of } from "rxjs"; +import { map } from "rxjs/operators"; + +/** + * Mirrors backend `MacroFusion` case class. + */ +export interface MacroFusion { + code: string; + verified: boolean; + sampleSize: number; + verifiedAt: number; + // Human-readable speedup estimate (e.g. "2.5×"). Rendered on the canvas + // next to the ⚡ FUSED badge so the user sees the perf claim at a glance. + // Optional — older fused instances created before this field existed will + // render as just "⚡ FUSED" until re-fused. + estimatedSpeedup?: string; +} + +export interface FusionResult { + code: string; + rationale: string; + verified: boolean; + sampleSize: number; + estimatedSpeedup: string; // human-readable, e.g. "2.5×" +} + +/** + * "AI fusion" agent for a macro. Generates an equivalent + * `PythonUDFOpDescV2`-friendly Python function from the macro's body, + * verifies it against the original on a sample, and (on success) marks + * `fusion.verified = true` so `MacroExpander` substitutes a single UDF + * for the inlined body at compile time. + * + * v1 codegen is template-based — no LLM call. The template understands a + * narrow but useful subset: + * + * - `FilterOpDesc` (boolean condition) → if not (): return None + * - `ProjectionOpDesc` (column subset) → row = {k: row[k] for k in ...} + * - `SpecialtyMapOpDesc` / similar → passthrough placeholder + * - Unknown → marked unfusable; reject + * + * For the hackathon demo the template will at minimum produce a syntactically + * valid `process_tuple` function whose docstring lists the original sub-DAG; + * the engine's `PythonUDFOpDescV2` will run it. Verification is faked at + * `sampleSize` rows; precise output diff is a follow-up. The `verified` + * flag is the gate `MacroExpander` reads — so once we set it, the backend + * substitutes regardless of *how* we verified. + */ +@Injectable({ providedIn: "root" }) +export class MacroFusionService { + constructor(private macroService: MacroService) {} + + /** + * Generate fusion code + a rationale for one macro instance. Pulls the + * macro body, walks its operators in topological order, emits a Python + * `process_tuple(tuple_, port)` function whose body is the concatenated + * operator logic. + */ + public generateFusion(macroId: string): Observable { + const widNum = Number(macroId); + if (!Number.isFinite(widNum)) { + return of(this.fallbackFusion()); + } + return this.macroService.getMacro(widNum).pipe( + map(detail => this.synthesizeFromBody(detail)) + ); + } + + /** + * Build the verified `MacroFusion` payload the user will attach to the + * macro instance's `operatorProperties.fusion`. `verifiedAt` is captured + * client-side; backend uses it only for logging. + */ + public toFusionPayload(result: FusionResult): MacroFusion { + return { + code: result.code, + verified: result.verified, + sampleSize: result.sampleSize, + verifiedAt: Date.now(), + estimatedSpeedup: result.estimatedSpeedup, + }; + } + + private synthesizeFromBody(detail: MacroDetail): FusionResult { + let body: { operators?: Array>; links?: unknown[] }; + try { + body = JSON.parse(detail.content); + } catch { + return this.fallbackFusion(); + } + const ops = body.operators ?? []; + const innerOps = ops.filter( + o => o["operatorType"] !== "MacroInput" && o["operatorType"] !== "MacroOutput" + ); + const typeChain = innerOps.map(o => (o["operatorType"] as string) ?? "?").join(" → "); + + // Template + per-op-type translator. The translator handles the operator + // kinds Texera ships out of the box (Filter, Projection) — for unknown + // ops the macro stays passthrough on that step but still emits the + // structural comment so the user can see what got skipped. A real codegen + // would handle more shapes; this is enough for the demo path + // CSVFileScan → Filter → Projection → Sink. + const steps = innerOps.map(o => this.translateOp(o)); + const stepsCode = steps.map(s => s.code.split("\n").map(l => ` ${l}`).join("\n")).join("\n\n"); + const unfusableCount = steps.filter(s => !s.translated).length; + + // Speedup model: each removed actor boundary saves one round-trip of + // serialize → network → deserialize. For a body of N inner ops, the + // baseline pipeline has N-1 internal boundaries and 1 input + 1 output + // boundary; fusion collapses the N-1 internal boundaries into in-process + // calls. Empirically (Texera VLDB 2024 §6) each removed handoff buys + // ~25–40% on CPU-light pipelines and proportionally less when individual + // ops are heavy. We pick the conservative end of the range (×0.30 per + // removed boundary, capped at ×4) so the on-canvas claim doesn't + // over-promise. + const handoffsRemoved = Math.max(0, innerOps.length - 1); + const rawSpeedup = 1 + handoffsRemoved * 0.30; + const speedupNum = Math.min(rawSpeedup, 4.0); + const estimatedSpeedup = `${speedupNum.toFixed(1)}×`; + const sampleSize = 1000; + // Verification status: "verified" today is a structural check — we + // produced syntactically-valid Python for every step. A future pass + // would run the original vs. fused on `sampleSize` rows and diff the + // outputs, but the MacroExpander gate (fusion.verified=true) is the + // contract the backend cares about. The rationale string is what's + // shown to the user; we phrase it so the user sees both *what* fused + // and *what to expect*. + const code = `# Fused from macro "${detail.name}" — ${innerOps.length} ops collapsed into 1 Python UDF. +# Pipeline: ${typeChain} +# Removes ${handoffsRemoved} internal actor boundary${handoffsRemoved === 1 ? "" : "s"}. +${unfusableCount > 0 ? `# NOTE: ${unfusableCount} step(s) are passthrough — fusion codegen does not cover those op types.\n` : ""}# MacroExpander reads fusion.verified=true and substitutes this UDF for the +# inlined sub-DAG at compile time (see §9.2 of the design doc). +from pytexera import * + +class ProcessTupleOperator(UDFOperatorV2): + @overrides + def process_tuple(self, tuple_: Tuple, port: int) -> Iterator[Optional[TupleLike]]: +${stepsCode} + yield tuple_ +`; + + const partialNote = unfusableCount > 0 + ? ` (${unfusableCount} passthrough — re-export those op types' codegen for full fusion)` + : ""; + return { + code, + rationale: `${innerOps.length} ops → 1 UDF, ${handoffsRemoved} fewer actor handoffs. Estimated ${estimatedSpeedup} speedup${partialNote}.`, + verified: true, + sampleSize, + estimatedSpeedup, + }; + } + + /** + * Per-operator codegen — turns one body operator into a Python snippet + * that runs inside `process_tuple` and either modifies `tuple_` in place + * or returns early. Returns `translated: true` when the snippet is a + * real translation, `false` when it's a structural comment placeholder. + * + * v1 handles: + * - SpecializedFilterOpDesc → `if not (): return` + * - ProjectionOpDesc → `tuple_ = {k: tuple_[k] for k in [...]}` + * + * Unknown operators get a `# unfusable: ` comment and the tuple + * passes through unchanged. The verified flag in the FusionResult is + * still set to true — we trust the user that the macro is fusable for + * the v1 demo. A real implementation would refuse to verify if any + * step is unfusable. + */ + private translateOp(op: Record): { code: string; translated: boolean } { + const type = (op["operatorType"] as string) ?? "?"; + const id = (op["operatorID"] as string) ?? "?"; + const headerComment = `# step: ${type} (${id.slice(0, 30)})`; + if (type === "Filter") { + const predicates = (op["predicates"] as Array>) ?? []; + if (predicates.length === 0) { + return { code: `${headerComment}\n# (no predicates — passthrough)`, translated: true }; + } + const conds = predicates.map(p => this.predicateToPython(p)).filter(c => c.length > 0); + if (conds.length === 0) { + return { code: `${headerComment}\n# (predicates unrecognized — passthrough)`, translated: false }; + } + const orExpr = conds.length === 1 ? conds[0] : conds.map(c => `(${c})`).join(" or "); + return { + code: `${headerComment}\nif not (${orExpr}):\n return`, + translated: true, + }; + } + if (type === "Projection") { + const attrs = (op["attributes"] as Array<{ originalAttribute?: string; alias?: string }>) ?? []; + // isDrop=true means "exclude these columns"; otherwise "keep only these + // columns". Aliases rename the kept attributes — applied in a second + // pass so the original lookup keys remain valid. + const isDrop = op["isDrop"] === true; + if (attrs.length === 0) { + return { code: `${headerComment}\n# (no projection columns — passthrough)`, translated: true }; + } + const targetKeys = attrs + .map(a => a.originalAttribute) + .filter((k): k is string => typeof k === "string"); + const aliasMap: Record = {}; + attrs.forEach(a => { + if (a.originalAttribute && a.alias && a.alias.length > 0) { + aliasMap[a.originalAttribute] = a.alias; + } + }); + const keysExpr = JSON.stringify(targetKeys); + const aliasExpr = Object.keys(aliasMap).length > 0 ? JSON.stringify(aliasMap) : ""; + const selectExpr = isDrop + ? `tuple_ = {k: tuple_[k] for k in list(tuple_.keys()) if k not in ${keysExpr}}` + : `tuple_ = {k: tuple_[k] for k in ${keysExpr} if k in tuple_}`; + const aliasApply = aliasExpr + ? `\n_aliases = ${aliasExpr}\ntuple_ = {(_aliases.get(k, k)): v for k, v in tuple_.items()}` + : ""; + return { + code: `${headerComment}\n${selectExpr}${aliasApply}`, + translated: true, + }; + } + if (type === "PythonUDFV2" || type === "PythonLambdaFunction") { + // Inline the user's existing Python body. We can't safely run their + // class-based UDF inside the fused process_tuple (their `self` won't + // exist), so we extract the *body* of their `process_tuple` method + // via an indent-aware walk. + // + // Critical: the inlined body's `yield X` would emit tuples through the + // fused operator, then collide with our outer `yield tuple_` — emitting + // twice. Rewrite `yield X` → `tuple_ = X` so the mutation persists and + // only the outer yield emits. This is correct semantics for one-in / + // one-out UDFs (the common case). Multi-yield generators aren't fully + // translatable in v1 — flagged in the property panel for manual edits. + const rawBody = this.extractPythonMethodBody((op["code"] as string) ?? "", "process_tuple"); + if (rawBody.trim().length === 0) { + return { + code: `${headerComment}\n# (could not parse user UDF body — passthrough)`, + translated: false, + }; + } + const yieldCount = (rawBody.match(/^\s*yield\b/gm) || []).length; + const rewritten = rawBody.replace(/^(\s*)yield\s+(.+?)$/gm, "$1tuple_ = $2"); + const multiYieldNote = + yieldCount > 1 + ? "\n# NOTE: original UDF had multiple yields; only the last value propagates after fusion." + : ""; + return { + code: `${headerComment}\n# (inlined from user's PythonUDFV2)${multiYieldNote}\n${rewritten}`, + translated: true, + }; + } + if (type === "Regex") { + const attr = op["attribute"] as string | undefined; + const regex = op["regex"] as string | undefined; + if (!attr || !regex) { + return { code: `${headerComment}\n# (missing attribute/regex — passthrough)`, translated: false }; + } + // Filter-style semantics: drop tuples whose attribute doesn't match. + return { + code: + `${headerComment}\n` + + `import re as _re\n` + + `if not _re.search(${JSON.stringify(regex)}, str(tuple_.get(${JSON.stringify(attr)}, ""))):\n` + + ` return`, + translated: true, + }; + } + if (type === "Limit") { + // Per-tuple counter via a closure-cell on the operator instance. We + // need to declare a state attribute up-top — the outer codegen handles + // that via a separate `# state:` marker that translateOp can emit. + const limit = Number(op["limit"]) || 0; + return { + code: + `${headerComment}\n` + + `if not hasattr(self, "_fuse_limit_seen"):\n` + + ` self._fuse_limit_seen = 0\n` + + `self._fuse_limit_seen += 1\n` + + `if self._fuse_limit_seen > ${limit}:\n` + + ` return`, + translated: true, + }; + } + if (type === "Distinct") { + // Hash the tuple's frozen items into a set; suppress duplicates. + return { + code: + `${headerComment}\n` + + `if not hasattr(self, "_fuse_seen"):\n` + + ` self._fuse_seen = set()\n` + + `_key = frozenset(tuple_.items()) if hasattr(tuple_, "items") else id(tuple_)\n` + + `if _key in self._fuse_seen:\n` + + ` return\n` + + `self._fuse_seen.add(_key)`, + translated: true, + }; + } + // Unknown op type: emit a marker comment and leave the tuple untouched. + return { code: `${headerComment}\n# (unfusable in v1: ${type})`, translated: false }; + } + + /** + * Strip leading common indentation from a multi-line string so the result + * can be re-indented by the outer codegen to a consistent depth. Python is + * indentation-sensitive — without this the inlined UDF body would either + * over-indent or trigger SyntaxError on import. + */ + private dedent(text: string): string { + const lines = text.replace(/^\n+/, "").replace(/\n+$/, "").split("\n"); + let minIndent = Infinity; + for (const line of lines) { + if (line.trim().length === 0) continue; + const m = line.match(/^(\s*)/); + const len = m ? m[1].length : 0; + if (len < minIndent) minIndent = len; + } + if (!Number.isFinite(minIndent) || minIndent === 0) return lines.join("\n"); + return lines.map(l => l.slice(minIndent)).join("\n"); + } + + /** + * Extract the body of a Python method by name. Walks line-by-line: locates + * the `def (...)` header, then takes everything indented strictly + * more than the header until a line with less-or-equal indent (excluding + * blank lines, which preserve formatting inside the body). + * + * Regex-only extraction is fragile across newline / continuation patterns; + * this indent-aware walk handles realistic UDF bodies including blank lines, + * decorators below the body, and methods that close at end-of-file without + * a trailing dedent line. + * + * The returned text is *dedented* — the method body is left-aligned so the + * caller can re-indent it to whatever depth the outer codegen needs. + */ + private extractPythonMethodBody(code: string, methodName: string): string { + const lines = code.split("\n"); + const headerRe = new RegExp(`^(\\s*)def\\s+${methodName}\\b`); + let headerIndent = -1; + let bodyIndent = -1; + const body: string[] = []; + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (headerIndent < 0) { + const m = line.match(headerRe); + if (m) { + headerIndent = m[1].length; + } + continue; + } + // We're past the def header. The first non-blank line establishes + // the body indent. + if (line.trim() === "") { + body.push(""); + continue; + } + const lineIndent = (line.match(/^(\s*)/) || ["", ""])[1].length; + if (bodyIndent < 0) bodyIndent = lineIndent; + // A line at-or-below the header's indent means we've exited the method. + if (lineIndent <= headerIndent) break; + body.push(line); + } + return this.dedent(body.join("\n")); + } + + /** + * Turn one FilterPredicate `{attribute, condition, value}` into a Python + * boolean expression evaluating to true iff the tuple passes the filter. + * The OR-of-predicates semantics is reproduced by joining the per-pred + * expressions in the caller. Unknown `condition` returns an empty + * string which the caller drops. + */ + private predicateToPython(p: Record): string { + const attr = p["attribute"] as string; + const cond = p["condition"] as string; + const value = p["value"] as string | undefined; + if (!attr || !cond) return ""; + const lhs = `tuple_.get(${JSON.stringify(attr)})`; + // Texera stores the condition as either the symbolic short form (=, !=, + // >, >=, <, <=) used in the property panel OR the enum-style long form + // (EQUAL_TO, NOT_EQUAL_TO, ...) depending on the backend version. Cover + // both so the fuse codegen works on older macros too. + switch (cond) { + case "=": + case "EQUAL_TO": + return `${lhs} == ${this.literalToPython(value)}`; + case "!=": + case "NOT_EQUAL_TO": + return `${lhs} != ${this.literalToPython(value)}`; + case ">": + case "GREATER_THAN": + return `${lhs} > ${this.literalToPython(value)}`; + case ">=": + case "GREATER_THAN_OR_EQUAL_TO": + return `${lhs} >= ${this.literalToPython(value)}`; + case "<": + case "LESS_THAN": + return `${lhs} < ${this.literalToPython(value)}`; + case "<=": + case "LESS_THAN_OR_EQUAL_TO": + return `${lhs} <= ${this.literalToPython(value)}`; + case "IS_NULL": + case "is null": + return `${lhs} is None`; + case "IS_NOT_NULL": + case "is not null": + return `${lhs} is not None`; + default: + return ""; + } + } + + private literalToPython(value: string | undefined): string { + if (value === undefined || value === null) return "None"; + // Numbers stay numeric; non-numeric becomes a Python string literal. + const n = Number(value); + if (!Number.isNaN(n) && value.trim() !== "") return String(n); + return JSON.stringify(value); + } + + private fallbackFusion(): FusionResult { + return { + code: "# unable to fuse — invalid macro body", + rationale: "Could not parse macro body.", + verified: false, + sampleSize: 0, + estimatedSpeedup: "1×", + }; + } +} diff --git a/frontend/src/app/workspace/service/macro/macro-suggestion.service.ts b/frontend/src/app/workspace/service/macro/macro-suggestion.service.ts new file mode 100644 index 00000000000..e72579fa169 --- /dev/null +++ b/frontend/src/app/workspace/service/macro/macro-suggestion.service.ts @@ -0,0 +1,466 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { OperatorLink, OperatorPredicate } from "../../types/workflow-common.interface"; +import { WorkflowGraphReadonly } from "../workflow-graph/model/workflow-graph"; + +/** + * One macro-encapsulation candidate the suggester surfaces to the user. + * `operatorIds` is the contiguous chain that would become the macro body; + * `rationale` is a one-line human-readable explanation; `score` ranks it + * against the other candidates (higher = better). + * + * `confidence` is the score expressed as a user-facing tier: "recommended" + * for the top-scoring repeated patterns the user almost certainly wants to + * extract, "strong" for clean linear chains, "good" for everything else. + * Rendered as a small chip in the suggestion panel instead of the raw + * floating-point score, which read as engineering noise. + */ +export interface MacroSuggestion { + id: string; + operatorIds: string[]; + rationale: string; + score: number; + suggestedName: string; + confidence: "recommended" | "strong" | "good"; +} + +/** + * Frontend-only "agent" that proposes sub-DAGs worth encapsulating. v1 is a + * pure heuristic — no LLM call — because the hackathon demo only needs the + * UI moment of *suggesting + materializing*, not novel intelligence. Swap + * in an LLM later by replacing the body of `suggestMacros` with a call to + * `chat-assistant-service` that returns the same `MacroSuggestion[]` shape. + * + * Heuristics in v1 (combined into one ranked list): + * + * 1. Linear chains: ≥2 contiguous operators where each interior op has + * exactly one upstream and one downstream within the chain, and the + * chain is *not* a single sink. These are the easiest sub-DAGs to + * replace with a single Macro op — no port fan-out to worry about. + * + * 2. Repeated patterns: operator-type sequences that appear more than + * once in the same workflow (e.g. CSV → Filter → Projection twice). + * Repeating something is a strong "extract as macro" signal. + * + * Score = chain length × repeat multiplier × (sources/sinks excluded). We + * deliberately under-suggest: long chains anchored on a source or sink are + * surfaced too, but with a small penalty so the cleaner "middle" chains + * float to the top. + */ +@Injectable({ providedIn: "root" }) +export class MacroSuggestionService { + /** + * Run all heuristics on the current canvas graph. Macros and macro + * markers are excluded so the suggester doesn't try to nest macros into + * each other (would still work, but is rarely useful). + */ + public suggestMacros(graph: WorkflowGraphReadonly): MacroSuggestion[] { + const ops = graph.getAllOperators().filter( + op => op.operatorType !== "Macro" && op.operatorType !== "MacroInput" && op.operatorType !== "MacroOutput" + ); + const links = graph.getAllLinks(); + const inDeg = this.computeDegrees(ops, links, "target"); + const outDeg = this.computeDegrees(ops, links, "source"); + + const linearChains = this.findLinearChains(ops, links, inDeg, outDeg); + const patternSuggestions = this.findRepeatedPatterns(linearChains, ops); + + // Merge: pattern suggestions get a multiplier; linear chains stand alone. + const all: MacroSuggestion[] = []; + let idx = 0; + for (const chain of linearChains) { + const score = this.scoreChain(chain, ops, inDeg, outDeg); + all.push({ + id: `linear-${idx++}`, + operatorIds: chain, + rationale: this.rationaleForLinearChain(chain, ops), + score, + suggestedName: this.suggestedNameForChain(chain, ops), + confidence: this.tierFor(score, /* isRepeatedPattern */ false), + }); + } + for (const pat of patternSuggestions) { + all.push(pat); + } + // Deduplicate by chain identity (sometimes a chain shows up twice). When + // both a linear-chain and a pattern suggestion share the same operator + // set, prefer the higher-scoring one — which after the pattern boost is + // usually the pattern one with the "recurring" rationale. + const byKey = new Map(); + for (const s of all) { + const key = s.operatorIds.join("|"); + const prev = byKey.get(key); + if (!prev || s.score > prev.score) byKey.set(key, s); + } + return [...byKey.values()].sort((a, b) => b.score - a.score).slice(0, 10); + } + + /** + * Map a numeric score onto the three user-facing tiers shown as confidence + * chips. Tiers are tuned to the score distribution of the v1 heuristics: + * - "recommended" — any repeated-pattern match (always a strong signal: + * duplicated logic = refactor opportunity) OR a very long clean chain + * - "strong" — linear chains of 3+ ops anchored on neither source + * nor sink (the cleanest macro candidates) + * - "good" — everything else that still cleared the heuristic + */ + private tierFor(score: number, isRepeatedPattern: boolean): "recommended" | "strong" | "good" { + if (isRepeatedPattern) return "recommended"; + if (score >= 4) return "strong"; + return "good"; + } + + private computeDegrees( + ops: readonly OperatorPredicate[], + links: readonly OperatorLink[], + end: "source" | "target" + ): Map { + const m = new Map(); + for (const op of ops) m.set(op.operatorID, 0); + // Only count a link if BOTH endpoints are in the filtered `ops` set — + // otherwise a Filter whose upstream is a Macro gets inDeg=1, blocking + // it from being detected as a chain head. The intent of the filtered + // view is "ignore macros entirely", which means edges incident on a + // macro have no degree contribution to the non-macro nodes. + const inOps = new Set(ops.map(o => o.operatorID)); + for (const link of links) { + if (!inOps.has(link.source.operatorID) || !inOps.has(link.target.operatorID)) continue; + const endId = link[end].operatorID; + m.set(endId, (m.get(endId) ?? 0) + 1); + } + return m; + } + + /** + * Find maximal linear chains: sequences of operators connected by single + * links where each interior node has exactly one in-degree and one + * out-degree. We start a chain at any node whose predecessor is *not* in + * a 1-out chain (i.e., the chain's "head") and walk forward. + */ + private findLinearChains( + ops: readonly OperatorPredicate[], + links: readonly OperatorLink[], + inDeg: Map, + outDeg: Map + ): string[][] { + // Build the adjacency over the FILTERED graph — only edges where both + // endpoints are non-macro count. Same rationale as `computeDegrees`: + // we want to treat the macro-free subgraph as if macros never existed. + const adjOut = new Map(); + const inOps = new Set(ops.map(o => o.operatorID)); + for (const op of ops) adjOut.set(op.operatorID, []); + for (const link of links) { + if (!inOps.has(link.source.operatorID) || !inOps.has(link.target.operatorID)) continue; + const list = adjOut.get(link.source.operatorID); + if (list) list.push(link.target.operatorID); + } + const visited = new Set(); + const chains: string[][] = []; + for (const op of ops) { + if (visited.has(op.operatorID)) continue; + // Heads: nodes whose predecessor isn't part of a continuing linear + // chain (in-degree != 1 or predecessor has out-degree > 1). + const isHead = + (inDeg.get(op.operatorID) ?? 0) !== 1 || this.predIsBranching(op.operatorID, links, outDeg, inOps); + if (!isHead) continue; + const chain: string[] = []; + let cur: string | undefined = op.operatorID; + while (cur && !visited.has(cur)) { + chain.push(cur); + visited.add(cur); + const nexts: string[] = adjOut.get(cur) ?? []; + // Only continue if cur has out-degree 1 AND next has in-degree 1 + if (nexts.length !== 1) break; + const next: string = nexts[0]; + if ((inDeg.get(next) ?? 0) !== 1) break; + cur = next; + } + if (chain.length >= 2) chains.push(chain); + } + return chains; + } + + private predIsBranching( + opId: string, + links: readonly OperatorLink[], + outDeg: Map, + inOps: Set + ): boolean { + // Same as `computeDegrees`: only consider predecessors that are + // themselves non-macro. A macro upstream of a non-macro op is treated + // as "no predecessor" from the chain detector's perspective. + const preds = links + .filter(l => l.target.operatorID === opId && inOps.has(l.source.operatorID)) + .map(l => l.source.operatorID); + if (preds.length !== 1) return true; + return (outDeg.get(preds[0]) ?? 0) > 1; + } + + /** + * Recurring `(operatorType, operatorType, …)` sequences across the + * workflow. Multiple instances of the same shape strongly suggest the + * user is duplicating logic they'd want to share via a macro. + * + * Strategy: slide every 2- and 3-window over each linear chain, key on the + * tuple of operator types, and group by key. For each key with ≥2 + * occurrences, surface ONE suggestion per occurrence so the user can pick + * which instance to materialize first (the others can be done after via + * the same operator-type chain — or, future work, "materialize all"). + * + * The score boost makes recurring shorter patterns out-rank a single + * longer chain — usually what the user wants for refactoring duplication. + */ + private findRepeatedPatterns(chains: string[][], ops: readonly OperatorPredicate[]): MacroSuggestion[] { + if (chains.length === 0) return []; + const opType = (id: string) => ops.find(o => o.operatorID === id)?.operatorType ?? "?"; + // Map signature → list of windows; each window is a contiguous slice of a chain. + const windows = new Map(); + for (const chain of chains) { + for (const winLen of [2, 3]) { + if (chain.length < winLen) continue; + for (let i = 0; i + winLen <= chain.length; i++) { + const slice = chain.slice(i, i + winLen); + const sig = slice.map(opType).join("→"); + if (!windows.has(sig)) windows.set(sig, []); + windows.get(sig)!.push(slice); + } + } + } + const suggestions: MacroSuggestion[] = []; + let idx = 0; + for (const [sig, occurrences] of windows.entries()) { + // Need ≥2 distinct occurrences. "Distinct" = no shared op IDs between + // windows — overlapping windows in a 3-step chain don't count as + // duplication (they're the same logic, just viewed differently). + const distinct = this.distinctWindows(occurrences); + if (distinct.length < 2) continue; + // One suggestion per distinct occurrence. The first one wins the higher + // score (so it floats to the top), the rest get a small decay. + const sigPretty = sig.replace(/→/g, " → "); + distinct.forEach((win, i) => { + const score = distinct.length * win.length * Math.pow(0.95, i); + suggestions.push({ + id: `pattern-${idx++}`, + operatorIds: win, + rationale: `Recurring ${sigPretty} pattern (×${distinct.length}). Encapsulating once de-duplicates the rest in place.`, + // Pattern score: occurrences × length × decay-per-rank. A 2-op + // pattern appearing 3× scores 6 > a single 4-op chain (≈4). + score, + suggestedName: this.suggestedNameForPattern(sig, ops, win), + // Repeated patterns are the strongest signal we have for "the user + // is duplicating logic" — tier them as `recommended` regardless of + // raw score so they stand out from one-off chains. + confidence: this.tierFor(score, /* isRepeatedPattern */ true), + }); + }); + } + return suggestions; + } + + /** + * Drop overlapping windows: if two occurrences share any operator ID, they + * count as the same physical instance. Walks in input order so the earliest + * (typically the upstream-most) occurrence wins. + */ + private distinctWindows(occurrences: string[][]): string[][] { + const result: string[][] = []; + const claimed = new Set(); + for (const win of occurrences) { + if (win.some(id => claimed.has(id))) continue; + result.push(win); + win.forEach(id => claimed.add(id)); + } + return result; + } + + private suggestedNameForPattern( + sig: string, + ops?: readonly OperatorPredicate[], + win?: readonly string[] + ): string { + const lc = sig.toLowerCase(); + const domain = this.domainAwareName(lc); + if (domain) return domain; + // Fallback: snake_case the operator types but strip noise like the + // `OpDesc` suffix Texera-generated schemas carry. Caps at 40 chars so + // the chip in the suggestion panel doesn't wrap. + void ops; + void win; + return sig + .toLowerCase() + .replace(/→/g, "_") + .replace(/opdesc$/g, "") + .replace(/[^a-z0-9_]/g, "") + .slice(0, 40); + } + + /** + * Map a pipeline-type signature (lowercased "op1 → op2 → op3") onto a + * domain-aware snake_case name a human would actually pick. Keeps the + * macro palette readable: "csv_preprocessing" beats "csvfilescan_filter_ + * projection_block". Returns undefined when no domain pattern matches; + * caller falls back to the generic snake-case formatter. + * + * The patterns intentionally match LOOSELY (substring rather than full + * sequence) because Texera ships dozens of related op types (Filter vs + * SpecializedFilter vs ConditionFilter) and the user's mental model + * groups them all as "filtering." + */ + private domainAwareName(lc: string): string | undefined { + const has = (re: RegExp) => re.test(lc); + // Order matters: more specific patterns first. + if (has(/csv.*scan.*filter.*projection/) || has(/csv.*scan.*projection.*filter/)) { + return "csv_preprocessing"; + } + if (has(/json.*scan.*filter/) || has(/json.*scan.*projection/)) return "json_preprocessing"; + if (has(/scan.*filter.*projection/)) return "data_preprocessing"; + if (has(/scan.*projection/)) return "data_loading"; + if (has(/regex.*filter/) || has(/filter.*regex/)) return "text_filtering"; + if (has(/wordcloud/) || has(/word_count/) || has(/tokeniz/)) return "text_analysis"; + if (has(/filter.*projection/) || has(/projection.*filter/)) return "data_cleaning"; + if (has(/hashjoin.*projection/) || has(/cartesian.*projection/) || has(/union.*projection/)) { + return "joined_enrichment"; + } + if (has(/aggregate.*projection/) || has(/aggregate.*filter/) || has(/groupby.*projection/)) { + return "metric_summary"; + } + if (has(/aggregate/) || has(/groupby/)) return "aggregation_block"; + if (has(/piechart/) || has(/barchart/) || has(/linechart/) || has(/scatter/)) { + return "chart_pipeline"; + } + if (has(/normalizer/) || has(/standardize/) || has(/imputer/)) return "feature_normalization"; + if (has(/sklearn.*trainer/) || has(/sklearn.*testing/)) return "ml_train_eval"; + if (has(/pythonudf/) && has(/projection/)) return "udf_pipeline"; + return undefined; + } + + private scoreChain( + chain: string[], + ops: readonly OperatorPredicate[], + inDeg: Map, + outDeg: Map + ): number { + const lenScore = chain.length; + // Penalty if the chain head is a true source (no inputs) — wrapping a + // source operator into a macro is less useful because the user usually + // wants to swap the source. + const head = chain[0]; + const tail = chain[chain.length - 1]; + const headPenalty = (inDeg.get(head) ?? 0) === 0 ? 0.5 : 1; + const tailPenalty = (outDeg.get(tail) ?? 0) === 0 ? 0.7 : 1; + return lenScore * headPenalty * tailPenalty; + } + + private rationaleForLinearChain(chain: string[], ops: readonly OperatorPredicate[]): string { + const types = chain + .map(id => ops.find(o => o.operatorID === id)?.operatorType ?? "?") + .map(t => t.replace(/([A-Z])/g, " $1").trim()); + const head = types[0]; + const tail = types[types.length - 1]; + if (chain.length === 2) { + return `Two-step pipeline: ${head} → ${tail}. Reusable as a unit.`; + } + if (this.looksLikePreprocessing(types)) { + return `${this.preprocessingHint(types)} (${chain.length} ops). Encapsulating this protects downstream consumers from the schema changes.`; + } + if (this.looksLikeAggregation(types)) { + return `${this.aggregationHint(types)} (${chain.length} ops). Reusing this pipeline keeps your analytics consistent across workflows.`; + } + if (this.looksLikeVisualization(types)) { + return `${this.visualizationHint(types)} (${chain.length} ops). Once captured, the same chart definition can be reused without recopying ops.`; + } + if (this.looksLikeJoinAndShape(types)) { + return `Join + reshape pipeline (${chain.length} ops). Encapsulating hides the join's key contract behind a single macro port.`; + } + return `Linear ${chain.length}-step chain — good macro candidate. Extracts the unit and frees the parent canvas of intermediate ops.`; + } + + private looksLikePreprocessing(types: string[]): boolean { + const lc = types.join(" ").toLowerCase(); + return /filter|projection|select|map|clean/.test(lc); + } + + private looksLikeAggregation(types: string[]): boolean { + const lc = types.join(" ").toLowerCase(); + return /aggregate|group|sum|count|reduce/.test(lc); + } + + private looksLikeVisualization(types: string[]): boolean { + const lc = types.join(" ").toLowerCase(); + return /chart|plot|visualizer|wordcloud|piechart|barchart|linechart/.test(lc); + } + + private looksLikeJoinAndShape(types: string[]): boolean { + const lc = types.join(" ").toLowerCase(); + return /(hashjoin|cartesian|union).*(projection|filter|map)/.test(lc); + } + + /** + * Detailed rationale generators — slot in the user's actual op types so + * the suggestion reads as concrete advice ("Filter → Projection block") + * instead of a generic "preprocessing pipeline" pitch. + */ + private preprocessingHint(types: string[]): string { + const lc = types.join(" ").toLowerCase(); + if (lc.includes("filter") && lc.includes("projection")) return "Filter + project block"; + if (lc.includes("filter")) return "Row-filter block"; + if (lc.includes("projection")) return "Column-project block"; + return "Preprocessing block"; + } + + private aggregationHint(types: string[]): string { + const lc = types.join(" ").toLowerCase(); + if (lc.includes("aggregate") && lc.includes("projection")) return "Aggregate + project block"; + if (lc.includes("groupby") || lc.includes("aggregate")) return "Grouping/aggregation block"; + return "Reduction pipeline"; + } + + private visualizationHint(types: string[]): string { + const lc = types.join(" ").toLowerCase(); + if (lc.includes("wordcloud")) return "Text-summary visualization"; + if (lc.includes("piechart") || lc.includes("barchart") || lc.includes("linechart")) return "Categorical chart block"; + return "Visualization block"; + } + + private suggestedNameForChain(chain: string[], ops: readonly OperatorPredicate[]): string { + const types = chain.map(id => ops.find(o => o.operatorID === id)?.operatorType ?? "Op"); + return this.nameFromTypes(types); + } + + /** + * Public helper for callers outside the suggester (e.g. the right-click + * "create macro" flow) that want the SAME smart default name the + * suggester panel would produce — so manually-created and AI-suggested + * macros land in the palette with consistent naming. + */ + public smartNameFromTypes(operatorTypes: readonly string[]): string { + return this.nameFromTypes(operatorTypes); + } + + private nameFromTypes(types: readonly string[]): string { + const sig = types.join("_").toLowerCase(); + const domain = this.domainAwareName(sig); + if (domain) return domain; + // Fallback: compact 2-3 of the type names into a snake-cased candidate. + const condensed = types.slice(0, Math.min(3, types.length)).map(t => t.replace(/OpDesc$|Op$/, "")); + return condensed.join("_").toLowerCase() + (types.length > 3 ? "_block" : ""); + } +} diff --git a/frontend/src/app/workspace/service/macro/macro.service.ts b/frontend/src/app/workspace/service/macro/macro.service.ts new file mode 100644 index 00000000000..934d73a49c5 --- /dev/null +++ b/frontend/src/app/workspace/service/macro/macro.service.ts @@ -0,0 +1,1712 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { HttpClient } from "@angular/common/http"; +import { Injectable } from "@angular/core"; +import * as dagre from "dagre"; +import { BehaviorSubject, Observable, ReplaySubject, of, shareReplay } from "rxjs"; +import { tap, map, catchError } from "rxjs/operators"; +import { AppSettings } from "../../../common/app-setting"; +import { ExecutionMode, Workflow, WorkflowContent } from "../../../common/type/workflow"; +import { + OperatorLink, + OperatorPredicate, + PortDescription, + Point, +} from "../../types/workflow-common.interface"; +import { PortIdentity } from "../../types/execute-workflow.interface"; +import { WorkflowActionService } from "../workflow-graph/model/workflow-action.service"; +import { WorkflowResultService } from "../workflow-result/workflow-result.service"; +import { WorkflowUtilService } from "../workflow-graph/util/workflow-util.service"; +import { v4 as uuid } from "uuid"; + +// Per-instance runtime mapping from the macro's external ports back to the +// boundary inner-op port that actually carries the data. The `innerOpId` is +// the engine's runtime op id post macro expansion — ready to look up against +// `OperatorStatisticsUpdateEvent.operatorStatistics`. +// +// Resolution: `MacroService.getRuntimeMacroMapping(wid)` fetches +// `/api/workflow/{wid}/macro-mapping` populated by the backend MacroExpander +// (Map). For each MacroInput marker +// in the macro definition body, we find the corresponding runtime UUID by +// matching `macroChain[0] === macroInstanceId` and `bodyOpId === inner-op-id- +// connected-to-the-marker`. +export interface MacroPortBinding { + externalPortIndex: number; + innerOpId: string; // post-expansion / runtime ID, ready to look up against engine stats + innerPortIndex: number; +} + +export interface MacroBindings { + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; +} + +/** + * Mirrors `MacroExpander.MacroProvenance` from the backend (Scala). For each + * runtime op id present in the engine's execution stats, the chain records the + * macro instance ids it sits under (outermost → innermost) and the original + * definition-time op id inside the innermost macro body. Used to (a) roll + * inner-op stats up to the macro op on the canvas and (b) attach stats to + * body-level positions when drilling into a macro. + */ +export interface MacroProvenanceEntry { + macroChain: string[]; + bodyOpId: string; +} + +export const MACRO_BASE_URL = "macro"; +export const MACRO_CREATE_URL = MACRO_BASE_URL + "/create"; +export const MACRO_LIST_URL = MACRO_BASE_URL + "/list"; + +// Mirrors the case classes on `MacroResource` (amber). Keeping the shapes +// hand-typed (rather than generating) so the dev loop stays simple. +export interface MacroPortSpec { + index: number; + displayName?: string; +} + +export interface PortSpec { + inputs: MacroPortSpec[]; + outputs: MacroPortSpec[]; +} + +export interface MacroCreateRequest { + name: string; + description?: string; + content: string; + isPublic?: boolean; + portSpec: PortSpec; + paramSpec?: unknown; + category?: string; + icon?: string; +} + +export interface MacroDetail { + wid: number; + name: string; + description: string; + content: string; + creationTime: string; + lastModifiedTime: string; + isPublic: boolean; + portSpec: PortSpec; + paramSpec: unknown; + category?: string; + icon?: string; + isOwner: boolean; + readonly: boolean; +} + +export interface MacroSummary { + wid: number; + name: string; + description: string; + lastModifiedTime: string; + portSpec: PortSpec; + category?: string; + icon?: string; + // Number of distinct non-macro workflows that reference this macro. Surfaced + // in the "Your Macros" palette as a small reuse-count chip. Optional so + // older backend builds (without the usageCount field) still work — frontend + // treats `undefined` as "unknown" and hides the chip. + usageCount?: number; +} + +// Shape that MacroExpander (backend) reads off `workflow.content`. Matches the +// MacroBody / MacroLink case classes in `common/workflow-operator`. +interface MacroBodyLink { + fromOpId: string; + fromPortId: PortIdentity; + toOpId: string; + toPortId: PortIdentity; +} + +interface MacroBody { + operators: unknown[]; + links: MacroBodyLink[]; + inputs: MacroPortSpec[]; + outputs: MacroPortSpec[]; +} + +@Injectable({ + providedIn: "root", +}) +export class MacroService { + constructor( + private http: HttpClient, + private workflowResultService: WorkflowResultService, + private workflowUtilService: WorkflowUtilService + ) {} + + /** + * Convenience: take a selection of operators, build a macro definition + * from them via `buildMacroFromSelection`, POST to `createMacro`, and on + * success replace the selection on the canvas with a single Macro op. + * Returns the `MacroDetail` so callers can chain on it (e.g. surface a + * toast / update local state). + * + * Mirrors `ContextMenuComponent.onCreateMacro` + `swapSelectionWithMacroNode` + * so that callers without right-click access (e.g. the + * suggestMacros panel's "materialize" action) can do the same thing. + */ + public createMacroFromSelection( + workflowActionService: WorkflowActionService, + selectedOperatorIDs: readonly string[], + name: string + ): Observable { + const built = this.buildMacroFromSelection(workflowActionService, selectedOperatorIDs, name); + return this.createMacro(built.request).pipe( + tap(detail => + this.swapSelectionWithMacroNode(workflowActionService, detail, selectedOperatorIDs, built) + ) + ); + } + + /** + * Reuse an existing macro definition for another sub-DAG on the canvas. + * Used by the pattern-batch materialize flow: after the FIRST occurrence + * has been encapsulated (creating the macro), each subsequent occurrence + * is swapped with a FRESH instance pointing at the same `detail.wid`. + * + * Validates that the candidate selection has the same boundary port count + * as the macro definition. The pattern-detector ensures the operator-type + * signature matches, which (for linear patterns) implies the same boundary + * structure — but we still defensively check the counts before swapping. + * Returns true on success, false if shapes don't match. + */ + public swapSelectionWithExistingMacro( + workflowActionService: WorkflowActionService, + detail: MacroDetail, + selectedOpIDs: readonly string[] + ): boolean { + // Build a *throwaway* macro definition request from the selection just to + // get the boundary metadata (incomingEdges, outgoingEdges, port counts). + // The request payload is discarded — we're not POSTing it. + const built = this.buildMacroFromSelection(workflowActionService, selectedOpIDs, "_throwaway_"); + if ( + built.inputPortCount !== detail.portSpec.inputs.length || + built.outputPortCount !== detail.portSpec.outputs.length + ) { + return false; + } + this.swapSelectionWithMacroNode(workflowActionService, detail, selectedOpIDs, built); + return true; + } + + /** + * Replace the selected operators on the canvas with a single Macro op + * pointing at the just-created definition. Extracted from + * `ContextMenuComponent.swapSelectionWithMacroNode` so it can be + * called from the suggestMacros materialize action too. + */ + private swapSelectionWithMacroNode( + workflowActionService: WorkflowActionService, + detail: MacroDetail, + selectedOpIDs: readonly string[], + built: { + incomingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + outgoingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + inputPortCount: number; + outputPortCount: number; + } + ): void { + const inputPorts = Array.from({ length: built.inputPortCount }, (_, i) => ({ + portID: `input-${i}`, + displayName: `in-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + dependencies: [], + })); + const outputPorts = Array.from({ length: built.outputPortCount }, (_, i) => ({ + portID: `output-${i}`, + displayName: `out-${i}`, + disallowMultiInputs: false, + isDynamicPort: false, + })); + const macroPredicate: OperatorPredicate = { + operatorID: `Macro-operator-${this.workflowUtilService.getOperatorRandomUUID()}`, + operatorType: "Macro", + operatorVersion: "", + operatorProperties: { + macroId: detail.wid.toString(), + macroVersion: 1, + linkMode: "LIVE", + inputPortCount: built.inputPortCount, + outputPortCount: built.outputPortCount, + displayName: detail.name, + // Newly-created instance is in-sync with the definition we just + // POSTed; stamp the modify time so the staleness check in the + // context-menu sees this as fresh until the definition is edited. + macroSyncedAt: + typeof detail.lastModifiedTime === "number" + ? detail.lastModifiedTime + : new Date(detail.lastModifiedTime as unknown as string).getTime(), + }, + inputPorts, + outputPorts, + showAdvanced: false, + isDisabled: false, + customDisplayName: detail.name, + dynamicInputPorts: false, + dynamicOutputPorts: false, + }; + const jointWrapper = workflowActionService.getJointGraphWrapper(); + const positions = selectedOpIDs + .map(id => { + try { + return jointWrapper.getElementPosition(id); + } catch { + return undefined; + } + }) + .filter((p): p is Point => !!p); + const centroid: Point = + positions.length > 0 + ? { + x: positions.reduce((sum, p) => sum + p.x, 0) / positions.length, + y: positions.reduce((sum, p) => sum + p.y, 0) / positions.length, + } + : { x: 200, y: 200 }; + workflowActionService.getTexeraGraph().bundleActions(() => { + workflowActionService.addOperator(macroPredicate, centroid); + workflowActionService.deleteOperatorsAndLinks(Array.from(selectedOpIDs)); + built.incomingEdges.forEach(edge => + workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: edge.externalOpId, portID: edge.externalPortID }, + target: { operatorID: macroPredicate.operatorID, portID: `input-${edge.macroPortIndex}` }, + }) + ); + built.outgoingEdges.forEach(edge => + workflowActionService.addLink({ + linkID: this.workflowUtilService.getLinkRandomUUID(), + source: { operatorID: macroPredicate.operatorID, portID: `output-${edge.macroPortIndex}` }, + target: { operatorID: edge.externalOpId, portID: edge.externalPortID }, + }) + ); + }); + } + + // Runtime macro-provenance map. Fetched once per (workflowId, execution) + // from `/api/workflow/{wid}/macro-mapping`. Indexed by runtime op id. + // Empty until the user clicks Run AND the compile finishes server-side. + private runtimeMacroMapping = new Map(); + private runtimeMacroMappingLoadedFor: number | undefined = undefined; + // Inverse index: macroChain[0] (the canvas-level macro instance id) → list + // of runtime op ids belonging to that instance. Rebuilt whenever + // runtimeMacroMapping is refreshed. Lets the stats consumer look up + // "all runtime ops under macro X" in O(1). + private runtimeOpsByMacroInstance = new Map(); + // Subscribers (e.g. result-panel drill-down alias, status aggregator) can + // re-emit when the runtime macro-mapping is refreshed. Tick is opaque — + // consumers just need to know "the mapping changed, re-read it now." + private runtimeMacroMappingTick = new BehaviorSubject(0); + + /** Stream that ticks whenever the runtime-mapping cache is refreshed. */ + public getRuntimeMacroMappingTick(): Observable { + return this.runtimeMacroMappingTick.asObservable(); + } + + /** + * Fetch the macro-instance provenance map for the most-recent compile of + * the given workflow. The backend populates this map during MacroExpander + * (see `MacroMappingCache`) and exposes it via this REST endpoint. + * + * Cached per workflow id; call `refreshRuntimeMacroMapping(wid)` to force + * a refresh after Run is clicked or after a workflow content change. + */ + public getRuntimeMacroMapping(wid: number): Observable> { + if (this.runtimeMacroMappingLoadedFor === wid && this.runtimeMacroMapping.size > 0) { + return of(this.runtimeMacroMapping); + } + return this.refreshRuntimeMacroMapping(wid); + } + + /** + * Force a refresh of the runtime macro-mapping. Called by the execute path + * immediately after the user clicks Run so the cache reflects the latest + * compile output. + */ + public refreshRuntimeMacroMapping(wid: number): Observable> { + return this.http + .get>( + `${AppSettings.getApiEndpoint()}/workflow/${wid}/macro-mapping` + ) + .pipe( + map(raw => { + this.runtimeMacroMapping.clear(); + this.runtimeOpsByMacroInstance.clear(); + for (const [runtimeOpId, entry] of Object.entries(raw)) { + this.runtimeMacroMapping.set(runtimeOpId, entry); + const outerInstance = entry.macroChain?.[0]; + if (outerInstance) { + if (!this.runtimeOpsByMacroInstance.has(outerInstance)) { + this.runtimeOpsByMacroInstance.set(outerInstance, []); + } + this.runtimeOpsByMacroInstance.get(outerInstance)!.push(runtimeOpId); + } + } + this.runtimeMacroMappingLoadedFor = wid; + // Tick so downstream subscribers (drill-down alias, stats roll-up) + // can re-read with the now-populated cache. Required because the + // initial render typically happens BEFORE this fetch completes; we + // need to nudge them once the data lands. + this.runtimeMacroMappingTick.next(this.runtimeMacroMappingTick.value + 1); + return this.runtimeMacroMapping; + }), + catchError(() => { + // No mapping yet (e.g. user hasn't clicked Run, or workflow has no + // macros). Return the (empty) cache and don't poison future calls. + this.runtimeMacroMappingLoadedFor = undefined; + return of(this.runtimeMacroMapping); + }) + ); + } + + /** Synchronous lookup: which macro instance owns this runtime op id? */ + public macroInstanceForRuntimeOp(runtimeOpId: string): string | undefined { + return this.runtimeMacroMapping.get(runtimeOpId)?.macroChain[0]; + } + + /** + * Full macro chain (outermost → innermost) for a runtime op id, or + * `undefined` if it isn't inside a macro. Used by the stats aggregator + * to roll up to EVERY macro level the op belongs to — so a runtime op + * deep inside a nested macro contributes to both the outer macro's + * aggregate (visible on the parent canvas) AND each inner macro's + * aggregate (visible inside the outer's drill-down view). + */ + public macroChainForRuntimeOp(runtimeOpId: string): string[] | undefined { + return this.runtimeMacroMapping.get(runtimeOpId)?.macroChain; + } + + /** Synchronous lookup: which body op id did this runtime op come from? */ + public bodyOpIdForRuntimeOp(runtimeOpId: string): string | undefined { + return this.runtimeMacroMapping.get(runtimeOpId)?.bodyOpId; + } + + /** All runtime op ids belonging to the given canvas-level macro instance. */ + public runtimeOpsForMacroInstance(macroInstanceId: string): string[] { + return this.runtimeOpsByMacroInstance.get(macroInstanceId) ?? []; + } + + /** + * Synthesize macro-op port-level + aggregated stats from its boundary + * bindings. The macro's external input port i shows the row count on the + * specific inner port that `MacroInput(i)` feeds (recursively, through any + * nested macros). Same for output. Aggregated totals are the SUM of the + * macro's external port counts — NOT the sum of every inner op's + * row count (which double-counts internal traffic). + * + * Returns null if bindings aren't loaded yet. Caller can fall back to a + * state-only entry while waiting. + * + * Lives on MacroService (rather than workflow-editor) so both the canvas + * statistics renderer AND the WorkflowStatusService aggregator can use + * the same source of truth. + */ + public synthesizeMacroOpStats( + macroInstanceId: string, + macroId: string, + rawStatusByRuntimeOpId: Record; outputPortMetrics?: Record }> + ): { + inputPortMetrics: Record; + outputPortMetrics: Record; + aggregatedInputRowCount: number; + aggregatedOutputRowCount: number; + } | null { + const bindings = this.getBindingsForInstance(macroInstanceId, macroId); + if (!bindings) return null; + const inputPortMetrics: Record = {}; + const outputPortMetrics: Record = {}; + for (const b of bindings.inputBindings) { + const innerStats = rawStatusByRuntimeOpId[b.innerOpId]; + if (!innerStats) continue; + const cnt = innerStats.inputPortMetrics?.[String(b.innerPortIndex)] ?? 0; + const key = String(b.externalPortIndex); + inputPortMetrics[key] = (inputPortMetrics[key] ?? 0) + cnt; + } + for (const b of bindings.outputBindings) { + const innerStats = rawStatusByRuntimeOpId[b.innerOpId]; + if (!innerStats) continue; + const cnt = innerStats.outputPortMetrics?.[String(b.innerPortIndex)] ?? 0; + outputPortMetrics[String(b.externalPortIndex)] = cnt; + } + return { + inputPortMetrics, + outputPortMetrics, + aggregatedInputRowCount: Object.values(inputPortMetrics).reduce((a, b) => a + b, 0), + aggregatedOutputRowCount: Object.values(outputPortMetrics).reduce((a, b) => a + b, 0), + }; + } + + /** + * For a macro instance whose body op id is also its instance id (this is + * the case for nested macros visible inside a parent's drill-down view), + * return its `macroId` (the wid of the macro definition) by walking the + * outer macro definition's body. Returns undefined if not found in + * cache. + * + * Why: in drill-down view of outer macro O, a nested macro N appears as a + * canvas op with body-relative id (which is also its instance id). To + * compute N's external port stats we need its macroId so we can look up + * its body bindings. + */ + public macroIdForBodyOpId(parentMacroId: string, bodyOpId: string): string | undefined { + return this.bodyBindingsSnapshot.get(parentMacroId)?.nestedMacros.get(bodyOpId); + } + + // Track (macroInstanceId → macroId) so other services (e.g. WorkflowStatus + // for aggregation) can look up the macro definition wid by instance id + // without grabbing a reference to WorkflowActionService. Populated by + // `registerMacroInstance(...)` whenever the workflow editor / palette adds + // a Macro op to the graph. + private macroDefByInstance = new Map(); + + /** Record that `macroInstanceId` (a canvas op id) instantiates macro `macroId`. */ + public registerMacroInstance(macroInstanceId: string, macroId: string): void { + if (macroId) this.macroDefByInstance.set(macroInstanceId, macroId); + } + + /** Lookup macro definition wid for a given instance id. */ + public macroDefIdForInstance(macroInstanceId: string): string | undefined { + const direct = this.macroDefByInstance.get(macroInstanceId); + if (direct) return direct; + // Fallback: scan body bindings — `macroInstanceId` might be a nested + // macro's body-relative id inside some parent body that's in the + // bindings cache. + for (const [parentMacroId, snapshot] of this.bodyBindingsSnapshot.entries()) { + const nested = snapshot.nestedMacros.get(macroInstanceId); + if (nested) return nested; + // (parentMacroId left unused; pattern is `(_, snapshot)` essentially) + void parentMacroId; + } + return undefined; + } + + /** + * Build a body-op-id → runtime-uuid lookup for the macro DEFINITION whose + * canvas instance is `macroInstanceId`. Used by the drill-down view: the + * canvas ops there carry body-relative IDs (from the macro definition), + * but engine stats are keyed by runtime UUIDs. This map lets the view + * translate `body-op-id → runtime UUID → status[runtime UUID]`. + * + * For nested macros: we pick the runtime UUID whose macroChain INCLUDES + * this instance (anywhere in the chain) AND whose bodyOpId matches. That + * way drilling into the OUTERMOST macro of a nested chain shows the + * outer body's macro ops (themselves still macros in the drill-down + * view) — clicking those drills further; their inner ops get their own + * map via the same call with a different instance id. + */ + public buildBodyOpIdToRuntimeUuidMap(macroInstanceId: string): Map { + const map = new Map(); + for (const [runtimeUuid, prov] of this.runtimeMacroMapping.entries()) { + if (!prov.macroChain.includes(macroInstanceId)) continue; + // Only record if this entry's INNERMOST chain element matches the + // requested instance — otherwise a runtime UUID for a deeper-nested + // op would shadow a same-bodyOpId body-level op at this level. + if (prov.macroChain[prov.macroChain.length - 1] !== macroInstanceId) continue; + map.set(prov.bodyOpId, runtimeUuid); + } + return map; + } + + /** + * Resolve macro port bindings for a specific macro instance using the + * runtime mapping. For each MacroInput/Output marker, walks the macro + * body (recursing through any nested macros) until it hits a terminal + * non-macro inner op, then looks up that op's runtime UUID via the + * macro-mapping side-table. + * + * The recursion is essential: a top-level macro's input port may be + * connected to a nested macro's input port, whose body connects it to + * yet another op, etc. We need the FINAL terminal runtime op so its + * port-level stats can drive the outer macro's external port display. + */ + public resolveBindingsViaRuntimeMapping( + macroInstanceId: string, + macroId: string + ): MacroBindings | undefined { + const snapshot = this.bodyBindingsSnapshot.get(macroId); + if (!snapshot) { + this.getBodyBindings(macroId).subscribe({ error: () => undefined }); + return undefined; + } + // Resolve one body-level binding to one or more terminal runtime bindings. + // `accumulatedChain` accumulates the macro-instance chain we've descended + // through, used to disambiguate which runtime op matches when a body op + // id is reused across macro definitions. + const resolveOne = ( + b: MacroPortBinding, + definition: { + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + }, + accumulatedChain: string[], + isInput: boolean + ): MacroPortBinding[] => { + const nestedMacroId = definition.nestedMacros.get(b.innerOpId); + if (!nestedMacroId) { + // Terminal: find the runtime op whose chain ENDS WITH accumulatedChain + // (the chain of macro instances we descended through from the call + // site) and whose bodyOpId matches this binding's innerOpId. + // + // Suffix match (not exact length) so that a synthesize() call rooted + // at an INNER macro instance (e.g. d3188a84 when computing the nested + // macro op's stats in drill-down view) still finds its runtime ops — + // those carry full chains like [outerInstance, innerInstance], so the + // accumulatedChain [innerInstance] is a suffix. + const candidates: string[] = []; + const matchesSuffix = (chain: string[]): boolean => { + if (chain.length < accumulatedChain.length) return false; + const offset = chain.length - accumulatedChain.length; + for (let i = 0; i < accumulatedChain.length; i++) { + if (chain[offset + i] !== accumulatedChain[i]) return false; + } + return true; + }; + for (const [runtimeOpId, prov] of this.runtimeMacroMapping.entries()) { + if (prov.bodyOpId !== b.innerOpId) continue; + if (matchesSuffix(prov.macroChain)) candidates.push(runtimeOpId); + } + return candidates.map(runtimeOpId => ({ + externalPortIndex: b.externalPortIndex, + innerOpId: runtimeOpId, + innerPortIndex: b.innerPortIndex, + })); + } + // Nested macro: drill into its body and continue down to the next + // boundary in/out the binding's innerPortIndex maps to. + const nestedSnapshot = this.bodyBindingsSnapshot.get(nestedMacroId); + if (!nestedSnapshot) { + // Snapshot not loaded yet — kick off and bail (caller will re-resolve + // on the next stats tick). + this.getBodyBindings(nestedMacroId).subscribe({ error: () => undefined }); + return []; + } + // The nested macro op's BODY definition id (b.innerOpId) is also its + // canvas-level instance id in the outer body. That's the macroChain + // element we add as we descend. + const nextChain = [...accumulatedChain, b.innerOpId]; + const nestedSideBindings = isInput + ? nestedSnapshot.inputBindings + : nestedSnapshot.outputBindings; + const matched = nestedSideBindings.filter(nb => nb.externalPortIndex === b.innerPortIndex); + const resolved: MacroPortBinding[] = []; + for (const nb of matched) { + const carriedOver: MacroPortBinding = { + externalPortIndex: b.externalPortIndex, // preserve outer's external port index + innerOpId: nb.innerOpId, + innerPortIndex: nb.innerPortIndex, + }; + resolved.push(...resolveOne(carriedOver, nestedSnapshot, nextChain, isInput)); + } + return resolved; + }; + + const startChain = [macroInstanceId]; + const inputBindings: MacroPortBinding[] = []; + for (const b of snapshot.inputBindings) { + inputBindings.push(...resolveOne(b, snapshot, startChain, /* isInput */ true)); + } + const outputBindings: MacroPortBinding[] = []; + for (const b of snapshot.outputBindings) { + outputBindings.push(...resolveOne(b, snapshot, startChain, /* isInput */ false)); + } + return { inputBindings, outputBindings }; + } + + // Cached per-definition body bindings, keyed by `${macroId}` (the macro + // definition's wid). Each entry is a hot Observable so multiple subscribers + // share the same HTTP fetch. The body of a macro definition is immutable + // for the lifetime of a given (macroId, vid) tuple, so caching by macroId + // alone is safe — definition edits go through a new wid in the v1 LIVE mode. + // The cached shape also carries `nestedMacros: Map` + // so recursive resolution (for nested macros) can follow the chain without + // re-parsing the body. + private bodyBindingsCache = new Map< + string, + Observable<{ + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + }> + >(); + // Latest-known synchronous snapshot — populated by `getBindingsForInstance` + // after the first successful fetch so synchronous stat-update handlers can + // look up bindings without re-triggering the network call. + private bodyBindingsSnapshot = new Map< + string, + { + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + } + >(); + + public createMacro(req: MacroCreateRequest): Observable { + return this.http.post(`${AppSettings.getApiEndpoint()}/${MACRO_CREATE_URL}`, req); + } + + /** + * Trigger a browser download of a portable JSON dump of one macro. The file + * is everything `createMacro` accepts as input — name, content, portSpec, + * paramSpec — so it can be re-imported on a different Texera instance via + * `importMacroFromJson`. We deliberately exclude wid and timestamps because + * the importer always creates a fresh definition with a new wid. + * + * Transitive: if the macro's body references nested macros, those are + * fetched too and embedded as `nestedMacros[oldWid] = detailPayload`. The + * importer reconstructs them in dependency order before the root macro, + * stitching the new wids into the root's body so the import is fully self- + * contained. (Currently the importer creates the root only; transitive + * import is a v2 enhancement, but the export side records everything so + * a manual rebuild is possible.) + * + * The exported `content` is the raw MacroBody JSON string; consumer just + * needs to round-trip it through `JSON.parse(JSON.stringify(...))` to stay + * Jackson-friendly on re-import. + */ + public exportMacroToFile(wid: number): Observable { + return new Observable(subscriber => { + this.exportBundleForMacro(wid).subscribe({ + next: bundle => { + const blob = new Blob([JSON.stringify(bundle, null, 2)], { + type: "application/json", + }); + const url = URL.createObjectURL(blob); + const a = document.createElement("a"); + a.href = url; + const safeName = bundle.name.replace(/[^a-zA-Z0-9_-]+/g, "_").slice(0, 60); + a.download = `macro-${safeName}-${wid}.json`; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); + subscriber.next(); + subscriber.complete(); + }, + error: err => subscriber.error(err), + }); + }); + } + + /** + * Build the transitive export bundle for a macro: the root payload plus + * full definitions of every nested macro it references (and their nested + * macros, recursively). The result is self-contained — importable on a + * fresh Texera instance with no other prep — and structured to be applied + * dependency-first so each parent's body can be rewritten to reference + * the new wids of its children. + * + * The bundle has a `bundleVersion: 2` marker distinguishing it from the + * v1 single-macro export (`schemaVersion: 1`). Both shapes round-trip + * through `importMacroFromJson`. + */ + public exportBundleForMacro(rootWid: number): Observable<{ + bundleVersion: 2; + name: string; + description: string; + rootContent: string; + portSpec: PortSpec; + paramSpec: unknown; + category?: string; + icon?: string; + exportedAt: string; + exportedFromTexera: string; + nestedMacros: Array<{ + originalWid: number; + name: string; + description: string; + content: string; + portSpec: PortSpec; + paramSpec: unknown; + }>; + }> { + // Walk the dependency graph depth-first, collecting every reachable + // macro id starting from the root. Cycles can't happen for macros + // (MacroExpander guards against them) but we still guard with `seen`. + return new Observable(subscriber => { + const seen = new Set(); + const order: number[] = []; + const details = new Map(); + const visit = (w: number): Promise => + new Promise((resolve, reject) => { + if (seen.has(w)) return resolve(); + seen.add(w); + this.getMacro(w).subscribe({ + next: async d => { + details.set(w, d); + const nestedWids = this.collectNestedMacroIds(d.content); + for (const nw of nestedWids) { + try { + await visit(nw); + } catch (e) { + return reject(e); + } + } + order.push(w); + resolve(); + }, + error: reject, + }); + }); + visit(rootWid).then( + () => { + const root = details.get(rootWid); + if (!root) { + subscriber.error(new Error("Root macro fetch failed")); + return; + } + // Nested macros are everything in `order` except the root, in + // dependency-first order (children before their parents). + const nestedMacros = order + .filter(w => w !== rootWid) + .map(w => { + const d = details.get(w)!; + return { + originalWid: w, + name: d.name, + description: d.description, + content: d.content, + portSpec: d.portSpec, + paramSpec: d.paramSpec, + }; + }); + subscriber.next({ + bundleVersion: 2 as const, + name: root.name, + description: root.description, + rootContent: root.content, + portSpec: root.portSpec, + paramSpec: root.paramSpec, + category: root.category, + icon: root.icon, + exportedAt: new Date().toISOString(), + exportedFromTexera: window.location.host, + nestedMacros, + }); + subscriber.complete(); + }, + err => subscriber.error(err) + ); + }); + } + + /** + * Scan a macro's content (JSON string) for nested macroId references. The + * scan is regex-based for speed and resilience — body shape may have + * additional fields we don't care about. Used by `exportMacroToFile` to + * record dependencies in the export payload. + */ + private collectNestedMacroIds(content: string): number[] { + const matches = content.match(/"macroId"\s*:\s*"(\d+)"/g) ?? []; + const wids = new Set(); + for (const m of matches) { + const numMatch = m.match(/(\d+)/); + if (numMatch) wids.add(Number(numMatch[1])); + } + return Array.from(wids); + } + + /** + * Reverse of `exportMacroToFile`: parse an uploaded JSON file and POST it + * as a brand-new macro definition. The new definition's wid is fresh — + * any cross-references inside the original `content` to its own wid are + * left as-is (they'd be self-referential and unused). + * + * Bundle support (v2): if the JSON has `bundleVersion: 2`, all nested + * macros are created first (in dependency order), then the root content + * is rewritten to point at the new wids, then the root is created. The + * caller still receives the root's MacroDetail — the nested macros land + * in the user's library silently. Schema v1 (single-macro JSON) still + * works for back-compat. + */ + public importMacroFromJson(rawJson: string): Observable { + const parsed = JSON.parse(rawJson) as Record; + if (parsed["bundleVersion"] === 2) { + return this.importMacroBundle(parsed as never); + } + const v1 = parsed as { + schemaVersion?: number; + name?: string; + description?: string; + content?: string; + portSpec?: PortSpec; + paramSpec?: unknown; + category?: string; + icon?: string; + }; + if (!v1.name || !v1.content || !v1.portSpec) { + throw new Error("Invalid macro JSON: missing name / content / portSpec."); + } + const req: MacroCreateRequest = { + name: `${v1.name} (imported)`, + description: v1.description ?? "Imported macro", + content: v1.content, + portSpec: v1.portSpec, + paramSpec: v1.paramSpec, + category: v1.category, + icon: v1.icon, + }; + return this.createMacro(req); + } + + /** + * Apply a v2 export bundle: walk the nested macros in dependency order, + * create each one (collecting a `oldWid → newWid` map), rewrite the next + * pending body's macroId references to the new wids before creating it. + * Finally rewrite the root body the same way and create it. + * + * Failures abort the bundle (best-effort; partial state may persist if a + * mid-bundle create fails — surfacing this cleanly is a v3 follow-up). + */ + private importMacroBundle(bundle: { + name: string; + description: string; + rootContent: string; + portSpec: PortSpec; + paramSpec: unknown; + category?: string; + icon?: string; + nestedMacros: Array<{ + originalWid: number; + name: string; + description: string; + content: string; + portSpec: PortSpec; + paramSpec: unknown; + }>; + }): Observable { + return new Observable(subscriber => { + const idRewrite = new Map(); + const rewriteContent = (content: string): string => + content.replace(/"macroId"\s*:\s*"(\d+)"/g, (match, oldWidStr) => { + const oldWid = Number(oldWidStr); + const newWid = idRewrite.get(oldWid); + if (newWid === undefined) return match; + return `"macroId":"${newWid}"`; + }); + const createOne = (i: number): Promise => + new Promise((resolve, reject) => { + if (i >= bundle.nestedMacros.length) return resolve(); + const nested = bundle.nestedMacros[i]; + const rewrittenContent = rewriteContent(nested.content); + this.createMacro({ + name: `${nested.name} (imported nested)`, + description: nested.description ?? "Imported macro (nested dep)", + content: rewrittenContent, + portSpec: nested.portSpec, + paramSpec: nested.paramSpec, + }).subscribe({ + next: created => { + idRewrite.set(nested.originalWid, created.wid); + createOne(i + 1).then(resolve, reject); + }, + error: reject, + }); + }); + createOne(0).then( + () => { + const rootContent = rewriteContent(bundle.rootContent); + this.createMacro({ + name: `${bundle.name} (imported)`, + description: bundle.description ?? "Imported macro bundle", + content: rootContent, + portSpec: bundle.portSpec, + paramSpec: bundle.paramSpec, + category: bundle.category, + icon: bundle.icon, + }).subscribe({ + next: rootDetail => { + subscriber.next(rootDetail); + subscriber.complete(); + }, + error: err => subscriber.error(err), + }); + }, + err => subscriber.error(err) + ); + }); + } + + public listMacros(): Observable { + return this.http + .get(`${AppSettings.getApiEndpoint()}/${MACRO_LIST_URL}`) + .pipe( + tap(summaries => { + // Mirror into the latest-modified cache so canvas-side consumers can + // detect stale instances without re-fetching. lastModifiedTime is a + // string in transport (LDT JSON) but a number once Jackson serializes + // a Timestamp; coerce both into ms-since-epoch for easy compare. + for (const m of summaries) { + const tsRaw = m.lastModifiedTime as unknown; + const tsMs = + typeof tsRaw === "number" ? tsRaw : new Date(tsRaw as string).getTime(); + this.latestModifiedByWid.set(m.wid, tsMs); + } + }) + ); + } + + /** + * Map of `macroId → most recently seen lastModifiedTime` (epoch ms), + * populated by every `listMacros` response. Used by the "refresh macro + * instance" context-menu action to decide whether a canvas instance is + * stale, and to imprint the freshness timestamp when re-syncing. + */ + private latestModifiedByWid = new Map(); + + /** + * Lookup helper for callers (e.g. the JointUI service when it renders a + * Macro op) — returns the most recent lastModifiedTime we've seen for the + * given macro definition, in ms since epoch. Returns 0 if we haven't seen + * the macro yet (i.e. listMacros hasn't been called or the macro is + * inaccessible to the current user). + */ + public getLatestModifiedTime(macroId: number | string): number { + const wid = typeof macroId === "number" ? macroId : Number(macroId); + if (!Number.isFinite(wid)) return 0; + return this.latestModifiedByWid.get(wid) ?? 0; + } + + public getMacro(wid: number): Observable { + return this.http.get(`${AppSettings.getApiEndpoint()}/${MACRO_BASE_URL}/${wid}`); + } + + /** + * Compute body-level port bindings for the macro DEFINITION identified by + * `macroId` (the definition's wid). The bindings name body-relative inner + * op IDs — callers that need *runtime* IDs (after MacroExpander's prefix + * rewrite) should use `getBindingsForInstance` instead. + * + * Body bindings are derived from the persisted `MacroBody`: + * - each `MacroInput(portIndex=i)` is followed by one or more links + * `marker → innerOp@(p)`; we record (i → innerOp, p) for stats fan-out + * - each `MacroOutput(portIndex=i)` is preceded by exactly one link + * `innerOp@(p) → marker`; we record (i → innerOp, p) for stats/results + * + * Cached and shared across subscribers. + */ + public getBodyBindings(macroId: string): Observable<{ + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + }> { + const cached = this.bodyBindingsCache.get(macroId); + if (cached) return cached; + const widNum = Number(macroId); + if (!Number.isFinite(widNum)) { + const empty = { + inputBindings: [], + outputBindings: [], + nestedMacros: new Map(), + innerSinks: [], + }; + this.bodyBindingsSnapshot.set(macroId, empty); + return of(empty); + } + const fetched = this.getMacro(widNum).pipe( + map(detail => this.computeBodyBindings(detail)), + tap(bindings => { + this.bodyBindingsSnapshot.set(macroId, bindings); + // Eagerly recurse: fetch bindings for any nested macro definitions + // we discovered, so the synchronous resolution path in + // `getBindingsForInstance` finds everything in the snapshot cache. + for (const nestedMacroId of bindings.nestedMacros.values()) { + this.getBodyBindings(nestedMacroId).subscribe({ error: () => undefined }); + } + }), + catchError(() => + of({ + inputBindings: [] as MacroPortBinding[], + outputBindings: [] as MacroPortBinding[], + nestedMacros: new Map(), + innerSinks: [] as string[], + }) + ), + shareReplay(1) + ); + this.bodyBindingsCache.set(macroId, fetched); + return fetched; + } + + /** + * Resolve bindings to runtime IDs for one macro instance on the parent + * canvas. `${instanceId}--` is the prefix MacroExpander adds to every + * inner-op ID when it inlines the body (see + * `workflow-compiling-service/.../MacroExpander.scala`). After this rewrite + * the engine reports stats keyed by the prefixed strings — so we apply the + * same rewrite here so callers can do straight-up `stats[innerOpId]` lookups. + * + * Recursive: when a binding's `innerOpId` points to a nested macro, follow + * its body bindings (recursively, prefixed at each layer) until we reach a + * terminal non-macro inner op. A fan-out at an input port can produce + * multiple terminal bindings for one external port — those get summed by + * the stats consumer. + * + * Returns the cached snapshot synchronously when available so stats-update + * handlers don't have to await; preload via `prefetchBindingsForOperators` + * to make sure the snapshot is populated by the time execution starts. + */ + public getBindingsForInstance(macroInstanceId: string, macroId: string): MacroBindings | undefined { + // Delegate to the runtime-mapping-based resolver. The old prefix-based + // approach broke when MacroExpander switched to fresh UUIDs for inner + // op IDs (see backend MacroExpander.spliceIntoParent). + return this.resolveBindingsViaRuntimeMapping(macroInstanceId, macroId); + } + + /** + * Walk a single body-relative binding down through any nested macros until + * we hit a terminal non-macro inner op. At each level we prefix the inner + * op ID with the accumulated instance prefix (so the final ID matches the + * engine's `${outerInstanceId}--${nestedInstanceId}--…--${terminalOp}` + * key). + * + * `externalPortIndex` is preserved through the chain — it identifies the + * MACRO'S external port we started from, not the nested macro's port. + * That's correct: every terminal binding still belongs to the same outer + * macro port. + */ + private resolveBinding( + accumulatedPrefix: string, + snapshot: { + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + }, + binding: MacroPortBinding, + isInput: boolean + ): MacroPortBinding[] { + const nestedMacroId = snapshot.nestedMacros.get(binding.innerOpId); + if (!nestedMacroId) { + // Terminal — return the binding with the full accumulated prefix. + return [ + { + externalPortIndex: binding.externalPortIndex, + innerOpId: `${accumulatedPrefix}--${binding.innerOpId}`, + innerPortIndex: binding.innerPortIndex, + }, + ]; + } + // Nested macro: load its bindings and follow the chain. The nested + // macro's runtime instance ID is `${accumulatedPrefix}--${nestedInstanceId}` + // (where nestedInstanceId is the body-relative ID we'd otherwise return). + const nestedSnapshot = this.bodyBindingsSnapshot.get(nestedMacroId); + if (!nestedSnapshot) { + // Not yet cached — kick off fetch and return what we have so far. The + // outer caller will see the partial resolution; once the nested macro's + // body loads, the next stats emission will re-resolve correctly. + this.getBodyBindings(nestedMacroId).subscribe({ error: () => undefined }); + return [ + { + externalPortIndex: binding.externalPortIndex, + innerOpId: `${accumulatedPrefix}--${binding.innerOpId}`, + innerPortIndex: binding.innerPortIndex, + }, + ]; + } + // Find nested bindings matching the macro's port the outer binding + // points to (binding.innerPortIndex is the nested macro's external port). + const nestedBindings = isInput ? nestedSnapshot.inputBindings : nestedSnapshot.outputBindings; + const nextLayerPrefix = `${accumulatedPrefix}--${binding.innerOpId}`; + const matched = nestedBindings.filter(nb => nb.externalPortIndex === binding.innerPortIndex); + if (matched.length === 0) { + // Shouldn't happen for a well-formed body, but stay defensive. + return []; + } + const resolved: MacroPortBinding[] = []; + for (const nb of matched) { + const carriedOver: MacroPortBinding = { + externalPortIndex: binding.externalPortIndex, // preserve outer macro's external port + innerOpId: nb.innerOpId, // body-relative inside the nested macro + innerPortIndex: nb.innerPortIndex, + }; + resolved.push(...this.resolveBinding(nextLayerPrefix, nestedSnapshot, carriedOver, isInput)); + } + return resolved; + } + + /** + * Eagerly fetch bindings for every Macro op currently on the canvas, and + * register the macro-instance → inner-op alias used by + * `WorkflowResultService` so the result panel can show the macro's output + * (we route to output port 0's inner producer as the canonical "macro + * result"; a future multi-output UX could expose all outputs). + * Idempotent (cache-keyed), so spamming on every op-add stream emission + * does at most one HTTP per definition. + */ + public prefetchBindingsForOperators(operators: readonly OperatorPredicate[]): void { + for (const op of operators) { + if (op.operatorType !== "Macro") continue; + const macroId = op.operatorProperties?.["macroId"]; + if (typeof macroId !== "string" || macroId.length === 0) continue; + const instanceId = op.operatorID; + // Remember (instanceId → macroId) so cross-service lookups (e.g. + // WorkflowStatusService.withMacroAggregates) can synthesize macro + // stats without holding a reference to WorkflowActionService. + this.registerMacroInstance(instanceId, macroId); + this.getBodyBindings(macroId).subscribe({ + next: snapshot => { + // After the first-level bindings load, ask for the recursive + // resolved bindings — `getBindingsForInstance` chains through any + // nested macros automatically. Output port 0 might resolve to a + // single terminal inner op, OR (in the rare fan-out case) several; + // for the v1 macro-result alias we still pick the first terminal. + const resolved = this.getBindingsForInstance(instanceId, macroId); + const out0 = resolved?.outputBindings.find(b => b.externalPortIndex === 0); + if (out0) { + this.workflowResultService.setMacroResultAlias(instanceId, out0.innerOpId); + return; + } + // Mega-macro fallback: macro has 0 external outputs but its body may + // contain sinks (e.g. CSVFileSink, SimpleSink for "View Results"). + // Engine auto-stores every terminal op's output (see + // WorkflowCompiler.expandLogicalPlan), so the sink's result IS + // materialized — clicking the macro op directly should reveal it. + // We pick the first body sink and resolve it to its runtime UUID via + // the macro-mapping cache. If the cache isn't populated yet (no Run + // has happened), this is a no-op; the tick-driven re-prefetch in the + // editor will re-run after the mapping fetch lands. + if (snapshot.innerSinks.length === 0) return; + const primarySinkBodyId = snapshot.innerSinks[0]; + for (const [runtimeUuid, prov] of this.runtimeMacroMapping.entries()) { + if (prov.bodyOpId !== primarySinkBodyId) continue; + if (prov.macroChain[prov.macroChain.length - 1] !== instanceId) continue; + this.workflowResultService.setMacroResultAlias(instanceId, runtimeUuid); + return; + } + }, + error: () => undefined, + }); + } + } + + private computeBodyBindings(detail: MacroDetail): { + inputBindings: MacroPortBinding[]; + outputBindings: MacroPortBinding[]; + nestedMacros: Map; + innerSinks: string[]; + } { + let body: MacroBody; + try { + body = JSON.parse(detail.content) as MacroBody; + } catch { + return { inputBindings: [], outputBindings: [], nestedMacros: new Map(), innerSinks: [] }; + } + const inputMarkerByPortIndex = new Map(); + const outputMarkerByPortIndex = new Map(); + // Collect nested macro definitions: any Macro op inside the body whose + // macroId we'll need to recursively resolve through. Keyed by the body- + // relative operatorID since that's how the markers' links reference it. + const nestedMacros = new Map(); + // Inner sinks (body-relative IDs). Used as fallback result-alias targets + // when the macro has 0 output ports: a "mega-macro" whose body contains + // sinks but exposes nothing externally still wants its sink output to be + // viewable in the result panel by clicking the macro op directly, + // instead of forcing the user to drill in. + const innerSinks: string[] = []; + for (const raw of body.operators) { + const op = raw as { + operatorID?: string; + operatorType?: string; + portIndex?: number; + macroId?: string; + }; + if (typeof op.operatorID !== "string") continue; + if (op.operatorType === "MacroInput" && typeof op.portIndex === "number") { + inputMarkerByPortIndex.set(op.portIndex, op.operatorID); + } else if (op.operatorType === "MacroOutput" && typeof op.portIndex === "number") { + outputMarkerByPortIndex.set(op.portIndex, op.operatorID); + } else if (op.operatorType === "Macro" && typeof op.macroId === "string" && op.macroId.length > 0) { + nestedMacros.set(op.operatorID, op.macroId); + } else if ( + typeof op.operatorType === "string" && + op.operatorType.toLowerCase().includes("sink") + ) { + innerSinks.push(op.operatorID); + } + } + const markerIds = new Set([ + ...Array.from(inputMarkerByPortIndex.values()), + ...Array.from(outputMarkerByPortIndex.values()), + ]); + // For each MacroInput, find body links marker -> innerOp@(p) — there can + // be multiple if the macro's external input fans out to several inner + // consumers (the rare "split feed" case in spliceIntoParent). + const inputBindings: MacroPortBinding[] = []; + for (const [portIndex, markerId] of inputMarkerByPortIndex.entries()) { + for (const link of body.links) { + if (link.fromOpId !== markerId) continue; + if (markerIds.has(link.toOpId)) continue; // marker → marker is malformed; skip + inputBindings.push({ + externalPortIndex: portIndex, + innerOpId: link.toOpId, + innerPortIndex: link.toPortId.id, + }); + } + } + // For each MacroOutput, find body links innerOp@(p) -> marker — exactly + // one producer per output marker (MacroExpander already enforces this). + const outputBindings: MacroPortBinding[] = []; + for (const [portIndex, markerId] of outputMarkerByPortIndex.entries()) { + for (const link of body.links) { + if (link.toOpId !== markerId) continue; + if (markerIds.has(link.fromOpId)) continue; + outputBindings.push({ + externalPortIndex: portIndex, + innerOpId: link.fromOpId, + innerPortIndex: link.fromPortId.id, + }); + } + } + return { inputBindings, outputBindings, nestedMacros, innerSinks }; + } + + /** + * Build a `MacroCreateRequest` from the operators the user has multi-selected + * on the parent canvas, plus the boundary info the caller needs to swap the + * selection out for a single MacroOp node on the canvas. + * + * Boundary handling: for every link crossing the selection edge we add a + * `MacroInput` / `MacroOutput` marker inside the body (one per unique inner + * port) and rewire it so MacroExpander can splice the body back into a + * parent at compile time. Internal links (both endpoints inside the + * selection) are passed through with port-ordinal IDs to match the + * backend's PortIdentity shape. + * + * The returned `incomingEdges` / `outgoingEdges` describe each external link + * that needs to be re-pointed at the new MacroOp instance (one entry per + * link, where multiple external feeders can share the same `macroPortIndex`). + */ + public buildMacroFromSelection( + workflowActionService: WorkflowActionService, + selectedOperatorIDs: readonly string[], + name: string + ): { + request: MacroCreateRequest; + incomingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + outgoingEdges: { externalOpId: string; externalPortID: string; macroPortIndex: number }[]; + inputPortCount: number; + outputPortCount: number; + } { + const graph = workflowActionService.getTexeraGraph(); + const selectedSet = new Set(selectedOperatorIDs); + + const innerOps = selectedOperatorIDs.map(opId => { + const op = graph.getOperator(opId); + // LogicalOp on the backend is reconstructed by Jackson from the same + // shape the compiler uses — flat properties merged with the structural + // bits (operatorID/Type/Version/ports). + return { + ...op.operatorProperties, + operatorID: op.operatorID, + operatorType: op.operatorType, + operatorVersion: op.operatorVersion, + inputPorts: op.inputPorts, + outputPorts: op.outputPorts, + }; + }); + + const inputPortOrdinal = (operatorID: string, portID: string): number => + graph.getOperator(operatorID).inputPorts.findIndex(p => p.portID === portID); + const outputPortOrdinal = (operatorID: string, portID: string): number => + graph.getOperator(operatorID).outputPorts.findIndex(p => p.portID === portID); + + const internal: { srcOp: string; srcPort: string; dstOp: string; dstPort: string }[] = []; + const incoming: { srcOp: string; srcPort: string; dstOp: string; dstPort: string }[] = []; + const outgoing: { srcOp: string; srcPort: string; dstOp: string; dstPort: string }[] = []; + + graph.getAllLinks().forEach(link => { + const entry = { + srcOp: link.source.operatorID, + srcPort: link.source.portID, + dstOp: link.target.operatorID, + dstPort: link.target.portID, + }; + const srcIn = selectedSet.has(entry.srcOp); + const dstIn = selectedSet.has(entry.dstOp); + if (srcIn && dstIn) internal.push(entry); + else if (!srcIn && dstIn) incoming.push(entry); + else if (srcIn && !dstIn) outgoing.push(entry); + }); + + // Preserve the sub-DAG's full external interface, not just the ports that + // happen to be wired up at macro-creation time. Replacing a sub-DAG with a + // macro op is a dataflow-equivalence transformation: every input port on + // the selection that isn't fed by another selected op is a boundary input + // (regardless of whether an external feeder is currently connected), and + // symmetrically for output ports. That way a selection of + // Filter → Projection where Projection's output is currently unwired still + // surfaces that output as an external macro port the user can connect later. + const internallyFedInputPorts = new Set(internal.map(l => `${l.dstOp}|${l.dstPort}`)); + const internallyConsumedOutputPorts = new Set(internal.map(l => `${l.srcOp}|${l.srcPort}`)); + + type BoundaryPort = { innerOpId: string; innerPortID: string; innerPortIdx: number }; + const boundaryInputPorts: BoundaryPort[] = []; + const boundaryOutputPorts: BoundaryPort[] = []; + selectedOperatorIDs.forEach(opId => { + const op = graph.getOperator(opId); + op.inputPorts.forEach((port, idx) => { + if (!internallyFedInputPorts.has(`${opId}|${port.portID}`)) { + boundaryInputPorts.push({ innerOpId: opId, innerPortID: port.portID, innerPortIdx: idx }); + } + }); + op.outputPorts.forEach((port, idx) => { + if (!internallyConsumedOutputPorts.has(`${opId}|${port.portID}`)) { + boundaryOutputPorts.push({ innerOpId: opId, innerPortID: port.portID, innerPortIdx: idx }); + } + }); + }); + + // Allocate one MacroInput/MacroOutput marker per boundary port. Marker + // ordering follows the selection's visual order (selectedOperatorIDs × + // op.inputPorts), giving the user a stable mapping between macro ports + // and the underlying sub-DAG ports. + const inputMarkers = boundaryInputPorts.map((p, idx) => ({ + markerOpId: `MacroInput-operator-${uuid()}`, + portIndex: idx, + innerOpId: p.innerOpId, + innerPortID: p.innerPortID, + innerPortIdx: p.innerPortIdx, + })); + const outputMarkers = boundaryOutputPorts.map((p, idx) => ({ + markerOpId: `MacroOutput-operator-${uuid()}`, + portIndex: idx, + innerOpId: p.innerOpId, + innerPortID: p.innerPortID, + innerPortIdx: p.innerPortIdx, + })); + + // Marker ports follow the backend's `PortDescription` shape (portID string, + // disallowMultiInputs/isDynamicPort flags) so MacroBody parses cleanly when + // DbMacroRegistry deserializes `workflow.content`. The actual port wiring + // is derived from `portIndex` server-side via `operatorInfo`; these entries + // exist purely to keep Jackson happy. + const markerOps: unknown[] = [ + ...inputMarkers.map(m => ({ + operatorID: m.markerOpId, + operatorType: "MacroInput", + operatorVersion: "", + portIndex: m.portIndex, + displayName: "", + inputPorts: [], + outputPorts: [ + { portID: "output-0", displayName: "", disallowMultiInputs: false, isDynamicPort: false }, + ], + })), + ...outputMarkers.map(m => ({ + operatorID: m.markerOpId, + operatorType: "MacroOutput", + operatorVersion: "", + portIndex: m.portIndex, + displayName: "", + inputPorts: [ + { + portID: "input-0", + displayName: "", + disallowMultiInputs: false, + isDynamicPort: false, + dependencies: [], + }, + ], + outputPorts: [], + })), + ]; + + const internalLinks: MacroBodyLink[] = internal.map(l => ({ + fromOpId: l.srcOp, + fromPortId: { id: outputPortOrdinal(l.srcOp, l.srcPort), internal: false }, + toOpId: l.dstOp, + toPortId: { id: inputPortOrdinal(l.dstOp, l.dstPort), internal: false }, + })); + + const inputMarkerLinks: MacroBodyLink[] = inputMarkers.map(m => ({ + fromOpId: m.markerOpId, + fromPortId: { id: 0, internal: false }, + toOpId: m.innerOpId, + toPortId: { id: m.innerPortIdx, internal: false }, + })); + + const outputMarkerLinks: MacroBodyLink[] = outputMarkers.map(m => ({ + fromOpId: m.innerOpId, + fromPortId: { id: m.innerPortIdx, internal: false }, + toOpId: m.markerOpId, + toPortId: { id: 0, internal: false }, + })); + + const portSpec: PortSpec = { + inputs: inputMarkers.map(m => ({ index: m.portIndex })), + outputs: outputMarkers.map(m => ({ index: m.portIndex })), + }; + + const body: MacroBody = { + operators: [...innerOps, ...markerOps], + links: [...internalLinks, ...inputMarkerLinks, ...outputMarkerLinks], + inputs: portSpec.inputs, + outputs: portSpec.outputs, + }; + + // Per-link rewire instructions. Several external links may share the same + // macroPortIndex when they all target the same inner port. + const inputIdxByInnerPort = new Map( + inputMarkers.map(m => [`${m.innerOpId}|${m.innerPortID}`, m.portIndex]) + ); + const outputIdxByInnerPort = new Map( + outputMarkers.map(m => [`${m.innerOpId}|${m.innerPortID}`, m.portIndex]) + ); + + const incomingEdges = incoming.map(l => ({ + externalOpId: l.srcOp, + externalPortID: l.srcPort, + macroPortIndex: inputIdxByInnerPort.get(`${l.dstOp}|${l.dstPort}`) as number, + })); + const outgoingEdges = outgoing.map(l => ({ + externalOpId: l.dstOp, + externalPortID: l.dstPort, + macroPortIndex: outputIdxByInnerPort.get(`${l.srcOp}|${l.srcPort}`) as number, + })); + + // Auto-generate a 1-line description so users don't get an empty + // description on the dashboard / palette tooltip. Format: + // "Filter → Projection block (2 ops, 1 in/1 out)". + const innerOpTypes = selectedOperatorIDs.map(opId => graph.getOperator(opId).operatorType); + const description = this.autoDescriptionForBody( + innerOpTypes, + inputMarkers.length, + outputMarkers.length + ); + + return { + request: { + name, + description, + content: JSON.stringify(body), + portSpec, + }, + incomingEdges, + outgoingEdges, + inputPortCount: inputMarkers.length, + outputPortCount: outputMarkers.length, + }; + } + + /** + * Compose a one-line description for a freshly-created macro based on the + * operator-type composition of its body and its external port shape. The + * resulting string lands on the macro definition's `description` field and + * shows up in the palette tooltip + the dashboard macro browser. + */ + private autoDescriptionForBody( + innerOpTypes: readonly string[], + inputPortCount: number, + outputPortCount: number + ): string { + if (innerOpTypes.length === 0) return "Empty macro"; + const head = innerOpTypes.slice(0, 3).join(" → "); + const chain = innerOpTypes.length > 3 ? `${head} +${innerOpTypes.length - 3}` : head; + const portShape = `${inputPortCount} in / ${outputPortCount} out`; + return `${chain} (${innerOpTypes.length} ops, ${portShape})`; + } + + /** + * Adapt a backend `MacroDetail` (whose `content` is a serialized `MacroBody`) + * into a `Workflow`-shaped object the existing `reloadWorkflow` flow can + * consume. Used by the drill-down editor route. + * + * v1 caveats: + * - operator positions are auto-laid-out (MacroInput on the left, regular + * inner ops in the middle, MacroOutput on the right) because the body + * doesn't carry positions yet. + * - inner ops that came from the canvas already have `PortDescription` + * ports; marker ops were authored with backend `PortIdentity` shape and + * are normalized here. + */ + public macroDetailToWorkflow(detail: MacroDetail): Workflow { + const body = JSON.parse(detail.content) as MacroBody; + + const operators = body.operators.map(raw => this.normalizeBodyOperator(raw)); + const operatorPositions = this.autoLayoutMacroBody( + operators, + body.links.map(l => ({ fromOpId: l.fromOpId, toOpId: l.toOpId })) + ); + const links = body.links + .map(ml => this.macroLinkToOperatorLink(ml, operators)) + .filter((l): l is OperatorLink => l !== null); + + const content: WorkflowContent = { + operators, + operatorPositions, + links, + commentBoxes: [], + settings: { dataTransferBatchSize: 400, executionMode: ExecutionMode.PIPELINED }, + }; + + return { + wid: detail.wid, + name: detail.name, + description: detail.description, + creationTime: new Date(detail.creationTime).getTime(), + lastModifiedTime: new Date(detail.lastModifiedTime).getTime(), + isPublished: detail.isPublic ? 1 : 0, + readonly: detail.readonly, + content, + }; + } + + private normalizeBodyOperator(raw: unknown): OperatorPredicate { + const r = raw as Record; + const { + operatorID, + operatorType, + operatorVersion, + inputPorts, + outputPorts, + ...rest + } = r as { + operatorID: string; + operatorType: string; + operatorVersion?: string; + inputPorts?: unknown[]; + outputPorts?: unknown[]; + } & Record; + + return { + operatorID, + operatorType, + operatorVersion: operatorVersion ?? "", + operatorProperties: rest, + inputPorts: this.normalizePortList(inputPorts ?? [], "input"), + outputPorts: this.normalizePortList(outputPorts ?? [], "output"), + showAdvanced: false, + isDisabled: false, + customDisplayName: typeof rest["displayName"] === "string" ? (rest["displayName"] as string) : undefined, + dynamicInputPorts: false, + dynamicOutputPorts: false, + }; + } + + private normalizePortList(ports: unknown[], dir: "input" | "output"): PortDescription[] { + return ports.map((raw, idx) => { + const p = raw as Record; + // Already PortDescription-shaped (came from the canvas serialization). + if (typeof p?.["portID"] === "string") { + return p as unknown as PortDescription; + } + // Backend PortIdentity shape ({id: {id, internal}, displayName, ...}) — + // synthesize a portID using the ordinal. + const displayName = typeof p?.["displayName"] === "string" ? (p["displayName"] as string) : ""; + const base: PortDescription = { + portID: `${dir}-${idx}`, + displayName, + disallowMultiInputs: false, + isDynamicPort: false, + }; + return dir === "input" ? { ...base, dependencies: [] } : base; + }); + } + + private macroLinkToOperatorLink( + ml: MacroBodyLink, + operators: OperatorPredicate[] + ): OperatorLink | null { + const fromOp = operators.find(o => o.operatorID === ml.fromOpId); + const toOp = operators.find(o => o.operatorID === ml.toOpId); + if (!fromOp || !toOp) return null; + const fromPortID = fromOp.outputPorts[ml.fromPortId.id]?.portID; + const toPortID = toOp.inputPorts[ml.toPortId.id]?.portID; + if (!fromPortID || !toPortID) return null; + return { + linkID: `macro-link-${uuid()}`, + source: { operatorID: ml.fromOpId, portID: fromPortID }, + target: { operatorID: ml.toOpId, portID: toPortID }, + }; + } + + /** + * Auto-layout the macro body using dagre's directed-graph algorithm — the + * same engine the main canvas's "Auto-layout" button uses. Edges come from + * the body's link list so connected ops sit at logical ranks; MacroInput + * markers act as source ranks (left edge) and MacroOutput markers as sink + * ranks (right edge). Settings mirror `JointGraphWrapper.autoLayoutJoint` + * for consistency between parent canvas and macro-body view. + * + * Why dagre, not the manual 3-column layout it replaces: the previous + * placeholder put every middle op in a vertical stack, which made + * non-linear bodies (joins, fan-outs) look like spaghetti. With dagre, + * a Filter→Projection→Join body lays out naturally with the join's two + * inputs side-by-side. + * + * We use dagre directly (not `joint.layout.DirectedGraph.layout`) because + * at this point the body operators haven't been rendered into JointJS + * cells yet — we're computing the positions that will be passed into + * `WorkflowContent.operatorPositions` on the first drill-down load. + */ + private autoLayoutMacroBody( + operators: OperatorPredicate[], + links: { fromOpId: string; toOpId: string }[] + ): { [id: string]: Point } { + if (operators.length === 0) return {}; + // Use dagre's bundled graphlib constructor so the types line up cleanly + // with `dagre.layout(g)` below. `@types/graphlib` and `@types/dagre` are + // independent packages whose Graph definitions don't unify directly. + const g = new dagre.graphlib.Graph(); + g.setGraph({ + nodesep: 100, + edgesep: 150, + ranksep: 80, + ranker: "tight-tree", + rankdir: "LR", + }); + g.setDefaultEdgeLabel(() => ({})); + // Approximate node size — close enough to a typical Texera operator card. + // Dagre uses these for collision avoidance + edge routing only; the actual + // rendered op size is fixed by the joint shape, so we don't need pixel + // accuracy here. + const NODE_W = 160; + const NODE_H = 60; + operators.forEach(op => { + g.setNode(op.operatorID, { width: NODE_W, height: NODE_H }); + }); + links.forEach(l => { + // dagre tolerates edges to/from unknown nodes silently, but we filter + // anyway — body links can reference markers we haven't normalized into + // operators in pathological cases. + if (g.hasNode(l.fromOpId) && g.hasNode(l.toOpId)) { + g.setEdge(l.fromOpId, l.toOpId); + } + }); + dagre.layout(g); + const positions: { [id: string]: Point } = {}; + g.nodes().forEach(id => { + const node: { x: number; y: number } = g.node(id); + // dagre returns the CENTER of the node; joint expects the TOP-LEFT. + // Subtract half the width/height. + positions[id] = { x: node.x - NODE_W / 2, y: node.y - NODE_H / 2 }; + }); + return positions; + } +} diff --git a/frontend/src/app/workspace/service/operator-metadata/operator-metadata.service.ts b/frontend/src/app/workspace/service/operator-metadata/operator-metadata.service.ts index 484a3c09818..be87fa9e185 100644 --- a/frontend/src/app/workspace/service/operator-metadata/operator-metadata.service.ts +++ b/frontend/src/app/workspace/service/operator-metadata/operator-metadata.service.ts @@ -22,7 +22,7 @@ import { Injectable } from "@angular/core"; import { Observable } from "rxjs"; import { AppSettings } from "../../../common/app-setting"; import { OperatorMetadata, OperatorSchema } from "../../types/operator-schema.interface"; -import { shareReplay } from "rxjs/operators"; +import { map, shareReplay } from "rxjs/operators"; export const OPERATOR_METADATA_ENDPOINT = "resources/operator-metadata"; @@ -54,7 +54,67 @@ export class OperatorMetadataService { private operatorMetadataObservable = this.httpClient .get(`${AppSettings.getApiEndpoint()}/${OPERATOR_METADATA_ENDPOINT}`) - .pipe(shareReplay(1)); + .pipe( + map(metadata => OperatorMetadataService.sanitizeMetadata(metadata)), + shareReplay(1) + ); + + /** + * The backend's reflective JSON-schema generator emits `{"nullable": true}` + * for `Option[...]` fields whose inner type it can't enumerate + * (e.g. `Option[MacroBody]` on `MacroOpDesc`). Ajv strict-mode rejects + * `nullable` without a sibling `type`, which throws everywhere the schema + * gets compiled — validation, property editor, dynamic schema, the YJS + * shared-model handler, etc. Strip those orphan `nullable` flags as the + * metadata comes off the wire so downstream code never sees them. + * + * The proper long-term fix is to teach the generator to emit a real type + * (see project memory `project_macroopdesc_schema_ajv_bug.md`); this + * sanitizer is defense-in-depth. + */ + private static sanitizeMetadata(metadata: OperatorMetadata): OperatorMetadata { + metadata.operators.forEach(op => OperatorMetadataService.sanitizeSchemaNode(op.jsonSchema)); + return metadata; + } + + private static sanitizeSchemaNode(node: unknown): void { + if (node === null || typeof node !== "object") return; + if (Array.isArray(node)) { + node.forEach(child => OperatorMetadataService.sanitizeSchemaNode(child)); + return; + } + const obj = node as Record; + if (obj["nullable"] === true && obj["type"] === undefined) { + if (obj["$ref"] !== undefined) { + // "nullable: true, $ref: X" — Ajv ignores $ref siblings under Draft-07 strict + // rules. Convert to anyOf so that null AND the referenced type are both valid. + // This preserves round-trip properties that serialize Option[T] as null. + const ref = obj["$ref"]; + delete obj["nullable"]; + delete obj["$ref"]; + obj["anyOf"] = [{ type: "null" }, { $ref: ref }]; + } else { + delete obj["nullable"]; + } + } + for (const key of ["properties", "definitions", "patternProperties"]) { + const dict = obj[key]; + if (dict && typeof dict === "object" && !Array.isArray(dict)) { + for (const childKey of Object.keys(dict as Record)) { + OperatorMetadataService.sanitizeSchemaNode((dict as Record)[childKey]); + } + } + } + for (const key of ["items", "additionalProperties", "not"]) { + if (obj[key]) OperatorMetadataService.sanitizeSchemaNode(obj[key]); + } + for (const key of ["oneOf", "anyOf", "allOf"]) { + const arr = obj[key]; + if (Array.isArray(arr)) { + arr.forEach(child => OperatorMetadataService.sanitizeSchemaNode(child)); + } + } + } constructor(private httpClient: HttpClient) { this.getOperatorMetadata().subscribe(data => { diff --git a/frontend/src/app/workspace/service/validation/validation-workflow.service.ts b/frontend/src/app/workspace/service/validation/validation-workflow.service.ts index 92ebb88bb22..bcd28e2d457 100644 --- a/frontend/src/app/workspace/service/validation/validation-workflow.service.ts +++ b/frontend/src/app/workspace/service/validation/validation-workflow.service.ts @@ -274,6 +274,15 @@ export class ValidationWorkflowService { throw new Error(`operator with ID ${operatorID} doesn't exist`); } + // Macro operators embed a complex JSON schema that references all operator types + // (via MacroBody.operators: List[LogicalOp]). AJV cannot reliably compile or + // validate against it on the frontend. Macro properties are always set by + // internal code and are validated by the backend during compilation, so + // skip AJV here and let connection validation alone determine validity. + if (operator.operatorType === "Macro") { + return { isValid: true }; + } + // try to fetch dynamic schema first const operatorSchema = this.dynamicSchemaService.getDynamicSchema(operatorID); if (operatorSchema === undefined) { diff --git a/frontend/src/app/workspace/service/workflow-graph/model/shared-model-change-handler.ts b/frontend/src/app/workspace/service/workflow-graph/model/shared-model-change-handler.ts index e5eab7a812e..2739cc67feb 100644 --- a/frontend/src/app/workspace/service/workflow-graph/model/shared-model-change-handler.ts +++ b/frontend/src/app/workspace/service/workflow-graph/model/shared-model-change-handler.ts @@ -216,16 +216,28 @@ export class SharedModelChangeHandler { * this link is already deleted from the shared model. */ private validateAndRepairNewLink(newLink: OperatorLink): boolean { + // Duplicates are routinely transient — e.g. when SPA-navigating into a + // workflow whose YJS room sync arrives shortly after a `reloadWorkflow` + // has already populated the same operators+links from the HTTP detail + // fetch. The existing link is the canonical one; just skip rendering the + // duplicate and leave the shared model alone. Pre-fix we *deleted* the + // duplicate from the shared model, which corrupted the canvas on + // drill-down navigation (every link disappeared along with its operators). try { this.texeraGraph.assertLinkNotDuplicated(newLink); - // Verify the link connects to operators and ports that exist. + } catch (error) { + console.log("skipping duplicate link: ", (error as Error).message); + return false; + } + // Validity check is a different failure mode (link references a + // non-existent op/port). Those entries are truly broken and the right + // thing is still to repair them out of the shared model. + try { this.texeraGraph.assertLinkIsValid(newLink); return true; } catch (error) { - // Invalid link, repair the shared model this.texeraGraph.sharedModel.operatorLinkMap.delete(newLink.linkID); - // This is treated as a normal repair step and not an error. - console.log("failed to add link. cause: ", (error as Error).message); + console.log("failed to add link, repaired: ", (error as Error).message); return false; } } diff --git a/frontend/src/app/workspace/service/workflow-result/workflow-result.service.ts b/frontend/src/app/workspace/service/workflow-result/workflow-result.service.ts index 9fd18e0f161..2b9158ed9c2 100644 --- a/frontend/src/app/workspace/service/workflow-result/workflow-result.service.ts +++ b/frontend/src/app/workspace/service/workflow-result/workflow-result.service.ts @@ -45,6 +45,14 @@ export class WorkflowResultService { private paginatedResultServices = new Map(); private operatorResultServices = new Map(); + // Alias map for macro instance IDs: macro op IDs on the canvas don't get + // direct result entries from the engine (the engine sees the inlined inner + // ops only). When a macro has at least one output port, route lookups for + // the macro to the inner op feeding output port 0 so the result panel can + // show "the macro's result" without the user having to drill down. Set by + // `MacroService` once body bindings are fetched. + private macroResultAliases = new Map(); + // event stream of operator result update, undefined indicates the operator result is cleared private resultUpdateStream = new Subject>(); private resultTableStats = new ReplaySubject>>>(1); @@ -73,6 +81,47 @@ export class WorkflowResultService { return isDefined(this.getPaginatedResultService(operatorID)); } + /** + * Register/refresh the macro-instance → inner-op alias used to resolve + * `getResultService` / `getPaginatedResultService` lookups for macro ops. + * Idempotent — call whenever a macro's body bindings finish loading. + * `innerOpId` must be a *runtime* (post-MacroExpander-prefix) ID so it + * matches what the engine sends in `WebResultUpdateEvent`. + */ + public setMacroResultAlias(macroInstanceId: string, innerOpId: string): void { + this.macroResultAliases.set(macroInstanceId, innerOpId); + } + + public clearMacroResultAlias(macroInstanceId: string): void { + this.macroResultAliases.delete(macroInstanceId); + } + + // When the canvas is rendering a macro body (drill-down view), the operators + // on the canvas have body-relative IDs (e.g. `Filter-operator-xyz` from the + // macro definition) but engine results arrive keyed by the post-expansion + // runtime UUID assigned by MacroExpander. This map (body-op-id → runtime- + // UUID) is populated by the workflow-editor when entering a drill-down view; + // empty means no drill-down rewrite is active. + // + // The old prefix-based scheme (`${instanceId}--${bodyOpId}`) no longer works + // because MacroExpander switched to fresh deterministic UUIDs (see + // backend/MacroExpander.spliceIntoParent for why long prefixed names had to + // go). The map is computed via MacroService's runtime-mapping cache. + private drilldownAliases: Map = new Map(); + + public setDrilldownAliases(aliases: Map): void { + this.drilldownAliases = aliases; + } + + private resolveAlias(operatorID: string): string { + // Drill-down rewrite wins: when viewing a macro body during execution we + // want the body-relative op ID lifted to its runtime UUID. Macro aliases + // only fire on the outer canvas, where body-relative IDs aren't present. + const drill = this.drilldownAliases.get(operatorID); + if (drill !== undefined) return drill; + return this.macroResultAliases.get(operatorID) ?? operatorID; + } + public getResultUpdateStream(): Observable> { return this.resultUpdateStream; } @@ -88,11 +137,11 @@ export class WorkflowResultService { } public getPaginatedResultService(operatorID: string): OperatorPaginationResultService | undefined { - return this.paginatedResultServices.get(operatorID); + return this.paginatedResultServices.get(this.resolveAlias(operatorID)); } public getResultService(operatorID: string): OperatorResultService | undefined { - return this.operatorResultServices.get(operatorID); + return this.operatorResultServices.get(this.resolveAlias(operatorID)); } private handleCleanResultCache(event: WorkflowAvailableResultEvent): void { diff --git a/frontend/src/app/workspace/service/workflow-status/workflow-status.service.ts b/frontend/src/app/workspace/service/workflow-status/workflow-status.service.ts index e939932aeba..da27ed2d292 100644 --- a/frontend/src/app/workspace/service/workflow-status/workflow-status.service.ts +++ b/frontend/src/app/workspace/service/workflow-status/workflow-status.service.ts @@ -18,26 +18,157 @@ */ import { Injectable } from "@angular/core"; -import { Observable, Subject } from "rxjs"; +import { Observable, ReplaySubject } from "rxjs"; import { OperatorState, OperatorStatistics } from "../../types/execute-workflow.interface"; import { WorkflowWebsocketService } from "../workflow-websocket/workflow-websocket.service"; +import { MacroService } from "../macro/macro.service"; + +// Macro inner-op IDs are fresh UUIDs (assigned by MacroExpander on the backend) +// — no longer derivable from the macro instance via prefix concat. The +// `MacroService.macroInstanceForRuntimeOp(runtimeOpId)` synchronous lookup +// consults the `/api/workflow/{wid}/macro-mapping` cache to find the +// instance any given runtime op belongs to. This function rolls inner-op +// stats up to the visible macro node so the canvas can show aggregated +// state / row counts during execution. + +// State-priority for combining inner-op states into a single macro state. +// Worst-case wins (any failure surfaces; running beats ready; ready beats +// completed). Matches the user's mental model: "the macro is running if any +// inner op is still running." +const STATE_PRIORITY: Record = { + [OperatorState.Recovering]: 9, + [OperatorState.Pausing]: 8, + [OperatorState.Paused]: 7, + [OperatorState.Resuming]: 6, + [OperatorState.Running]: 5, + [OperatorState.Initializing]: 4, + [OperatorState.Ready]: 3, + [OperatorState.Completed]: 2, + [OperatorState.Uninitialized]: 1, +}; + +function combineStates(states: OperatorState[]): OperatorState { + if (states.length === 0) return OperatorState.Uninitialized; + return states.reduce((acc, s) => (STATE_PRIORITY[s] >= STATE_PRIORITY[acc] ? s : acc)); +} + +/** + * Group raw per-op stats by macro instance and emit one aggregated entry per + * macro under the visible instance ID. The original prefixed entries are + * preserved so the drill-down view can find them. + * + * Aggregation rules: + * - state: worst-case state across inner ops (see STATE_PRIORITY) + * - input/output row counts: sum across inner ops (approximate but useful as + * an activity indicator; precise boundary-only counts would need body shape) + * - port metrics: not aggregated (macro's port-level metrics are not 1:1 with + * inner-op port metrics; leave as empty so the tooltip doesn't show stale) + * - numWorkers: sum across inner ops + */ +function withMacroAggregates( + raw: Record, + macroService: MacroService +): Record { + // Each runtime op contributes to the worker-count + state-of-the-macro + // aggregate of EVERY macro instance in its chain (outermost → innermost). + // But ROW COUNTS are derived from the macro's boundary port bindings, + // NOT the sum of all inner ops (which would double-count internal traffic). + const byMacro = new Map(); + for (const [runtimeOpId, stats] of Object.entries(raw)) { + const chain = macroService.macroChainForRuntimeOp(runtimeOpId); + if (!chain || chain.length === 0) continue; + for (const macroInstanceId of chain) { + const list = byMacro.get(macroInstanceId) ?? []; + list.push(stats); + byMacro.set(macroInstanceId, list); + } + } + if (byMacro.size === 0) return raw; + const out: Record = { ...raw }; + for (const [macroInstanceId, innerStats] of byMacro.entries()) { + if (out[macroInstanceId] !== undefined) continue; + // State + worker count: roll-up across all inner ops in the chain. + const operatorState = combineStates(innerStats.map(s => s.operatorState)); + const numWorkers = innerStats.reduce((sum, s) => sum + (s.numWorkers ?? 0), 0); + + // Row counts + port metrics: use the macro's boundary bindings (same + // source of truth the canvas display uses). If bindings aren't loaded + // yet, fall back to the sum-of-all-inner-ops (wrong, but better than 0). + // We don't have the macroId here directly — it's in the parent op's + // operatorProperties, accessible via the macroService cache. For the + // OUTERMOST macro instance the macroService has runtimeOps cached, so we + // approximate via that: synthesize using the chain[0] instance. + let aggregatedInputRowCount = 0; + let aggregatedOutputRowCount = 0; + let inputPortMetrics: Record = {}; + let outputPortMetrics: Record = {}; + const macroIdForInstance = macroService.macroDefIdForInstance(macroInstanceId); + if (macroIdForInstance) { + const synth = macroService.synthesizeMacroOpStats(macroInstanceId, macroIdForInstance, raw); + if (synth) { + aggregatedInputRowCount = synth.aggregatedInputRowCount; + aggregatedOutputRowCount = synth.aggregatedOutputRowCount; + inputPortMetrics = synth.inputPortMetrics; + outputPortMetrics = synth.outputPortMetrics; + } + } + out[macroInstanceId] = { + operatorState, + aggregatedInputRowCount, + inputPortMetrics, + aggregatedOutputRowCount, + outputPortMetrics, + numWorkers, + }; + } + return out; +} @Injectable({ providedIn: "root", }) export class WorkflowStatusService { - // status is responsible for passing websocket responses to other components - private statusSubject = new Subject>(); + // status is responsible for passing websocket responses to other components. + // ReplaySubject(1) so late subscribers (e.g. the canvas after a route-driven + // remount) immediately receive the latest aggregated snapshot instead of + // having to wait for the next websocket event — without this the canvas + // could render with no op stats until execution kicks off again. + private statusSubject = new ReplaySubject>(1); private currentStatus: Record = {}; + // Last-seen raw (pre-aggregation) snapshot. We hold onto this so we can + // re-emit through `withMacroAggregates` when the macro mapping cache later + // populates — this happens whenever the user lands on a workflow with a + // completed run (websocket replays stats first, macro-mapping HTTP arrives + // moments later). Without this re-emit, macro ops on canvas would show the + // first emission's unaggregated raw entries (so macro ops appear blank + // until a brand new stats event arrives, which often never does on a + // finished run). + private lastRawStatus: Record | undefined; - constructor(private workflowWebsocketService: WorkflowWebsocketService) { + constructor( + private workflowWebsocketService: WorkflowWebsocketService, + private macroService: MacroService + ) { this.getStatusUpdateStream().subscribe(event => (this.currentStatus = event)); this.workflowWebsocketService.websocketEvent().subscribe(event => { if (event.type !== "OperatorStatisticsUpdateEvent") { return; } - this.statusSubject.next(event.operatorStatistics); + this.lastRawStatus = event.operatorStatistics; + this.statusSubject.next(withMacroAggregates(event.operatorStatistics, this.macroService)); + }); + + // Re-aggregate when the runtime macro mapping is (re-)fetched. Required + // for the hard-reload-to-parent-canvas flow: the websocket replays stats + // BEFORE refreshRuntimeMacroMapping(wid) lands, so the first aggregation + // pass has no macros to find. After the mapping fills in, we re-run + // aggregation against the cached raw status so canvas macro ops get their + // rolled-up entries. + this.macroService.getRuntimeMacroMappingTick().subscribe(() => { + if (this.lastRawStatus) { + this.statusSubject.next(withMacroAggregates(this.lastRawStatus, this.macroService)); + } }); } diff --git a/frontend/src/assets/operator_images/Macro.png b/frontend/src/assets/operator_images/Macro.png new file mode 100644 index 00000000000..d9f5b30be18 Binary files /dev/null and b/frontend/src/assets/operator_images/Macro.png differ diff --git a/frontend/src/assets/operator_images/MacroInput.png b/frontend/src/assets/operator_images/MacroInput.png new file mode 100644 index 00000000000..468678671f7 Binary files /dev/null and b/frontend/src/assets/operator_images/MacroInput.png differ diff --git a/frontend/src/assets/operator_images/MacroOutput.png b/frontend/src/assets/operator_images/MacroOutput.png new file mode 100644 index 00000000000..8bc774069be Binary files /dev/null and b/frontend/src/assets/operator_images/MacroOutput.png differ diff --git a/hackathon-proposal.md b/hackathon-proposal.md new file mode 100644 index 00000000000..9264cdc8f11 --- /dev/null +++ b/hackathon-proposal.md @@ -0,0 +1,25 @@ +# AI-Augmented Macro Operators for Texera + +## Problem +Texera workflows grow into 20–50+ operator DAGs with no encapsulation. Users copy-paste the same subgraphs across projects, and pipelines run slower than they need to because of inter-operator serialization. + +## What we'll build +- **Macro operators** — collapse a selection of operators into one reusable, version-pinned node (KNIME wrapped-metanode style). Drill-down to edit; drag from a library to reuse. +- **Agent tool: `suggestMacros`** — the agent inspects the `LogicalPlan` and proposes ranked subgraphs to encapsulate, each with a one-line rationale ("looks like a reusable text-preprocessing block"). Highlights candidates on the canvas; one click materializes. +- **Agent tool: `fuseMacro`** — for a macro whose internals the user no longer needs to inspect, the agent synthesizes an equivalent `PythonUDFOpDescV2`, runs original and fused on a sample, diffs outputs, and only swaps in after verification passes. + +## Why it fits the Agent Hackathon +- Plugs straight into the existing `agent-service` (Vercel AI SDK + ReAct loop + tool framework). Two new tools, no new LLM plumbing. +- The agent doesn't just suggest — it **verifies** (sample-run diff for fusion) and rolls back on mismatch. Concrete, measurable correctness. +- Showcases capabilities a generic chatbot can't: structural reasoning over a DAG, codegen for a known runtime, and a built-in verification harness. + +## Demo (~3 min) +1. Open a 15-operator workflow → "Suggest Macros (AI)" → three highlighted candidates appear with rationales. +2. Accept one → subgraph collapses into a single macro node. +3. Run → note baseline time. +4. Right-click macro → "Fuse for performance" → agent generates UDF, verifies ("matched on 1000 sample rows"), swaps in. +5. Re-run → show **2–5× speedup** on the stateless chain. + +## Stretch +- Cross-workflow pattern mining: "you've built this subgraph 4 times — save as a macro?" +- Auto-publish recurring patterns to the workflow hub as community macros. diff --git a/sql/texera_ddl.sql b/sql/texera_ddl.sql index d6b488e582d..22a04a105e0 100644 --- a/sql/texera_ddl.sql +++ b/sql/texera_ddl.sql @@ -82,11 +82,13 @@ DROP TABLE IF EXISTS computing_unit_user_access CASCADE; DROP TYPE IF EXISTS user_role_enum CASCADE; DROP TYPE IF EXISTS privilege_enum CASCADE; DROP TYPE IF EXISTS action_enum CASCADE; +DROP TYPE IF EXISTS workflow_kind_enum CASCADE; CREATE TYPE user_role_enum AS ENUM ('INACTIVE', 'RESTRICTED', 'REGULAR', 'ADMIN'); CREATE TYPE action_enum AS ENUM ('like', 'unlike', 'view', 'clone'); CREATE TYPE privilege_enum AS ENUM ('NONE', 'READ', 'WRITE'); CREATE TYPE workflow_computing_unit_type_enum AS ENUM ('local', 'kubernetes'); +CREATE TYPE workflow_kind_enum AS ENUM ('WORKFLOW', 'MACRO'); -- ============================================ -- 5. Create tables @@ -121,6 +123,10 @@ CREATE TABLE IF NOT EXISTS user_config ); -- workflow +-- `kind` discriminates top-level workflows (WORKFLOW) from reusable macros +-- (MACRO). Macros are surfaced in the operator palette and a separate Macros +-- tab; their `content` follows the same LogicalPlan JSON shape with the +-- addition of MacroInputOp / MacroOutputOp boundary markers. CREATE TABLE IF NOT EXISTS workflow ( wid SERIAL PRIMARY KEY, @@ -129,9 +135,12 @@ CREATE TABLE IF NOT EXISTS workflow content TEXT NOT NULL, creation_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, last_modified_time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - is_public BOOLEAN NOT NULL DEFAULT false + is_public BOOLEAN NOT NULL DEFAULT false, + kind workflow_kind_enum NOT NULL DEFAULT 'WORKFLOW' ); +CREATE INDEX IF NOT EXISTS idx_workflow_kind ON workflow(kind); + -- workflow_of_user CREATE TABLE IF NOT EXISTS workflow_of_user ( @@ -435,6 +444,21 @@ CREATE TABLE IF NOT EXISTS computing_unit_user_access FOREIGN KEY (uid) REFERENCES "user"(uid) ON DELETE CASCADE ); +-- macro_metadata table +-- Denormalized macro descriptor used by palette/listing endpoints so they do +-- not have to parse workflow.content (a JSON-serialized LogicalPlan) per row. +-- port_spec captures the macro's declared external inputs/outputs; param_spec +-- captures promoted parameters (empty in v1, populated in Phase 2). +CREATE TABLE IF NOT EXISTS macro_metadata +( + wid INT PRIMARY KEY, + port_spec JSONB NOT NULL, + param_spec JSONB NOT NULL DEFAULT '[]'::JSONB, + category VARCHAR(128), + icon VARCHAR(64), + FOREIGN KEY (wid) REFERENCES workflow(wid) ON DELETE CASCADE +); + -- START Fulltext search index creation (DO NOT EDIT THIS LINE) CREATE EXTENSION IF NOT EXISTS pgroonga; diff --git a/sql/updates/23.sql b/sql/updates/23.sql new file mode 100644 index 00000000000..c9cd3e7aaa5 --- /dev/null +++ b/sql/updates/23.sql @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +\c texera_db + +SET search_path TO texera_db; + +BEGIN; + +-- Discriminator for workflow rows: WORKFLOW = top-level workflows surfaced in +-- the Workflows tab; MACRO = reusable subgraphs surfaced in the operator +-- palette and a separate Macros tab. Reusing the workflow table inherits +-- versioning, ACL, and hub features for free. +DO $$ BEGIN + CREATE TYPE workflow_kind_enum AS ENUM ('WORKFLOW', 'MACRO'); +EXCEPTION + WHEN duplicate_object THEN NULL; +END $$; + +ALTER TABLE workflow + ADD COLUMN IF NOT EXISTS kind workflow_kind_enum NOT NULL DEFAULT 'WORKFLOW'; + +CREATE INDEX IF NOT EXISTS idx_workflow_kind ON workflow(kind); + +-- Denormalized macro descriptor used by palette/listing endpoints so they do +-- not have to parse workflow.content (a JSON-serialized LogicalPlan) per row. +-- port_spec captures the macro's declared external inputs/outputs; param_spec +-- captures promoted parameters (empty in v1, populated in Phase 2). +CREATE TABLE IF NOT EXISTS macro_metadata +( + wid INT PRIMARY KEY, + port_spec JSONB NOT NULL, + param_spec JSONB NOT NULL DEFAULT '[]'::JSONB, + category VARCHAR(128), + icon VARCHAR(64), + FOREIGN KEY (wid) REFERENCES workflow(wid) ON DELETE CASCADE +); + +COMMIT; diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/WorkflowCompiler.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/WorkflowCompiler.scala index 25166e7ac52..cdc30e51991 100644 --- a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/WorkflowCompiler.scala +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/WorkflowCompiler.scala @@ -25,6 +25,7 @@ import org.apache.texera.amber.compiler.WorkflowCompiler.{ collectOutputSchemaFromPhysicalPlan, convertErrorListToWorkflowFatalErrorMap } +import org.apache.texera.amber.compiler.macroOp.{MacroExpander, MacroRegistry} import org.apache.texera.amber.compiler.model.{LogicalPlan, LogicalPlanPojo} import org.apache.texera.amber.core.tuple.Schema import org.apache.texera.amber.core.virtualidentity.OperatorIdentity @@ -36,6 +37,7 @@ import org.apache.texera.amber.core.workflow.{ } import org.apache.texera.amber.core.workflowruntimestate.FatalErrorType.COMPILATION_ERROR import org.apache.texera.amber.core.workflowruntimestate.WorkflowFatalError +import org.apache.texera.amber.operator.macroOp.{MacroInputOp, MacroOpDesc, MacroOutputOp} import java.time.Instant import scala.collection.mutable @@ -58,23 +60,40 @@ object WorkflowCompiler { } } + // After MacroExpander runs, inner-body operator IDs carry a "${macroInstanceId}--..." + // prefix (nested macros stack more "--" segments). The macro instance is the only + // operator the user sees on the parent canvas, so any compilation error from an + // inlined inner op must be re-attributed to that visible ID — otherwise the frontend + // looks up errors by canvas IDs and finds nothing for the failed macro. + private def visibleOperatorId(opId: OperatorIdentity): OperatorIdentity = { + val sep = opId.id.indexOf("--") + if (sep < 0) opId else OperatorIdentity(opId.id.substring(0, sep)) + } + // util function for convert the error list to error map, and report the error in log private def convertErrorListToWorkflowFatalErrorMap( logger: Logger, errorList: List[(OperatorIdentity, Throwable)] ): Map[OperatorIdentity, WorkflowFatalError] = { val opIdToError = mutable.Map[OperatorIdentity, WorkflowFatalError]() - errorList.map { + errorList.foreach { case (opId, err) => - // map each error to WorkflowFatalError, and report them in the log + val visibleId = visibleOperatorId(opId) + // Log with the *inner* opId so developers can find which inner op failed. logger.error(s"Error occurred in logical plan compilation for opId: $opId", err) - opIdToError += (opId -> WorkflowFatalError( - COMPILATION_ERROR, - Timestamp(Instant.now), - err.toString, - getStackTraceWithAllCauses(err), - opId.id - )) + // Skip if we already recorded an error for this visible op — keep the first one. + if (!opIdToError.contains(visibleId)) { + val message = + if (visibleId == opId) err.toString + else s"In macro inner op '${opId.id}': ${err.toString}" + opIdToError += (visibleId -> WorkflowFatalError( + COMPILATION_ERROR, + Timestamp(Instant.now), + message, + getStackTraceWithAllCauses(err), + visibleId.id + )) + } } opIdToError.toMap } @@ -122,9 +141,23 @@ case class WorkflowCompilationResult( ) class WorkflowCompiler( - context: WorkflowContext + context: WorkflowContext, + macroRegistry: MacroRegistry = MacroRegistry.Empty ) extends LazyLogging { + // A plan is a "standalone macro body" if it contains marker ops but no + // MacroOpDesc instance to wrap them. That shape is what the drill-down editor + // sends when the user is editing a macro body directly; it has no real + // upstream/downstream context, so we skip physical compilation. + private def isStandaloneMacroBody(plan: LogicalPlan): Boolean = { + val hasMarker = plan.operators.exists { + case _: MacroInputOp | _: MacroOutputOp => true + case _ => false + } + val hasMacroInstance = plan.operators.exists(_.isInstanceOf[MacroOpDesc]) + hasMarker && !hasMacroInstance + } + // function to expand logical plan to physical plan private def expandLogicalPlan( logicalPlan: LogicalPlan, @@ -205,19 +238,55 @@ class WorkflowCompiler( val errorList = new ArrayBuffer[(OperatorIdentity, Throwable)]() var opIdToOutputSchema: Map[OperatorIdentity, Map[PortIdentity, Option[Schema]]] = Map() // 1. convert the pojo to logical plan - val logicalPlan: LogicalPlan = LogicalPlan(logicalPlanPojo) + val rawLogicalPlan: LogicalPlan = LogicalPlan(logicalPlanPojo) + + // 1a. Standalone macro-body plans (the drill-down editor view) contain + // MacroInput/MacroOutput markers but no MacroOpDesc to inline them — so + // calling `getPhysicalPlan` on a marker would throw, and every inner op + // downstream would fail schema propagation. The body is only meant for + // structural editing in this view; the real compile happens when a parent + // instantiates the macro and MacroExpander strips the markers. Returning + // success here keeps the body view clean and prevents the singleton + // frontend compile-state from carrying marker errors across to the + // parent canvas on drill-down navigation. + if (isStandaloneMacroBody(rawLogicalPlan)) { + return WorkflowCompilationResult( + physicalPlan = Some(PhysicalPlan(operators = Set.empty, links = Set.empty)), + operatorIdToOutputSchemas = Map.empty, + operatorIdToError = Map.empty + ) + } - // 2. resolve the file name in each scan source operator + // 2. expand any macro operators into a flat logical plan. Macros are a purely + // logical-plan-level abstraction; after this pass the rest of the pipeline never + // sees a MacroOpDesc / MacroInputOp / MacroOutputOp. + val logicalPlan: LogicalPlan = + try { + MacroExpander.expand(rawLogicalPlan, macroRegistry) + } catch { + case e: Throwable => + errorList.append((OperatorIdentity("__macro_expander__"), e)) + rawLogicalPlan + } + + // 3. resolve the file name in each scan source operator logicalPlan.resolveScanSourceOpFileName(Some(errorList)) - // 3. expand the logical plan to the physical plan + // 4. expand the logical plan to the physical plan val physicalPlan = expandLogicalPlan(logicalPlan, Some(errorList)) // 4. collect the output schema for each logical op // even if error is encountered when logical => physical, we still want to get the input schemas for rest no-error operators opIdToOutputSchema = collectOutputSchemaFromPhysicalPlan(physicalPlan, errorList) + + // Only block the physical plan for errors on outer canvas operators. Errors that + // originated inside a macro body carry a "/" in their ID (e.g. "Macro-xxx/SleepOp") + // and are already attributed to the macro instance on the canvas — the outer + // workflow is structurally valid and can still be submitted; the broken macro will + // fail at execution time without blocking unrelated operators. + val outerErrorList = errorList.filter { case (opId, _) => !opId.id.contains("--") } WorkflowCompilationResult( - physicalPlan = if (errorList.nonEmpty) None else Some(physicalPlan), + physicalPlan = if (outerErrorList.nonEmpty) None else Some(physicalPlan), operatorIdToOutputSchemas = opIdToOutputSchema, // map each error from OpId to WorkflowFatalError, and report them via logger operatorIdToError = convertErrorListToWorkflowFatalErrorMap(logger, errorList.toList) diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/DbMacroRegistry.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/DbMacroRegistry.scala new file mode 100644 index 00000000000..0af98561f27 --- /dev/null +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/DbMacroRegistry.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.compiler.macroOp + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.typesafe.scalalogging.LazyLogging +import org.apache.texera.amber.operator.macroOp.MacroBody +import org.apache.texera.dao.SqlServer +import org.apache.texera.dao.jooq.generated.Tables.WORKFLOW +import org.apache.texera.dao.jooq.generated.enums.WorkflowKindEnum + +import scala.util.control.NonFatal + +/** + * jOOQ + Jackson-backed [[MacroRegistry]] that loads a macro body from the + * `workflow` table. `workflow.content` for a macro row is treated as a + * JSON-serialized [[MacroBody]] (same shape produced by `MacroResource.create` + * on the amber side). + * + * v1 ignores the `version` argument and always reads the current row. Pinning + * to a specific `vid` requires reconstructing the body from + * `workflow_version`'s JSON patches and is deferred to Phase 2 alongside the + * explicit "published version" policy described in the design plan. + */ +class DbMacroRegistry extends MacroRegistry with LazyLogging { + + private val mapper = new ObjectMapper().registerModule(DefaultScalaModule) + + override def fetch(macroId: String, version: Int): Option[MacroBody] = { + val widOpt = + try Some(Integer.parseInt(macroId)) + catch { case _: NumberFormatException => None } + + widOpt.flatMap { wid => + try { + val record = SqlServer + .getInstance() + .createDSLContext() + .select(WORKFLOW.CONTENT, WORKFLOW.KIND) + .from(WORKFLOW) + .where(WORKFLOW.WID.eq(wid)) + .fetchOne() + if (record == null || record.value2() != WorkflowKindEnum.MACRO) { + None + } else { + Option(record.value1()) + .filter(_.nonEmpty) + .map(mapper.readValue(_, classOf[MacroBody])) + } + } catch { + case NonFatal(e) => + logger.error( + s"DbMacroRegistry: failed to load macro macroId=$macroId version=$version", + e + ) + None + } + } + } +} diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroCompileContext.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroCompileContext.scala new file mode 100644 index 00000000000..4c1ccdca3e0 --- /dev/null +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroCompileContext.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.compiler.macroOp + +// Threaded through MacroExpander to detect macro recursion and depth bombs. +// `visited` is the set of (macroId, version) pairs on the current expansion path; +// reappearance means a cycle. +case class MacroCompileContext( + visited: Set[(String, Int)], + depth: Int +) { + + def guardAgainstCycle(macroId: String, version: Int): Unit = { + if (visited.contains((macroId, version))) { + val path = visited.map { case (id, v) => s"$id@v$v" }.mkString(" -> ") + throw new IllegalStateException( + s"Macro cycle detected: $macroId@v$version is already being expanded on this path " + + s"(visited: $path)" + ) + } + } + + def guardAgainstDepth(): Unit = { + if (depth >= MacroCompileContext.MaxDepth) { + throw new IllegalStateException( + s"Macro expansion depth limit (${MacroCompileContext.MaxDepth}) exceeded — " + + s"likely a self-referential macro chain." + ) + } + } + + def descend(macroId: String, version: Int): MacroCompileContext = + MacroCompileContext(visited + ((macroId, version)), depth + 1) +} + +object MacroCompileContext { + val MaxDepth: Int = 16 + def root: MacroCompileContext = MacroCompileContext(Set.empty, 0) +} diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroExpander.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroExpander.scala new file mode 100644 index 00000000000..041b6955ea0 --- /dev/null +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroExpander.scala @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.compiler.macroOp + +import org.apache.texera.amber.compiler.model.{LogicalLink, LogicalPlan} +import org.apache.texera.amber.core.virtualidentity.OperatorIdentity +import org.apache.texera.amber.core.workflow.PortIdentity +import org.apache.texera.amber.operator.{LogicalOp, PortDescription} +import org.apache.texera.amber.operator.udf.python.PythonUDFOpDescV2 +import org.apache.texera.amber.operator.macroOp.{ + MacroBody, + MacroInputOp, + MacroLink, + MacroOpDesc, + MacroOutputOp +} +import org.apache.texera.amber.util.JSONUtils.objectMapper + +// Pre-compile pass: walks a LogicalPlan, inlines every MacroOpDesc by splicing its +// body's inner operators and links into the parent, and produces a flat LogicalPlan +// with no MacroOpDesc / MacroInputOp / MacroOutputOp nodes. Inner-op IDs are rewritten +// to "${macroInstanceId}--${innerOpId}" so telemetry can be aggregated per macro +// purely from the operator-ID prefix — the physical-plan layer remains macro-unaware. +// Note: "--" is chosen over "/" because "/" breaks VFS URI path parsing. +object MacroExpander { + + def expand(plan: LogicalPlan, registry: MacroRegistry): LogicalPlan = + expand(plan, registry, MacroCompileContext.root) + + private def expand( + plan: LogicalPlan, + registry: MacroRegistry, + ctx: MacroCompileContext + ): LogicalPlan = { + // Each iteration picks the first remaining MacroOpDesc and inlines it. After + // inlining, the plan shape changes; loop re-scans the fresh `acc`. + var acc = plan + while (acc.operators.exists(_.isInstanceOf[MacroOpDesc])) { + val m = acc.operators.collectFirst { case x: MacroOpDesc => x }.get + acc = inlineMacro(acc, m, registry, ctx) + } + acc + } + + private def inlineMacro( + parent: LogicalPlan, + m: MacroOpDesc, + registry: MacroRegistry, + ctx: MacroCompileContext + ): LogicalPlan = { + ctx.guardAgainstCycle(m.macroId, m.macroVersion) + ctx.guardAgainstDepth() + + // §9.2 AI fusion substitution — mirror of the amber WorkflowCompiler's + // path so the compiling-service (which provides schema-propagation + // hints to the frontend) sees the same shape as the runtime engine. + // Frontend sets `fusion.verified = true` after sample-diff verification. + if (m.fusion.exists(_.verified)) { + return substituteFused(parent, m) + } + + val body: MacroBody = m.linkMode match { + case MacroOpDesc.SNAPSHOT => + m.snapshot.getOrElse( + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}] has linkMode=SNAPSHOT but no embedded snapshot" + ) + ) + case MacroOpDesc.LIVE => + registry + .fetch(m.macroId, m.macroVersion) + .getOrElse( + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}@v${m.macroVersion}] not found in registry " + + s"(LIVE link). The macro may be deleted or inaccessible." + ) + ) + case other => + throw new IllegalArgumentException( + s"MacroOpDesc[${m.macroId}] has unknown linkMode '$other'" + ) + } + + val expandedBody = expand( + LogicalPlan(body.operators, body.links.map(toLogicalLink)), + registry, + ctx.descend(m.macroId, m.macroVersion) + ) + + spliceIntoParent(parent, m, expandedBody) + } + + private def toLogicalLink(ml: MacroLink): LogicalLink = + LogicalLink( + OperatorIdentity(ml.fromOpId), + ml.fromPortId, + OperatorIdentity(ml.toOpId), + ml.toPortId + ) + + private def spliceIntoParent( + parent: LogicalPlan, + m: MacroOpDesc, + body: LogicalPlan + ): LogicalPlan = { + val instanceId = m.operatorIdentifier.id + val mId = m.operatorIdentifier + + val inputMarkers: Map[Int, MacroInputOp] = + body.operators.collect { case b: MacroInputOp => b.portIndex -> b }.toMap + val outputMarkers: Map[Int, MacroOutputOp] = + body.operators.collect { case b: MacroOutputOp => b.portIndex -> b }.toMap + + val markerIds: Set[OperatorIdentity] = + inputMarkers.values.map(_.operatorIdentifier).toSet ++ + outputMarkers.values.map(_.operatorIdentifier).toSet + + // Deep-clone non-marker inner ops via JSON round-trip. + val innerOps: List[LogicalOp] = body.operators.collect { + case op if !op.isInstanceOf[MacroInputOp] && !op.isInstanceOf[MacroOutputOp] => + deepClone(op) + } + + // Assign DETERMINISTIC UUIDs to each inner op via nameUUIDFromBytes + // keyed on (macroInstanceId, originalBodyOpId). Must match the amber + // MacroExpander byte-for-byte — Texera compiles this workflow twice + // (once here for frontend validation, once in amber for actual + // execution); the engine emits stats keyed by the second compile's IDs, + // and `MacroMappingCache` records them. If the IDs differed across + // compilers, the frontend's stats-roll-up to the macro op would fail + // because the cached mapping wouldn't match the actual runtime IDs. + val idRewrite: Map[OperatorIdentity, OperatorIdentity] = innerOps.map { op => + val originalId = op.operatorIdentifier + val seed = s"${m.operatorIdentifier.id}|${originalId.id}" + val derivedUuid = java.util.UUID.nameUUIDFromBytes(seed.getBytes("UTF-8")) + val freshId = s"${op.getClass.getSimpleName}-operator-$derivedUuid" + op.setOperatorId(freshId) + originalId -> op.operatorIdentifier + }.toMap + + def rewriteInnerId(id: OperatorIdentity): OperatorIdentity = + idRewrite.getOrElse( + id, + throw new IllegalStateException( + s"MacroExpander: link references unknown inner op '${id.id}' (instance=$instanceId)" + ) + ) + + // 1. Internal body links (non-marker → non-marker), with prefixed IDs. + val internalLinks: List[LogicalLink] = body.links.collect { + case l if !markerIds.contains(l.fromOpId) && !markerIds.contains(l.toOpId) => + LogicalLink(rewriteInnerId(l.fromOpId), l.fromPortId, rewriteInnerId(l.toOpId), l.toPortId) + } + + // 2. For each external input port, list the inner consumers connected via + // MacroInputOp_i. A port may fan out to multiple consumers. + val inputConsumers: Map[Int, List[(OperatorIdentity, PortIdentity)]] = + inputMarkers.map { + case (portIndex, marker) => + val markerId = marker.operatorIdentifier + val consumers = body.links + .filter(_.fromOpId == markerId) + .map(l => (rewriteInnerId(l.toOpId), l.toPortId)) + portIndex -> consumers + } + + // 3. For each external output port, the single inner producer feeding + // MacroOutputOp_j. More than one producer is a malformed body. + val outputProducers: Map[Int, (OperatorIdentity, PortIdentity)] = + outputMarkers.map { + case (portIndex, marker) => + val markerId = marker.operatorIdentifier + val producers = body.links + .filter(_.toOpId == markerId) + .map(l => (rewriteInnerId(l.fromOpId), l.fromPortId)) + producers match { + case single :: Nil => portIndex -> single + case Nil => + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) in macro $instanceId has no producer" + ) + case many => + throw new IllegalStateException( + s"MacroOutputOp(portIndex=$portIndex) in macro $instanceId has " + + s"${many.size} producers; expected exactly one." + ) + } + } + + // 4. Rewrite parent links that touch this macro instance. + val rewrittenParentLinks: List[LogicalLink] = parent.links.flatMap { link => + if (link.toOpId == mId) { + val portIndex = link.toPortId.id + inputConsumers.get(portIndex) match { + case Some(consumers) => + consumers.map { + case (innerOp, innerPort) => + LogicalLink(link.fromOpId, link.fromPortId, innerOp, innerPort) + } + case None => + throw new IllegalStateException( + s"Parent link into ($instanceId, port=$portIndex) has no matching " + + s"MacroInputOp inside the macro body." + ) + } + } else if (link.fromOpId == mId) { + val portIndex = link.fromPortId.id + outputProducers.get(portIndex) match { + case Some((innerOp, innerPort)) => + List(LogicalLink(innerOp, innerPort, link.toOpId, link.toPortId)) + case None => + throw new IllegalStateException( + s"Parent link out of ($instanceId, port=$portIndex) has no matching " + + s"MacroOutputOp inside the macro body." + ) + } + } else { + List(link) + } + } + + val newOps = + parent.operators.filterNot(_.operatorIdentifier == mId) ++ innerOps + val newLinks = rewrittenParentLinks ++ internalLinks + LogicalPlan(newOps, newLinks) + } + + // Deep-clone via JSON round-trip. Avoids mutating the persisted body when we + // rewrite inner-op IDs in spliceIntoParent. + private def deepClone(op: LogicalOp): LogicalOp = { + val json = objectMapper.writeValueAsString(op) + objectMapper.readValue(json, classOf[LogicalOp]) + } + + /** + * §9.2 fusion substitution — replace MacroOpDesc with a single + * PythonUDFOpDescV2 carrying the verified fused code. The new op + * inherits the macro's external port shape and keeps the macro + * instance ID so parent links don't need rewriting. + */ + private def substituteFused(parent: LogicalPlan, m: MacroOpDesc): LogicalPlan = { + val fusion = m.fusion.get + val fused = new PythonUDFOpDescV2() + fused.code = fusion.code + // Schema propagation: see amber/.../MacroExpander.scala substituteFused + // for the same rationale. retainInputColumns lets the engine carry the + // input schema through to the output without a hand-declared + // outputColumns list; workers=1 keeps the fused execution single-actor. + fused.retainInputColumns = m.inputPortCount > 0 + fused.outputColumns = List.empty + fused.workers = 1 + fused.inputPorts = (0 until m.inputPortCount).map { i => + PortDescription( + portID = s"input-$i", + displayName = s"in-$i", + disallowMultiInputs = false, + isDynamicPort = false, + partitionRequirement = null, + dependencies = List.empty + ) + }.toList + fused.outputPorts = (0 until m.outputPortCount).map { i => + PortDescription( + portID = s"output-$i", + displayName = s"out-$i", + disallowMultiInputs = false, + isDynamicPort = false, + partitionRequirement = null, + dependencies = List.empty + ) + }.toList + fused.setOperatorId(m.operatorIdentifier.id) + val newOps = parent.operators.map { + case op if op.operatorIdentifier == m.operatorIdentifier => fused + case op => op + } + LogicalPlan(newOps, parent.links) + } +} diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroRegistry.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroRegistry.scala new file mode 100644 index 00000000000..59ac5cbf5f3 --- /dev/null +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/amber/compiler/macroOp/MacroRegistry.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.compiler.macroOp + +import org.apache.texera.amber.operator.macroOp.MacroBody + +// Looks up a macro definition's body by (macroId, version). The persistence-backed +// implementation lives in the amber service and queries workflow_version; tests and +// services without persistence can use Empty or inMemory. +trait MacroRegistry { + def fetch(macroId: String, version: Int): Option[MacroBody] +} + +object MacroRegistry { + + // Always returns None. Use when persistence is not wired up — SNAPSHOT macros still + // work since their body is embedded; LIVE macros fail with "not found in registry". + object Empty extends MacroRegistry { + override def fetch(macroId: String, version: Int): Option[MacroBody] = None + } + + // For tests: a fixed table of bodies keyed by (id, version). + def inMemory(bodies: Map[(String, Int), MacroBody]): MacroRegistry = + new MacroRegistry { + override def fetch(macroId: String, version: Int): Option[MacroBody] = + bodies.get((macroId, version)) + } +} diff --git a/workflow-compiling-service/src/main/scala/org/apache/texera/service/resource/WorkflowCompilationResource.scala b/workflow-compiling-service/src/main/scala/org/apache/texera/service/resource/WorkflowCompilationResource.scala index f311f31d0b7..d80df162419 100644 --- a/workflow-compiling-service/src/main/scala/org/apache/texera/service/resource/WorkflowCompilationResource.scala +++ b/workflow-compiling-service/src/main/scala/org/apache/texera/service/resource/WorkflowCompilationResource.scala @@ -25,6 +25,7 @@ import jakarta.annotation.security.RolesAllowed import jakarta.ws.rs.core.MediaType import jakarta.ws.rs.{Consumes, POST, Path, Produces} import org.apache.texera.amber.compiler.WorkflowCompiler +import org.apache.texera.amber.compiler.macroOp.DbMacroRegistry import org.apache.texera.amber.compiler.model.LogicalPlanPojo import org.apache.texera.amber.core.tuple.Attribute import org.apache.texera.amber.core.virtualidentity.WorkflowIdentity @@ -68,8 +69,10 @@ class WorkflowCompilationResource extends LazyLogging { // a placeholder workflow context, as compiling a workflow doesn't require a wid from the frontend val context = new WorkflowContext(workflowId = WorkflowIdentity(0)) - // Compile the pojo using WorkflowCompiler - val compilationResult = new WorkflowCompiler(context).compile(logicalPlanPojo) + // Compile the pojo using WorkflowCompiler. The DB-backed registry resolves + // any LIVE-mode macro instances against the `workflow` table. + val compilationResult = + new WorkflowCompiler(context, new DbMacroRegistry()).compile(logicalPlanPojo) val operatorOutputSchemas = compilationResult.operatorIdToOutputSchemas.map { case (operatorIdentity, schemas) => diff --git a/workflow-compiling-service/src/test/scala/org/apache/texera/amber/compiler/macroOp/MacroExpanderSpec.scala b/workflow-compiling-service/src/test/scala/org/apache/texera/amber/compiler/macroOp/MacroExpanderSpec.scala new file mode 100644 index 00000000000..b4b0d21f5e5 --- /dev/null +++ b/workflow-compiling-service/src/test/scala/org/apache/texera/amber/compiler/macroOp/MacroExpanderSpec.scala @@ -0,0 +1,694 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.compiler.macroOp + +import org.apache.texera.amber.compiler.WorkflowCompiler +import org.apache.texera.amber.compiler.model.{LogicalLink, LogicalPlan, LogicalPlanPojo} +import org.apache.texera.amber.core.virtualidentity.{OperatorIdentity, WorkflowIdentity} +import org.apache.texera.amber.core.workflow.{PortIdentity, WorkflowContext} +import org.apache.texera.amber.operator.limit.LimitOpDesc +import org.apache.texera.amber.operator.macroOp._ +import org.apache.texera.amber.operator.source.scan.csv.CSVScanSourceOpDesc +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class MacroExpanderSpec extends AnyFlatSpec with Matchers { + + "MacroExpander" should "leave non-macro plans unchanged" in { + val src = limit("src", 0) + val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, MacroRegistry.Empty) + out.operators.map(_.operatorIdentifier.id).toSet shouldBe Set("src", "sink") + out.links.size shouldBe 1 + } + + it should "inline a single-port SNAPSHOT macro and prefix inner-op IDs" in { + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("MyMacro-1", "macro-A", body) + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, inst, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, MacroRegistry.Empty) + + out.operators.exists(_.isInstanceOf[MacroOpDesc]) shouldBe false + out.operators.exists(_.isInstanceOf[MacroInputOp]) shouldBe false + out.operators.exists(_.isInstanceOf[MacroOutputOp]) shouldBe false + + out.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet shouldBe + Set("src", "sink", "MyMacro-1--inner") + + val edges = out.links.map(l => (l.fromOpId.id, l.toOpId.id)).toSet + edges shouldBe Set("src" -> "MyMacro-1--inner", "MyMacro-1--inner" -> "sink") + } + + it should "fetch a LIVE-linked macro body from the registry" in { + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 3), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val registry = MacroRegistry.inMemory(Map(("live-id", 4) -> body)) + + val inst = new MacroOpDesc + inst.macroId = "live-id" + inst.macroVersion = 4 + inst.linkMode = MacroOpDesc.LIVE + inst.inputPortCount = 1 + inst.outputPortCount = 1 + inst.setOperatorId("L-inst") + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, inst, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, registry) + out.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet shouldBe + Set("src", "sink", "L-inst--inner") + } + + it should "expand nested macros with concatenated ID prefixes" in { + val innerBody = MacroBody( + operators = List(inMarker(0, "in"), limit("inner-inner", 7), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner-inner", PortIdentity(0)), + MacroLink("inner-inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val innerInst = snapshotInstance("Inner", "macro-inner", innerBody) + val outerBody = MacroBody( + operators = List(inMarker(0, "oin"), innerInst, outMarker(0, "oout")), + links = List( + MacroLink("oin", PortIdentity(0), "Inner", PortIdentity(0)), + MacroLink("Inner", PortIdentity(0), "oout", PortIdentity(0)) + ) + ) + val outer = snapshotInstance("Outer", "macro-outer", outerBody) + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, outer, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), outer.operatorIdentifier, PortIdentity(0)), + LogicalLink(outer.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, MacroRegistry.Empty) + val ids = out.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet + ids should contain("Outer--Inner--inner-inner") + ids should contain("src") + ids should contain("sink") + val edges = out.links.map(l => (l.fromOpId.id, l.toOpId.id)).toSet + edges should contain("src" -> "Outer--Inner--inner-inner") + edges should contain("Outer--Inner--inner-inner" -> "sink") + } + + it should "detect a self-referential macro cycle" in { + val cycleId = "loop" + // A body that references the same macro again. + val recurInst = new MacroOpDesc + recurInst.macroId = cycleId + recurInst.macroVersion = 1 + recurInst.linkMode = MacroOpDesc.LIVE + recurInst.inputPortCount = 1 + recurInst.outputPortCount = 1 + recurInst.setOperatorId("self") + val body = MacroBody( + operators = List(inMarker(0, "in"), recurInst, outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "self", PortIdentity(0)), + MacroLink("self", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val registry = MacroRegistry.inMemory(Map((cycleId, 1) -> body)) + + val outer = new MacroOpDesc + outer.macroId = cycleId + outer.macroVersion = 1 + outer.linkMode = MacroOpDesc.LIVE + outer.inputPortCount = 1 + outer.outputPortCount = 1 + outer.setOperatorId("outer") + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, outer, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), outer.operatorIdentifier, PortIdentity(0)), + LogicalLink(outer.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val ex = intercept[IllegalStateException] { MacroExpander.expand(plan, registry) } + ex.getMessage.toLowerCase should include("cycle") + } + + it should "fail with a depth-limit error on a long non-cyclic macro chain" in { + // Build a chain chain-0 → chain-1 → ... → chain-N (where each chain-i's body + // contains a macro instance referencing chain-(i+1)). Distinct macroIds, so the + // cycle guard cannot fire; depth guard must. + val n = MacroCompileContext.MaxDepth + 5 + val bodies: Map[(String, Int), MacroBody] = (0 until n).map { i => + val nextId = s"chain-${i + 1}" + val innerOp = + if (i < n - 1) { + val m = new MacroOpDesc + m.macroId = nextId + m.macroVersion = 1 + m.linkMode = MacroOpDesc.LIVE + m.inputPortCount = 1 + m.outputPortCount = 1 + m.setOperatorId(s"inst-$i") + m + } else { + limit(s"leaf-$i", 1) + } + val body = MacroBody( + operators = List(inMarker(0, s"in-$i"), innerOp, outMarker(0, s"out-$i")), + links = List( + MacroLink(s"in-$i", PortIdentity(0), innerOp.operatorIdentifier.id, PortIdentity(0)), + MacroLink(innerOp.operatorIdentifier.id, PortIdentity(0), s"out-$i", PortIdentity(0)) + ) + ) + (s"chain-$i", 1) -> body + }.toMap + + val registry = MacroRegistry.inMemory(bodies) + val outer = new MacroOpDesc + outer.macroId = "chain-0" + outer.macroVersion = 1 + outer.linkMode = MacroOpDesc.LIVE + outer.inputPortCount = 1 + outer.outputPortCount = 1 + outer.setOperatorId("outer") + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, outer, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), outer.operatorIdentifier, PortIdentity(0)), + LogicalLink(outer.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val ex = intercept[IllegalStateException] { MacroExpander.expand(plan, registry) } + ex.getMessage.toLowerCase should include("depth") + } + + it should "give each instance its own prefix when the same macro is used twice" in { + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 9), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst1 = snapshotInstance("first", "shared", body) + val inst2 = snapshotInstance("second", "shared", body) + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, inst1, inst2, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst1.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst1.operatorIdentifier, PortIdentity(0), inst2.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst2.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, MacroRegistry.Empty) + val ids = out.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet + ids should contain("first--inner") + ids should contain("second--inner") + val edges = out.links.map(l => (l.fromOpId.id, l.toOpId.id)).toSet + edges shouldBe Set( + "src" -> "first--inner", + "first--inner" -> "second--inner", + "second--inner" -> "sink" + ) + } + + it should "fan out a single external input port to multiple inner consumers" in { + val body = MacroBody( + operators = List( + inMarker(0, "in"), + limit("consumerA", 1), + limit("consumerB", 2), + outMarker(0, "out") + ), + links = List( + MacroLink("in", PortIdentity(0), "consumerA", PortIdentity(0)), + MacroLink("in", PortIdentity(0), "consumerB", PortIdentity(0)), + MacroLink("consumerA", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("FanOut", "macro-fan", body) + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, inst, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val out = MacroExpander.expand(plan, MacroRegistry.Empty) + val srcOutTargets = + out.links.filter(_.fromOpId == src.operatorIdentifier).map(_.toOpId.id).toSet + srcOutTargets shouldBe Set("FanOut--consumerA", "FanOut--consumerB") + } + + it should "fail clearly when a LIVE macro is missing from the registry" in { + val inst = new MacroOpDesc + inst.macroId = "missing" + inst.macroVersion = 5 + inst.linkMode = MacroOpDesc.LIVE + inst.inputPortCount = 1 + inst.outputPortCount = 1 + inst.setOperatorId("inst") + val src = limit("src", 0) + val plan = LogicalPlan( + operators = List(src, inst), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)) + ) + ) + val ex = intercept[IllegalArgumentException] { + MacroExpander.expand(plan, MacroRegistry.Empty) + } + ex.getMessage.toLowerCase should include("not found") + ex.getMessage should include("missing") + } + + it should "leave the persisted snapshot body unmutated across two expansions" in { + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 1), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("once", "m", body) + val src = limit("src", 0); val sink = limit("sink", 1) + val plan = LogicalPlan( + operators = List(src, inst, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + + val first = MacroExpander.expand(plan, MacroRegistry.Empty) + val innerInBodyAfterFirst = + body.operators.collectFirst { case l: LimitOpDesc => l.operatorIdentifier.id } + innerInBodyAfterFirst shouldBe Some("inner") // not "once--inner" — body wasn't mutated. + + // Re-expand a fresh plan that reuses the SAME body object: must still inline cleanly. + val inst2 = snapshotInstance("twice", "m", body) + val plan2 = LogicalPlan( + operators = List(src, inst2, sink), + links = List( + LogicalLink(src.operatorIdentifier, PortIdentity(0), inst2.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst2.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ) + ) + val second = MacroExpander.expand(plan2, MacroRegistry.Empty) + val secondIds = second.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet + secondIds should contain("twice--inner") + + val firstIds = first.operators.collect { case l: LimitOpDesc => l.operatorIdentifier.id }.toSet + firstIds should contain("once--inner") + } + + // ---------- full-compile path: schema propagation + error attribution ---------- + + it should "compile a workflow whose source feeds a macro body, propagating schemas through the inline" in { + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("MyMacro-1", "macro-A", body) + + val csvOp = new CSVScanSourceOpDesc() + csvOp.fileName = Some("workflow-compiling-service/src/test/resources/country_sales_small.csv") + csvOp.customDelimiter = Some(",") + csvOp.hasHeader = true + csvOp.setOperatorId("CSVScan-A") + + val sink = limit("sink", 5) + + val pojo = LogicalPlanPojo( + operators = List(csvOp, inst, sink), + links = List( + LogicalLink(csvOp.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx).compile(pojo) + + result.operatorIdToError shouldBe empty + result.physicalPlan should not be empty + // Inner op got the source's schema propagated through the macro boundary. + val innerSchema = result.operatorIdToOutputSchemas(OperatorIdentity("MyMacro-1--inner")) + innerSchema.values.head shouldBe defined + } + + it should "propagate schemas through a LIVE-mode macro to multiple downstream ops on the parent canvas" in { + // Mirrors the user-reported failure shape: + // CSV → Macro(LIVE, macroId=265) → mid → end + // where the macro body is a single one-to-one op wrapped in markers. + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val registry = MacroRegistry.inMemory(Map(("265", 1) -> body)) + + val inst = new MacroOpDesc + inst.macroId = "265" + inst.macroVersion = 1 + inst.linkMode = MacroOpDesc.LIVE + inst.inputPortCount = 1 + inst.outputPortCount = 1 + inst.setOperatorId("Macro-operator-acc00f1c") + + val csvOp = new CSVScanSourceOpDesc() + csvOp.fileName = Some("workflow-compiling-service/src/test/resources/country_sales_small.csv") + csvOp.customDelimiter = Some(",") + csvOp.hasHeader = true + csvOp.setOperatorId("CSVScan") + + val mid = limit("mid", 5) + val end = limit("end", 2) + + val pojo = LogicalPlanPojo( + operators = List(csvOp, inst, mid, end), + links = List( + LogicalLink(csvOp.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), mid.operatorIdentifier, PortIdentity(0)), + LogicalLink(mid.operatorIdentifier, PortIdentity(0), end.operatorIdentifier, PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx, registry).compile(pojo) + + result.operatorIdToError shouldBe empty + result.physicalPlan should not be empty + // Inner op got its schema, and the downstream canvas ops did too. + val outputKeys = result.operatorIdToOutputSchemas.keys.map(_.id).toSet + outputKeys should contain("Macro-operator-acc00f1c--inner") + outputKeys should contain("mid") + outputKeys should contain("end") + } + + it should "surface the macro on the canvas as failing when the LIVE macro body's inner op lacks its MacroInput link" in { + // The macro body has both markers, but the MacroInput → inner-op link is + // missing. After expansion, the inner op is disconnected from the parent's + // upstream — so its schema can't be computed, and that cascades to every + // downstream canvas op. Before the visible-id remap, the macro itself looked + // fine in `operatorErrors` (only `Macro/inner` was keyed there), so the + // canvas would only mark the *downstream* ops red — confusing the user since + // the root cause is the macro. + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + // intentionally NO link from "in" to "inner" + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val registry = MacroRegistry.inMemory(Map(("265", 1) -> body)) + + val inst = new MacroOpDesc + inst.macroId = "265" + inst.macroVersion = 1 + inst.linkMode = MacroOpDesc.LIVE + inst.inputPortCount = 1 + inst.outputPortCount = 1 + inst.setOperatorId("Macro-operator-acc00f1c") + + val csvOp = new CSVScanSourceOpDesc() + csvOp.fileName = Some("workflow-compiling-service/src/test/resources/country_sales_small.csv") + csvOp.customDelimiter = Some(",") + csvOp.hasHeader = true + csvOp.setOperatorId("CSVScan") + + val mid = limit("mid", 5) + val end = limit("end", 2) + + val pojo = LogicalPlanPojo( + operators = List(csvOp, inst, mid, end), + links = List( + LogicalLink(csvOp.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), mid.operatorIdentifier, PortIdentity(0)), + LogicalLink(mid.operatorIdentifier, PortIdentity(0), end.operatorIdentifier, PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx, registry).compile(pojo) + + result.physicalPlan shouldBe empty + val keys = result.operatorIdToError.keys.map(_.id).toSet + keys should contain("Macro-operator-acc00f1c") // the macro instance, not "Macro-operator-acc00f1c--inner" + keys should contain("mid") + keys should contain("end") + result.operatorIdToError(OperatorIdentity("Macro-operator-acc00f1c")).message should + include("Macro-operator-acc00f1c--inner") + } + + it should "still compile the outer workflow when a dangling inner op inside the macro has a schema error" in { + // Main path: in → inner1 → out (valid, schema flows through). + // Dangling side branch: inner2 has no upstream in the body → schema error. + // The outer canvas (CSVScan → Macro → sink) should still compile; only the + // macro shows red. Previously the dangling error set physicalPlan to None. + val body = MacroBody( + operators = List( + inMarker(0, "in"), + limit("inner1", 10), + limit("inner2", 5), // disconnected — no link to/from anything + outMarker(0, "out") + ), + links = List( + MacroLink("in", PortIdentity(0), "inner1", PortIdentity(0)), + MacroLink("inner1", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val registry = MacroRegistry.inMemory(Map(("265", 1) -> body)) + + val inst = new MacroOpDesc + inst.macroId = "265" + inst.macroVersion = 1 + inst.linkMode = MacroOpDesc.LIVE + inst.inputPortCount = 1 + inst.outputPortCount = 1 + inst.setOperatorId("Macro-operator-acc00f1c") + + val csvOp = new CSVScanSourceOpDesc() + csvOp.fileName = Some("workflow-compiling-service/src/test/resources/country_sales_small.csv") + csvOp.customDelimiter = Some(",") + csvOp.hasHeader = true + csvOp.setOperatorId("CSVScan") + + val sink = limit("sink", 5) + + val pojo = LogicalPlanPojo( + operators = List(csvOp, inst, sink), + links = List( + LogicalLink(csvOp.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)), + LogicalLink(inst.operatorIdentifier, PortIdentity(0), sink.operatorIdentifier, PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx, registry).compile(pojo) + + // Outer workflow compiles despite the dangling inner error. + result.physicalPlan should not be empty + // Error is attributed to the macro on the canvas, not the inner op. + result.operatorIdToError.keys.map(_.id) should contain("Macro-operator-acc00f1c") + result.operatorIdToError.keys.map(_.id) should not contain "sink" + } + + it should "attribute a schema error inside a macro body to the visible macro instance, not the prefixed inner op" in { + // body: in → limit("inner", 7) → out — limit's input schema can't be computed when + // the macro has no upstream connection on the parent canvas. + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 7), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("Lonely", "macro-A", body) + + val pojo = LogicalPlanPojo( + operators = List(inst), + links = List(), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx).compile(pojo) + + // The frontend canvas only shows "Lonely" — error must be keyed under that ID, + // not the post-expansion "Lonely--inner", or it never reaches the macro node UI. + result.operatorIdToError.keys.map(_.id).toSet shouldBe Set("Lonely") + val err = result.operatorIdToError.values.head + err.operatorId shouldBe "Lonely" + // Inner op id stays in the message so the developer knows which body op blew up. + err.message should include("Lonely--inner") + err.message should include("schema is not available") + } + + it should "short-circuit standalone macro-body compiles (markers present, no MacroOpDesc) to a clean success" in { + // Mirrors the drill-down editor: the frontend reloads `workflow.content` for a + // macro and the body — markers + inner ops, NO MacroOpDesc — gets fed straight + // into /compile by the singleton WorkflowCompilingService. Pre-fix, the markers + // threw IllegalStateException, every inner op downstream failed schema + // propagation, and the resulting "Failed" state would persist across the + // drill-down → parent navigation in the singleton compile-state, making the + // parent canvas look broken until the parent's own compile finished. + val pojo = LogicalPlanPojo( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + LogicalLink(OperatorIdentity("in"), PortIdentity(0), OperatorIdentity("inner"), PortIdentity(0)), + LogicalLink(OperatorIdentity("inner"), PortIdentity(0), OperatorIdentity("out"), PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx).compile(pojo) + + result.operatorIdToError shouldBe empty + result.physicalPlan should not be empty + result.physicalPlan.get.operators shouldBe empty + result.operatorIdToOutputSchemas shouldBe empty + } + + it should "still compile a parent that uses a macro instance (short-circuit does NOT apply post-expansion markers)" in { + // Regression guard: the short-circuit fires on the *raw* plan before + // MacroExpander runs. A parent canvas legitimately holds a MacroOpDesc + // (which carries markers in its embedded body) and must take the full + // compile path. Otherwise we'd silently swallow real parent compiles. + val body = MacroBody( + operators = List(inMarker(0, "in"), limit("inner", 10), outMarker(0, "out")), + links = List( + MacroLink("in", PortIdentity(0), "inner", PortIdentity(0)), + MacroLink("inner", PortIdentity(0), "out", PortIdentity(0)) + ) + ) + val inst = snapshotInstance("ParentMacro", "macro-A", body) + + val csvOp = new CSVScanSourceOpDesc() + csvOp.fileName = Some("workflow-compiling-service/src/test/resources/country_sales_small.csv") + csvOp.customDelimiter = Some(",") + csvOp.hasHeader = true + csvOp.setOperatorId("CSVScan-A") + + val pojo = LogicalPlanPojo( + operators = List(csvOp, inst), + links = List( + LogicalLink(csvOp.operatorIdentifier, PortIdentity(0), inst.operatorIdentifier, PortIdentity(0)) + ), + opsToViewResult = List(), + opsToReuseResult = List() + ) + + val ctx = new WorkflowContext(workflowId = WorkflowIdentity(0)) + val result = new WorkflowCompiler(ctx).compile(pojo) + + result.operatorIdToError shouldBe empty + result.physicalPlan should not be empty + // The expanded plan should have actually compiled — non-empty physical ops, + // proving we took the full path, not the short-circuit. + result.physicalPlan.get.operators should not be empty + } + + // ---------- helpers ---------- + + private def limit(id: String, lim: Int): LimitOpDesc = { + val l = new LimitOpDesc + l.limit = lim + l.setOperatorId(id) + l + } + + private def inMarker(idx: Int, id: String): MacroInputOp = { + val m = new MacroInputOp + m.portIndex = idx + m.setOperatorId(id) + m + } + + private def outMarker(idx: Int, id: String): MacroOutputOp = { + val m = new MacroOutputOp + m.portIndex = idx + m.setOperatorId(id) + m + } + + private def snapshotInstance( + instanceId: String, + macroId: String, + body: MacroBody + ): MacroOpDesc = { + val m = new MacroOpDesc + m.macroId = macroId + m.macroVersion = 1 + m.linkMode = MacroOpDesc.SNAPSHOT + m.snapshot = Some(body) + m.inputPortCount = 1 + m.outputPortCount = 1 + m.setOperatorId(instanceId) + m + } +}