From d36fc8addaeffc9d798fc8e504825c1cd2da44e0 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 15:39:00 +0100 Subject: [PATCH 01/33] feat: adding project-discovery-worker Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 97 +++++++++++++++++++ .../automatic-project-discovery-worker.yaml | 64 ++++++++++++ ...kerfile.automatic_project_discovery_worker | 23 +++++ .../package.json | 34 +++++++ .../src/activities.ts | 1 + .../src/activities/activities.ts | 7 ++ .../src/main.ts | 36 +++++++ .../src/schedules/scheduleProjectDiscovery.ts | 42 ++++++++ .../src/workflows.ts | 3 + .../src/workflows/discoverProjects.ts | 11 +++ .../tsconfig.json | 4 + 11 files changed, 322 insertions(+) create mode 100644 scripts/services/automatic-project-discovery-worker.yaml create mode 100644 scripts/services/docker/Dockerfile.automatic_project_discovery_worker create mode 100644 services/apps/automatic_project_discovery_worker/package.json create mode 100644 services/apps/automatic_project_discovery_worker/src/activities.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/activities/activities.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/main.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/workflows.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts create mode 100644 services/apps/automatic_project_discovery_worker/tsconfig.json diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 052cfd6ade..0904e17f6a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -476,6 +476,58 @@ importers: specifier: ^3.3.3 version: 3.3.3 + services/apps/automatic_project_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/cache_worker: dependencies: '@crowd/archetype-standard': @@ -10619,6 +10671,51 @@ snapshots: - '@aws-sdk/client-sts' - aws-crt + '@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0)': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/client-sts': 3.687.0 + '@aws-sdk/core': 3.686.0 + '@aws-sdk/credential-provider-node': 3.687.0(@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0))(@aws-sdk/client-sts@3.687.0) + '@aws-sdk/middleware-host-header': 3.686.0 + '@aws-sdk/middleware-logger': 3.686.0 + '@aws-sdk/middleware-recursion-detection': 3.686.0 + '@aws-sdk/middleware-user-agent': 3.687.0 + '@aws-sdk/region-config-resolver': 3.686.0 + '@aws-sdk/types': 3.686.0 + '@aws-sdk/util-endpoints': 3.686.0 + '@aws-sdk/util-user-agent-browser': 3.686.0 + '@aws-sdk/util-user-agent-node': 3.687.0 + '@smithy/config-resolver': 3.0.10 + '@smithy/core': 2.5.1 + '@smithy/fetch-http-handler': 4.0.0 + '@smithy/hash-node': 3.0.8 + '@smithy/invalid-dependency': 3.0.8 + '@smithy/middleware-content-length': 3.0.10 + '@smithy/middleware-endpoint': 3.2.1 + '@smithy/middleware-retry': 3.0.25 + '@smithy/middleware-serde': 3.0.8 + '@smithy/middleware-stack': 3.0.8 + '@smithy/node-config-provider': 3.1.9 + '@smithy/node-http-handler': 3.2.5 + '@smithy/protocol-http': 4.1.5 + '@smithy/smithy-client': 3.4.2 + '@smithy/types': 3.6.0 + '@smithy/url-parser': 3.0.8 + '@smithy/util-base64': 3.0.0 + '@smithy/util-body-length-browser': 3.0.0 + '@smithy/util-body-length-node': 3.0.0 + '@smithy/util-defaults-mode-browser': 3.0.25 + '@smithy/util-defaults-mode-node': 3.0.25 + '@smithy/util-endpoints': 2.1.4 + '@smithy/util-middleware': 3.0.8 + '@smithy/util-retry': 3.0.8 + '@smithy/util-utf8': 3.0.0 + tslib: 2.6.2 + transitivePeerDependencies: + - aws-crt + '@aws-sdk/client-sso@3.556.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 diff --git a/scripts/services/automatic-project-discovery-worker.yaml b/scripts/services/automatic-project-discovery-worker.yaml new file mode 100644 index 0000000000..89f2e2abb3 --- /dev/null +++ b/scripts/services/automatic-project-discovery-worker.yaml @@ -0,0 +1,64 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: automatic-project-discovery-worker + CROWD_TEMPORAL_TASKQUEUE: automatic-project-discovery + SHELL: /bin/sh + +services: + automatic-project-discovery-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + command: 'pnpm run start' + working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + automatic-project-discovery-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + command: 'pnpm run dev' + working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: automatic-project-discovery-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/automatic_project_discovery_worker/src:/usr/crowd/app/services/apps/automatic_project_discovery_worker/src + +networks: + crowd-bridge: + external: true diff --git a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker b/scripts/services/docker/Dockerfile.automatic_project_discovery_worker new file mode 100644 index 0000000000..9492597b72 --- /dev/null +++ b/scripts/services/docker/Dockerfile.automatic_project_discovery_worker @@ -0,0 +1,23 @@ +FROM node:20-alpine as builder + +RUN apk add --no-cache python3 make g++ + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./ +RUN pnpm fetch + +COPY ./services ./services +RUN pnpm i --frozen-lockfile + +FROM node:20-bookworm-slim as runner + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate && apt update && apt install -y ca-certificates --no-install-recommends && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/crowd/app/node_modules ./node_modules +COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json +COPY --from=builder /usr/crowd/app/services/libs ./services/libs +COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes +COPY --from=builder /usr/crowd/app/services/apps/automatic_project_discovery_worker/ ./services/apps/automatic_project_discovery_worker diff --git a/services/apps/automatic_project_discovery_worker/package.json b/services/apps/automatic_project_discovery_worker/package.json new file mode 100644 index 0000000000..a0df2ff8c0 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/package.json @@ -0,0 +1,34 @@ +{ + "name": "@crowd/automatic-project-discovery-worker", + "scripts": { + "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker tsx src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", + "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", + "lint": "npx eslint --ext .ts src --max-warnings=0", + "format": "npx prettier --write \"src/**/*.ts\"", + "format-check": "npx prettier --check .", + "tsc-check": "tsc --noEmit" + }, + "dependencies": { + "@crowd/archetype-standard": "workspace:*", + "@crowd/archetype-worker": "workspace:*", + "@crowd/common": "workspace:*", + "@crowd/common_services": "workspace:*", + "@crowd/data-access-layer": "workspace:*", + "@crowd/logging": "workspace:*", + "@crowd/redis": "workspace:*", + "@crowd/temporal": "workspace:*", + "@crowd/types": "workspace:*", + "@temporalio/activity": "~1.11.8", + "@temporalio/client": "~1.11.8", + "@temporalio/workflow": "~1.11.8", + "tsx": "^4.7.1", + "typescript": "^5.6.3" + }, + "devDependencies": { + "@types/node": "^20.8.2", + "nodemon": "^3.0.1" + } +} diff --git a/services/apps/automatic_project_discovery_worker/src/activities.ts b/services/apps/automatic_project_discovery_worker/src/activities.ts new file mode 100644 index 0000000000..3662234550 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/activities.ts @@ -0,0 +1 @@ +export * from './activities/activities' diff --git a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts b/services/apps/automatic_project_discovery_worker/src/activities/activities.ts new file mode 100644 index 0000000000..806f5e5087 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/activities/activities.ts @@ -0,0 +1,7 @@ +import { getServiceLogger } from '@crowd/logging' + +const log = getServiceLogger() + +export async function logDiscoveryRun(): Promise { + log.info('Automatic project discovery workflow executed successfully.') +} diff --git a/services/apps/automatic_project_discovery_worker/src/main.ts b/services/apps/automatic_project_discovery_worker/src/main.ts new file mode 100644 index 0000000000..44f3182720 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/main.ts @@ -0,0 +1,36 @@ +import { Config } from '@crowd/archetype-standard' +import { Options, ServiceWorker } from '@crowd/archetype-worker' + +import { scheduleProjectDiscovery } from './schedules/scheduleProjectDiscovery' + +const config: Config = { + envvars: [], + producer: { + enabled: false, + }, + temporal: { + enabled: true, + }, + redis: { + enabled: false, + }, +} + +const options: Options = { + postgres: { + enabled: false, + }, + opensearch: { + enabled: false, + }, +} + +export const svc = new ServiceWorker(config, options) + +setImmediate(async () => { + await svc.init() + + await scheduleProjectDiscovery() + + await svc.start() +}) diff --git a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts b/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts new file mode 100644 index 0000000000..74e0636b56 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts @@ -0,0 +1,42 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../main' +import { discoverProjects } from '../workflows' + +const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM + +export const scheduleProjectDiscovery = async () => { + const cronExpression = process.env.CROWD_AUTOMATIC_PROJECT_DISCOVERY_CRON || DEFAULT_CRON + + svc.log.info(`Scheduling project discovery with cron: ${cronExpression}`) + + try { + await svc.temporal.schedule.create({ + scheduleId: 'automaticProjectDiscovery', + spec: { + cronExpressions: [cronExpression], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 minute', + }, + action: { + type: 'startWorkflow', + workflowType: discoverProjects, + taskQueue: 'automatic-project-discovery', + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule already registered in Temporal.') + svc.log.info('Configuration may have changed since. Please make sure they are in sync.') + } else { + throw new Error(err) + } + } +} diff --git a/services/apps/automatic_project_discovery_worker/src/workflows.ts b/services/apps/automatic_project_discovery_worker/src/workflows.ts new file mode 100644 index 0000000000..07b00cee6f --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/workflows.ts @@ -0,0 +1,3 @@ +import { discoverProjects } from './workflows/discoverProjects' + +export { discoverProjects } diff --git a/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts new file mode 100644 index 0000000000..f43a9b5a12 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts @@ -0,0 +1,11 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from '../activities' + +const activity = proxyActivities({ + startToCloseTimeout: '1 minutes', +}) + +export async function discoverProjects(): Promise { + await activity.logDiscoveryRun() +} diff --git a/services/apps/automatic_project_discovery_worker/tsconfig.json b/services/apps/automatic_project_discovery_worker/tsconfig.json new file mode 100644 index 0000000000..bf7f183850 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../base.tsconfig.json", + "include": ["src/**/*"] +} From a736f47773ced1b10ad047c5722cd7a2e21c853b Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:12:44 +0100 Subject: [PATCH 02/33] feat: add builder Signed-off-by: Umberto Sgueglia --- scripts/builders/automatic-project-discovery-worker.env | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scripts/builders/automatic-project-discovery-worker.env diff --git a/scripts/builders/automatic-project-discovery-worker.env b/scripts/builders/automatic-project-discovery-worker.env new file mode 100644 index 0000000000..1294c29254 --- /dev/null +++ b/scripts/builders/automatic-project-discovery-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.automatic_project_discovery_worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/automatic-project-discovery-worker" +SERVICES="automatic-project-discovery-worker" From 1af6e976bece751596301bce10d5ac1b0a19a50e Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:28:39 +0100 Subject: [PATCH 03/33] refactor: rename service Signed-off-by: Umberto Sgueglia --- .../automatic-project-discovery-worker.env | 4 ---- .../automatic-projects-discovery-worker.env | 4 ++++ ... automatic-projects-discovery-worker.yaml} | 20 +++++++++---------- ...rfile.automatic_projects_discovery_worker} | 2 +- .../package.json | 8 ++++---- .../src/activities.ts | 0 .../src/activities/activities.ts | 2 +- .../src/main.ts | 4 ++-- .../schedules/scheduleProjectsDiscovery.ts} | 10 +++++----- .../src/workflows.ts | 0 .../src/workflows/discoverProjects.ts | 0 .../tsconfig.json | 0 12 files changed, 27 insertions(+), 27 deletions(-) delete mode 100644 scripts/builders/automatic-project-discovery-worker.env create mode 100644 scripts/builders/automatic-projects-discovery-worker.env rename scripts/services/{automatic-project-discovery-worker.yaml => automatic-projects-discovery-worker.yaml} (78%) rename scripts/services/docker/{Dockerfile.automatic_project_discovery_worker => Dockerfile.automatic_projects_discovery_worker} (92%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/package.json (69%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/activities.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/activities/activities.ts (65%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/main.ts (81%) rename services/apps/{automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts => automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts} (74%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/workflows.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/workflows/discoverProjects.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/tsconfig.json (100%) diff --git a/scripts/builders/automatic-project-discovery-worker.env b/scripts/builders/automatic-project-discovery-worker.env deleted file mode 100644 index 1294c29254..0000000000 --- a/scripts/builders/automatic-project-discovery-worker.env +++ /dev/null @@ -1,4 +0,0 @@ -DOCKERFILE="./services/docker/Dockerfile.automatic_project_discovery_worker" -CONTEXT="../" -REPO="sjc.ocir.io/axbydjxa5zuh/automatic-project-discovery-worker" -SERVICES="automatic-project-discovery-worker" diff --git a/scripts/builders/automatic-projects-discovery-worker.env b/scripts/builders/automatic-projects-discovery-worker.env new file mode 100644 index 0000000000..8416386449 --- /dev/null +++ b/scripts/builders/automatic-projects-discovery-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.automatic_projects_discovery_worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/automatic-projects-discovery-worker" +SERVICES="automatic-projects-discovery-worker" diff --git a/scripts/services/automatic-project-discovery-worker.yaml b/scripts/services/automatic-projects-discovery-worker.yaml similarity index 78% rename from scripts/services/automatic-project-discovery-worker.yaml rename to scripts/services/automatic-projects-discovery-worker.yaml index 89f2e2abb3..5f3732b7c2 100644 --- a/scripts/services/automatic-project-discovery-worker.yaml +++ b/scripts/services/automatic-projects-discovery-worker.yaml @@ -3,17 +3,17 @@ version: '3.1' x-env-args: &env-args DOCKER_BUILDKIT: 1 NODE_ENV: docker - SERVICE: automatic-project-discovery-worker - CROWD_TEMPORAL_TASKQUEUE: automatic-project-discovery + SERVICE: automatic-projects-discovery-worker + CROWD_TEMPORAL_TASKQUEUE: automatic-projects-discovery SHELL: /bin/sh services: - automatic-project-discovery-worker: + automatic-projects-discovery-worker: build: context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker command: 'pnpm run start' - working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker env_file: - ../../backend/.env.dist.local - ../../backend/.env.dist.composed @@ -25,12 +25,12 @@ services: networks: - crowd-bridge - automatic-project-discovery-worker-dev: + automatic-projects-discovery-worker-dev: build: context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker command: 'pnpm run dev' - working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker env_file: - ../../backend/.env.dist.local - ../../backend/.env.dist.composed @@ -38,7 +38,7 @@ services: - ../../backend/.env.override.composed environment: <<: *env-args - hostname: automatic-project-discovery-worker + hostname: automatic-projects-discovery-worker networks: - crowd-bridge volumes: @@ -57,7 +57,7 @@ services: - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src - - ../../services/apps/automatic_project_discovery_worker/src:/usr/crowd/app/services/apps/automatic_project_discovery_worker/src + - ../../services/apps/automatic_projects_discovery_worker/src:/usr/crowd/app/services/apps/automatic_projects_discovery_worker/src networks: crowd-bridge: diff --git a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker similarity index 92% rename from scripts/services/docker/Dockerfile.automatic_project_discovery_worker rename to scripts/services/docker/Dockerfile.automatic_projects_discovery_worker index 9492597b72..860af6601e 100644 --- a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker +++ b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker @@ -20,4 +20,4 @@ COPY --from=builder /usr/crowd/app/node_modules ./node_modules COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json COPY --from=builder /usr/crowd/app/services/libs ./services/libs COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes -COPY --from=builder /usr/crowd/app/services/apps/automatic_project_discovery_worker/ ./services/apps/automatic_project_discovery_worker +COPY --from=builder /usr/crowd/app/services/apps/automatic_projects_discovery_worker/ ./services/apps/automatic_projects_discovery_worker diff --git a/services/apps/automatic_project_discovery_worker/package.json b/services/apps/automatic_projects_discovery_worker/package.json similarity index 69% rename from services/apps/automatic_project_discovery_worker/package.json rename to services/apps/automatic_projects_discovery_worker/package.json index a0df2ff8c0..1c79505f89 100644 --- a/services/apps/automatic_project_discovery_worker/package.json +++ b/services/apps/automatic_projects_discovery_worker/package.json @@ -1,9 +1,9 @@ { - "name": "@crowd/automatic-project-discovery-worker", + "name": "@crowd/automatic-projects-discovery-worker", "scripts": { - "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker tsx src/main.ts", - "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", - "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", "lint": "npx eslint --ext .ts src --max-warnings=0", diff --git a/services/apps/automatic_project_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/activities.ts rename to services/apps/automatic_projects_discovery_worker/src/activities.ts diff --git a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts similarity index 65% rename from services/apps/automatic_project_discovery_worker/src/activities/activities.ts rename to services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 806f5e5087..3aea7f8200 100644 --- a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -3,5 +3,5 @@ import { getServiceLogger } from '@crowd/logging' const log = getServiceLogger() export async function logDiscoveryRun(): Promise { - log.info('Automatic project discovery workflow executed successfully.') + log.info('Automatic projects discovery workflow executed successfully.') } diff --git a/services/apps/automatic_project_discovery_worker/src/main.ts b/services/apps/automatic_projects_discovery_worker/src/main.ts similarity index 81% rename from services/apps/automatic_project_discovery_worker/src/main.ts rename to services/apps/automatic_projects_discovery_worker/src/main.ts index 44f3182720..326c3a361a 100644 --- a/services/apps/automatic_project_discovery_worker/src/main.ts +++ b/services/apps/automatic_projects_discovery_worker/src/main.ts @@ -1,7 +1,7 @@ import { Config } from '@crowd/archetype-standard' import { Options, ServiceWorker } from '@crowd/archetype-worker' -import { scheduleProjectDiscovery } from './schedules/scheduleProjectDiscovery' +import { scheduleProjectsDiscovery } from './schedules/scheduleProjectsDiscovery' const config: Config = { envvars: [], @@ -30,7 +30,7 @@ export const svc = new ServiceWorker(config, options) setImmediate(async () => { await svc.init() - await scheduleProjectDiscovery() + await scheduleProjectsDiscovery() await svc.start() }) diff --git a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts similarity index 74% rename from services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts rename to services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 74e0636b56..847c2e4ce9 100644 --- a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -5,14 +5,14 @@ import { discoverProjects } from '../workflows' const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM -export const scheduleProjectDiscovery = async () => { - const cronExpression = process.env.CROWD_AUTOMATIC_PROJECT_DISCOVERY_CRON || DEFAULT_CRON +export const scheduleProjectsDiscovery = async () => { + const cronExpression = process.env.CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON || DEFAULT_CRON - svc.log.info(`Scheduling project discovery with cron: ${cronExpression}`) + svc.log.info(`Scheduling projects discovery with cron: ${cronExpression}`) try { await svc.temporal.schedule.create({ - scheduleId: 'automaticProjectDiscovery', + scheduleId: 'automaticProjectsDiscovery', spec: { cronExpressions: [cronExpression], }, @@ -23,7 +23,7 @@ export const scheduleProjectDiscovery = async () => { action: { type: 'startWorkflow', workflowType: discoverProjects, - taskQueue: 'automatic-project-discovery', + taskQueue: 'automatic-projects-discovery', retry: { initialInterval: '15 seconds', backoffCoefficient: 2, diff --git a/services/apps/automatic_project_discovery_worker/src/workflows.ts b/services/apps/automatic_projects_discovery_worker/src/workflows.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/workflows.ts rename to services/apps/automatic_projects_discovery_worker/src/workflows.ts diff --git a/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts rename to services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts diff --git a/services/apps/automatic_project_discovery_worker/tsconfig.json b/services/apps/automatic_projects_discovery_worker/tsconfig.json similarity index 100% rename from services/apps/automatic_project_discovery_worker/tsconfig.json rename to services/apps/automatic_projects_discovery_worker/tsconfig.json From 339371850e6b4c9f4ebf03984b5e04f0956b8368 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:35:53 +0100 Subject: [PATCH 04/33] fix: push lock file Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 138 +++++++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 51 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0904e17f6a..495c278fba 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -528,6 +528,58 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/automatic_projects_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/cache_worker: dependencies: '@crowd/archetype-standard': @@ -5627,10 +5679,6 @@ packages: brace-expansion@2.0.1: resolution: {integrity: sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==} - braces@3.0.2: - resolution: {integrity: sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==} - engines: {node: '>=8'} - braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} engines: {node: '>=8'} @@ -6805,10 +6853,6 @@ packages: file-uri-to-path@1.0.0: resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} - fill-range@7.0.1: - resolution: {integrity: sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==} - engines: {node: '>=8'} - fill-range@7.1.1: resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} engines: {node: '>=8'} @@ -11885,7 +11929,7 @@ snapshots: '@babel/traverse': 7.24.1 '@babel/types': 7.24.0 convert-source-map: 2.0.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -11946,7 +11990,7 @@ snapshots: '@babel/core': 7.24.4 '@babel/helper-compilation-targets': 7.23.6 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -12613,7 +12657,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.22.6 '@babel/parser': 7.24.4 '@babel/types': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -12925,7 +12969,7 @@ snapshots: '@eslint/eslintrc@2.1.4': dependencies: ajv: 6.12.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -13055,7 +13099,7 @@ snapshots: '@humanwhocodes/config-array@0.11.14': dependencies: '@humanwhocodes/object-schema': 2.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -13457,7 +13501,7 @@ snapshots: '@opensearch-project/opensearch@2.11.0': dependencies: aws4: 1.12.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 hpagent: 1.2.0 json11: 1.1.2 ms: 2.1.3 @@ -14292,7 +14336,7 @@ snapshots: '@superfaceai/parser': 1.2.0 abort-controller: 3.0.0 cross-fetch: 3.1.8(encoding@0.1.13) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 isomorphic-form-data: 2.0.0 vm2: 3.9.19 transitivePeerDependencies: @@ -14303,7 +14347,7 @@ snapshots: dependencies: '@superfaceai/ast': 1.2.0 '@types/debug': 4.1.12 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) typescript: 4.9.5 transitivePeerDependencies: - supports-color @@ -14664,7 +14708,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/type-utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14684,7 +14728,7 @@ snapshots: '@typescript-eslint/type-utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14701,7 +14745,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/types': 5.62.0 '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14714,7 +14758,7 @@ snapshots: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14735,7 +14779,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) eslint: 8.57.0 tsutils: 3.21.0(typescript@5.6.3) optionalDependencies: @@ -14747,7 +14791,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 ts-api-utils: 1.3.0(typescript@5.6.3) optionalDependencies: @@ -14763,7 +14807,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.62.0 '@typescript-eslint/visitor-keys': 5.62.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.0 @@ -14777,7 +14821,7 @@ snapshots: dependencies: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 globby: 11.1.0 is-glob: 4.0.3 minimatch: 9.0.3 @@ -14966,13 +15010,13 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color agent-base@7.1.1: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -15422,10 +15466,6 @@ snapshots: dependencies: balanced-match: 1.0.2 - braces@3.0.2: - dependencies: - fill-range: 7.0.1 - braces@3.0.3: dependencies: fill-range: 7.1.1 @@ -15572,7 +15612,7 @@ snapshots: chokidar@3.6.0: dependencies: anymatch: 3.1.3 - braces: 3.0.2 + braces: 3.0.3 glob-parent: 5.1.2 is-binary-path: 2.1.0 is-glob: 4.0.3 @@ -15983,19 +16023,19 @@ snapshots: optionalDependencies: supports-color: 5.5.0 - debug@4.3.4(supports-color@5.5.0): + debug@4.3.4: dependencies: ms: 2.1.2 - optionalDependencies: - supports-color: 5.5.0 debug@4.3.7: dependencies: ms: 2.1.3 - debug@4.4.0: + debug@4.4.0(supports-color@5.5.0): dependencies: ms: 2.1.3 + optionalDependencies: + supports-color: 5.5.0 decamelize@1.2.0: {} @@ -16557,7 +16597,7 @@ snapshots: ajv: 6.12.6 chalk: 4.1.2 cross-spawn: 7.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 doctrine: 3.0.0 escape-string-regexp: 4.0.0 eslint-scope: 7.2.2 @@ -16847,10 +16887,6 @@ snapshots: file-uri-to-path@1.0.0: {} - fill-range@7.0.1: - dependencies: - to-regex-range: 5.0.1 - fill-range@7.1.1: dependencies: to-regex-range: 5.0.1 @@ -17455,7 +17491,7 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17471,14 +17507,14 @@ snapshots: https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color https-proxy-agent@7.0.4: dependencies: agent-base: 7.1.1 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17888,7 +17924,7 @@ snapshots: dependencies: '@types/express': 4.17.21 '@types/jsonwebtoken': 9.0.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 jose: 4.15.5 limiter: 1.1.5 lru-memoizer: 2.2.0 @@ -17948,7 +17984,7 @@ snapshots: dependencies: chalk: 5.4.1 commander: 13.1.0 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) execa: 8.0.1 lilconfig: 3.1.3 listr2: 8.2.5 @@ -18424,7 +18460,7 @@ snapshots: nodemon@3.1.0: dependencies: chokidar: 3.6.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.4.0(supports-color@5.5.0) ignore-by-default: 1.0.1 minimatch: 3.1.2 pstree.remy: 1.1.8 @@ -19136,7 +19172,7 @@ snapshots: command-line-usage: 6.1.3 config: 3.3.11 configstore: 5.0.1 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 editor: 1.0.0 enquirer: 2.4.1 form-data: 4.0.0 @@ -19302,7 +19338,7 @@ snapshots: retry-request@4.2.2: dependencies: - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) extend: 3.0.2 transitivePeerDependencies: - supports-color @@ -19492,7 +19528,7 @@ snapshots: dependencies: '@types/debug': 4.1.12 '@types/validator': 13.11.9 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 dottie: 2.0.6 inflection: 1.13.4 lodash: 4.17.21 @@ -19740,7 +19776,7 @@ snapshots: accepts: 1.3.8 base64id: 2.0.0 cors: 2.8.5 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 engine.io: 6.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-adapter: 2.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-parser: 4.2.4 @@ -19898,7 +19934,7 @@ snapshots: dependencies: component-emitter: 1.3.1 cookiejar: 2.1.4 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 fast-safe-stringify: 2.1.1 form-data: 4.0.0 formidable: 2.1.2 From 637d42ca693c14c1bf390a6f1b27d778900c44d9 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 10:40:07 +0100 Subject: [PATCH 05/33] feat: add migrations (CM-950) (#3835) Signed-off-by: Umberto Sgueglia --- ...dd-automatic_projects_discovery-tables.sql | 10 +++++ ...dd-automatic_projects_discovery-tables.sql | 40 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql create mode 100644 backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql diff --git a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..879b28e1a2 --- /dev/null +++ b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,10 @@ +DROP INDEX IF EXISTS "ix_evaluatedProjects_onboarded"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationScore"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationStatus"; +DROP INDEX IF EXISTS "uix_evaluatedProjects_projectCatalogId"; +DROP TABLE IF EXISTS "evaluatedProjects"; + +DROP INDEX IF EXISTS "ix_projectCatalog_syncedAt"; +DROP INDEX IF EXISTS "ix_projectCatalog_criticalityScore"; +DROP INDEX IF EXISTS "uix_projectCatalog_repoUrl"; +DROP TABLE IF EXISTS "projectCatalog"; diff --git a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..53697ce5ce --- /dev/null +++ b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,40 @@ +-- Project Catalog: candidate projects discovered from OSSF Criticality Score and other sources +CREATE TABLE IF NOT EXISTS "projectCatalog" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectSlug" VARCHAR(255) NOT NULL, + "repoName" VARCHAR(255) NOT NULL, + "repoUrl" VARCHAR(1024) NOT NULL, + "criticalityScore" DOUBLE PRECISION, + "syncedAt" TIMESTAMP WITH TIME ZONE DEFAULT NULL, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_projectCatalog_repoUrl" ON "projectCatalog" ("repoUrl"); +CREATE INDEX "ix_projectCatalog_criticalityScore" ON "projectCatalog" ("criticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_syncedAt" ON "projectCatalog" ("syncedAt"); + +-- Evaluated Projects: AI evaluation results linked to catalog entries +CREATE TABLE IF NOT EXISTS "evaluatedProjects" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectCatalogId" UUID NOT NULL REFERENCES "projectCatalog"(id) ON DELETE CASCADE, + "evaluationStatus" VARCHAR(50) NOT NULL DEFAULT 'pending', + "evaluationScore" DOUBLE PRECISION, + "evaluation" JSONB, + "evaluationReason" TEXT, + "evaluatedAt" TIMESTAMP WITH TIME ZONE, + "starsCount" INTEGER, + "forksCount" INTEGER, + "commitsCount" INTEGER, + "pullRequestsCount" INTEGER, + "issuesCount" INTEGER, + "onboarded" BOOLEAN NOT NULL DEFAULT FALSE, + "onboardedAt" TIMESTAMP WITH TIME ZONE, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_evaluatedProjects_projectCatalogId" ON "evaluatedProjects" ("projectCatalogId"); +CREATE INDEX "ix_evaluatedProjects_evaluationStatus" ON "evaluatedProjects" ("evaluationStatus"); +CREATE INDEX "ix_evaluatedProjects_evaluationScore" ON "evaluatedProjects" ("evaluationScore" DESC NULLS LAST); +CREATE INDEX "ix_evaluatedProjects_onboarded" ON "evaluatedProjects" ("onboarded"); From 3ea08f48bfcf290efce9d2567a60094879bfc672 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Fri, 20 Feb 2026 15:56:45 +0100 Subject: [PATCH 06/33] fix: add lf-criticality-score Signed-off-by: Umberto Sgueglia --- ...U1770653666__add-automatic_projects_discovery-tables.sql | 3 ++- ...V1770653666__add-automatic_projects_discovery-tables.sql | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql index 879b28e1a2..a32dbe9a91 100644 --- a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql +++ b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql @@ -5,6 +5,7 @@ DROP INDEX IF EXISTS "uix_evaluatedProjects_projectCatalogId"; DROP TABLE IF EXISTS "evaluatedProjects"; DROP INDEX IF EXISTS "ix_projectCatalog_syncedAt"; -DROP INDEX IF EXISTS "ix_projectCatalog_criticalityScore"; +DROP INDEX IF EXISTS "ix_projectCatalog_lfCriticalityScore"; +DROP INDEX IF EXISTS "ix_projectCatalog_ossfCriticalityScore"; DROP INDEX IF EXISTS "uix_projectCatalog_repoUrl"; DROP TABLE IF EXISTS "projectCatalog"; diff --git a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql index 53697ce5ce..c2add79aae 100644 --- a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql +++ b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql @@ -4,14 +4,16 @@ CREATE TABLE IF NOT EXISTS "projectCatalog" ( "projectSlug" VARCHAR(255) NOT NULL, "repoName" VARCHAR(255) NOT NULL, "repoUrl" VARCHAR(1024) NOT NULL, - "criticalityScore" DOUBLE PRECISION, + "ossfCriticalityScore" DOUBLE PRECISION, + "lfCriticalityScore" DOUBLE PRECISION, "syncedAt" TIMESTAMP WITH TIME ZONE DEFAULT NULL, "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); CREATE UNIQUE INDEX "uix_projectCatalog_repoUrl" ON "projectCatalog" ("repoUrl"); -CREATE INDEX "ix_projectCatalog_criticalityScore" ON "projectCatalog" ("criticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_ossfCriticalityScore" ON "projectCatalog" ("ossfCriticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_lfCriticalityScore" ON "projectCatalog" ("lfCriticalityScore" DESC NULLS LAST); CREATE INDEX "ix_projectCatalog_syncedAt" ON "projectCatalog" ("syncedAt"); -- Evaluated Projects: AI evaluation results linked to catalog entries From 7e6dc1877e23ccfb75c5d30289902d2fdab576c5 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 24 Mar 2026 11:26:19 +0100 Subject: [PATCH 07/33] fix: add dependencies Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 495c278fba..ef062f9f07 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16031,6 +16031,10 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.0: + dependencies: + ms: 2.1.3 + debug@4.4.0(supports-color@5.5.0): dependencies: ms: 2.1.3 From 82f29d9eb29635578ab64f6ae1f4bc8a5dc4d70c Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 10 Feb 2026 11:27:28 +0100 Subject: [PATCH 08/33] feat: add DAL Signed-off-by: Umberto Sgueglia --- .../evaluated-projects/evaluatedProjects.ts | 397 ++++++++++++++++++ .../src/evaluated-projects/index.ts | 2 + .../src/evaluated-projects/types.ts | 69 +++ services/libs/data-access-layer/src/index.ts | 2 + .../src/project-catalog/index.ts | 2 + .../src/project-catalog/projectCatalog.ts | 315 ++++++++++++++ .../src/project-catalog/types.ts | 23 + 7 files changed, 810 insertions(+) create mode 100644 services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts create mode 100644 services/libs/data-access-layer/src/evaluated-projects/index.ts create mode 100644 services/libs/data-access-layer/src/evaluated-projects/types.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/index.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/projectCatalog.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/types.ts diff --git a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts new file mode 100644 index 0000000000..caec6a72a1 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts @@ -0,0 +1,397 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { + EvaluationStatus, + IDbEvaluatedProject, + IDbEvaluatedProjectCreate, + IDbEvaluatedProjectUpdate, +} from './types' + +const EVALUATED_PROJECT_COLUMNS = [ + 'id', + 'projectCatalogId', + 'evaluationStatus', + 'evaluationScore', + 'evaluation', + 'evaluationReason', + 'evaluatedAt', + 'starsCount', + 'forksCount', + 'commitsCount', + 'pullRequestsCount', + 'issuesCount', + 'onboarded', + 'onboardedAt', + 'createdAt', + 'updatedAt', +] + +export async function findEvaluatedProjectById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findEvaluatedProjectsByStatus( + qx: QueryExecutor, + evaluationStatus: EvaluationStatus, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "evaluationStatus" = $(evaluationStatus) + ORDER BY "createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { evaluationStatus, limit, offset }, + ) +} + +export async function findAllEvaluatedProjects( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countEvaluatedProjects( + qx: QueryExecutor, + evaluationStatus?: EvaluationStatus, +): Promise { + const statusFilter = evaluationStatus ? 'WHERE "evaluationStatus" = $(evaluationStatus)' : '' + + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "evaluatedProjects" + ${statusFilter} + `, + { evaluationStatus }, + ) + return parseInt(result.count, 10) +} + +export async function insertEvaluatedProject( + qx: QueryExecutor, + data: IDbEvaluatedProjectCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectCatalogId), + $(evaluationStatus), + $(evaluationScore), + $(evaluation), + $(evaluationReason), + $(starsCount), + $(forksCount), + $(commitsCount), + $(pullRequestsCount), + $(issuesCount), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + projectCatalogId: data.projectCatalogId, + evaluationStatus: data.evaluationStatus ?? 'pending', + evaluationScore: data.evaluationScore ?? null, + evaluation: data.evaluation ? JSON.stringify(data.evaluation) : null, + evaluationReason: data.evaluationReason ?? null, + starsCount: data.starsCount ?? null, + forksCount: data.forksCount ?? null, + commitsCount: data.commitsCount ?? null, + pullRequestsCount: data.pullRequestsCount ?? null, + issuesCount: data.issuesCount ?? null, + }, + ) +} + +export async function bulkInsertEvaluatedProjects( + qx: QueryExecutor, + items: IDbEvaluatedProjectCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectCatalogId: item.projectCatalogId, + evaluationStatus: item.evaluationStatus ?? 'pending', + evaluationScore: item.evaluationScore ?? null, + evaluation: item.evaluation ? JSON.stringify(item.evaluation) : null, + evaluationReason: item.evaluationReason ?? null, + starsCount: item.starsCount ?? null, + forksCount: item.forksCount ?? null, + commitsCount: item.commitsCount ?? null, + pullRequestsCount: item.pullRequestsCount ?? null, + issuesCount: item.issuesCount ?? null, + })) + + await qx.result( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + SELECT + v."projectCatalogId"::uuid, + v."evaluationStatus", + v."evaluationScore"::double precision, + v.evaluation::jsonb, + v."evaluationReason", + v."starsCount"::integer, + v."forksCount"::integer, + v."commitsCount"::integer, + v."pullRequestsCount"::integer, + v."issuesCount"::integer, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectCatalogId" text, + "evaluationStatus" text, + "evaluationScore" double precision, + evaluation jsonb, + "evaluationReason" text, + "starsCount" integer, + "forksCount" integer, + "commitsCount" integer, + "pullRequestsCount" integer, + "issuesCount" integer + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateEvaluatedProject( + qx: QueryExecutor, + id: string, + data: IDbEvaluatedProjectUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.evaluationStatus !== undefined) { + setClauses.push('"evaluationStatus" = $(evaluationStatus)') + params.evaluationStatus = data.evaluationStatus + } + if (data.evaluationScore !== undefined) { + setClauses.push('"evaluationScore" = $(evaluationScore)') + params.evaluationScore = data.evaluationScore + } + if (data.evaluation !== undefined) { + setClauses.push('evaluation = $(evaluation)') + params.evaluation = data.evaluation ? JSON.stringify(data.evaluation) : null + } + if (data.evaluationReason !== undefined) { + setClauses.push('"evaluationReason" = $(evaluationReason)') + params.evaluationReason = data.evaluationReason + } + if (data.evaluatedAt !== undefined) { + setClauses.push('"evaluatedAt" = $(evaluatedAt)') + params.evaluatedAt = data.evaluatedAt + } + if (data.starsCount !== undefined) { + setClauses.push('"starsCount" = $(starsCount)') + params.starsCount = data.starsCount + } + if (data.forksCount !== undefined) { + setClauses.push('"forksCount" = $(forksCount)') + params.forksCount = data.forksCount + } + if (data.commitsCount !== undefined) { + setClauses.push('"commitsCount" = $(commitsCount)') + params.commitsCount = data.commitsCount + } + if (data.pullRequestsCount !== undefined) { + setClauses.push('"pullRequestsCount" = $(pullRequestsCount)') + params.pullRequestsCount = data.pullRequestsCount + } + if (data.issuesCount !== undefined) { + setClauses.push('"issuesCount" = $(issuesCount)') + params.issuesCount = data.issuesCount + } + if (data.onboarded !== undefined) { + setClauses.push('onboarded = $(onboarded)') + params.onboarded = data.onboarded + } + if (data.onboardedAt !== undefined) { + setClauses.push('"onboardedAt" = $(onboardedAt)') + params.onboardedAt = data.onboardedAt + } + + if (setClauses.length === 0) { + return findEvaluatedProjectById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + params, + ) +} + +export async function markEvaluatedProjectAsEvaluated( + qx: QueryExecutor, + id: string, + data: { + evaluationScore: number + evaluation: Record + evaluationReason?: string + }, +): Promise { + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + "evaluationStatus" = 'evaluated', + "evaluationScore" = $(evaluationScore), + evaluation = $(evaluation), + "evaluationReason" = $(evaluationReason), + "evaluatedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + id, + evaluationScore: data.evaluationScore, + evaluation: JSON.stringify(data.evaluation), + evaluationReason: data.evaluationReason ?? null, + }, + ) +} + +export async function markEvaluatedProjectAsOnboarded( + qx: QueryExecutor, + id: string, +): Promise { + await qx.selectNone( + ` + UPDATE "evaluatedProjects" + SET + onboarded = true, + "onboardedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProject(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findPendingEvaluatedProjectsWithCatalog( + qx: QueryExecutor, + options: { limit?: number } = {}, +): Promise<(IDbEvaluatedProject & { projectSlug: string; repoName: string; repoUrl: string })[]> { + const { limit } = options + + return qx.select( + ` + SELECT + ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS, 'ep')}, + pc."projectSlug", + pc."repoName", + pc."repoUrl" + FROM "evaluatedProjects" ep + JOIN "projectCatalog" pc ON pc.id = ep."projectCatalogId" + WHERE ep."evaluationStatus" = 'pending' + ORDER BY ep."createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + `, + { limit }, + ) +} diff --git a/services/libs/data-access-layer/src/evaluated-projects/index.ts b/services/libs/data-access-layer/src/evaluated-projects/index.ts new file mode 100644 index 0000000000..7a4064eec2 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './evaluatedProjects' diff --git a/services/libs/data-access-layer/src/evaluated-projects/types.ts b/services/libs/data-access-layer/src/evaluated-projects/types.ts new file mode 100644 index 0000000000..f8661f47a2 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/types.ts @@ -0,0 +1,69 @@ +export type EvaluationStatus = 'pending' | 'evaluating' | 'evaluated' | 'failed' + +export interface IDbEvaluatedProject { + id: string + projectCatalogId: string + evaluationStatus: EvaluationStatus + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null + onboarded: boolean + onboardedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +type EvaluatedProjectWritable = Pick< + IDbEvaluatedProject, + | 'projectCatalogId' + | 'evaluationStatus' + | 'evaluationScore' + | 'evaluation' + | 'evaluationReason' + | 'evaluatedAt' + | 'starsCount' + | 'forksCount' + | 'commitsCount' + | 'pullRequestsCount' + | 'issuesCount' + | 'onboarded' + | 'onboardedAt' +> + +export type IDbEvaluatedProjectCreate = Omit & { + projectCatalogId: string +} & { + evaluationStatus?: EvaluationStatus + evaluationScore?: number + evaluation?: Record + evaluationReason?: string + evaluatedAt?: string + starsCount?: number + forksCount?: number + commitsCount?: number + pullRequestsCount?: number + issuesCount?: number + onboarded?: boolean + onboardedAt?: string +} + +export type IDbEvaluatedProjectUpdate = Partial<{ + evaluationStatus: EvaluationStatus + evaluationScore: number + evaluation: Record + evaluationReason: string + evaluatedAt: string + starsCount: number + forksCount: number + commitsCount: number + pullRequestsCount: number + issuesCount: number + onboarded: boolean + onboardedAt: string +}> diff --git a/services/libs/data-access-layer/src/index.ts b/services/libs/data-access-layer/src/index.ts index 639f0547b8..5ef4749d79 100644 --- a/services/libs/data-access-layer/src/index.ts +++ b/services/libs/data-access-layer/src/index.ts @@ -13,3 +13,5 @@ export * from './systemSettings' export * from './integrations' export * from './auditLogs' export * from './maintainers' +export * from './project-catalog' +export * from './evaluated-projects' diff --git a/services/libs/data-access-layer/src/project-catalog/index.ts b/services/libs/data-access-layer/src/project-catalog/index.ts new file mode 100644 index 0000000000..af7ef7faa1 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './projectCatalog' diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts new file mode 100644 index 0000000000..2e3b409579 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -0,0 +1,315 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { IDbProjectCatalog, IDbProjectCatalogCreate, IDbProjectCatalogUpdate } from './types' + +const PROJECT_CATALOG_COLUMNS = [ + 'id', + 'projectSlug', + 'repoName', + 'repoUrl', + 'criticalityScore', + 'syncedAt', + 'createdAt', + 'updatedAt', +] + +export async function findProjectCatalogById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findProjectCatalogByRepoUrl( + qx: QueryExecutor, + repoUrl: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "repoUrl" = $(repoUrl) + `, + { repoUrl }, + ) +} + +export async function findProjectCatalogBySlug( + qx: QueryExecutor, + projectSlug: string, +): Promise { + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "projectSlug" = $(projectSlug) + ORDER BY "createdAt" DESC + `, + { projectSlug }, + ) +} + +export async function findAllProjectCatalog( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countProjectCatalog(qx: QueryExecutor): Promise { + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "projectCatalog" + `, + ) + return parseInt(result.count, 10) +} + +export async function insertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(criticalityScore), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + criticalityScore: data.criticalityScore ?? null, + }, + ) +} + +export async function bulkInsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + criticalityScore: item.criticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."criticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "criticalityScore" double precision + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function upsertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(criticalityScore), + NOW(), + NOW() + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "criticalityScore" = EXCLUDED."criticalityScore", + "updatedAt" = NOW() + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + criticalityScore: data.criticalityScore ?? null, + }, + ) +} + +export async function bulkUpsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + criticalityScore: item.criticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."criticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "criticalityScore" double precision + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "criticalityScore" = EXCLUDED."criticalityScore", + "updatedAt" = NOW() + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateProjectCatalog( + qx: QueryExecutor, + id: string, + data: IDbProjectCatalogUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.projectSlug !== undefined) { + setClauses.push('"projectSlug" = $(projectSlug)') + params.projectSlug = data.projectSlug + } + if (data.repoName !== undefined) { + setClauses.push('"repoName" = $(repoName)') + params.repoName = data.repoName + } + if (data.repoUrl !== undefined) { + setClauses.push('"repoUrl" = $(repoUrl)') + params.repoUrl = data.repoUrl + } + if (data.criticalityScore !== undefined) { + setClauses.push('"criticalityScore" = $(criticalityScore)') + params.criticalityScore = data.criticalityScore + } + if (data.syncedAt !== undefined) { + setClauses.push('"syncedAt" = $(syncedAt)') + params.syncedAt = data.syncedAt + } + + if (setClauses.length === 0) { + return findProjectCatalogById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "projectCatalog" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + params, + ) +} + +export async function updateProjectCatalogSyncedAt(qx: QueryExecutor, id: string): Promise { + await qx.selectNone( + ` + UPDATE "projectCatalog" + SET "syncedAt" = NOW(), "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteProjectCatalog(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts new file mode 100644 index 0000000000..382527f57f --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/types.ts @@ -0,0 +1,23 @@ +export interface IDbProjectCatalog { + id: string + projectSlug: string + repoName: string + repoUrl: string + criticalityScore: number | null + syncedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +type ProjectCatalogWritable = Pick< + IDbProjectCatalog, + 'projectSlug' | 'repoName' | 'repoUrl' | 'criticalityScore' +> + +export type IDbProjectCatalogCreate = Omit & { + criticalityScore?: number +} + +export type IDbProjectCatalogUpdate = Partial & { + syncedAt?: string +} From 69c9ab37462accd87d55230e86a99ea4415d2ab3 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 26 Mar 2026 09:25:39 +0100 Subject: [PATCH 09/33] fix: updated the types as the current db Signed-off-by: Umberto Sgueglia --- .../evaluated-projects/evaluatedProjects.ts | 2 +- .../src/evaluated-projects/types.ts | 45 ++++---------- .../src/project-catalog/projectCatalog.ts | 61 +++++++++++++------ .../src/project-catalog/types.ts | 13 ++-- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts index caec6a72a1..2e078beb0d 100644 --- a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts +++ b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts @@ -173,7 +173,7 @@ export async function bulkInsertEvaluatedProjects( projectCatalogId: item.projectCatalogId, evaluationStatus: item.evaluationStatus ?? 'pending', evaluationScore: item.evaluationScore ?? null, - evaluation: item.evaluation ? JSON.stringify(item.evaluation) : null, + evaluation: item.evaluation ?? null, evaluationReason: item.evaluationReason ?? null, starsCount: item.starsCount ?? null, forksCount: item.forksCount ?? null, diff --git a/services/libs/data-access-layer/src/evaluated-projects/types.ts b/services/libs/data-access-layer/src/evaluated-projects/types.ts index f8661f47a2..bb11eb5d65 100644 --- a/services/libs/data-access-layer/src/evaluated-projects/types.ts +++ b/services/libs/data-access-layer/src/evaluated-projects/types.ts @@ -19,51 +19,32 @@ export interface IDbEvaluatedProject { updatedAt: string | null } -type EvaluatedProjectWritable = Pick< - IDbEvaluatedProject, - | 'projectCatalogId' - | 'evaluationStatus' - | 'evaluationScore' - | 'evaluation' - | 'evaluationReason' - | 'evaluatedAt' - | 'starsCount' - | 'forksCount' - | 'commitsCount' - | 'pullRequestsCount' - | 'issuesCount' - | 'onboarded' - | 'onboardedAt' -> - -export type IDbEvaluatedProjectCreate = Omit & { +// onboarded/onboardedAt/evaluatedAt are excluded: they are managed by dedicated helpers +// (markEvaluatedProjectAsEvaluated, markEvaluatedProjectAsOnboarded) and never written on insert. +export type IDbEvaluatedProjectCreate = { projectCatalogId: string -} & { evaluationStatus?: EvaluationStatus evaluationScore?: number evaluation?: Record evaluationReason?: string - evaluatedAt?: string starsCount?: number forksCount?: number commitsCount?: number pullRequestsCount?: number issuesCount?: number - onboarded?: boolean - onboardedAt?: string } export type IDbEvaluatedProjectUpdate = Partial<{ evaluationStatus: EvaluationStatus - evaluationScore: number - evaluation: Record - evaluationReason: string - evaluatedAt: string - starsCount: number - forksCount: number - commitsCount: number - pullRequestsCount: number - issuesCount: number + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null onboarded: boolean - onboardedAt: string + onboardedAt: string | null }> diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts index 2e3b409579..5b94e2b8bc 100644 --- a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -8,7 +8,8 @@ const PROJECT_CATALOG_COLUMNS = [ 'projectSlug', 'repoName', 'repoUrl', - 'criticalityScore', + 'ossfCriticalityScore', + 'lfCriticalityScore', 'syncedAt', 'createdAt', 'updatedAt', @@ -95,7 +96,8 @@ export async function insertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -103,7 +105,8 @@ export async function insertProjectCatalog( $(projectSlug), $(repoName), $(repoUrl), - $(criticalityScore), + $(ossfCriticalityScore), + $(lfCriticalityScore), NOW(), NOW() ) @@ -113,7 +116,8 @@ export async function insertProjectCatalog( projectSlug: data.projectSlug, repoName: data.repoName, repoUrl: data.repoUrl, - criticalityScore: data.criticalityScore ?? null, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, }, ) } @@ -130,7 +134,8 @@ export async function bulkInsertProjectCatalog( projectSlug: item.projectSlug, repoName: item.repoName, repoUrl: item.repoUrl, - criticalityScore: item.criticalityScore ?? null, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, })) await qx.result( @@ -139,7 +144,8 @@ export async function bulkInsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -147,14 +153,16 @@ export async function bulkInsertProjectCatalog( v."projectSlug", v."repoName", v."repoUrl", - v."criticalityScore"::double precision, + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, NOW(), NOW() FROM jsonb_to_recordset($(values)::jsonb) AS v( "projectSlug" text, "repoName" text, "repoUrl" text, - "criticalityScore" double precision + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision ) `, { values: JSON.stringify(values) }, @@ -171,7 +179,8 @@ export async function upsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -179,14 +188,16 @@ export async function upsertProjectCatalog( $(projectSlug), $(repoName), $(repoUrl), - $(criticalityScore), + $(ossfCriticalityScore), + $(lfCriticalityScore), NOW(), NOW() ) ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "criticalityScore" = EXCLUDED."criticalityScore", + "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", + "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", "updatedAt" = NOW() RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} `, @@ -194,7 +205,8 @@ export async function upsertProjectCatalog( projectSlug: data.projectSlug, repoName: data.repoName, repoUrl: data.repoUrl, - criticalityScore: data.criticalityScore ?? null, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, }, ) } @@ -211,7 +223,8 @@ export async function bulkUpsertProjectCatalog( projectSlug: item.projectSlug, repoName: item.repoName, repoUrl: item.repoUrl, - criticalityScore: item.criticalityScore ?? null, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, })) await qx.result( @@ -220,7 +233,8 @@ export async function bulkUpsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -228,19 +242,22 @@ export async function bulkUpsertProjectCatalog( v."projectSlug", v."repoName", v."repoUrl", - v."criticalityScore"::double precision, + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, NOW(), NOW() FROM jsonb_to_recordset($(values)::jsonb) AS v( "projectSlug" text, "repoName" text, "repoUrl" text, - "criticalityScore" double precision + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision ) ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "criticalityScore" = EXCLUDED."criticalityScore", + "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", + "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", "updatedAt" = NOW() `, { values: JSON.stringify(values) }, @@ -267,9 +284,13 @@ export async function updateProjectCatalog( setClauses.push('"repoUrl" = $(repoUrl)') params.repoUrl = data.repoUrl } - if (data.criticalityScore !== undefined) { - setClauses.push('"criticalityScore" = $(criticalityScore)') - params.criticalityScore = data.criticalityScore + if (data.ossfCriticalityScore !== undefined) { + setClauses.push('"ossfCriticalityScore" = $(ossfCriticalityScore)') + params.ossfCriticalityScore = data.ossfCriticalityScore + } + if (data.lfCriticalityScore !== undefined) { + setClauses.push('"lfCriticalityScore" = $(lfCriticalityScore)') + params.lfCriticalityScore = data.lfCriticalityScore } if (data.syncedAt !== undefined) { setClauses.push('"syncedAt" = $(syncedAt)') diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts index 382527f57f..8cbb39a310 100644 --- a/services/libs/data-access-layer/src/project-catalog/types.ts +++ b/services/libs/data-access-layer/src/project-catalog/types.ts @@ -3,7 +3,8 @@ export interface IDbProjectCatalog { projectSlug: string repoName: string repoUrl: string - criticalityScore: number | null + ossfCriticalityScore: number | null + lfCriticalityScore: number | null syncedAt: string | null createdAt: string | null updatedAt: string | null @@ -11,11 +12,15 @@ export interface IDbProjectCatalog { type ProjectCatalogWritable = Pick< IDbProjectCatalog, - 'projectSlug' | 'repoName' | 'repoUrl' | 'criticalityScore' + 'projectSlug' | 'repoName' | 'repoUrl' | 'ossfCriticalityScore' | 'lfCriticalityScore' > -export type IDbProjectCatalogCreate = Omit & { - criticalityScore?: number +export type IDbProjectCatalogCreate = Omit< + ProjectCatalogWritable, + 'ossfCriticalityScore' | 'lfCriticalityScore' +> & { + ossfCriticalityScore?: number + lfCriticalityScore?: number } export type IDbProjectCatalogUpdate = Partial & { From 0ee8ef1dc254b254d8c24743df618ce51ef6b3e1 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:35:53 +0100 Subject: [PATCH 10/33] fix: push lock file Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ef062f9f07..1236281d03 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16031,9 +16031,11 @@ snapshots: dependencies: ms: 2.1.3 - debug@4.4.0: + debug@4.4.0(supports-color@5.5.0): dependencies: ms: 2.1.3 + optionalDependencies: + supports-color: 5.5.0 debug@4.4.0(supports-color@5.5.0): dependencies: From 66019ea908e42bdab25146ce70f47535f0db25bf Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 10 Feb 2026 17:17:34 +0100 Subject: [PATCH 11/33] feat: schedule structure Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 3 + .../README.md | 73 ++++++++++++ .../package.json | 3 +- .../src/activities.ts | 4 +- .../src/activities/activities.ts | 108 +++++++++++++++++- .../src/main.ts | 2 +- .../schedules/scheduleProjectsDiscovery.ts | 9 +- .../ossf-criticality-score/bucketClient.ts | 86 ++++++++++++++ .../sources/ossf-criticality-score/source.ts | 75 ++++++++++++ .../src/sources/registry.ts | 19 +++ .../src/sources/types.ts | 21 ++++ .../src/workflows/discoverProjects.ts | 50 +++++++- 12 files changed, 438 insertions(+), 15 deletions(-) create mode 100644 services/apps/automatic_projects_discovery_worker/README.md create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/registry.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/types.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1236281d03..11a8cc872c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -566,6 +566,9 @@ importers: '@temporalio/workflow': specifier: ~1.11.8 version: 1.11.8 + csv-parse: + specifier: ^5.5.6 + version: 5.5.6 tsx: specifier: ^4.7.1 version: 4.7.3 diff --git a/services/apps/automatic_projects_discovery_worker/README.md b/services/apps/automatic_projects_discovery_worker/README.md new file mode 100644 index 0000000000..85162f9974 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/README.md @@ -0,0 +1,73 @@ +# Automatic Projects Discovery Worker + +Temporal worker that discovers open-source projects from external data sources and writes them to the `projectCatalog` table. + +## Architecture + +### Source abstraction + +Every data source implements the `IDiscoverySource` interface (`src/sources/types.ts`): + +| Method | Purpose | +|--------|---------| +| `listAvailableDatasets()` | Returns available dataset snapshots, sorted newest-first | +| `fetchDatasetStream(dataset)` | Returns a readable stream for the dataset (e.g. HTTP response) | +| `parseRow(rawRow)` | Converts a raw CSV/JSON row into a `IDiscoverySourceRow`, or `null` to skip | + +Sources are registered in `src/sources/registry.ts` as a simple name → factory map. + +**To add a new source:** create a class implementing `IDiscoverySource`, then add one line to the registry. + +### Current sources + +| Name | Folder | Description | +|------|--------|-------------| +| `ossf-criticality-score` | `src/sources/ossf-criticality-score/` | OSSF Criticality Score snapshots from a public GCS bucket (~750K repos per snapshot) | + +### Workflow + +``` +discoverProjects({ mode: 'incremental' | 'full' }) + │ + ├─ Activity: listDatasets(sourceName) + │ → returns dataset descriptors sorted newest-first + │ + ├─ Selection: incremental → latest only, full → all datasets + │ + └─ For each dataset: + └─ Activity: processDataset(sourceName, dataset) + → HTTP stream → csv-parse → batches of 5000 → bulkUpsertProjectCatalog +``` + +### Timeouts + +| Activity | startToCloseTimeout | retries | +|----------|-------------------|---------| +| `listDatasets` | 2 min | 3 | +| `processDataset` | 30 min | 3 | +| Workflow execution | 2 hours | 3 | + +### Schedule + +Runs daily via Temporal cron. The cron expression can be overridden with the `CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON` env var. + +## File structure + +``` +src/ +├── main.ts # Service bootstrap (postgres enabled) +├── activities.ts # Barrel re-export +├── workflows.ts # Barrel re-export +├── activities/ +│ └── activities.ts # listDatasets, processDataset +├── workflows/ +│ └── discoverProjects.ts # Orchestration with mode selection +├── schedules/ +│ └── scheduleProjectsDiscovery.ts # Temporal cron schedule +└── sources/ + ├── types.ts # IDiscoverySource, IDatasetDescriptor + ├── registry.ts # Source factory map + └── ossf-criticality-score/ + ├── source.ts # IDiscoverySource implementation + └── bucketClient.ts # GCS public bucket HTTP client +``` diff --git a/services/apps/automatic_projects_discovery_worker/package.json b/services/apps/automatic_projects_discovery_worker/package.json index 1c79505f89..022c1a6297 100644 --- a/services/apps/automatic_projects_discovery_worker/package.json +++ b/services/apps/automatic_projects_discovery_worker/package.json @@ -2,7 +2,7 @@ "name": "@crowd/automatic-projects-discovery-worker", "scripts": { "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx src/main.ts", - "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx --inspect=0.0.0.0:9232 src/main.ts", "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", @@ -24,6 +24,7 @@ "@temporalio/activity": "~1.11.8", "@temporalio/client": "~1.11.8", "@temporalio/workflow": "~1.11.8", + "csv-parse": "^5.5.6", "tsx": "^4.7.1", "typescript": "^5.6.3" }, diff --git a/services/apps/automatic_projects_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts index 3662234550..1718218b3e 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities.ts @@ -1 +1,3 @@ -export * from './activities/activities' +import { listDatasets, processDataset } from './activities/activities' + +export { listDatasets, processDataset } diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 3aea7f8200..bc337a3516 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -1,7 +1,111 @@ +import { parse } from 'csv-parse' + +import { bulkUpsertProjectCatalog } from '@crowd/data-access-layer' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' +import { IDbProjectCatalogCreate } from '@crowd/data-access-layer/src/project-catalog/types' import { getServiceLogger } from '@crowd/logging' +import { svc } from '../main' +import { getSource } from '../sources/registry' +import { IDatasetDescriptor, IDiscoverySourceRow } from '../sources/types' + const log = getServiceLogger() -export async function logDiscoveryRun(): Promise { - log.info('Automatic projects discovery workflow executed successfully.') +const BATCH_SIZE = 5000 + +export async function listDatasets(sourceName: string): Promise { + const source = getSource(sourceName) + const datasets = await source.listAvailableDatasets() + + log.info({ sourceName, count: datasets.length, newest: datasets[0]?.id }, 'Datasets listed.') + + return datasets +} + +export async function processDataset( + sourceName: string, + dataset: IDatasetDescriptor, +): Promise { + const source = getSource(sourceName) + const qx = pgpQx(svc.postgres.writer.connection()) + const startTime = Date.now() + + log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...') + + // We use streaming (not full download) because each CSV is ~119MB / ~750K rows. + // Streaming keeps memory usage low (only one batch in memory at a time) and leverages + // Node.js backpressure: if DB writes are slow, the HTTP stream pauses automatically. + const httpStream = await source.fetchDatasetStream(dataset) + + // Pipe the raw HTTP response directly into csv-parse. + // Data flows as: HTTP response → csv-parse → for-await → batch → DB + const parser = httpStream.pipe( + parse({ + columns: true, + skip_empty_lines: true, + trim: true, + }), + ) + + parser.on('error', (err) => { + log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') + }) + + httpStream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'HTTP stream error.') + }) + + let batch: IDbProjectCatalogCreate[] = [] + let totalProcessed = 0 + let totalSkipped = 0 + let batchNumber = 0 + let totalRows = 0 + + for await (const rawRow of parser) { + totalRows++ + + const parsed: IDiscoverySourceRow | null = source.parseRow(rawRow) + if (!parsed) { + totalSkipped++ + continue + } + + batch.push({ + projectSlug: parsed.projectSlug, + repoName: parsed.repoName, + repoUrl: parsed.repoUrl, + criticalityScore: parsed.criticalityScore, + }) + + if (batch.length >= BATCH_SIZE) { + batchNumber++ + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + batch = [] + + log.info({ totalProcessed, batchNumber, datasetId: dataset.id }, 'Batch upserted.') + } + } + + // Flush remaining rows that didn't fill a complete batch + if (batch.length > 0) { + batchNumber++ + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + } + + const elapsedSeconds = ((Date.now() - startTime) / 1000).toFixed(1) + + log.info( + { + sourceName, + datasetId: dataset.id, + totalRows, + totalProcessed, + totalSkipped, + totalBatches: batchNumber, + elapsedSeconds, + }, + 'Dataset processing complete.', + ) } diff --git a/services/apps/automatic_projects_discovery_worker/src/main.ts b/services/apps/automatic_projects_discovery_worker/src/main.ts index 326c3a361a..0345c420f8 100644 --- a/services/apps/automatic_projects_discovery_worker/src/main.ts +++ b/services/apps/automatic_projects_discovery_worker/src/main.ts @@ -18,7 +18,7 @@ const config: Config = { const options: Options = { postgres: { - enabled: false, + enabled: true, }, opensearch: { enabled: false, diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 847c2e4ce9..3366470d75 100644 --- a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -3,18 +3,15 @@ import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/clien import { svc } from '../main' import { discoverProjects } from '../workflows' -const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM - export const scheduleProjectsDiscovery = async () => { - const cronExpression = process.env.CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON || DEFAULT_CRON - svc.log.info(`Scheduling projects discovery with cron: ${cronExpression}`) + svc.log.info(`Scheduling projects discovery`) try { await svc.temporal.schedule.create({ scheduleId: 'automaticProjectsDiscovery', spec: { - cronExpressions: [cronExpression], + cronExpressions: ['55 14 * * *'], }, policies: { overlap: ScheduleOverlapPolicy.SKIP, @@ -24,6 +21,8 @@ export const scheduleProjectsDiscovery = async () => { type: 'startWorkflow', workflowType: discoverProjects, taskQueue: 'automatic-projects-discovery', + args: [{ mode: 'full' as const }], + workflowExecutionTimeout: '2 hours', retry: { initialInterval: '15 seconds', backoffCoefficient: 2, diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts new file mode 100644 index 0000000000..7d6cb7f561 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts @@ -0,0 +1,86 @@ +import https from 'https' + +const BUCKET_URL = 'https://commondatastorage.googleapis.com/ossf-criticality-score' + +function httpsGet(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + httpsGet(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8'))) + res.on('error', reject) + }) + .on('error', reject) + }) +} + +function extractPrefixes(xml: string): string[] { + const prefixes: string[] = [] + const regex = /([^<]+)<\/Prefix>/g + let match: RegExpExecArray | null + + while ((match = regex.exec(xml)) !== null) { + prefixes.push(match[1]) + } + + return prefixes +} + +/** + * List all date prefixes in the OSSF Criticality Score bucket. + * Returns prefixes like ['2024.07.01/', '2024.07.08/', ...] + */ +export async function listDatePrefixes(): Promise { + const xml = await httpsGet(`${BUCKET_URL}?delimiter=/`) + return extractPrefixes(xml).filter((p) => /^\d{4}\.\d{2}\.\d{2}\/$/.test(p)) +} + +/** + * List time sub-prefixes for a given date prefix. + * E.g., for '2024.07.01/' returns ['2024.07.01/060102/', ...] + */ +export async function listTimePrefixes(datePrefix: string): Promise { + const xml = await httpsGet(`${BUCKET_URL}?prefix=${encodeURIComponent(datePrefix)}&delimiter=/`) + return extractPrefixes(xml).filter((p) => p !== datePrefix) +} + +/** + * Build the full URL for the all.csv file within a given dataset prefix. + */ +export function buildDatasetUrl(prefix: string): string { + return `${BUCKET_URL}/${prefix}all.csv` +} + +/** + * Get an HTTPS readable stream for a given URL. + */ +export function getHttpsStream(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + getHttpsStream(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + resolve(res) + }) + .on('error', reject) + }) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts new file mode 100644 index 0000000000..9b3338b867 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts @@ -0,0 +1,75 @@ +import { Readable } from 'stream' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +import { buildDatasetUrl, getHttpsStream, listDatePrefixes, listTimePrefixes } from './bucketClient' + +export class OssfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'ossf-criticality-score' + + async listAvailableDatasets(): Promise { + const datePrefixes = await listDatePrefixes() + + const datasets: IDatasetDescriptor[] = [] + + for (const datePrefix of datePrefixes) { + const timePrefixes = await listTimePrefixes(datePrefix) + + for (const timePrefix of timePrefixes) { + const date = datePrefix.replace(/\/$/, '') + const url = buildDatasetUrl(timePrefix) + + datasets.push({ + id: timePrefix.replace(/\/$/, ''), + date, + url, + }) + } + } + + // Sort newest-first by date + datasets.sort((a, b) => b.date.localeCompare(a.date)) + + return datasets + } + + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const stream = await getHttpsStream(dataset.url) + return stream as Readable + } + + // CSV columns use dot notation (e.g. "repo.url", "default_score") + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repo.url'] + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const criticalityScoreRaw = rawRow['default_score'] + const criticalityScore = criticalityScoreRaw ? parseFloat(criticalityScoreRaw) : undefined + + return { + projectSlug, + repoName, + repoUrl, + criticalityScore: Number.isNaN(criticalityScore) ? undefined : criticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts new file mode 100644 index 0000000000..0d05783c71 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts @@ -0,0 +1,19 @@ +import { IDiscoverySource } from './types' +import { OssfCriticalityScoreSource } from './ossf-criticality-score/source' + +// To add a new source: instantiate it here. +const sources: IDiscoverySource[] = [ + new OssfCriticalityScoreSource(), +] + +export function getSource(name: string): IDiscoverySource { + const source = sources.find((s) => s.name === name) + if (!source) { + throw new Error(`Unknown source: ${name}. Available: ${sources.map((s) => s.name).join(', ')}`) + } + return source +} + +export function getAvailableSourceNames(): string[] { + return sources.map((s) => s.name) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts new file mode 100644 index 0000000000..c2b30afa83 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts @@ -0,0 +1,21 @@ +import { Readable } from 'stream' + +export interface IDatasetDescriptor { + id: string + date: string + url: string +} + +export interface IDiscoverySource { + name: string + listAvailableDatasets(): Promise + fetchDatasetStream(dataset: IDatasetDescriptor): Promise + parseRow(rawRow: Record): IDiscoverySourceRow | null +} + +export interface IDiscoverySourceRow { + projectSlug: string + repoName: string + repoUrl: string + criticalityScore?: number +} diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts index f43a9b5a12..17b8706e89 100644 --- a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts +++ b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts @@ -1,11 +1,51 @@ -import { proxyActivities } from '@temporalio/workflow' +import { log, proxyActivities } from '@temporalio/workflow' import type * as activities from '../activities' -const activity = proxyActivities({ - startToCloseTimeout: '1 minutes', +const listActivities = proxyActivities({ + startToCloseTimeout: '2 minutes', + retry: { + maximumAttempts: 3, + }, }) -export async function discoverProjects(): Promise { - await activity.logDiscoveryRun() +// processDataset is long-running (10-20 min for ~119MB / ~750K rows). +const processActivities = proxyActivities({ + startToCloseTimeout: '30 minutes', + retry: { + maximumAttempts: 3, + }, +}) + +export interface DiscoverProjectsInput { + mode: 'incremental' | 'full' +} + +export async function discoverProjects( + input: DiscoverProjectsInput = { mode: 'incremental' }, +): Promise { + const sourceName = 'ossf-criticality-score' + const { mode } = input + + const allDatasets = await listActivities.listDatasets(sourceName) + + if (allDatasets.length === 0) { + log.warn('No datasets found. Nothing to process.') + return + } + + // allDatasets is sorted newest-first. + // Incremental: process only the latest snapshot. + // Full: process oldest-first so the newest data wins the final upsert. + const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() + + log.info(`mode=${mode}, ${datasets.length}/${allDatasets.length} datasets to process.`) + + for (let i = 0; i < datasets.length; i++) { + const dataset = datasets[i] + log.info(`Processing dataset ${i + 1}/${datasets.length}: ${dataset.id}`) + await processActivities.processDataset(sourceName, dataset) + } + + log.info(`Done. Processed ${datasets.length} dataset(s).`) } From 99e6320ed6bc491db1a0d3c41db0f06ab731cbe9 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 10 Feb 2026 17:23:21 +0100 Subject: [PATCH 12/33] fix: lint Signed-off-by: Umberto Sgueglia --- .../src/activities/activities.ts | 2 +- .../src/schedules/scheduleProjectsDiscovery.ts | 1 - .../sources/ossf-criticality-score/bucketClient.ts | 14 ++++++++++++-- .../src/sources/registry.ts | 6 ++---- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index bc337a3516..98176c1745 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -1,8 +1,8 @@ import { parse } from 'csv-parse' import { bulkUpsertProjectCatalog } from '@crowd/data-access-layer' -import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' import { IDbProjectCatalogCreate } from '@crowd/data-access-layer/src/project-catalog/types' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceLogger } from '@crowd/logging' import { svc } from '../main' diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 3366470d75..11a3801ef6 100644 --- a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -4,7 +4,6 @@ import { svc } from '../main' import { discoverProjects } from '../workflows' export const scheduleProjectsDiscovery = async () => { - svc.log.info(`Scheduling projects discovery`) try { diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts index 7d6cb7f561..71b2066ae7 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts @@ -6,7 +6,12 @@ function httpsGet(url: string): Promise { return new Promise((resolve, reject) => { https .get(url, (res) => { - if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { httpsGet(res.headers.location).then(resolve, reject) return } @@ -69,7 +74,12 @@ export function getHttpsStream(url: string): Promise { return new Promise((resolve, reject) => { https .get(url, (res) => { - if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { getHttpsStream(res.headers.location).then(resolve, reject) return } diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts index 0d05783c71..7c8796094f 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts @@ -1,10 +1,8 @@ -import { IDiscoverySource } from './types' import { OssfCriticalityScoreSource } from './ossf-criticality-score/source' +import { IDiscoverySource } from './types' // To add a new source: instantiate it here. -const sources: IDiscoverySource[] = [ - new OssfCriticalityScoreSource(), -] +const sources: IDiscoverySource[] = [new OssfCriticalityScoreSource()] export function getSource(name: string): IDiscoverySource { const source = sources.find((s) => s.name === name) From e2c144f158e67b84a7c2dc40492e17d5b2fa0bb4 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 10 Feb 2026 17:52:51 +0100 Subject: [PATCH 13/33] fix: lint Signed-off-by: Umberto Sgueglia --- .../README.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/services/apps/automatic_projects_discovery_worker/README.md b/services/apps/automatic_projects_discovery_worker/README.md index 85162f9974..ff4ce16302 100644 --- a/services/apps/automatic_projects_discovery_worker/README.md +++ b/services/apps/automatic_projects_discovery_worker/README.md @@ -8,11 +8,11 @@ Temporal worker that discovers open-source projects from external data sources a Every data source implements the `IDiscoverySource` interface (`src/sources/types.ts`): -| Method | Purpose | -|--------|---------| -| `listAvailableDatasets()` | Returns available dataset snapshots, sorted newest-first | -| `fetchDatasetStream(dataset)` | Returns a readable stream for the dataset (e.g. HTTP response) | -| `parseRow(rawRow)` | Converts a raw CSV/JSON row into a `IDiscoverySourceRow`, or `null` to skip | +| Method | Purpose | +| ----------------------------- | --------------------------------------------------------------------------- | +| `listAvailableDatasets()` | Returns available dataset snapshots, sorted newest-first | +| `fetchDatasetStream(dataset)` | Returns a readable stream for the dataset (e.g. HTTP response) | +| `parseRow(rawRow)` | Converts a raw CSV/JSON row into a `IDiscoverySourceRow`, or `null` to skip | Sources are registered in `src/sources/registry.ts` as a simple name → factory map. @@ -20,8 +20,8 @@ Sources are registered in `src/sources/registry.ts` as a simple name → factory ### Current sources -| Name | Folder | Description | -|------|--------|-------------| +| Name | Folder | Description | +| ------------------------ | ------------------------------------- | ------------------------------------------------------------------------------------ | | `ossf-criticality-score` | `src/sources/ossf-criticality-score/` | OSSF Criticality Score snapshots from a public GCS bucket (~750K repos per snapshot) | ### Workflow @@ -41,11 +41,11 @@ discoverProjects({ mode: 'incremental' | 'full' }) ### Timeouts -| Activity | startToCloseTimeout | retries | -|----------|-------------------|---------| -| `listDatasets` | 2 min | 3 | -| `processDataset` | 30 min | 3 | -| Workflow execution | 2 hours | 3 | +| Activity | startToCloseTimeout | retries | +| ------------------ | ------------------- | ------- | +| `listDatasets` | 2 min | 3 | +| `processDataset` | 30 min | 3 | +| Workflow execution | 2 hours | 3 | ### Schedule From f0123d616e39ca9949470fbb01363b35abd8c51b Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 10:37:40 +0100 Subject: [PATCH 14/33] fix: format Signed-off-by: Umberto Sgueglia --- .../src/activities/activities.ts | 14 ++++++++------ .../src/workflows/discoverProjects.ts | 17 +++++------------ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 98176c1745..fc82cadea5 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -26,17 +26,22 @@ export async function processDataset( sourceName: string, dataset: IDatasetDescriptor, ): Promise { - const source = getSource(sourceName) const qx = pgpQx(svc.postgres.writer.connection()) const startTime = Date.now() log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...') + const source = getSource(sourceName) + // We use streaming (not full download) because each CSV is ~119MB / ~750K rows. // Streaming keeps memory usage low (only one batch in memory at a time) and leverages // Node.js backpressure: if DB writes are slow, the HTTP stream pauses automatically. const httpStream = await source.fetchDatasetStream(dataset) + httpStream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'HTTP stream error.') + }) + // Pipe the raw HTTP response directly into csv-parse. // Data flows as: HTTP response → csv-parse → for-await → batch → DB const parser = httpStream.pipe( @@ -51,10 +56,6 @@ export async function processDataset( log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') }) - httpStream.on('error', (err: Error) => { - log.error({ datasetId: dataset.id, error: err.message }, 'HTTP stream error.') - }) - let batch: IDbProjectCatalogCreate[] = [] let totalProcessed = 0 let totalSkipped = 0 @@ -64,7 +65,7 @@ export async function processDataset( for await (const rawRow of parser) { totalRows++ - const parsed: IDiscoverySourceRow | null = source.parseRow(rawRow) + const parsed = source.parseRow(rawRow) if (!parsed) { totalSkipped++ continue @@ -79,6 +80,7 @@ export async function processDataset( if (batch.length >= BATCH_SIZE) { batchNumber++ + await bulkUpsertProjectCatalog(qx, batch) totalProcessed += batch.length batch = [] diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts index 17b8706e89..6e9893949b 100644 --- a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts +++ b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts @@ -4,27 +4,20 @@ import type * as activities from '../activities' const listActivities = proxyActivities({ startToCloseTimeout: '2 minutes', - retry: { - maximumAttempts: 3, - }, + retry: { maximumAttempts: 3 }, }) // processDataset is long-running (10-20 min for ~119MB / ~750K rows). const processActivities = proxyActivities({ startToCloseTimeout: '30 minutes', - retry: { - maximumAttempts: 3, - }, + retry: { maximumAttempts: 3 }, }) -export interface DiscoverProjectsInput { - mode: 'incremental' | 'full' -} - export async function discoverProjects( - input: DiscoverProjectsInput = { mode: 'incremental' }, + input: { mode: 'incremental' | 'full' } = { mode: 'incremental' }, ): Promise { const sourceName = 'ossf-criticality-score' + const { mode } = input const allDatasets = await listActivities.listDatasets(sourceName) @@ -34,7 +27,7 @@ export async function discoverProjects( return } - // allDatasets is sorted newest-first. + // allDatasets is sorted newest-first, that is the reason we need the .reverse(). // Incremental: process only the latest snapshot. // Full: process oldest-first so the newest data wins the final upsert. const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() From ad1e4cf0eb775f81c14b612160578f6ff1e11e20 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 10:42:20 +0100 Subject: [PATCH 15/33] fix: lint Signed-off-by: Umberto Sgueglia --- .../src/activities/activities.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index fc82cadea5..54ce9feb64 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -7,7 +7,7 @@ import { getServiceLogger } from '@crowd/logging' import { svc } from '../main' import { getSource } from '../sources/registry' -import { IDatasetDescriptor, IDiscoverySourceRow } from '../sources/types' +import { IDatasetDescriptor } from '../sources/types' const log = getServiceLogger() From 41c7a11b260a860aca3a45e8e45c20df34f8a933 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 10:49:30 +0100 Subject: [PATCH 16/33] fix: update cron expression Signed-off-by: Umberto Sgueglia --- .../src/schedules/scheduleProjectsDiscovery.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 11a3801ef6..f3b29a2d7a 100644 --- a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -10,7 +10,8 @@ export const scheduleProjectsDiscovery = async () => { await svc.temporal.schedule.create({ scheduleId: 'automaticProjectsDiscovery', spec: { - cronExpressions: ['55 14 * * *'], + // Run every day at midnight + cronExpressions: ['0 0 * * *'], }, policies: { overlap: ScheduleOverlapPolicy.SKIP, From 151e228ea579d35e5605bf40d722c6047ae2e03e Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 11:32:41 +0100 Subject: [PATCH 17/33] feat: mode incremental Signed-off-by: Umberto Sgueglia --- .../src/schedules/scheduleProjectsDiscovery.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index f3b29a2d7a..b173126a78 100644 --- a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -21,7 +21,7 @@ export const scheduleProjectsDiscovery = async () => { type: 'startWorkflow', workflowType: discoverProjects, taskQueue: 'automatic-projects-discovery', - args: [{ mode: 'full' as const }], + args: [{ mode: 'incremental' as const }], workflowExecutionTimeout: '2 hours', retry: { initialInterval: '15 seconds', From 6a79d0e608576f38f664e3ca64a1e670550de05e Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 12:52:36 +0100 Subject: [PATCH 18/33] fix: update readme Signed-off-by: Umberto Sgueglia --- services/apps/automatic_projects_discovery_worker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/automatic_projects_discovery_worker/README.md b/services/apps/automatic_projects_discovery_worker/README.md index ff4ce16302..77623513cc 100644 --- a/services/apps/automatic_projects_discovery_worker/README.md +++ b/services/apps/automatic_projects_discovery_worker/README.md @@ -49,7 +49,7 @@ discoverProjects({ mode: 'incremental' | 'full' }) ### Schedule -Runs daily via Temporal cron. The cron expression can be overridden with the `CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON` env var. +Runs daily at midnight via Temporal cron (`0 0 * * *`). ## File structure From 30313f85e2a3f0c872dceaba612af545d69ed493 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Fri, 20 Feb 2026 16:31:56 +0100 Subject: [PATCH 19/33] fix: add new source Signed-off-by: Umberto Sgueglia --- .../src/activities.ts | 4 +- .../src/activities/activities.ts | 54 ++--- .../sources/lf-criticality-score/source.ts | 197 ++++++++++++++++++ .../sources/ossf-criticality-score/source.ts | 10 +- .../src/sources/registry.ts | 6 +- .../src/sources/types.ts | 10 +- .../src/workflows/discoverProjects.ts | 40 ++-- .../src/project-catalog/projectCatalog.ts | 8 +- 8 files changed, 273 insertions(+), 56 deletions(-) create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts diff --git a/services/apps/automatic_projects_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts index 1718218b3e..a2c8cf9935 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities.ts @@ -1,3 +1,3 @@ -import { listDatasets, processDataset } from './activities/activities' +import { listDatasets, listSources, processDataset } from './activities/activities' -export { listDatasets, processDataset } +export { listDatasets, listSources, processDataset } diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 54ce9feb64..d4a939fe0c 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -6,13 +6,17 @@ import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceLogger } from '@crowd/logging' import { svc } from '../main' -import { getSource } from '../sources/registry' +import { getAvailableSourceNames, getSource } from '../sources/registry' import { IDatasetDescriptor } from '../sources/types' const log = getServiceLogger() const BATCH_SIZE = 5000 +export async function listSources(): Promise { + return getAvailableSourceNames() +} + export async function listDatasets(sourceName: string): Promise { const source = getSource(sourceName) const datasets = await source.listAvailableDatasets() @@ -32,29 +36,30 @@ export async function processDataset( log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...') const source = getSource(sourceName) + const stream = await source.fetchDatasetStream(dataset) - // We use streaming (not full download) because each CSV is ~119MB / ~750K rows. - // Streaming keeps memory usage low (only one batch in memory at a time) and leverages - // Node.js backpressure: if DB writes are slow, the HTTP stream pauses automatically. - const httpStream = await source.fetchDatasetStream(dataset) - - httpStream.on('error', (err: Error) => { - log.error({ datasetId: dataset.id, error: err.message }, 'HTTP stream error.') + stream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.') }) - // Pipe the raw HTTP response directly into csv-parse. - // Data flows as: HTTP response → csv-parse → for-await → batch → DB - const parser = httpStream.pipe( - parse({ - columns: true, - skip_empty_lines: true, - trim: true, - }), - ) - - parser.on('error', (err) => { - log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') - }) + // For CSV sources: pipe through csv-parse to get Record objects. + // For JSON sources: the stream already emits pre-parsed objects in object mode. + const records = + source.format === 'json' + ? stream + : stream.pipe( + parse({ + columns: true, + skip_empty_lines: true, + trim: true, + }), + ) + + if (source.format !== 'json') { + ;(records as ReturnType).on('error', (err) => { + log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') + }) + } let batch: IDbProjectCatalogCreate[] = [] let totalProcessed = 0 @@ -62,10 +67,10 @@ export async function processDataset( let batchNumber = 0 let totalRows = 0 - for await (const rawRow of parser) { + for await (const rawRow of records) { totalRows++ - const parsed = source.parseRow(rawRow) + const parsed = source.parseRow(rawRow as Record) if (!parsed) { totalSkipped++ continue @@ -75,7 +80,8 @@ export async function processDataset( projectSlug: parsed.projectSlug, repoName: parsed.repoName, repoUrl: parsed.repoUrl, - criticalityScore: parsed.criticalityScore, + ossfCriticalityScore: parsed.ossfCriticalityScore, + lfCriticalityScore: parsed.lfCriticalityScore, }) if (batch.length >= BATCH_SIZE) { diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts new file mode 100644 index 0000000000..effe8ea6c9 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts @@ -0,0 +1,197 @@ +import http from 'http' +import https from 'https' +import { Readable } from 'stream' + +import { getServiceLogger } from '@crowd/logging' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +const log = getServiceLogger() + +const DEFAULT_API_URL = 'https://hypervascular-nonduplicative-vern.ngrok-free.dev' +const PAGE_SIZE = 100 + +interface LfApiResponse { + page: number + pageSize: number + total: number + totalPages: number + data: LfApiRow[] +} + +interface LfApiRow { + runDate: string + repoUrl: string + owner: string + repoName: string + contributors: number + organizations: number + sizeSloc: number + lastUpdated: number + age: number + commitFreq: number + score: number +} + +function getApiBaseUrl(): string { + return (process.env.LF_CRITICALITY_SCORE_API_URL ?? DEFAULT_API_URL).replace(/\/$/, '') +} + +async function fetchPage( + baseUrl: string, + startDate: string, + endDate: string, + page: number, +): Promise { + const url = `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}&page=${page}&pageSize=${PAGE_SIZE}` + + return new Promise((resolve, reject) => { + const client = url.startsWith('https://') ? https : http + + const req = client.get(url, (res) => { + if (res.statusCode !== 200) { + reject(new Error(`LF Criticality Score API returned status ${res.statusCode} for ${url}`)) + res.resume() + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => { + try { + resolve(JSON.parse(Buffer.concat(chunks).toString('utf8')) as LfApiResponse) + } catch (err) { + reject(new Error(`Failed to parse LF Criticality Score API response: ${err}`)) + } + }) + res.on('error', reject) + }) + + req.on('error', reject) + req.end() + }) +} + +/** + * Generates the first day and last day of a given month. + * monthOffset = 0 → current month, -1 → previous month, etc. + */ +function monthRange(monthOffset: number): { startDate: string; endDate: string } { + const now = new Date() + const year = now.getUTCFullYear() + const month = now.getUTCMonth() + monthOffset // can be negative; Date handles rollover + + const first = new Date(Date.UTC(year, month, 1)) + const last = new Date(Date.UTC(year, month + 1, 0)) // last day of month + + const pad = (n: number) => String(n).padStart(2, '0') + const fmt = (d: Date) => + `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}` + + return { startDate: fmt(first), endDate: fmt(last) } +} + +export class LfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'lf-criticality-score' + public readonly format = 'json' as const + + async listAvailableDatasets(): Promise { + const baseUrl = getApiBaseUrl() + + // Return one dataset per month for the last 12 months (newest first) + const datasets: IDatasetDescriptor[] = [] + + for (let offset = 0; offset >= -11; offset--) { + const { startDate, endDate } = monthRange(offset) + const id = startDate.slice(0, 7) // e.g. "2026-02" + + datasets.push({ + id, + date: startDate, + url: `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}`, + }) + } + + return datasets + } + + /** + * Returns an object-mode Readable that fetches all pages from the API + * and pushes each row as a plain object. Activities.ts iterates this + * directly (no csv-parse) because format === 'json'. + */ + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const baseUrl = getApiBaseUrl() + + // Extract startDate and endDate from the stored URL + const parsed = new URL(dataset.url) + const startDate = parsed.searchParams.get('startDate') ?? '' + const endDate = parsed.searchParams.get('endDate') ?? '' + + const stream = new Readable({ objectMode: true, read() {} }) + + // Fetch pages asynchronously and push rows into the stream + ;(async () => { + try { + let page = 1 + let totalPages = 1 + + do { + const response = await fetchPage(baseUrl, startDate, endDate, page) + totalPages = response.totalPages + + for (const row of response.data) { + stream.push(row) + } + + log.debug( + { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length }, + 'LF Criticality Score page fetched.', + ) + + page++ + } while (page <= totalPages) + + stream.push(null) // signal end of stream + } catch (err) { + stream.destroy(err instanceof Error ? err : new Error(String(err))) + } + })() + + return stream + } + + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repoUrl'] as string | undefined + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const score = rawRow['score'] + const lfCriticalityScore = typeof score === 'number' ? score : parseFloat(score as string) + + return { + projectSlug, + repoName, + repoUrl, + lfCriticalityScore: Number.isNaN(lfCriticalityScore) ? undefined : lfCriticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts index 9b3338b867..8ee20fb602 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts @@ -39,8 +39,8 @@ export class OssfCriticalityScoreSource implements IDiscoverySource { } // CSV columns use dot notation (e.g. "repo.url", "default_score") - parseRow(rawRow: Record): IDiscoverySourceRow | null { - const repoUrl = rawRow['repo.url'] + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repo.url'] as string | undefined if (!repoUrl) { return null } @@ -62,14 +62,14 @@ export class OssfCriticalityScoreSource implements IDiscoverySource { return null } - const criticalityScoreRaw = rawRow['default_score'] - const criticalityScore = criticalityScoreRaw ? parseFloat(criticalityScoreRaw) : undefined + const scoreRaw = rawRow['default_score'] + const ossfCriticalityScore = scoreRaw ? parseFloat(scoreRaw as string) : undefined return { projectSlug, repoName, repoUrl, - criticalityScore: Number.isNaN(criticalityScore) ? undefined : criticalityScore, + ossfCriticalityScore: Number.isNaN(ossfCriticalityScore) ? undefined : ossfCriticalityScore, } } } diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts index 7c8796094f..1c7af148a3 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts @@ -1,8 +1,12 @@ +import { LfCriticalityScoreSource } from './lf-criticality-score/source' import { OssfCriticalityScoreSource } from './ossf-criticality-score/source' import { IDiscoverySource } from './types' // To add a new source: instantiate it here. -const sources: IDiscoverySource[] = [new OssfCriticalityScoreSource()] +const sources: IDiscoverySource[] = [ + new OssfCriticalityScoreSource(), + new LfCriticalityScoreSource(), +] export function getSource(name: string): IDiscoverySource { const source = sources.find((s) => s.name === name) diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts index c2b30afa83..9b386b5da7 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts @@ -8,14 +8,20 @@ export interface IDatasetDescriptor { export interface IDiscoverySource { name: string + /** + * 'csv' (default): fetchDatasetStream returns a raw text stream, piped through csv-parse. + * 'json': fetchDatasetStream returns an object-mode Readable that emits pre-parsed records. + */ + format?: 'csv' | 'json' listAvailableDatasets(): Promise fetchDatasetStream(dataset: IDatasetDescriptor): Promise - parseRow(rawRow: Record): IDiscoverySourceRow | null + parseRow(rawRow: Record): IDiscoverySourceRow | null } export interface IDiscoverySourceRow { projectSlug: string repoName: string repoUrl: string - criticalityScore?: number + ossfCriticalityScore?: number + lfCriticalityScore?: number } diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts index 6e9893949b..00856493d4 100644 --- a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts +++ b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts @@ -16,29 +16,33 @@ const processActivities = proxyActivities({ export async function discoverProjects( input: { mode: 'incremental' | 'full' } = { mode: 'incremental' }, ): Promise { - const sourceName = 'ossf-criticality-score' - const { mode } = input - const allDatasets = await listActivities.listDatasets(sourceName) + const sourceNames = await listActivities.listSources() - if (allDatasets.length === 0) { - log.warn('No datasets found. Nothing to process.') - return - } + for (const sourceName of sourceNames) { + const allDatasets = await listActivities.listDatasets(sourceName) - // allDatasets is sorted newest-first, that is the reason we need the .reverse(). - // Incremental: process only the latest snapshot. - // Full: process oldest-first so the newest data wins the final upsert. - const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() + if (allDatasets.length === 0) { + log.warn(`No datasets found for source "${sourceName}". Skipping.`) + continue + } - log.info(`mode=${mode}, ${datasets.length}/${allDatasets.length} datasets to process.`) + // allDatasets is sorted newest-first. + // Incremental: process only the latest snapshot. + // Full: process oldest-first so the newest data wins the final upsert. + const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() - for (let i = 0; i < datasets.length; i++) { - const dataset = datasets[i] - log.info(`Processing dataset ${i + 1}/${datasets.length}: ${dataset.id}`) - await processActivities.processDataset(sourceName, dataset) - } + log.info( + `source=${sourceName} mode=${mode}, ${datasets.length}/${allDatasets.length} datasets to process.`, + ) - log.info(`Done. Processed ${datasets.length} dataset(s).`) + for (let i = 0; i < datasets.length; i++) { + const dataset = datasets[i] + log.info(`[${sourceName}] Processing dataset ${i + 1}/${datasets.length}: ${dataset.id}`) + await processActivities.processDataset(sourceName, dataset) + } + + log.info(`[${sourceName}] Done. Processed ${datasets.length} dataset(s).`) + } } diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts index 5b94e2b8bc..b951e11317 100644 --- a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -196,8 +196,8 @@ export async function upsertProjectCatalog( ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", - "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), "updatedAt" = NOW() RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} `, @@ -256,8 +256,8 @@ export async function bulkUpsertProjectCatalog( ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", - "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), "updatedAt" = NOW() `, { values: JSON.stringify(values) }, From 08f2d045edd87c71e56ecaf163c03874a1a04b8f Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 24 Mar 2026 11:43:33 +0100 Subject: [PATCH 20/33] fix: add dependencies Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 11a8cc872c..616fea2b48 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16034,11 +16034,9 @@ snapshots: dependencies: ms: 2.1.3 - debug@4.4.0(supports-color@5.5.0): + debug@4.4.0: dependencies: ms: 2.1.3 - optionalDependencies: - supports-color: 5.5.0 debug@4.4.0(supports-color@5.5.0): dependencies: From cc9b4dcef1d4f550d83c93657f75f65477a45609 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 24 Mar 2026 11:51:18 +0100 Subject: [PATCH 21/33] refactor: fix eslint Signed-off-by: Umberto Sgueglia --- .../sources/lf-criticality-score/source.ts | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts index effe8ea6c9..4738318454 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts @@ -128,37 +128,28 @@ export class LfCriticalityScoreSource implements IDiscoverySource { const startDate = parsed.searchParams.get('startDate') ?? '' const endDate = parsed.searchParams.get('endDate') ?? '' - const stream = new Readable({ objectMode: true, read() {} }) + async function* pages() { + let page = 1 + let totalPages = 1 - // Fetch pages asynchronously and push rows into the stream - ;(async () => { - try { - let page = 1 - let totalPages = 1 + do { + const response = await fetchPage(baseUrl, startDate, endDate, page) + totalPages = response.totalPages - do { - const response = await fetchPage(baseUrl, startDate, endDate, page) - totalPages = response.totalPages - - for (const row of response.data) { - stream.push(row) - } - - log.debug( - { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length }, - 'LF Criticality Score page fetched.', - ) + for (const row of response.data) { + yield row + } - page++ - } while (page <= totalPages) + log.debug( + { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length }, + 'LF Criticality Score page fetched.', + ) - stream.push(null) // signal end of stream - } catch (err) { - stream.destroy(err instanceof Error ? err : new Error(String(err))) - } - })() + page++ + } while (page <= totalPages) + } - return stream + return Readable.from(pages(), { objectMode: true }) } parseRow(rawRow: Record): IDiscoverySourceRow | null { From a8062f59157e72a3de068ac97a5b0f0178422397 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 26 Mar 2026 09:16:15 +0100 Subject: [PATCH 22/33] fix: stream destroy Signed-off-by: Umberto Sgueglia --- .../src/activities/activities.ts | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index d4a939fe0c..fbbe6c28a8 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -38,10 +38,6 @@ export async function processDataset( const source = getSource(sourceName) const stream = await source.fetchDatasetStream(dataset) - stream.on('error', (err: Error) => { - log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.') - }) - // For CSV sources: pipe through csv-parse to get Record objects. // For JSON sources: the stream already emits pre-parsed objects in object mode. const records = @@ -55,8 +51,17 @@ export async function processDataset( }), ) + // pipe() does not forward source errors to the destination automatically, so we + // destroy records explicitly — this surfaces the error in the for-await loop and + // lets Temporal mark the activity as failed and retry it. + stream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.') + records.destroy(err) + }) + if (source.format !== 'json') { - ;(records as ReturnType).on('error', (err) => { + const csvRecords = records as ReturnType + csvRecords.on('error', (err) => { log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') }) } From 0383a3f468c2380becb9f787ce4fff50db4ebde5 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 10 Feb 2026 11:27:28 +0100 Subject: [PATCH 23/33] feat: add DAL Signed-off-by: Umberto Sgueglia --- .../evaluated-projects/evaluatedProjects.ts | 397 ++++++++++++++++++ .../src/evaluated-projects/index.ts | 2 + .../src/evaluated-projects/types.ts | 69 +++ services/libs/data-access-layer/src/index.ts | 2 + .../src/project-catalog/index.ts | 2 + .../src/project-catalog/projectCatalog.ts | 315 ++++++++++++++ .../src/project-catalog/types.ts | 23 + 7 files changed, 810 insertions(+) create mode 100644 services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts create mode 100644 services/libs/data-access-layer/src/evaluated-projects/index.ts create mode 100644 services/libs/data-access-layer/src/evaluated-projects/types.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/index.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/projectCatalog.ts create mode 100644 services/libs/data-access-layer/src/project-catalog/types.ts diff --git a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts new file mode 100644 index 0000000000..caec6a72a1 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts @@ -0,0 +1,397 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { + EvaluationStatus, + IDbEvaluatedProject, + IDbEvaluatedProjectCreate, + IDbEvaluatedProjectUpdate, +} from './types' + +const EVALUATED_PROJECT_COLUMNS = [ + 'id', + 'projectCatalogId', + 'evaluationStatus', + 'evaluationScore', + 'evaluation', + 'evaluationReason', + 'evaluatedAt', + 'starsCount', + 'forksCount', + 'commitsCount', + 'pullRequestsCount', + 'issuesCount', + 'onboarded', + 'onboardedAt', + 'createdAt', + 'updatedAt', +] + +export async function findEvaluatedProjectById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findEvaluatedProjectsByStatus( + qx: QueryExecutor, + evaluationStatus: EvaluationStatus, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "evaluationStatus" = $(evaluationStatus) + ORDER BY "createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { evaluationStatus, limit, offset }, + ) +} + +export async function findAllEvaluatedProjects( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countEvaluatedProjects( + qx: QueryExecutor, + evaluationStatus?: EvaluationStatus, +): Promise { + const statusFilter = evaluationStatus ? 'WHERE "evaluationStatus" = $(evaluationStatus)' : '' + + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "evaluatedProjects" + ${statusFilter} + `, + { evaluationStatus }, + ) + return parseInt(result.count, 10) +} + +export async function insertEvaluatedProject( + qx: QueryExecutor, + data: IDbEvaluatedProjectCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectCatalogId), + $(evaluationStatus), + $(evaluationScore), + $(evaluation), + $(evaluationReason), + $(starsCount), + $(forksCount), + $(commitsCount), + $(pullRequestsCount), + $(issuesCount), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + projectCatalogId: data.projectCatalogId, + evaluationStatus: data.evaluationStatus ?? 'pending', + evaluationScore: data.evaluationScore ?? null, + evaluation: data.evaluation ? JSON.stringify(data.evaluation) : null, + evaluationReason: data.evaluationReason ?? null, + starsCount: data.starsCount ?? null, + forksCount: data.forksCount ?? null, + commitsCount: data.commitsCount ?? null, + pullRequestsCount: data.pullRequestsCount ?? null, + issuesCount: data.issuesCount ?? null, + }, + ) +} + +export async function bulkInsertEvaluatedProjects( + qx: QueryExecutor, + items: IDbEvaluatedProjectCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectCatalogId: item.projectCatalogId, + evaluationStatus: item.evaluationStatus ?? 'pending', + evaluationScore: item.evaluationScore ?? null, + evaluation: item.evaluation ? JSON.stringify(item.evaluation) : null, + evaluationReason: item.evaluationReason ?? null, + starsCount: item.starsCount ?? null, + forksCount: item.forksCount ?? null, + commitsCount: item.commitsCount ?? null, + pullRequestsCount: item.pullRequestsCount ?? null, + issuesCount: item.issuesCount ?? null, + })) + + await qx.result( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + SELECT + v."projectCatalogId"::uuid, + v."evaluationStatus", + v."evaluationScore"::double precision, + v.evaluation::jsonb, + v."evaluationReason", + v."starsCount"::integer, + v."forksCount"::integer, + v."commitsCount"::integer, + v."pullRequestsCount"::integer, + v."issuesCount"::integer, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectCatalogId" text, + "evaluationStatus" text, + "evaluationScore" double precision, + evaluation jsonb, + "evaluationReason" text, + "starsCount" integer, + "forksCount" integer, + "commitsCount" integer, + "pullRequestsCount" integer, + "issuesCount" integer + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateEvaluatedProject( + qx: QueryExecutor, + id: string, + data: IDbEvaluatedProjectUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.evaluationStatus !== undefined) { + setClauses.push('"evaluationStatus" = $(evaluationStatus)') + params.evaluationStatus = data.evaluationStatus + } + if (data.evaluationScore !== undefined) { + setClauses.push('"evaluationScore" = $(evaluationScore)') + params.evaluationScore = data.evaluationScore + } + if (data.evaluation !== undefined) { + setClauses.push('evaluation = $(evaluation)') + params.evaluation = data.evaluation ? JSON.stringify(data.evaluation) : null + } + if (data.evaluationReason !== undefined) { + setClauses.push('"evaluationReason" = $(evaluationReason)') + params.evaluationReason = data.evaluationReason + } + if (data.evaluatedAt !== undefined) { + setClauses.push('"evaluatedAt" = $(evaluatedAt)') + params.evaluatedAt = data.evaluatedAt + } + if (data.starsCount !== undefined) { + setClauses.push('"starsCount" = $(starsCount)') + params.starsCount = data.starsCount + } + if (data.forksCount !== undefined) { + setClauses.push('"forksCount" = $(forksCount)') + params.forksCount = data.forksCount + } + if (data.commitsCount !== undefined) { + setClauses.push('"commitsCount" = $(commitsCount)') + params.commitsCount = data.commitsCount + } + if (data.pullRequestsCount !== undefined) { + setClauses.push('"pullRequestsCount" = $(pullRequestsCount)') + params.pullRequestsCount = data.pullRequestsCount + } + if (data.issuesCount !== undefined) { + setClauses.push('"issuesCount" = $(issuesCount)') + params.issuesCount = data.issuesCount + } + if (data.onboarded !== undefined) { + setClauses.push('onboarded = $(onboarded)') + params.onboarded = data.onboarded + } + if (data.onboardedAt !== undefined) { + setClauses.push('"onboardedAt" = $(onboardedAt)') + params.onboardedAt = data.onboardedAt + } + + if (setClauses.length === 0) { + return findEvaluatedProjectById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + params, + ) +} + +export async function markEvaluatedProjectAsEvaluated( + qx: QueryExecutor, + id: string, + data: { + evaluationScore: number + evaluation: Record + evaluationReason?: string + }, +): Promise { + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + "evaluationStatus" = 'evaluated', + "evaluationScore" = $(evaluationScore), + evaluation = $(evaluation), + "evaluationReason" = $(evaluationReason), + "evaluatedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + id, + evaluationScore: data.evaluationScore, + evaluation: JSON.stringify(data.evaluation), + evaluationReason: data.evaluationReason ?? null, + }, + ) +} + +export async function markEvaluatedProjectAsOnboarded( + qx: QueryExecutor, + id: string, +): Promise { + await qx.selectNone( + ` + UPDATE "evaluatedProjects" + SET + onboarded = true, + "onboardedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProject(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findPendingEvaluatedProjectsWithCatalog( + qx: QueryExecutor, + options: { limit?: number } = {}, +): Promise<(IDbEvaluatedProject & { projectSlug: string; repoName: string; repoUrl: string })[]> { + const { limit } = options + + return qx.select( + ` + SELECT + ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS, 'ep')}, + pc."projectSlug", + pc."repoName", + pc."repoUrl" + FROM "evaluatedProjects" ep + JOIN "projectCatalog" pc ON pc.id = ep."projectCatalogId" + WHERE ep."evaluationStatus" = 'pending' + ORDER BY ep."createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + `, + { limit }, + ) +} diff --git a/services/libs/data-access-layer/src/evaluated-projects/index.ts b/services/libs/data-access-layer/src/evaluated-projects/index.ts new file mode 100644 index 0000000000..7a4064eec2 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './evaluatedProjects' diff --git a/services/libs/data-access-layer/src/evaluated-projects/types.ts b/services/libs/data-access-layer/src/evaluated-projects/types.ts new file mode 100644 index 0000000000..f8661f47a2 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/types.ts @@ -0,0 +1,69 @@ +export type EvaluationStatus = 'pending' | 'evaluating' | 'evaluated' | 'failed' + +export interface IDbEvaluatedProject { + id: string + projectCatalogId: string + evaluationStatus: EvaluationStatus + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null + onboarded: boolean + onboardedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +type EvaluatedProjectWritable = Pick< + IDbEvaluatedProject, + | 'projectCatalogId' + | 'evaluationStatus' + | 'evaluationScore' + | 'evaluation' + | 'evaluationReason' + | 'evaluatedAt' + | 'starsCount' + | 'forksCount' + | 'commitsCount' + | 'pullRequestsCount' + | 'issuesCount' + | 'onboarded' + | 'onboardedAt' +> + +export type IDbEvaluatedProjectCreate = Omit & { + projectCatalogId: string +} & { + evaluationStatus?: EvaluationStatus + evaluationScore?: number + evaluation?: Record + evaluationReason?: string + evaluatedAt?: string + starsCount?: number + forksCount?: number + commitsCount?: number + pullRequestsCount?: number + issuesCount?: number + onboarded?: boolean + onboardedAt?: string +} + +export type IDbEvaluatedProjectUpdate = Partial<{ + evaluationStatus: EvaluationStatus + evaluationScore: number + evaluation: Record + evaluationReason: string + evaluatedAt: string + starsCount: number + forksCount: number + commitsCount: number + pullRequestsCount: number + issuesCount: number + onboarded: boolean + onboardedAt: string +}> diff --git a/services/libs/data-access-layer/src/index.ts b/services/libs/data-access-layer/src/index.ts index 639f0547b8..5ef4749d79 100644 --- a/services/libs/data-access-layer/src/index.ts +++ b/services/libs/data-access-layer/src/index.ts @@ -13,3 +13,5 @@ export * from './systemSettings' export * from './integrations' export * from './auditLogs' export * from './maintainers' +export * from './project-catalog' +export * from './evaluated-projects' diff --git a/services/libs/data-access-layer/src/project-catalog/index.ts b/services/libs/data-access-layer/src/project-catalog/index.ts new file mode 100644 index 0000000000..af7ef7faa1 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './projectCatalog' diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts new file mode 100644 index 0000000000..2e3b409579 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -0,0 +1,315 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { IDbProjectCatalog, IDbProjectCatalogCreate, IDbProjectCatalogUpdate } from './types' + +const PROJECT_CATALOG_COLUMNS = [ + 'id', + 'projectSlug', + 'repoName', + 'repoUrl', + 'criticalityScore', + 'syncedAt', + 'createdAt', + 'updatedAt', +] + +export async function findProjectCatalogById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findProjectCatalogByRepoUrl( + qx: QueryExecutor, + repoUrl: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "repoUrl" = $(repoUrl) + `, + { repoUrl }, + ) +} + +export async function findProjectCatalogBySlug( + qx: QueryExecutor, + projectSlug: string, +): Promise { + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "projectSlug" = $(projectSlug) + ORDER BY "createdAt" DESC + `, + { projectSlug }, + ) +} + +export async function findAllProjectCatalog( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countProjectCatalog(qx: QueryExecutor): Promise { + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "projectCatalog" + `, + ) + return parseInt(result.count, 10) +} + +export async function insertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(criticalityScore), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + criticalityScore: data.criticalityScore ?? null, + }, + ) +} + +export async function bulkInsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + criticalityScore: item.criticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."criticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "criticalityScore" double precision + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function upsertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(criticalityScore), + NOW(), + NOW() + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "criticalityScore" = EXCLUDED."criticalityScore", + "updatedAt" = NOW() + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + criticalityScore: data.criticalityScore ?? null, + }, + ) +} + +export async function bulkUpsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + criticalityScore: item.criticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "criticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."criticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "criticalityScore" double precision + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "criticalityScore" = EXCLUDED."criticalityScore", + "updatedAt" = NOW() + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateProjectCatalog( + qx: QueryExecutor, + id: string, + data: IDbProjectCatalogUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.projectSlug !== undefined) { + setClauses.push('"projectSlug" = $(projectSlug)') + params.projectSlug = data.projectSlug + } + if (data.repoName !== undefined) { + setClauses.push('"repoName" = $(repoName)') + params.repoName = data.repoName + } + if (data.repoUrl !== undefined) { + setClauses.push('"repoUrl" = $(repoUrl)') + params.repoUrl = data.repoUrl + } + if (data.criticalityScore !== undefined) { + setClauses.push('"criticalityScore" = $(criticalityScore)') + params.criticalityScore = data.criticalityScore + } + if (data.syncedAt !== undefined) { + setClauses.push('"syncedAt" = $(syncedAt)') + params.syncedAt = data.syncedAt + } + + if (setClauses.length === 0) { + return findProjectCatalogById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "projectCatalog" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + params, + ) +} + +export async function updateProjectCatalogSyncedAt(qx: QueryExecutor, id: string): Promise { + await qx.selectNone( + ` + UPDATE "projectCatalog" + SET "syncedAt" = NOW(), "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteProjectCatalog(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts new file mode 100644 index 0000000000..382527f57f --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/types.ts @@ -0,0 +1,23 @@ +export interface IDbProjectCatalog { + id: string + projectSlug: string + repoName: string + repoUrl: string + criticalityScore: number | null + syncedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +type ProjectCatalogWritable = Pick< + IDbProjectCatalog, + 'projectSlug' | 'repoName' | 'repoUrl' | 'criticalityScore' +> + +export type IDbProjectCatalogCreate = Omit & { + criticalityScore?: number +} + +export type IDbProjectCatalogUpdate = Partial & { + syncedAt?: string +} From 5dede3de4aa687cf7b91a92ee97a8a265a9904b8 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 26 Mar 2026 09:25:39 +0100 Subject: [PATCH 24/33] fix: updated the types as the current db Signed-off-by: Umberto Sgueglia --- .../evaluated-projects/evaluatedProjects.ts | 2 +- .../src/evaluated-projects/types.ts | 45 ++++---------- .../src/project-catalog/projectCatalog.ts | 61 +++++++++++++------ .../src/project-catalog/types.ts | 13 ++-- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts index caec6a72a1..2e078beb0d 100644 --- a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts +++ b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts @@ -173,7 +173,7 @@ export async function bulkInsertEvaluatedProjects( projectCatalogId: item.projectCatalogId, evaluationStatus: item.evaluationStatus ?? 'pending', evaluationScore: item.evaluationScore ?? null, - evaluation: item.evaluation ? JSON.stringify(item.evaluation) : null, + evaluation: item.evaluation ?? null, evaluationReason: item.evaluationReason ?? null, starsCount: item.starsCount ?? null, forksCount: item.forksCount ?? null, diff --git a/services/libs/data-access-layer/src/evaluated-projects/types.ts b/services/libs/data-access-layer/src/evaluated-projects/types.ts index f8661f47a2..bb11eb5d65 100644 --- a/services/libs/data-access-layer/src/evaluated-projects/types.ts +++ b/services/libs/data-access-layer/src/evaluated-projects/types.ts @@ -19,51 +19,32 @@ export interface IDbEvaluatedProject { updatedAt: string | null } -type EvaluatedProjectWritable = Pick< - IDbEvaluatedProject, - | 'projectCatalogId' - | 'evaluationStatus' - | 'evaluationScore' - | 'evaluation' - | 'evaluationReason' - | 'evaluatedAt' - | 'starsCount' - | 'forksCount' - | 'commitsCount' - | 'pullRequestsCount' - | 'issuesCount' - | 'onboarded' - | 'onboardedAt' -> - -export type IDbEvaluatedProjectCreate = Omit & { +// onboarded/onboardedAt/evaluatedAt are excluded: they are managed by dedicated helpers +// (markEvaluatedProjectAsEvaluated, markEvaluatedProjectAsOnboarded) and never written on insert. +export type IDbEvaluatedProjectCreate = { projectCatalogId: string -} & { evaluationStatus?: EvaluationStatus evaluationScore?: number evaluation?: Record evaluationReason?: string - evaluatedAt?: string starsCount?: number forksCount?: number commitsCount?: number pullRequestsCount?: number issuesCount?: number - onboarded?: boolean - onboardedAt?: string } export type IDbEvaluatedProjectUpdate = Partial<{ evaluationStatus: EvaluationStatus - evaluationScore: number - evaluation: Record - evaluationReason: string - evaluatedAt: string - starsCount: number - forksCount: number - commitsCount: number - pullRequestsCount: number - issuesCount: number + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null onboarded: boolean - onboardedAt: string + onboardedAt: string | null }> diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts index 2e3b409579..5b94e2b8bc 100644 --- a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -8,7 +8,8 @@ const PROJECT_CATALOG_COLUMNS = [ 'projectSlug', 'repoName', 'repoUrl', - 'criticalityScore', + 'ossfCriticalityScore', + 'lfCriticalityScore', 'syncedAt', 'createdAt', 'updatedAt', @@ -95,7 +96,8 @@ export async function insertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -103,7 +105,8 @@ export async function insertProjectCatalog( $(projectSlug), $(repoName), $(repoUrl), - $(criticalityScore), + $(ossfCriticalityScore), + $(lfCriticalityScore), NOW(), NOW() ) @@ -113,7 +116,8 @@ export async function insertProjectCatalog( projectSlug: data.projectSlug, repoName: data.repoName, repoUrl: data.repoUrl, - criticalityScore: data.criticalityScore ?? null, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, }, ) } @@ -130,7 +134,8 @@ export async function bulkInsertProjectCatalog( projectSlug: item.projectSlug, repoName: item.repoName, repoUrl: item.repoUrl, - criticalityScore: item.criticalityScore ?? null, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, })) await qx.result( @@ -139,7 +144,8 @@ export async function bulkInsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -147,14 +153,16 @@ export async function bulkInsertProjectCatalog( v."projectSlug", v."repoName", v."repoUrl", - v."criticalityScore"::double precision, + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, NOW(), NOW() FROM jsonb_to_recordset($(values)::jsonb) AS v( "projectSlug" text, "repoName" text, "repoUrl" text, - "criticalityScore" double precision + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision ) `, { values: JSON.stringify(values) }, @@ -171,7 +179,8 @@ export async function upsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -179,14 +188,16 @@ export async function upsertProjectCatalog( $(projectSlug), $(repoName), $(repoUrl), - $(criticalityScore), + $(ossfCriticalityScore), + $(lfCriticalityScore), NOW(), NOW() ) ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "criticalityScore" = EXCLUDED."criticalityScore", + "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", + "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", "updatedAt" = NOW() RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} `, @@ -194,7 +205,8 @@ export async function upsertProjectCatalog( projectSlug: data.projectSlug, repoName: data.repoName, repoUrl: data.repoUrl, - criticalityScore: data.criticalityScore ?? null, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, }, ) } @@ -211,7 +223,8 @@ export async function bulkUpsertProjectCatalog( projectSlug: item.projectSlug, repoName: item.repoName, repoUrl: item.repoUrl, - criticalityScore: item.criticalityScore ?? null, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, })) await qx.result( @@ -220,7 +233,8 @@ export async function bulkUpsertProjectCatalog( "projectSlug", "repoName", "repoUrl", - "criticalityScore", + "ossfCriticalityScore", + "lfCriticalityScore", "createdAt", "updatedAt" ) @@ -228,19 +242,22 @@ export async function bulkUpsertProjectCatalog( v."projectSlug", v."repoName", v."repoUrl", - v."criticalityScore"::double precision, + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, NOW(), NOW() FROM jsonb_to_recordset($(values)::jsonb) AS v( "projectSlug" text, "repoName" text, "repoUrl" text, - "criticalityScore" double precision + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision ) ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "criticalityScore" = EXCLUDED."criticalityScore", + "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", + "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", "updatedAt" = NOW() `, { values: JSON.stringify(values) }, @@ -267,9 +284,13 @@ export async function updateProjectCatalog( setClauses.push('"repoUrl" = $(repoUrl)') params.repoUrl = data.repoUrl } - if (data.criticalityScore !== undefined) { - setClauses.push('"criticalityScore" = $(criticalityScore)') - params.criticalityScore = data.criticalityScore + if (data.ossfCriticalityScore !== undefined) { + setClauses.push('"ossfCriticalityScore" = $(ossfCriticalityScore)') + params.ossfCriticalityScore = data.ossfCriticalityScore + } + if (data.lfCriticalityScore !== undefined) { + setClauses.push('"lfCriticalityScore" = $(lfCriticalityScore)') + params.lfCriticalityScore = data.lfCriticalityScore } if (data.syncedAt !== undefined) { setClauses.push('"syncedAt" = $(syncedAt)') diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts index 382527f57f..8cbb39a310 100644 --- a/services/libs/data-access-layer/src/project-catalog/types.ts +++ b/services/libs/data-access-layer/src/project-catalog/types.ts @@ -3,7 +3,8 @@ export interface IDbProjectCatalog { projectSlug: string repoName: string repoUrl: string - criticalityScore: number | null + ossfCriticalityScore: number | null + lfCriticalityScore: number | null syncedAt: string | null createdAt: string | null updatedAt: string | null @@ -11,11 +12,15 @@ export interface IDbProjectCatalog { type ProjectCatalogWritable = Pick< IDbProjectCatalog, - 'projectSlug' | 'repoName' | 'repoUrl' | 'criticalityScore' + 'projectSlug' | 'repoName' | 'repoUrl' | 'ossfCriticalityScore' | 'lfCriticalityScore' > -export type IDbProjectCatalogCreate = Omit & { - criticalityScore?: number +export type IDbProjectCatalogCreate = Omit< + ProjectCatalogWritable, + 'ossfCriticalityScore' | 'lfCriticalityScore' +> & { + ossfCriticalityScore?: number + lfCriticalityScore?: number } export type IDbProjectCatalogUpdate = Partial & { From e9852f171d41b3b7de726c6868929ed097dcbc82 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 26 Mar 2026 10:15:29 +0100 Subject: [PATCH 25/33] feat: add ossf data fetcher (CM-952) (#3839) Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 3 + .../README.md | 73 +++++++ .../package.json | 3 +- .../src/activities.ts | 4 +- .../src/activities/activities.ts | 121 ++++++++++- .../src/main.ts | 2 +- .../schedules/scheduleProjectsDiscovery.ts | 11 +- .../sources/lf-criticality-score/source.ts | 188 ++++++++++++++++++ .../ossf-criticality-score/bucketClient.ts | 96 +++++++++ .../sources/ossf-criticality-score/source.ts | 75 +++++++ .../src/sources/registry.ts | 21 ++ .../src/sources/types.ts | 27 +++ .../src/workflows/discoverProjects.ts | 47 ++++- .../src/project-catalog/projectCatalog.ts | 8 +- 14 files changed, 659 insertions(+), 20 deletions(-) create mode 100644 services/apps/automatic_projects_discovery_worker/README.md create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/registry.ts create mode 100644 services/apps/automatic_projects_discovery_worker/src/sources/types.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ef062f9f07..616fea2b48 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -566,6 +566,9 @@ importers: '@temporalio/workflow': specifier: ~1.11.8 version: 1.11.8 + csv-parse: + specifier: ^5.5.6 + version: 5.5.6 tsx: specifier: ^4.7.1 version: 4.7.3 diff --git a/services/apps/automatic_projects_discovery_worker/README.md b/services/apps/automatic_projects_discovery_worker/README.md new file mode 100644 index 0000000000..77623513cc --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/README.md @@ -0,0 +1,73 @@ +# Automatic Projects Discovery Worker + +Temporal worker that discovers open-source projects from external data sources and writes them to the `projectCatalog` table. + +## Architecture + +### Source abstraction + +Every data source implements the `IDiscoverySource` interface (`src/sources/types.ts`): + +| Method | Purpose | +| ----------------------------- | --------------------------------------------------------------------------- | +| `listAvailableDatasets()` | Returns available dataset snapshots, sorted newest-first | +| `fetchDatasetStream(dataset)` | Returns a readable stream for the dataset (e.g. HTTP response) | +| `parseRow(rawRow)` | Converts a raw CSV/JSON row into a `IDiscoverySourceRow`, or `null` to skip | + +Sources are registered in `src/sources/registry.ts` as a simple name → factory map. + +**To add a new source:** create a class implementing `IDiscoverySource`, then add one line to the registry. + +### Current sources + +| Name | Folder | Description | +| ------------------------ | ------------------------------------- | ------------------------------------------------------------------------------------ | +| `ossf-criticality-score` | `src/sources/ossf-criticality-score/` | OSSF Criticality Score snapshots from a public GCS bucket (~750K repos per snapshot) | + +### Workflow + +``` +discoverProjects({ mode: 'incremental' | 'full' }) + │ + ├─ Activity: listDatasets(sourceName) + │ → returns dataset descriptors sorted newest-first + │ + ├─ Selection: incremental → latest only, full → all datasets + │ + └─ For each dataset: + └─ Activity: processDataset(sourceName, dataset) + → HTTP stream → csv-parse → batches of 5000 → bulkUpsertProjectCatalog +``` + +### Timeouts + +| Activity | startToCloseTimeout | retries | +| ------------------ | ------------------- | ------- | +| `listDatasets` | 2 min | 3 | +| `processDataset` | 30 min | 3 | +| Workflow execution | 2 hours | 3 | + +### Schedule + +Runs daily at midnight via Temporal cron (`0 0 * * *`). + +## File structure + +``` +src/ +├── main.ts # Service bootstrap (postgres enabled) +├── activities.ts # Barrel re-export +├── workflows.ts # Barrel re-export +├── activities/ +│ └── activities.ts # listDatasets, processDataset +├── workflows/ +│ └── discoverProjects.ts # Orchestration with mode selection +├── schedules/ +│ └── scheduleProjectsDiscovery.ts # Temporal cron schedule +└── sources/ + ├── types.ts # IDiscoverySource, IDatasetDescriptor + ├── registry.ts # Source factory map + └── ossf-criticality-score/ + ├── source.ts # IDiscoverySource implementation + └── bucketClient.ts # GCS public bucket HTTP client +``` diff --git a/services/apps/automatic_projects_discovery_worker/package.json b/services/apps/automatic_projects_discovery_worker/package.json index 1c79505f89..022c1a6297 100644 --- a/services/apps/automatic_projects_discovery_worker/package.json +++ b/services/apps/automatic_projects_discovery_worker/package.json @@ -2,7 +2,7 @@ "name": "@crowd/automatic-projects-discovery-worker", "scripts": { "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx src/main.ts", - "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx --inspect=0.0.0.0:9232 src/main.ts", "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", @@ -24,6 +24,7 @@ "@temporalio/activity": "~1.11.8", "@temporalio/client": "~1.11.8", "@temporalio/workflow": "~1.11.8", + "csv-parse": "^5.5.6", "tsx": "^4.7.1", "typescript": "^5.6.3" }, diff --git a/services/apps/automatic_projects_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts index 3662234550..a2c8cf9935 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities.ts @@ -1 +1,3 @@ -export * from './activities/activities' +import { listDatasets, listSources, processDataset } from './activities/activities' + +export { listDatasets, listSources, processDataset } diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 3aea7f8200..fbbe6c28a8 100644 --- a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -1,7 +1,124 @@ +import { parse } from 'csv-parse' + +import { bulkUpsertProjectCatalog } from '@crowd/data-access-layer' +import { IDbProjectCatalogCreate } from '@crowd/data-access-layer/src/project-catalog/types' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceLogger } from '@crowd/logging' +import { svc } from '../main' +import { getAvailableSourceNames, getSource } from '../sources/registry' +import { IDatasetDescriptor } from '../sources/types' + const log = getServiceLogger() -export async function logDiscoveryRun(): Promise { - log.info('Automatic projects discovery workflow executed successfully.') +const BATCH_SIZE = 5000 + +export async function listSources(): Promise { + return getAvailableSourceNames() +} + +export async function listDatasets(sourceName: string): Promise { + const source = getSource(sourceName) + const datasets = await source.listAvailableDatasets() + + log.info({ sourceName, count: datasets.length, newest: datasets[0]?.id }, 'Datasets listed.') + + return datasets +} + +export async function processDataset( + sourceName: string, + dataset: IDatasetDescriptor, +): Promise { + const qx = pgpQx(svc.postgres.writer.connection()) + const startTime = Date.now() + + log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...') + + const source = getSource(sourceName) + const stream = await source.fetchDatasetStream(dataset) + + // For CSV sources: pipe through csv-parse to get Record objects. + // For JSON sources: the stream already emits pre-parsed objects in object mode. + const records = + source.format === 'json' + ? stream + : stream.pipe( + parse({ + columns: true, + skip_empty_lines: true, + trim: true, + }), + ) + + // pipe() does not forward source errors to the destination automatically, so we + // destroy records explicitly — this surfaces the error in the for-await loop and + // lets Temporal mark the activity as failed and retry it. + stream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.') + records.destroy(err) + }) + + if (source.format !== 'json') { + const csvRecords = records as ReturnType + csvRecords.on('error', (err) => { + log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') + }) + } + + let batch: IDbProjectCatalogCreate[] = [] + let totalProcessed = 0 + let totalSkipped = 0 + let batchNumber = 0 + let totalRows = 0 + + for await (const rawRow of records) { + totalRows++ + + const parsed = source.parseRow(rawRow as Record) + if (!parsed) { + totalSkipped++ + continue + } + + batch.push({ + projectSlug: parsed.projectSlug, + repoName: parsed.repoName, + repoUrl: parsed.repoUrl, + ossfCriticalityScore: parsed.ossfCriticalityScore, + lfCriticalityScore: parsed.lfCriticalityScore, + }) + + if (batch.length >= BATCH_SIZE) { + batchNumber++ + + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + batch = [] + + log.info({ totalProcessed, batchNumber, datasetId: dataset.id }, 'Batch upserted.') + } + } + + // Flush remaining rows that didn't fill a complete batch + if (batch.length > 0) { + batchNumber++ + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + } + + const elapsedSeconds = ((Date.now() - startTime) / 1000).toFixed(1) + + log.info( + { + sourceName, + datasetId: dataset.id, + totalRows, + totalProcessed, + totalSkipped, + totalBatches: batchNumber, + elapsedSeconds, + }, + 'Dataset processing complete.', + ) } diff --git a/services/apps/automatic_projects_discovery_worker/src/main.ts b/services/apps/automatic_projects_discovery_worker/src/main.ts index 326c3a361a..0345c420f8 100644 --- a/services/apps/automatic_projects_discovery_worker/src/main.ts +++ b/services/apps/automatic_projects_discovery_worker/src/main.ts @@ -18,7 +18,7 @@ const config: Config = { const options: Options = { postgres: { - enabled: false, + enabled: true, }, opensearch: { enabled: false, diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 847c2e4ce9..b173126a78 100644 --- a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -3,18 +3,15 @@ import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/clien import { svc } from '../main' import { discoverProjects } from '../workflows' -const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM - export const scheduleProjectsDiscovery = async () => { - const cronExpression = process.env.CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON || DEFAULT_CRON - - svc.log.info(`Scheduling projects discovery with cron: ${cronExpression}`) + svc.log.info(`Scheduling projects discovery`) try { await svc.temporal.schedule.create({ scheduleId: 'automaticProjectsDiscovery', spec: { - cronExpressions: [cronExpression], + // Run every day at midnight + cronExpressions: ['0 0 * * *'], }, policies: { overlap: ScheduleOverlapPolicy.SKIP, @@ -24,6 +21,8 @@ export const scheduleProjectsDiscovery = async () => { type: 'startWorkflow', workflowType: discoverProjects, taskQueue: 'automatic-projects-discovery', + args: [{ mode: 'incremental' as const }], + workflowExecutionTimeout: '2 hours', retry: { initialInterval: '15 seconds', backoffCoefficient: 2, diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts new file mode 100644 index 0000000000..4738318454 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts @@ -0,0 +1,188 @@ +import http from 'http' +import https from 'https' +import { Readable } from 'stream' + +import { getServiceLogger } from '@crowd/logging' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +const log = getServiceLogger() + +const DEFAULT_API_URL = 'https://hypervascular-nonduplicative-vern.ngrok-free.dev' +const PAGE_SIZE = 100 + +interface LfApiResponse { + page: number + pageSize: number + total: number + totalPages: number + data: LfApiRow[] +} + +interface LfApiRow { + runDate: string + repoUrl: string + owner: string + repoName: string + contributors: number + organizations: number + sizeSloc: number + lastUpdated: number + age: number + commitFreq: number + score: number +} + +function getApiBaseUrl(): string { + return (process.env.LF_CRITICALITY_SCORE_API_URL ?? DEFAULT_API_URL).replace(/\/$/, '') +} + +async function fetchPage( + baseUrl: string, + startDate: string, + endDate: string, + page: number, +): Promise { + const url = `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}&page=${page}&pageSize=${PAGE_SIZE}` + + return new Promise((resolve, reject) => { + const client = url.startsWith('https://') ? https : http + + const req = client.get(url, (res) => { + if (res.statusCode !== 200) { + reject(new Error(`LF Criticality Score API returned status ${res.statusCode} for ${url}`)) + res.resume() + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => { + try { + resolve(JSON.parse(Buffer.concat(chunks).toString('utf8')) as LfApiResponse) + } catch (err) { + reject(new Error(`Failed to parse LF Criticality Score API response: ${err}`)) + } + }) + res.on('error', reject) + }) + + req.on('error', reject) + req.end() + }) +} + +/** + * Generates the first day and last day of a given month. + * monthOffset = 0 → current month, -1 → previous month, etc. + */ +function monthRange(monthOffset: number): { startDate: string; endDate: string } { + const now = new Date() + const year = now.getUTCFullYear() + const month = now.getUTCMonth() + monthOffset // can be negative; Date handles rollover + + const first = new Date(Date.UTC(year, month, 1)) + const last = new Date(Date.UTC(year, month + 1, 0)) // last day of month + + const pad = (n: number) => String(n).padStart(2, '0') + const fmt = (d: Date) => + `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}` + + return { startDate: fmt(first), endDate: fmt(last) } +} + +export class LfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'lf-criticality-score' + public readonly format = 'json' as const + + async listAvailableDatasets(): Promise { + const baseUrl = getApiBaseUrl() + + // Return one dataset per month for the last 12 months (newest first) + const datasets: IDatasetDescriptor[] = [] + + for (let offset = 0; offset >= -11; offset--) { + const { startDate, endDate } = monthRange(offset) + const id = startDate.slice(0, 7) // e.g. "2026-02" + + datasets.push({ + id, + date: startDate, + url: `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}`, + }) + } + + return datasets + } + + /** + * Returns an object-mode Readable that fetches all pages from the API + * and pushes each row as a plain object. Activities.ts iterates this + * directly (no csv-parse) because format === 'json'. + */ + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const baseUrl = getApiBaseUrl() + + // Extract startDate and endDate from the stored URL + const parsed = new URL(dataset.url) + const startDate = parsed.searchParams.get('startDate') ?? '' + const endDate = parsed.searchParams.get('endDate') ?? '' + + async function* pages() { + let page = 1 + let totalPages = 1 + + do { + const response = await fetchPage(baseUrl, startDate, endDate, page) + totalPages = response.totalPages + + for (const row of response.data) { + yield row + } + + log.debug( + { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length }, + 'LF Criticality Score page fetched.', + ) + + page++ + } while (page <= totalPages) + } + + return Readable.from(pages(), { objectMode: true }) + } + + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repoUrl'] as string | undefined + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const score = rawRow['score'] + const lfCriticalityScore = typeof score === 'number' ? score : parseFloat(score as string) + + return { + projectSlug, + repoName, + repoUrl, + lfCriticalityScore: Number.isNaN(lfCriticalityScore) ? undefined : lfCriticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts new file mode 100644 index 0000000000..71b2066ae7 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts @@ -0,0 +1,96 @@ +import https from 'https' + +const BUCKET_URL = 'https://commondatastorage.googleapis.com/ossf-criticality-score' + +function httpsGet(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + httpsGet(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8'))) + res.on('error', reject) + }) + .on('error', reject) + }) +} + +function extractPrefixes(xml: string): string[] { + const prefixes: string[] = [] + const regex = /([^<]+)<\/Prefix>/g + let match: RegExpExecArray | null + + while ((match = regex.exec(xml)) !== null) { + prefixes.push(match[1]) + } + + return prefixes +} + +/** + * List all date prefixes in the OSSF Criticality Score bucket. + * Returns prefixes like ['2024.07.01/', '2024.07.08/', ...] + */ +export async function listDatePrefixes(): Promise { + const xml = await httpsGet(`${BUCKET_URL}?delimiter=/`) + return extractPrefixes(xml).filter((p) => /^\d{4}\.\d{2}\.\d{2}\/$/.test(p)) +} + +/** + * List time sub-prefixes for a given date prefix. + * E.g., for '2024.07.01/' returns ['2024.07.01/060102/', ...] + */ +export async function listTimePrefixes(datePrefix: string): Promise { + const xml = await httpsGet(`${BUCKET_URL}?prefix=${encodeURIComponent(datePrefix)}&delimiter=/`) + return extractPrefixes(xml).filter((p) => p !== datePrefix) +} + +/** + * Build the full URL for the all.csv file within a given dataset prefix. + */ +export function buildDatasetUrl(prefix: string): string { + return `${BUCKET_URL}/${prefix}all.csv` +} + +/** + * Get an HTTPS readable stream for a given URL. + */ +export function getHttpsStream(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + getHttpsStream(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + resolve(res) + }) + .on('error', reject) + }) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts new file mode 100644 index 0000000000..8ee20fb602 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts @@ -0,0 +1,75 @@ +import { Readable } from 'stream' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +import { buildDatasetUrl, getHttpsStream, listDatePrefixes, listTimePrefixes } from './bucketClient' + +export class OssfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'ossf-criticality-score' + + async listAvailableDatasets(): Promise { + const datePrefixes = await listDatePrefixes() + + const datasets: IDatasetDescriptor[] = [] + + for (const datePrefix of datePrefixes) { + const timePrefixes = await listTimePrefixes(datePrefix) + + for (const timePrefix of timePrefixes) { + const date = datePrefix.replace(/\/$/, '') + const url = buildDatasetUrl(timePrefix) + + datasets.push({ + id: timePrefix.replace(/\/$/, ''), + date, + url, + }) + } + } + + // Sort newest-first by date + datasets.sort((a, b) => b.date.localeCompare(a.date)) + + return datasets + } + + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const stream = await getHttpsStream(dataset.url) + return stream as Readable + } + + // CSV columns use dot notation (e.g. "repo.url", "default_score") + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repo.url'] as string | undefined + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const scoreRaw = rawRow['default_score'] + const ossfCriticalityScore = scoreRaw ? parseFloat(scoreRaw as string) : undefined + + return { + projectSlug, + repoName, + repoUrl, + ossfCriticalityScore: Number.isNaN(ossfCriticalityScore) ? undefined : ossfCriticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts new file mode 100644 index 0000000000..1c7af148a3 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts @@ -0,0 +1,21 @@ +import { LfCriticalityScoreSource } from './lf-criticality-score/source' +import { OssfCriticalityScoreSource } from './ossf-criticality-score/source' +import { IDiscoverySource } from './types' + +// To add a new source: instantiate it here. +const sources: IDiscoverySource[] = [ + new OssfCriticalityScoreSource(), + new LfCriticalityScoreSource(), +] + +export function getSource(name: string): IDiscoverySource { + const source = sources.find((s) => s.name === name) + if (!source) { + throw new Error(`Unknown source: ${name}. Available: ${sources.map((s) => s.name).join(', ')}`) + } + return source +} + +export function getAvailableSourceNames(): string[] { + return sources.map((s) => s.name) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts new file mode 100644 index 0000000000..9b386b5da7 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts @@ -0,0 +1,27 @@ +import { Readable } from 'stream' + +export interface IDatasetDescriptor { + id: string + date: string + url: string +} + +export interface IDiscoverySource { + name: string + /** + * 'csv' (default): fetchDatasetStream returns a raw text stream, piped through csv-parse. + * 'json': fetchDatasetStream returns an object-mode Readable that emits pre-parsed records. + */ + format?: 'csv' | 'json' + listAvailableDatasets(): Promise + fetchDatasetStream(dataset: IDatasetDescriptor): Promise + parseRow(rawRow: Record): IDiscoverySourceRow | null +} + +export interface IDiscoverySourceRow { + projectSlug: string + repoName: string + repoUrl: string + ossfCriticalityScore?: number + lfCriticalityScore?: number +} diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts index f43a9b5a12..00856493d4 100644 --- a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts +++ b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts @@ -1,11 +1,48 @@ -import { proxyActivities } from '@temporalio/workflow' +import { log, proxyActivities } from '@temporalio/workflow' import type * as activities from '../activities' -const activity = proxyActivities({ - startToCloseTimeout: '1 minutes', +const listActivities = proxyActivities({ + startToCloseTimeout: '2 minutes', + retry: { maximumAttempts: 3 }, }) -export async function discoverProjects(): Promise { - await activity.logDiscoveryRun() +// processDataset is long-running (10-20 min for ~119MB / ~750K rows). +const processActivities = proxyActivities({ + startToCloseTimeout: '30 minutes', + retry: { maximumAttempts: 3 }, +}) + +export async function discoverProjects( + input: { mode: 'incremental' | 'full' } = { mode: 'incremental' }, +): Promise { + const { mode } = input + + const sourceNames = await listActivities.listSources() + + for (const sourceName of sourceNames) { + const allDatasets = await listActivities.listDatasets(sourceName) + + if (allDatasets.length === 0) { + log.warn(`No datasets found for source "${sourceName}". Skipping.`) + continue + } + + // allDatasets is sorted newest-first. + // Incremental: process only the latest snapshot. + // Full: process oldest-first so the newest data wins the final upsert. + const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() + + log.info( + `source=${sourceName} mode=${mode}, ${datasets.length}/${allDatasets.length} datasets to process.`, + ) + + for (let i = 0; i < datasets.length; i++) { + const dataset = datasets[i] + log.info(`[${sourceName}] Processing dataset ${i + 1}/${datasets.length}: ${dataset.id}`) + await processActivities.processDataset(sourceName, dataset) + } + + log.info(`[${sourceName}] Done. Processed ${datasets.length} dataset(s).`) + } } diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts index 5b94e2b8bc..b951e11317 100644 --- a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -196,8 +196,8 @@ export async function upsertProjectCatalog( ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", - "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), "updatedAt" = NOW() RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} `, @@ -256,8 +256,8 @@ export async function bulkUpsertProjectCatalog( ON CONFLICT ("repoUrl") DO UPDATE SET "projectSlug" = EXCLUDED."projectSlug", "repoName" = EXCLUDED."repoName", - "ossfCriticalityScore" = EXCLUDED."ossfCriticalityScore", - "lfCriticalityScore" = EXCLUDED."lfCriticalityScore", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), "updatedAt" = NOW() `, { values: JSON.stringify(values) }, From 5263af25b2a9fc69f893d35fd80a3b7ee267d91c Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 15:39:00 +0100 Subject: [PATCH 26/33] feat: adding project-discovery-worker Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 97 +++++++++++++++++++ .../automatic-project-discovery-worker.yaml | 64 ++++++++++++ ...kerfile.automatic_project_discovery_worker | 23 +++++ .../package.json | 34 +++++++ .../src/activities.ts | 1 + .../src/activities/activities.ts | 7 ++ .../src/main.ts | 36 +++++++ .../src/schedules/scheduleProjectDiscovery.ts | 42 ++++++++ .../src/workflows.ts | 3 + .../src/workflows/discoverProjects.ts | 11 +++ .../tsconfig.json | 4 + 11 files changed, 322 insertions(+) create mode 100644 scripts/services/automatic-project-discovery-worker.yaml create mode 100644 scripts/services/docker/Dockerfile.automatic_project_discovery_worker create mode 100644 services/apps/automatic_project_discovery_worker/package.json create mode 100644 services/apps/automatic_project_discovery_worker/src/activities.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/activities/activities.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/main.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/workflows.ts create mode 100644 services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts create mode 100644 services/apps/automatic_project_discovery_worker/tsconfig.json diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 052cfd6ade..0904e17f6a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -476,6 +476,58 @@ importers: specifier: ^3.3.3 version: 3.3.3 + services/apps/automatic_project_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/cache_worker: dependencies: '@crowd/archetype-standard': @@ -10619,6 +10671,51 @@ snapshots: - '@aws-sdk/client-sts' - aws-crt + '@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0)': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/client-sts': 3.687.0 + '@aws-sdk/core': 3.686.0 + '@aws-sdk/credential-provider-node': 3.687.0(@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0))(@aws-sdk/client-sts@3.687.0) + '@aws-sdk/middleware-host-header': 3.686.0 + '@aws-sdk/middleware-logger': 3.686.0 + '@aws-sdk/middleware-recursion-detection': 3.686.0 + '@aws-sdk/middleware-user-agent': 3.687.0 + '@aws-sdk/region-config-resolver': 3.686.0 + '@aws-sdk/types': 3.686.0 + '@aws-sdk/util-endpoints': 3.686.0 + '@aws-sdk/util-user-agent-browser': 3.686.0 + '@aws-sdk/util-user-agent-node': 3.687.0 + '@smithy/config-resolver': 3.0.10 + '@smithy/core': 2.5.1 + '@smithy/fetch-http-handler': 4.0.0 + '@smithy/hash-node': 3.0.8 + '@smithy/invalid-dependency': 3.0.8 + '@smithy/middleware-content-length': 3.0.10 + '@smithy/middleware-endpoint': 3.2.1 + '@smithy/middleware-retry': 3.0.25 + '@smithy/middleware-serde': 3.0.8 + '@smithy/middleware-stack': 3.0.8 + '@smithy/node-config-provider': 3.1.9 + '@smithy/node-http-handler': 3.2.5 + '@smithy/protocol-http': 4.1.5 + '@smithy/smithy-client': 3.4.2 + '@smithy/types': 3.6.0 + '@smithy/url-parser': 3.0.8 + '@smithy/util-base64': 3.0.0 + '@smithy/util-body-length-browser': 3.0.0 + '@smithy/util-body-length-node': 3.0.0 + '@smithy/util-defaults-mode-browser': 3.0.25 + '@smithy/util-defaults-mode-node': 3.0.25 + '@smithy/util-endpoints': 2.1.4 + '@smithy/util-middleware': 3.0.8 + '@smithy/util-retry': 3.0.8 + '@smithy/util-utf8': 3.0.0 + tslib: 2.6.2 + transitivePeerDependencies: + - aws-crt + '@aws-sdk/client-sso@3.556.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 diff --git a/scripts/services/automatic-project-discovery-worker.yaml b/scripts/services/automatic-project-discovery-worker.yaml new file mode 100644 index 0000000000..89f2e2abb3 --- /dev/null +++ b/scripts/services/automatic-project-discovery-worker.yaml @@ -0,0 +1,64 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: automatic-project-discovery-worker + CROWD_TEMPORAL_TASKQUEUE: automatic-project-discovery + SHELL: /bin/sh + +services: + automatic-project-discovery-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + command: 'pnpm run start' + working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + automatic-project-discovery-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + command: 'pnpm run dev' + working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: automatic-project-discovery-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/automatic_project_discovery_worker/src:/usr/crowd/app/services/apps/automatic_project_discovery_worker/src + +networks: + crowd-bridge: + external: true diff --git a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker b/scripts/services/docker/Dockerfile.automatic_project_discovery_worker new file mode 100644 index 0000000000..9492597b72 --- /dev/null +++ b/scripts/services/docker/Dockerfile.automatic_project_discovery_worker @@ -0,0 +1,23 @@ +FROM node:20-alpine as builder + +RUN apk add --no-cache python3 make g++ + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./ +RUN pnpm fetch + +COPY ./services ./services +RUN pnpm i --frozen-lockfile + +FROM node:20-bookworm-slim as runner + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate && apt update && apt install -y ca-certificates --no-install-recommends && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/crowd/app/node_modules ./node_modules +COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json +COPY --from=builder /usr/crowd/app/services/libs ./services/libs +COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes +COPY --from=builder /usr/crowd/app/services/apps/automatic_project_discovery_worker/ ./services/apps/automatic_project_discovery_worker diff --git a/services/apps/automatic_project_discovery_worker/package.json b/services/apps/automatic_project_discovery_worker/package.json new file mode 100644 index 0000000000..a0df2ff8c0 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/package.json @@ -0,0 +1,34 @@ +{ + "name": "@crowd/automatic-project-discovery-worker", + "scripts": { + "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker tsx src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", + "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", + "lint": "npx eslint --ext .ts src --max-warnings=0", + "format": "npx prettier --write \"src/**/*.ts\"", + "format-check": "npx prettier --check .", + "tsc-check": "tsc --noEmit" + }, + "dependencies": { + "@crowd/archetype-standard": "workspace:*", + "@crowd/archetype-worker": "workspace:*", + "@crowd/common": "workspace:*", + "@crowd/common_services": "workspace:*", + "@crowd/data-access-layer": "workspace:*", + "@crowd/logging": "workspace:*", + "@crowd/redis": "workspace:*", + "@crowd/temporal": "workspace:*", + "@crowd/types": "workspace:*", + "@temporalio/activity": "~1.11.8", + "@temporalio/client": "~1.11.8", + "@temporalio/workflow": "~1.11.8", + "tsx": "^4.7.1", + "typescript": "^5.6.3" + }, + "devDependencies": { + "@types/node": "^20.8.2", + "nodemon": "^3.0.1" + } +} diff --git a/services/apps/automatic_project_discovery_worker/src/activities.ts b/services/apps/automatic_project_discovery_worker/src/activities.ts new file mode 100644 index 0000000000..3662234550 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/activities.ts @@ -0,0 +1 @@ +export * from './activities/activities' diff --git a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts b/services/apps/automatic_project_discovery_worker/src/activities/activities.ts new file mode 100644 index 0000000000..806f5e5087 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/activities/activities.ts @@ -0,0 +1,7 @@ +import { getServiceLogger } from '@crowd/logging' + +const log = getServiceLogger() + +export async function logDiscoveryRun(): Promise { + log.info('Automatic project discovery workflow executed successfully.') +} diff --git a/services/apps/automatic_project_discovery_worker/src/main.ts b/services/apps/automatic_project_discovery_worker/src/main.ts new file mode 100644 index 0000000000..44f3182720 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/main.ts @@ -0,0 +1,36 @@ +import { Config } from '@crowd/archetype-standard' +import { Options, ServiceWorker } from '@crowd/archetype-worker' + +import { scheduleProjectDiscovery } from './schedules/scheduleProjectDiscovery' + +const config: Config = { + envvars: [], + producer: { + enabled: false, + }, + temporal: { + enabled: true, + }, + redis: { + enabled: false, + }, +} + +const options: Options = { + postgres: { + enabled: false, + }, + opensearch: { + enabled: false, + }, +} + +export const svc = new ServiceWorker(config, options) + +setImmediate(async () => { + await svc.init() + + await scheduleProjectDiscovery() + + await svc.start() +}) diff --git a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts b/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts new file mode 100644 index 0000000000..74e0636b56 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts @@ -0,0 +1,42 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../main' +import { discoverProjects } from '../workflows' + +const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM + +export const scheduleProjectDiscovery = async () => { + const cronExpression = process.env.CROWD_AUTOMATIC_PROJECT_DISCOVERY_CRON || DEFAULT_CRON + + svc.log.info(`Scheduling project discovery with cron: ${cronExpression}`) + + try { + await svc.temporal.schedule.create({ + scheduleId: 'automaticProjectDiscovery', + spec: { + cronExpressions: [cronExpression], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 minute', + }, + action: { + type: 'startWorkflow', + workflowType: discoverProjects, + taskQueue: 'automatic-project-discovery', + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule already registered in Temporal.') + svc.log.info('Configuration may have changed since. Please make sure they are in sync.') + } else { + throw new Error(err) + } + } +} diff --git a/services/apps/automatic_project_discovery_worker/src/workflows.ts b/services/apps/automatic_project_discovery_worker/src/workflows.ts new file mode 100644 index 0000000000..07b00cee6f --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/workflows.ts @@ -0,0 +1,3 @@ +import { discoverProjects } from './workflows/discoverProjects' + +export { discoverProjects } diff --git a/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts new file mode 100644 index 0000000000..f43a9b5a12 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts @@ -0,0 +1,11 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from '../activities' + +const activity = proxyActivities({ + startToCloseTimeout: '1 minutes', +}) + +export async function discoverProjects(): Promise { + await activity.logDiscoveryRun() +} diff --git a/services/apps/automatic_project_discovery_worker/tsconfig.json b/services/apps/automatic_project_discovery_worker/tsconfig.json new file mode 100644 index 0000000000..bf7f183850 --- /dev/null +++ b/services/apps/automatic_project_discovery_worker/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../base.tsconfig.json", + "include": ["src/**/*"] +} From ed854df0496731895bde7efb2d3ec2b589db5df8 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:12:44 +0100 Subject: [PATCH 27/33] feat: add builder Signed-off-by: Umberto Sgueglia --- scripts/builders/automatic-project-discovery-worker.env | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scripts/builders/automatic-project-discovery-worker.env diff --git a/scripts/builders/automatic-project-discovery-worker.env b/scripts/builders/automatic-project-discovery-worker.env new file mode 100644 index 0000000000..1294c29254 --- /dev/null +++ b/scripts/builders/automatic-project-discovery-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.automatic_project_discovery_worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/automatic-project-discovery-worker" +SERVICES="automatic-project-discovery-worker" From 9ccf8c0a32af590624ed487ab144b2708c54a2b7 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:28:39 +0100 Subject: [PATCH 28/33] refactor: rename service Signed-off-by: Umberto Sgueglia --- .../automatic-project-discovery-worker.env | 4 ---- .../automatic-projects-discovery-worker.env | 4 ++++ ... automatic-projects-discovery-worker.yaml} | 20 +++++++++---------- ...rfile.automatic_projects_discovery_worker} | 2 +- .../package.json | 8 ++++---- .../src/activities.ts | 0 .../src/activities/activities.ts | 2 +- .../src/main.ts | 4 ++-- .../schedules/scheduleProjectsDiscovery.ts} | 10 +++++----- .../src/workflows.ts | 0 .../src/workflows/discoverProjects.ts | 0 .../tsconfig.json | 0 12 files changed, 27 insertions(+), 27 deletions(-) delete mode 100644 scripts/builders/automatic-project-discovery-worker.env create mode 100644 scripts/builders/automatic-projects-discovery-worker.env rename scripts/services/{automatic-project-discovery-worker.yaml => automatic-projects-discovery-worker.yaml} (78%) rename scripts/services/docker/{Dockerfile.automatic_project_discovery_worker => Dockerfile.automatic_projects_discovery_worker} (92%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/package.json (69%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/activities.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/activities/activities.ts (65%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/main.ts (81%) rename services/apps/{automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts => automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts} (74%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/workflows.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/src/workflows/discoverProjects.ts (100%) rename services/apps/{automatic_project_discovery_worker => automatic_projects_discovery_worker}/tsconfig.json (100%) diff --git a/scripts/builders/automatic-project-discovery-worker.env b/scripts/builders/automatic-project-discovery-worker.env deleted file mode 100644 index 1294c29254..0000000000 --- a/scripts/builders/automatic-project-discovery-worker.env +++ /dev/null @@ -1,4 +0,0 @@ -DOCKERFILE="./services/docker/Dockerfile.automatic_project_discovery_worker" -CONTEXT="../" -REPO="sjc.ocir.io/axbydjxa5zuh/automatic-project-discovery-worker" -SERVICES="automatic-project-discovery-worker" diff --git a/scripts/builders/automatic-projects-discovery-worker.env b/scripts/builders/automatic-projects-discovery-worker.env new file mode 100644 index 0000000000..8416386449 --- /dev/null +++ b/scripts/builders/automatic-projects-discovery-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.automatic_projects_discovery_worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/automatic-projects-discovery-worker" +SERVICES="automatic-projects-discovery-worker" diff --git a/scripts/services/automatic-project-discovery-worker.yaml b/scripts/services/automatic-projects-discovery-worker.yaml similarity index 78% rename from scripts/services/automatic-project-discovery-worker.yaml rename to scripts/services/automatic-projects-discovery-worker.yaml index 89f2e2abb3..5f3732b7c2 100644 --- a/scripts/services/automatic-project-discovery-worker.yaml +++ b/scripts/services/automatic-projects-discovery-worker.yaml @@ -3,17 +3,17 @@ version: '3.1' x-env-args: &env-args DOCKER_BUILDKIT: 1 NODE_ENV: docker - SERVICE: automatic-project-discovery-worker - CROWD_TEMPORAL_TASKQUEUE: automatic-project-discovery + SERVICE: automatic-projects-discovery-worker + CROWD_TEMPORAL_TASKQUEUE: automatic-projects-discovery SHELL: /bin/sh services: - automatic-project-discovery-worker: + automatic-projects-discovery-worker: build: context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker command: 'pnpm run start' - working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker env_file: - ../../backend/.env.dist.local - ../../backend/.env.dist.composed @@ -25,12 +25,12 @@ services: networks: - crowd-bridge - automatic-project-discovery-worker-dev: + automatic-projects-discovery-worker-dev: build: context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.automatic_project_discovery_worker + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker command: 'pnpm run dev' - working_dir: /usr/crowd/app/services/apps/automatic_project_discovery_worker + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker env_file: - ../../backend/.env.dist.local - ../../backend/.env.dist.composed @@ -38,7 +38,7 @@ services: - ../../backend/.env.override.composed environment: <<: *env-args - hostname: automatic-project-discovery-worker + hostname: automatic-projects-discovery-worker networks: - crowd-bridge volumes: @@ -57,7 +57,7 @@ services: - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src - - ../../services/apps/automatic_project_discovery_worker/src:/usr/crowd/app/services/apps/automatic_project_discovery_worker/src + - ../../services/apps/automatic_projects_discovery_worker/src:/usr/crowd/app/services/apps/automatic_projects_discovery_worker/src networks: crowd-bridge: diff --git a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker similarity index 92% rename from scripts/services/docker/Dockerfile.automatic_project_discovery_worker rename to scripts/services/docker/Dockerfile.automatic_projects_discovery_worker index 9492597b72..860af6601e 100644 --- a/scripts/services/docker/Dockerfile.automatic_project_discovery_worker +++ b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker @@ -20,4 +20,4 @@ COPY --from=builder /usr/crowd/app/node_modules ./node_modules COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json COPY --from=builder /usr/crowd/app/services/libs ./services/libs COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes -COPY --from=builder /usr/crowd/app/services/apps/automatic_project_discovery_worker/ ./services/apps/automatic_project_discovery_worker +COPY --from=builder /usr/crowd/app/services/apps/automatic_projects_discovery_worker/ ./services/apps/automatic_projects_discovery_worker diff --git a/services/apps/automatic_project_discovery_worker/package.json b/services/apps/automatic_projects_discovery_worker/package.json similarity index 69% rename from services/apps/automatic_project_discovery_worker/package.json rename to services/apps/automatic_projects_discovery_worker/package.json index a0df2ff8c0..1c79505f89 100644 --- a/services/apps/automatic_project_discovery_worker/package.json +++ b/services/apps/automatic_projects_discovery_worker/package.json @@ -1,9 +1,9 @@ { - "name": "@crowd/automatic-project-discovery-worker", + "name": "@crowd/automatic-projects-discovery-worker", "scripts": { - "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker tsx src/main.ts", - "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", - "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-project-discovery SERVICE=automatic-project-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", "lint": "npx eslint --ext .ts src --max-warnings=0", diff --git a/services/apps/automatic_project_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/activities.ts rename to services/apps/automatic_projects_discovery_worker/src/activities.ts diff --git a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts similarity index 65% rename from services/apps/automatic_project_discovery_worker/src/activities/activities.ts rename to services/apps/automatic_projects_discovery_worker/src/activities/activities.ts index 806f5e5087..3aea7f8200 100644 --- a/services/apps/automatic_project_discovery_worker/src/activities/activities.ts +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -3,5 +3,5 @@ import { getServiceLogger } from '@crowd/logging' const log = getServiceLogger() export async function logDiscoveryRun(): Promise { - log.info('Automatic project discovery workflow executed successfully.') + log.info('Automatic projects discovery workflow executed successfully.') } diff --git a/services/apps/automatic_project_discovery_worker/src/main.ts b/services/apps/automatic_projects_discovery_worker/src/main.ts similarity index 81% rename from services/apps/automatic_project_discovery_worker/src/main.ts rename to services/apps/automatic_projects_discovery_worker/src/main.ts index 44f3182720..326c3a361a 100644 --- a/services/apps/automatic_project_discovery_worker/src/main.ts +++ b/services/apps/automatic_projects_discovery_worker/src/main.ts @@ -1,7 +1,7 @@ import { Config } from '@crowd/archetype-standard' import { Options, ServiceWorker } from '@crowd/archetype-worker' -import { scheduleProjectDiscovery } from './schedules/scheduleProjectDiscovery' +import { scheduleProjectsDiscovery } from './schedules/scheduleProjectsDiscovery' const config: Config = { envvars: [], @@ -30,7 +30,7 @@ export const svc = new ServiceWorker(config, options) setImmediate(async () => { await svc.init() - await scheduleProjectDiscovery() + await scheduleProjectsDiscovery() await svc.start() }) diff --git a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts similarity index 74% rename from services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts rename to services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts index 74e0636b56..847c2e4ce9 100644 --- a/services/apps/automatic_project_discovery_worker/src/schedules/scheduleProjectDiscovery.ts +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -5,14 +5,14 @@ import { discoverProjects } from '../workflows' const DEFAULT_CRON = '0 2 * * *' // Daily at 2:00 AM -export const scheduleProjectDiscovery = async () => { - const cronExpression = process.env.CROWD_AUTOMATIC_PROJECT_DISCOVERY_CRON || DEFAULT_CRON +export const scheduleProjectsDiscovery = async () => { + const cronExpression = process.env.CROWD_AUTOMATIC_PROJECTS_DISCOVERY_CRON || DEFAULT_CRON - svc.log.info(`Scheduling project discovery with cron: ${cronExpression}`) + svc.log.info(`Scheduling projects discovery with cron: ${cronExpression}`) try { await svc.temporal.schedule.create({ - scheduleId: 'automaticProjectDiscovery', + scheduleId: 'automaticProjectsDiscovery', spec: { cronExpressions: [cronExpression], }, @@ -23,7 +23,7 @@ export const scheduleProjectDiscovery = async () => { action: { type: 'startWorkflow', workflowType: discoverProjects, - taskQueue: 'automatic-project-discovery', + taskQueue: 'automatic-projects-discovery', retry: { initialInterval: '15 seconds', backoffCoefficient: 2, diff --git a/services/apps/automatic_project_discovery_worker/src/workflows.ts b/services/apps/automatic_projects_discovery_worker/src/workflows.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/workflows.ts rename to services/apps/automatic_projects_discovery_worker/src/workflows.ts diff --git a/services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts similarity index 100% rename from services/apps/automatic_project_discovery_worker/src/workflows/discoverProjects.ts rename to services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts diff --git a/services/apps/automatic_project_discovery_worker/tsconfig.json b/services/apps/automatic_projects_discovery_worker/tsconfig.json similarity index 100% rename from services/apps/automatic_project_discovery_worker/tsconfig.json rename to services/apps/automatic_projects_discovery_worker/tsconfig.json From 6c2bd7523f95678ca0b48b271bf8fe49194e5929 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 9 Feb 2026 16:35:53 +0100 Subject: [PATCH 29/33] fix: push lock file Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 138 +++++++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 51 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0904e17f6a..495c278fba 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -528,6 +528,58 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/automatic_projects_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/cache_worker: dependencies: '@crowd/archetype-standard': @@ -5627,10 +5679,6 @@ packages: brace-expansion@2.0.1: resolution: {integrity: sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==} - braces@3.0.2: - resolution: {integrity: sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==} - engines: {node: '>=8'} - braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} engines: {node: '>=8'} @@ -6805,10 +6853,6 @@ packages: file-uri-to-path@1.0.0: resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} - fill-range@7.0.1: - resolution: {integrity: sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==} - engines: {node: '>=8'} - fill-range@7.1.1: resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} engines: {node: '>=8'} @@ -11885,7 +11929,7 @@ snapshots: '@babel/traverse': 7.24.1 '@babel/types': 7.24.0 convert-source-map: 2.0.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -11946,7 +11990,7 @@ snapshots: '@babel/core': 7.24.4 '@babel/helper-compilation-targets': 7.23.6 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -12613,7 +12657,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.22.6 '@babel/parser': 7.24.4 '@babel/types': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -12925,7 +12969,7 @@ snapshots: '@eslint/eslintrc@2.1.4': dependencies: ajv: 6.12.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -13055,7 +13099,7 @@ snapshots: '@humanwhocodes/config-array@0.11.14': dependencies: '@humanwhocodes/object-schema': 2.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -13457,7 +13501,7 @@ snapshots: '@opensearch-project/opensearch@2.11.0': dependencies: aws4: 1.12.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 hpagent: 1.2.0 json11: 1.1.2 ms: 2.1.3 @@ -14292,7 +14336,7 @@ snapshots: '@superfaceai/parser': 1.2.0 abort-controller: 3.0.0 cross-fetch: 3.1.8(encoding@0.1.13) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 isomorphic-form-data: 2.0.0 vm2: 3.9.19 transitivePeerDependencies: @@ -14303,7 +14347,7 @@ snapshots: dependencies: '@superfaceai/ast': 1.2.0 '@types/debug': 4.1.12 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) typescript: 4.9.5 transitivePeerDependencies: - supports-color @@ -14664,7 +14708,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/type-utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14684,7 +14728,7 @@ snapshots: '@typescript-eslint/type-utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14701,7 +14745,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/types': 5.62.0 '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14714,7 +14758,7 @@ snapshots: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14735,7 +14779,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) eslint: 8.57.0 tsutils: 3.21.0(typescript@5.6.3) optionalDependencies: @@ -14747,7 +14791,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 ts-api-utils: 1.3.0(typescript@5.6.3) optionalDependencies: @@ -14763,7 +14807,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.62.0 '@typescript-eslint/visitor-keys': 5.62.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.0 @@ -14777,7 +14821,7 @@ snapshots: dependencies: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 globby: 11.1.0 is-glob: 4.0.3 minimatch: 9.0.3 @@ -14966,13 +15010,13 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color agent-base@7.1.1: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -15422,10 +15466,6 @@ snapshots: dependencies: balanced-match: 1.0.2 - braces@3.0.2: - dependencies: - fill-range: 7.0.1 - braces@3.0.3: dependencies: fill-range: 7.1.1 @@ -15572,7 +15612,7 @@ snapshots: chokidar@3.6.0: dependencies: anymatch: 3.1.3 - braces: 3.0.2 + braces: 3.0.3 glob-parent: 5.1.2 is-binary-path: 2.1.0 is-glob: 4.0.3 @@ -15983,19 +16023,19 @@ snapshots: optionalDependencies: supports-color: 5.5.0 - debug@4.3.4(supports-color@5.5.0): + debug@4.3.4: dependencies: ms: 2.1.2 - optionalDependencies: - supports-color: 5.5.0 debug@4.3.7: dependencies: ms: 2.1.3 - debug@4.4.0: + debug@4.4.0(supports-color@5.5.0): dependencies: ms: 2.1.3 + optionalDependencies: + supports-color: 5.5.0 decamelize@1.2.0: {} @@ -16557,7 +16597,7 @@ snapshots: ajv: 6.12.6 chalk: 4.1.2 cross-spawn: 7.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 doctrine: 3.0.0 escape-string-regexp: 4.0.0 eslint-scope: 7.2.2 @@ -16847,10 +16887,6 @@ snapshots: file-uri-to-path@1.0.0: {} - fill-range@7.0.1: - dependencies: - to-regex-range: 5.0.1 - fill-range@7.1.1: dependencies: to-regex-range: 5.0.1 @@ -17455,7 +17491,7 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17471,14 +17507,14 @@ snapshots: https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color https-proxy-agent@7.0.4: dependencies: agent-base: 7.1.1 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17888,7 +17924,7 @@ snapshots: dependencies: '@types/express': 4.17.21 '@types/jsonwebtoken': 9.0.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 jose: 4.15.5 limiter: 1.1.5 lru-memoizer: 2.2.0 @@ -17948,7 +17984,7 @@ snapshots: dependencies: chalk: 5.4.1 commander: 13.1.0 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) execa: 8.0.1 lilconfig: 3.1.3 listr2: 8.2.5 @@ -18424,7 +18460,7 @@ snapshots: nodemon@3.1.0: dependencies: chokidar: 3.6.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.4.0(supports-color@5.5.0) ignore-by-default: 1.0.1 minimatch: 3.1.2 pstree.remy: 1.1.8 @@ -19136,7 +19172,7 @@ snapshots: command-line-usage: 6.1.3 config: 3.3.11 configstore: 5.0.1 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 editor: 1.0.0 enquirer: 2.4.1 form-data: 4.0.0 @@ -19302,7 +19338,7 @@ snapshots: retry-request@4.2.2: dependencies: - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) extend: 3.0.2 transitivePeerDependencies: - supports-color @@ -19492,7 +19528,7 @@ snapshots: dependencies: '@types/debug': 4.1.12 '@types/validator': 13.11.9 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 dottie: 2.0.6 inflection: 1.13.4 lodash: 4.17.21 @@ -19740,7 +19776,7 @@ snapshots: accepts: 1.3.8 base64id: 2.0.0 cors: 2.8.5 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 engine.io: 6.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-adapter: 2.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-parser: 4.2.4 @@ -19898,7 +19934,7 @@ snapshots: dependencies: component-emitter: 1.3.1 cookiejar: 2.1.4 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 fast-safe-stringify: 2.1.1 form-data: 4.0.0 formidable: 2.1.2 From 87bb54048a154dd9cd80e2ac063d4a41be670edf Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 11 Feb 2026 10:40:07 +0100 Subject: [PATCH 30/33] feat: add migrations (CM-950) (#3835) Signed-off-by: Umberto Sgueglia --- ...dd-automatic_projects_discovery-tables.sql | 10 +++++ ...dd-automatic_projects_discovery-tables.sql | 40 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql create mode 100644 backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql diff --git a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..879b28e1a2 --- /dev/null +++ b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,10 @@ +DROP INDEX IF EXISTS "ix_evaluatedProjects_onboarded"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationScore"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationStatus"; +DROP INDEX IF EXISTS "uix_evaluatedProjects_projectCatalogId"; +DROP TABLE IF EXISTS "evaluatedProjects"; + +DROP INDEX IF EXISTS "ix_projectCatalog_syncedAt"; +DROP INDEX IF EXISTS "ix_projectCatalog_criticalityScore"; +DROP INDEX IF EXISTS "uix_projectCatalog_repoUrl"; +DROP TABLE IF EXISTS "projectCatalog"; diff --git a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..53697ce5ce --- /dev/null +++ b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,40 @@ +-- Project Catalog: candidate projects discovered from OSSF Criticality Score and other sources +CREATE TABLE IF NOT EXISTS "projectCatalog" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectSlug" VARCHAR(255) NOT NULL, + "repoName" VARCHAR(255) NOT NULL, + "repoUrl" VARCHAR(1024) NOT NULL, + "criticalityScore" DOUBLE PRECISION, + "syncedAt" TIMESTAMP WITH TIME ZONE DEFAULT NULL, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_projectCatalog_repoUrl" ON "projectCatalog" ("repoUrl"); +CREATE INDEX "ix_projectCatalog_criticalityScore" ON "projectCatalog" ("criticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_syncedAt" ON "projectCatalog" ("syncedAt"); + +-- Evaluated Projects: AI evaluation results linked to catalog entries +CREATE TABLE IF NOT EXISTS "evaluatedProjects" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectCatalogId" UUID NOT NULL REFERENCES "projectCatalog"(id) ON DELETE CASCADE, + "evaluationStatus" VARCHAR(50) NOT NULL DEFAULT 'pending', + "evaluationScore" DOUBLE PRECISION, + "evaluation" JSONB, + "evaluationReason" TEXT, + "evaluatedAt" TIMESTAMP WITH TIME ZONE, + "starsCount" INTEGER, + "forksCount" INTEGER, + "commitsCount" INTEGER, + "pullRequestsCount" INTEGER, + "issuesCount" INTEGER, + "onboarded" BOOLEAN NOT NULL DEFAULT FALSE, + "onboardedAt" TIMESTAMP WITH TIME ZONE, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_evaluatedProjects_projectCatalogId" ON "evaluatedProjects" ("projectCatalogId"); +CREATE INDEX "ix_evaluatedProjects_evaluationStatus" ON "evaluatedProjects" ("evaluationStatus"); +CREATE INDEX "ix_evaluatedProjects_evaluationScore" ON "evaluatedProjects" ("evaluationScore" DESC NULLS LAST); +CREATE INDEX "ix_evaluatedProjects_onboarded" ON "evaluatedProjects" ("onboarded"); From cd4ec1bef21a9342e1f84b1940f1b619d470ddaf Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Fri, 20 Feb 2026 15:56:45 +0100 Subject: [PATCH 31/33] fix: add lf-criticality-score Signed-off-by: Umberto Sgueglia --- ...U1770653666__add-automatic_projects_discovery-tables.sql | 3 ++- ...V1770653666__add-automatic_projects_discovery-tables.sql | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql index 879b28e1a2..a32dbe9a91 100644 --- a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql +++ b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql @@ -5,6 +5,7 @@ DROP INDEX IF EXISTS "uix_evaluatedProjects_projectCatalogId"; DROP TABLE IF EXISTS "evaluatedProjects"; DROP INDEX IF EXISTS "ix_projectCatalog_syncedAt"; -DROP INDEX IF EXISTS "ix_projectCatalog_criticalityScore"; +DROP INDEX IF EXISTS "ix_projectCatalog_lfCriticalityScore"; +DROP INDEX IF EXISTS "ix_projectCatalog_ossfCriticalityScore"; DROP INDEX IF EXISTS "uix_projectCatalog_repoUrl"; DROP TABLE IF EXISTS "projectCatalog"; diff --git a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql index 53697ce5ce..c2add79aae 100644 --- a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql +++ b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql @@ -4,14 +4,16 @@ CREATE TABLE IF NOT EXISTS "projectCatalog" ( "projectSlug" VARCHAR(255) NOT NULL, "repoName" VARCHAR(255) NOT NULL, "repoUrl" VARCHAR(1024) NOT NULL, - "criticalityScore" DOUBLE PRECISION, + "ossfCriticalityScore" DOUBLE PRECISION, + "lfCriticalityScore" DOUBLE PRECISION, "syncedAt" TIMESTAMP WITH TIME ZONE DEFAULT NULL, "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); CREATE UNIQUE INDEX "uix_projectCatalog_repoUrl" ON "projectCatalog" ("repoUrl"); -CREATE INDEX "ix_projectCatalog_criticalityScore" ON "projectCatalog" ("criticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_ossfCriticalityScore" ON "projectCatalog" ("ossfCriticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_lfCriticalityScore" ON "projectCatalog" ("lfCriticalityScore" DESC NULLS LAST); CREATE INDEX "ix_projectCatalog_syncedAt" ON "projectCatalog" ("syncedAt"); -- Evaluated Projects: AI evaluation results linked to catalog entries From b765959ec44be05b34c7034b83c798630d14e910 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 24 Mar 2026 11:26:19 +0100 Subject: [PATCH 32/33] fix: add dependencies Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 495c278fba..ef062f9f07 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16031,6 +16031,10 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.0: + dependencies: + ms: 2.1.3 + debug@4.4.0(supports-color@5.5.0): dependencies: ms: 2.1.3 From 75cb4b6c2a1ec5e774975ee4a022bea54a6cf79a Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 26 Mar 2026 10:57:12 +0100 Subject: [PATCH 33/33] fix: fake url as placeholder Signed-off-by: Umberto Sgueglia --- .../src/sources/lf-criticality-score/source.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts index 4738318454..804b686f65 100644 --- a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts +++ b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts @@ -8,7 +8,7 @@ import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../ty const log = getServiceLogger() -const DEFAULT_API_URL = 'https://hypervascular-nonduplicative-vern.ngrok-free.dev' +const DEFAULT_API_URL = 'https://lf-criticality-score-api.example.com' const PAGE_SIZE = 100 interface LfApiResponse {