diff --git a/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..a32dbe9a91 --- /dev/null +++ b/backend/src/database/migrations/U1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,11 @@ +DROP INDEX IF EXISTS "ix_evaluatedProjects_onboarded"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationScore"; +DROP INDEX IF EXISTS "ix_evaluatedProjects_evaluationStatus"; +DROP INDEX IF EXISTS "uix_evaluatedProjects_projectCatalogId"; +DROP TABLE IF EXISTS "evaluatedProjects"; + +DROP INDEX IF EXISTS "ix_projectCatalog_syncedAt"; +DROP INDEX IF EXISTS "ix_projectCatalog_lfCriticalityScore"; +DROP INDEX IF EXISTS "ix_projectCatalog_ossfCriticalityScore"; +DROP INDEX IF EXISTS "uix_projectCatalog_repoUrl"; +DROP TABLE IF EXISTS "projectCatalog"; diff --git a/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql new file mode 100644 index 0000000000..c2add79aae --- /dev/null +++ b/backend/src/database/migrations/V1770653666__add-automatic_projects_discovery-tables.sql @@ -0,0 +1,42 @@ +-- Project Catalog: candidate projects discovered from OSSF Criticality Score and other sources +CREATE TABLE IF NOT EXISTS "projectCatalog" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectSlug" VARCHAR(255) NOT NULL, + "repoName" VARCHAR(255) NOT NULL, + "repoUrl" VARCHAR(1024) NOT NULL, + "ossfCriticalityScore" DOUBLE PRECISION, + "lfCriticalityScore" DOUBLE PRECISION, + "syncedAt" TIMESTAMP WITH TIME ZONE DEFAULT NULL, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_projectCatalog_repoUrl" ON "projectCatalog" ("repoUrl"); +CREATE INDEX "ix_projectCatalog_ossfCriticalityScore" ON "projectCatalog" ("ossfCriticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_lfCriticalityScore" ON "projectCatalog" ("lfCriticalityScore" DESC NULLS LAST); +CREATE INDEX "ix_projectCatalog_syncedAt" ON "projectCatalog" ("syncedAt"); + +-- Evaluated Projects: AI evaluation results linked to catalog entries +CREATE TABLE IF NOT EXISTS "evaluatedProjects" ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + "projectCatalogId" UUID NOT NULL REFERENCES "projectCatalog"(id) ON DELETE CASCADE, + "evaluationStatus" VARCHAR(50) NOT NULL DEFAULT 'pending', + "evaluationScore" DOUBLE PRECISION, + "evaluation" JSONB, + "evaluationReason" TEXT, + "evaluatedAt" TIMESTAMP WITH TIME ZONE, + "starsCount" INTEGER, + "forksCount" INTEGER, + "commitsCount" INTEGER, + "pullRequestsCount" INTEGER, + "issuesCount" INTEGER, + "onboarded" BOOLEAN NOT NULL DEFAULT FALSE, + "onboardedAt" TIMESTAMP WITH TIME ZONE, + "createdAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP +); + +CREATE UNIQUE INDEX "uix_evaluatedProjects_projectCatalogId" ON "evaluatedProjects" ("projectCatalogId"); +CREATE INDEX "ix_evaluatedProjects_evaluationStatus" ON "evaluatedProjects" ("evaluationStatus"); +CREATE INDEX "ix_evaluatedProjects_evaluationScore" ON "evaluatedProjects" ("evaluationScore" DESC NULLS LAST); +CREATE INDEX "ix_evaluatedProjects_onboarded" ON "evaluatedProjects" ("onboarded"); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 052cfd6ade..616fea2b48 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -476,6 +476,113 @@ importers: specifier: ^3.3.3 version: 3.3.3 + services/apps/automatic_project_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + + services/apps/automatic_projects_discovery_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/common_services': + specifier: workspace:* + version: link:../../libs/common_services + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/redis': + specifier: workspace:* + version: link:../../libs/redis + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + csv-parse: + specifier: ^5.5.6 + version: 5.5.6 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/cache_worker: dependencies: '@crowd/archetype-standard': @@ -5575,10 +5682,6 @@ packages: brace-expansion@2.0.1: resolution: {integrity: sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==} - braces@3.0.2: - resolution: {integrity: sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==} - engines: {node: '>=8'} - braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} engines: {node: '>=8'} @@ -6753,10 +6856,6 @@ packages: file-uri-to-path@1.0.0: resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} - fill-range@7.0.1: - resolution: {integrity: sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==} - engines: {node: '>=8'} - fill-range@7.1.1: resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==} engines: {node: '>=8'} @@ -10619,6 +10718,51 @@ snapshots: - '@aws-sdk/client-sts' - aws-crt + '@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0)': + dependencies: + '@aws-crypto/sha256-browser': 5.2.0 + '@aws-crypto/sha256-js': 5.2.0 + '@aws-sdk/client-sts': 3.687.0 + '@aws-sdk/core': 3.686.0 + '@aws-sdk/credential-provider-node': 3.687.0(@aws-sdk/client-sso-oidc@3.687.0(@aws-sdk/client-sts@3.687.0))(@aws-sdk/client-sts@3.687.0) + '@aws-sdk/middleware-host-header': 3.686.0 + '@aws-sdk/middleware-logger': 3.686.0 + '@aws-sdk/middleware-recursion-detection': 3.686.0 + '@aws-sdk/middleware-user-agent': 3.687.0 + '@aws-sdk/region-config-resolver': 3.686.0 + '@aws-sdk/types': 3.686.0 + '@aws-sdk/util-endpoints': 3.686.0 + '@aws-sdk/util-user-agent-browser': 3.686.0 + '@aws-sdk/util-user-agent-node': 3.687.0 + '@smithy/config-resolver': 3.0.10 + '@smithy/core': 2.5.1 + '@smithy/fetch-http-handler': 4.0.0 + '@smithy/hash-node': 3.0.8 + '@smithy/invalid-dependency': 3.0.8 + '@smithy/middleware-content-length': 3.0.10 + '@smithy/middleware-endpoint': 3.2.1 + '@smithy/middleware-retry': 3.0.25 + '@smithy/middleware-serde': 3.0.8 + '@smithy/middleware-stack': 3.0.8 + '@smithy/node-config-provider': 3.1.9 + '@smithy/node-http-handler': 3.2.5 + '@smithy/protocol-http': 4.1.5 + '@smithy/smithy-client': 3.4.2 + '@smithy/types': 3.6.0 + '@smithy/url-parser': 3.0.8 + '@smithy/util-base64': 3.0.0 + '@smithy/util-body-length-browser': 3.0.0 + '@smithy/util-body-length-node': 3.0.0 + '@smithy/util-defaults-mode-browser': 3.0.25 + '@smithy/util-defaults-mode-node': 3.0.25 + '@smithy/util-endpoints': 2.1.4 + '@smithy/util-middleware': 3.0.8 + '@smithy/util-retry': 3.0.8 + '@smithy/util-utf8': 3.0.0 + tslib: 2.6.2 + transitivePeerDependencies: + - aws-crt + '@aws-sdk/client-sso@3.556.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 @@ -11788,7 +11932,7 @@ snapshots: '@babel/traverse': 7.24.1 '@babel/types': 7.24.0 convert-source-map: 2.0.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -11849,7 +11993,7 @@ snapshots: '@babel/core': 7.24.4 '@babel/helper-compilation-targets': 7.23.6 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -12516,7 +12660,7 @@ snapshots: '@babel/helper-split-export-declaration': 7.22.6 '@babel/parser': 7.24.4 '@babel/types': 7.24.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -12828,7 +12972,7 @@ snapshots: '@eslint/eslintrc@2.1.4': dependencies: ajv: 6.12.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 espree: 9.6.1 globals: 13.24.0 ignore: 5.3.1 @@ -12958,7 +13102,7 @@ snapshots: '@humanwhocodes/config-array@0.11.14': dependencies: '@humanwhocodes/object-schema': 2.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 minimatch: 3.1.2 transitivePeerDependencies: - supports-color @@ -13360,7 +13504,7 @@ snapshots: '@opensearch-project/opensearch@2.11.0': dependencies: aws4: 1.12.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 hpagent: 1.2.0 json11: 1.1.2 ms: 2.1.3 @@ -14195,7 +14339,7 @@ snapshots: '@superfaceai/parser': 1.2.0 abort-controller: 3.0.0 cross-fetch: 3.1.8(encoding@0.1.13) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 isomorphic-form-data: 2.0.0 vm2: 3.9.19 transitivePeerDependencies: @@ -14206,7 +14350,7 @@ snapshots: dependencies: '@superfaceai/ast': 1.2.0 '@types/debug': 4.1.12 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) typescript: 4.9.5 transitivePeerDependencies: - supports-color @@ -14567,7 +14711,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/type-utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14587,7 +14731,7 @@ snapshots: '@typescript-eslint/type-utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 graphemer: 1.4.0 ignore: 5.3.1 @@ -14604,7 +14748,7 @@ snapshots: '@typescript-eslint/scope-manager': 5.62.0 '@typescript-eslint/types': 5.62.0 '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14617,7 +14761,7 @@ snapshots: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 optionalDependencies: typescript: 5.6.3 @@ -14638,7 +14782,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 5.62.0(typescript@5.6.3) '@typescript-eslint/utils': 5.62.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) eslint: 8.57.0 tsutils: 3.21.0(typescript@5.6.3) optionalDependencies: @@ -14650,7 +14794,7 @@ snapshots: dependencies: '@typescript-eslint/typescript-estree': 6.21.0(typescript@5.6.3) '@typescript-eslint/utils': 6.21.0(eslint@8.57.0)(typescript@5.6.3) - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 eslint: 8.57.0 ts-api-utils: 1.3.0(typescript@5.6.3) optionalDependencies: @@ -14666,7 +14810,7 @@ snapshots: dependencies: '@typescript-eslint/types': 5.62.0 '@typescript-eslint/visitor-keys': 5.62.0 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) globby: 11.1.0 is-glob: 4.0.3 semver: 7.6.0 @@ -14680,7 +14824,7 @@ snapshots: dependencies: '@typescript-eslint/types': 6.21.0 '@typescript-eslint/visitor-keys': 6.21.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 globby: 11.1.0 is-glob: 4.0.3 minimatch: 9.0.3 @@ -14869,13 +15013,13 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color agent-base@7.1.1: dependencies: - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -15325,10 +15469,6 @@ snapshots: dependencies: balanced-match: 1.0.2 - braces@3.0.2: - dependencies: - fill-range: 7.0.1 - braces@3.0.3: dependencies: fill-range: 7.1.1 @@ -15475,7 +15615,7 @@ snapshots: chokidar@3.6.0: dependencies: anymatch: 3.1.3 - braces: 3.0.2 + braces: 3.0.3 glob-parent: 5.1.2 is-binary-path: 2.1.0 is-glob: 4.0.3 @@ -15886,11 +16026,9 @@ snapshots: optionalDependencies: supports-color: 5.5.0 - debug@4.3.4(supports-color@5.5.0): + debug@4.3.4: dependencies: ms: 2.1.2 - optionalDependencies: - supports-color: 5.5.0 debug@4.3.7: dependencies: @@ -15900,6 +16038,12 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.0(supports-color@5.5.0): + dependencies: + ms: 2.1.3 + optionalDependencies: + supports-color: 5.5.0 + decamelize@1.2.0: {} decompress-response@3.3.0: @@ -16460,7 +16604,7 @@ snapshots: ajv: 6.12.6 chalk: 4.1.2 cross-spawn: 7.0.3 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 doctrine: 3.0.0 escape-string-regexp: 4.0.0 eslint-scope: 7.2.2 @@ -16750,10 +16894,6 @@ snapshots: file-uri-to-path@1.0.0: {} - fill-range@7.0.1: - dependencies: - to-regex-range: 5.0.1 - fill-range@7.1.1: dependencies: to-regex-range: 5.0.1 @@ -17358,7 +17498,7 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17374,14 +17514,14 @@ snapshots: https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color https-proxy-agent@7.0.4: dependencies: agent-base: 7.1.1 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) transitivePeerDependencies: - supports-color @@ -17791,7 +17931,7 @@ snapshots: dependencies: '@types/express': 4.17.21 '@types/jsonwebtoken': 9.0.6 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 jose: 4.15.5 limiter: 1.1.5 lru-memoizer: 2.2.0 @@ -17851,7 +17991,7 @@ snapshots: dependencies: chalk: 5.4.1 commander: 13.1.0 - debug: 4.4.0 + debug: 4.4.0(supports-color@5.5.0) execa: 8.0.1 lilconfig: 3.1.3 listr2: 8.2.5 @@ -18327,7 +18467,7 @@ snapshots: nodemon@3.1.0: dependencies: chokidar: 3.6.0 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.4.0(supports-color@5.5.0) ignore-by-default: 1.0.1 minimatch: 3.1.2 pstree.remy: 1.1.8 @@ -19039,7 +19179,7 @@ snapshots: command-line-usage: 6.1.3 config: 3.3.11 configstore: 5.0.1 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 editor: 1.0.0 enquirer: 2.4.1 form-data: 4.0.0 @@ -19205,7 +19345,7 @@ snapshots: retry-request@4.2.2: dependencies: - debug: 4.3.7 + debug: 4.4.0(supports-color@5.5.0) extend: 3.0.2 transitivePeerDependencies: - supports-color @@ -19395,7 +19535,7 @@ snapshots: dependencies: '@types/debug': 4.1.12 '@types/validator': 13.11.9 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 dottie: 2.0.6 inflection: 1.13.4 lodash: 4.17.21 @@ -19643,7 +19783,7 @@ snapshots: accepts: 1.3.8 base64id: 2.0.0 cors: 2.8.5 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 engine.io: 6.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-adapter: 2.5.4(bufferutil@4.0.8)(utf-8-validate@5.0.10) socket.io-parser: 4.2.4 @@ -19801,7 +19941,7 @@ snapshots: dependencies: component-emitter: 1.3.1 cookiejar: 2.1.4 - debug: 4.3.4(supports-color@5.5.0) + debug: 4.3.4 fast-safe-stringify: 2.1.1 form-data: 4.0.0 formidable: 2.1.2 diff --git a/scripts/builders/automatic-projects-discovery-worker.env b/scripts/builders/automatic-projects-discovery-worker.env new file mode 100644 index 0000000000..8416386449 --- /dev/null +++ b/scripts/builders/automatic-projects-discovery-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.automatic_projects_discovery_worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/automatic-projects-discovery-worker" +SERVICES="automatic-projects-discovery-worker" diff --git a/scripts/services/automatic-projects-discovery-worker.yaml b/scripts/services/automatic-projects-discovery-worker.yaml new file mode 100644 index 0000000000..5f3732b7c2 --- /dev/null +++ b/scripts/services/automatic-projects-discovery-worker.yaml @@ -0,0 +1,64 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: automatic-projects-discovery-worker + CROWD_TEMPORAL_TASKQUEUE: automatic-projects-discovery + SHELL: /bin/sh + +services: + automatic-projects-discovery-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker + command: 'pnpm run start' + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + automatic-projects-discovery-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.automatic_projects_discovery_worker + command: 'pnpm run dev' + working_dir: /usr/crowd/app/services/apps/automatic_projects_discovery_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: automatic-projects-discovery-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/automatic_projects_discovery_worker/src:/usr/crowd/app/services/apps/automatic_projects_discovery_worker/src + +networks: + crowd-bridge: + external: true diff --git a/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker new file mode 100644 index 0000000000..860af6601e --- /dev/null +++ b/scripts/services/docker/Dockerfile.automatic_projects_discovery_worker @@ -0,0 +1,23 @@ +FROM node:20-alpine as builder + +RUN apk add --no-cache python3 make g++ + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./ +RUN pnpm fetch + +COPY ./services ./services +RUN pnpm i --frozen-lockfile + +FROM node:20-bookworm-slim as runner + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate && apt update && apt install -y ca-certificates --no-install-recommends && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /usr/crowd/app/node_modules ./node_modules +COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json +COPY --from=builder /usr/crowd/app/services/libs ./services/libs +COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes +COPY --from=builder /usr/crowd/app/services/apps/automatic_projects_discovery_worker/ ./services/apps/automatic_projects_discovery_worker diff --git a/services/apps/automatic_projects_discovery_worker/README.md b/services/apps/automatic_projects_discovery_worker/README.md new file mode 100644 index 0000000000..77623513cc --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/README.md @@ -0,0 +1,73 @@ +# Automatic Projects Discovery Worker + +Temporal worker that discovers open-source projects from external data sources and writes them to the `projectCatalog` table. + +## Architecture + +### Source abstraction + +Every data source implements the `IDiscoverySource` interface (`src/sources/types.ts`): + +| Method | Purpose | +| ----------------------------- | --------------------------------------------------------------------------- | +| `listAvailableDatasets()` | Returns available dataset snapshots, sorted newest-first | +| `fetchDatasetStream(dataset)` | Returns a readable stream for the dataset (e.g. HTTP response) | +| `parseRow(rawRow)` | Converts a raw CSV/JSON row into a `IDiscoverySourceRow`, or `null` to skip | + +Sources are registered in `src/sources/registry.ts` as a simple name → factory map. + +**To add a new source:** create a class implementing `IDiscoverySource`, then add one line to the registry. + +### Current sources + +| Name | Folder | Description | +| ------------------------ | ------------------------------------- | ------------------------------------------------------------------------------------ | +| `ossf-criticality-score` | `src/sources/ossf-criticality-score/` | OSSF Criticality Score snapshots from a public GCS bucket (~750K repos per snapshot) | + +### Workflow + +``` +discoverProjects({ mode: 'incremental' | 'full' }) + │ + ├─ Activity: listDatasets(sourceName) + │ → returns dataset descriptors sorted newest-first + │ + ├─ Selection: incremental → latest only, full → all datasets + │ + └─ For each dataset: + └─ Activity: processDataset(sourceName, dataset) + → HTTP stream → csv-parse → batches of 5000 → bulkUpsertProjectCatalog +``` + +### Timeouts + +| Activity | startToCloseTimeout | retries | +| ------------------ | ------------------- | ------- | +| `listDatasets` | 2 min | 3 | +| `processDataset` | 30 min | 3 | +| Workflow execution | 2 hours | 3 | + +### Schedule + +Runs daily at midnight via Temporal cron (`0 0 * * *`). + +## File structure + +``` +src/ +├── main.ts # Service bootstrap (postgres enabled) +├── activities.ts # Barrel re-export +├── workflows.ts # Barrel re-export +├── activities/ +│ └── activities.ts # listDatasets, processDataset +├── workflows/ +│ └── discoverProjects.ts # Orchestration with mode selection +├── schedules/ +│ └── scheduleProjectsDiscovery.ts # Temporal cron schedule +└── sources/ + ├── types.ts # IDiscoverySource, IDatasetDescriptor + ├── registry.ts # Source factory map + └── ossf-criticality-score/ + ├── source.ts # IDiscoverySource implementation + └── bucketClient.ts # GCS public bucket HTTP client +``` diff --git a/services/apps/automatic_projects_discovery_worker/package.json b/services/apps/automatic_projects_discovery_worker/package.json new file mode 100644 index 0000000000..022c1a6297 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/package.json @@ -0,0 +1,35 @@ +{ + "name": "@crowd/automatic-projects-discovery-worker", + "scripts": { + "start": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx src/main.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker tsx --inspect=0.0.0.0:9232 src/main.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=automatic-projects-discovery SERVICE=automatic-projects-discovery-worker LOG_LEVEL=trace tsx --inspect=0.0.0.0:9232 src/main.ts", + "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", + "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", + "lint": "npx eslint --ext .ts src --max-warnings=0", + "format": "npx prettier --write \"src/**/*.ts\"", + "format-check": "npx prettier --check .", + "tsc-check": "tsc --noEmit" + }, + "dependencies": { + "@crowd/archetype-standard": "workspace:*", + "@crowd/archetype-worker": "workspace:*", + "@crowd/common": "workspace:*", + "@crowd/common_services": "workspace:*", + "@crowd/data-access-layer": "workspace:*", + "@crowd/logging": "workspace:*", + "@crowd/redis": "workspace:*", + "@crowd/temporal": "workspace:*", + "@crowd/types": "workspace:*", + "@temporalio/activity": "~1.11.8", + "@temporalio/client": "~1.11.8", + "@temporalio/workflow": "~1.11.8", + "csv-parse": "^5.5.6", + "tsx": "^4.7.1", + "typescript": "^5.6.3" + }, + "devDependencies": { + "@types/node": "^20.8.2", + "nodemon": "^3.0.1" + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts new file mode 100644 index 0000000000..3662234550 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/activities.ts @@ -0,0 +1 @@ +export * from './activities/activities' diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts new file mode 100644 index 0000000000..fbbe6c28a8 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts @@ -0,0 +1,124 @@ +import { parse } from 'csv-parse' + +import { bulkUpsertProjectCatalog } from '@crowd/data-access-layer' +import { IDbProjectCatalogCreate } from '@crowd/data-access-layer/src/project-catalog/types' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceLogger } from '@crowd/logging' + +import { svc } from '../main' +import { getAvailableSourceNames, getSource } from '../sources/registry' +import { IDatasetDescriptor } from '../sources/types' + +const log = getServiceLogger() + +const BATCH_SIZE = 5000 + +export async function listSources(): Promise { + return getAvailableSourceNames() +} + +export async function listDatasets(sourceName: string): Promise { + const source = getSource(sourceName) + const datasets = await source.listAvailableDatasets() + + log.info({ sourceName, count: datasets.length, newest: datasets[0]?.id }, 'Datasets listed.') + + return datasets +} + +export async function processDataset( + sourceName: string, + dataset: IDatasetDescriptor, +): Promise { + const qx = pgpQx(svc.postgres.writer.connection()) + const startTime = Date.now() + + log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...') + + const source = getSource(sourceName) + const stream = await source.fetchDatasetStream(dataset) + + // For CSV sources: pipe through csv-parse to get Record objects. + // For JSON sources: the stream already emits pre-parsed objects in object mode. + const records = + source.format === 'json' + ? stream + : stream.pipe( + parse({ + columns: true, + skip_empty_lines: true, + trim: true, + }), + ) + + // pipe() does not forward source errors to the destination automatically, so we + // destroy records explicitly — this surfaces the error in the for-await loop and + // lets Temporal mark the activity as failed and retry it. + stream.on('error', (err: Error) => { + log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.') + records.destroy(err) + }) + + if (source.format !== 'json') { + const csvRecords = records as ReturnType + csvRecords.on('error', (err) => { + log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.') + }) + } + + let batch: IDbProjectCatalogCreate[] = [] + let totalProcessed = 0 + let totalSkipped = 0 + let batchNumber = 0 + let totalRows = 0 + + for await (const rawRow of records) { + totalRows++ + + const parsed = source.parseRow(rawRow as Record) + if (!parsed) { + totalSkipped++ + continue + } + + batch.push({ + projectSlug: parsed.projectSlug, + repoName: parsed.repoName, + repoUrl: parsed.repoUrl, + ossfCriticalityScore: parsed.ossfCriticalityScore, + lfCriticalityScore: parsed.lfCriticalityScore, + }) + + if (batch.length >= BATCH_SIZE) { + batchNumber++ + + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + batch = [] + + log.info({ totalProcessed, batchNumber, datasetId: dataset.id }, 'Batch upserted.') + } + } + + // Flush remaining rows that didn't fill a complete batch + if (batch.length > 0) { + batchNumber++ + await bulkUpsertProjectCatalog(qx, batch) + totalProcessed += batch.length + } + + const elapsedSeconds = ((Date.now() - startTime) / 1000).toFixed(1) + + log.info( + { + sourceName, + datasetId: dataset.id, + totalRows, + totalProcessed, + totalSkipped, + totalBatches: batchNumber, + elapsedSeconds, + }, + 'Dataset processing complete.', + ) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/main.ts b/services/apps/automatic_projects_discovery_worker/src/main.ts new file mode 100644 index 0000000000..0345c420f8 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/main.ts @@ -0,0 +1,36 @@ +import { Config } from '@crowd/archetype-standard' +import { Options, ServiceWorker } from '@crowd/archetype-worker' + +import { scheduleProjectsDiscovery } from './schedules/scheduleProjectsDiscovery' + +const config: Config = { + envvars: [], + producer: { + enabled: false, + }, + temporal: { + enabled: true, + }, + redis: { + enabled: false, + }, +} + +const options: Options = { + postgres: { + enabled: true, + }, + opensearch: { + enabled: false, + }, +} + +export const svc = new ServiceWorker(config, options) + +setImmediate(async () => { + await svc.init() + + await scheduleProjectsDiscovery() + + await svc.start() +}) diff --git a/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts new file mode 100644 index 0000000000..b173126a78 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/schedules/scheduleProjectsDiscovery.ts @@ -0,0 +1,41 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../main' +import { discoverProjects } from '../workflows' + +export const scheduleProjectsDiscovery = async () => { + svc.log.info(`Scheduling projects discovery`) + + try { + await svc.temporal.schedule.create({ + scheduleId: 'automaticProjectsDiscovery', + spec: { + // Run every day at midnight + cronExpressions: ['0 0 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 minute', + }, + action: { + type: 'startWorkflow', + workflowType: discoverProjects, + taskQueue: 'automatic-projects-discovery', + args: [{ mode: 'incremental' as const }], + workflowExecutionTimeout: '2 hours', + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule already registered in Temporal.') + svc.log.info('Configuration may have changed since. Please make sure they are in sync.') + } else { + throw new Error(err) + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts new file mode 100644 index 0000000000..804b686f65 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts @@ -0,0 +1,188 @@ +import http from 'http' +import https from 'https' +import { Readable } from 'stream' + +import { getServiceLogger } from '@crowd/logging' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +const log = getServiceLogger() + +const DEFAULT_API_URL = 'https://lf-criticality-score-api.example.com' +const PAGE_SIZE = 100 + +interface LfApiResponse { + page: number + pageSize: number + total: number + totalPages: number + data: LfApiRow[] +} + +interface LfApiRow { + runDate: string + repoUrl: string + owner: string + repoName: string + contributors: number + organizations: number + sizeSloc: number + lastUpdated: number + age: number + commitFreq: number + score: number +} + +function getApiBaseUrl(): string { + return (process.env.LF_CRITICALITY_SCORE_API_URL ?? DEFAULT_API_URL).replace(/\/$/, '') +} + +async function fetchPage( + baseUrl: string, + startDate: string, + endDate: string, + page: number, +): Promise { + const url = `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}&page=${page}&pageSize=${PAGE_SIZE}` + + return new Promise((resolve, reject) => { + const client = url.startsWith('https://') ? https : http + + const req = client.get(url, (res) => { + if (res.statusCode !== 200) { + reject(new Error(`LF Criticality Score API returned status ${res.statusCode} for ${url}`)) + res.resume() + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => { + try { + resolve(JSON.parse(Buffer.concat(chunks).toString('utf8')) as LfApiResponse) + } catch (err) { + reject(new Error(`Failed to parse LF Criticality Score API response: ${err}`)) + } + }) + res.on('error', reject) + }) + + req.on('error', reject) + req.end() + }) +} + +/** + * Generates the first day and last day of a given month. + * monthOffset = 0 → current month, -1 → previous month, etc. + */ +function monthRange(monthOffset: number): { startDate: string; endDate: string } { + const now = new Date() + const year = now.getUTCFullYear() + const month = now.getUTCMonth() + monthOffset // can be negative; Date handles rollover + + const first = new Date(Date.UTC(year, month, 1)) + const last = new Date(Date.UTC(year, month + 1, 0)) // last day of month + + const pad = (n: number) => String(n).padStart(2, '0') + const fmt = (d: Date) => + `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}` + + return { startDate: fmt(first), endDate: fmt(last) } +} + +export class LfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'lf-criticality-score' + public readonly format = 'json' as const + + async listAvailableDatasets(): Promise { + const baseUrl = getApiBaseUrl() + + // Return one dataset per month for the last 12 months (newest first) + const datasets: IDatasetDescriptor[] = [] + + for (let offset = 0; offset >= -11; offset--) { + const { startDate, endDate } = monthRange(offset) + const id = startDate.slice(0, 7) // e.g. "2026-02" + + datasets.push({ + id, + date: startDate, + url: `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}`, + }) + } + + return datasets + } + + /** + * Returns an object-mode Readable that fetches all pages from the API + * and pushes each row as a plain object. Activities.ts iterates this + * directly (no csv-parse) because format === 'json'. + */ + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const baseUrl = getApiBaseUrl() + + // Extract startDate and endDate from the stored URL + const parsed = new URL(dataset.url) + const startDate = parsed.searchParams.get('startDate') ?? '' + const endDate = parsed.searchParams.get('endDate') ?? '' + + async function* pages() { + let page = 1 + let totalPages = 1 + + do { + const response = await fetchPage(baseUrl, startDate, endDate, page) + totalPages = response.totalPages + + for (const row of response.data) { + yield row + } + + log.debug( + { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length }, + 'LF Criticality Score page fetched.', + ) + + page++ + } while (page <= totalPages) + } + + return Readable.from(pages(), { objectMode: true }) + } + + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repoUrl'] as string | undefined + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const score = rawRow['score'] + const lfCriticalityScore = typeof score === 'number' ? score : parseFloat(score as string) + + return { + projectSlug, + repoName, + repoUrl, + lfCriticalityScore: Number.isNaN(lfCriticalityScore) ? undefined : lfCriticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts new file mode 100644 index 0000000000..71b2066ae7 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/bucketClient.ts @@ -0,0 +1,96 @@ +import https from 'https' + +const BUCKET_URL = 'https://commondatastorage.googleapis.com/ossf-criticality-score' + +function httpsGet(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + httpsGet(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + const chunks: Uint8Array[] = [] + res.on('data', (chunk: Uint8Array) => chunks.push(chunk)) + res.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8'))) + res.on('error', reject) + }) + .on('error', reject) + }) +} + +function extractPrefixes(xml: string): string[] { + const prefixes: string[] = [] + const regex = /([^<]+)<\/Prefix>/g + let match: RegExpExecArray | null + + while ((match = regex.exec(xml)) !== null) { + prefixes.push(match[1]) + } + + return prefixes +} + +/** + * List all date prefixes in the OSSF Criticality Score bucket. + * Returns prefixes like ['2024.07.01/', '2024.07.08/', ...] + */ +export async function listDatePrefixes(): Promise { + const xml = await httpsGet(`${BUCKET_URL}?delimiter=/`) + return extractPrefixes(xml).filter((p) => /^\d{4}\.\d{2}\.\d{2}\/$/.test(p)) +} + +/** + * List time sub-prefixes for a given date prefix. + * E.g., for '2024.07.01/' returns ['2024.07.01/060102/', ...] + */ +export async function listTimePrefixes(datePrefix: string): Promise { + const xml = await httpsGet(`${BUCKET_URL}?prefix=${encodeURIComponent(datePrefix)}&delimiter=/`) + return extractPrefixes(xml).filter((p) => p !== datePrefix) +} + +/** + * Build the full URL for the all.csv file within a given dataset prefix. + */ +export function buildDatasetUrl(prefix: string): string { + return `${BUCKET_URL}/${prefix}all.csv` +} + +/** + * Get an HTTPS readable stream for a given URL. + */ +export function getHttpsStream(url: string): Promise { + return new Promise((resolve, reject) => { + https + .get(url, (res) => { + if ( + res.statusCode && + res.statusCode >= 300 && + res.statusCode < 400 && + res.headers.location + ) { + getHttpsStream(res.headers.location).then(resolve, reject) + return + } + + if (res.statusCode && (res.statusCode < 200 || res.statusCode >= 300)) { + reject(new Error(`HTTP ${res.statusCode} for ${url}`)) + return + } + + resolve(res) + }) + .on('error', reject) + }) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts new file mode 100644 index 0000000000..8ee20fb602 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts @@ -0,0 +1,75 @@ +import { Readable } from 'stream' + +import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types' + +import { buildDatasetUrl, getHttpsStream, listDatePrefixes, listTimePrefixes } from './bucketClient' + +export class OssfCriticalityScoreSource implements IDiscoverySource { + public readonly name = 'ossf-criticality-score' + + async listAvailableDatasets(): Promise { + const datePrefixes = await listDatePrefixes() + + const datasets: IDatasetDescriptor[] = [] + + for (const datePrefix of datePrefixes) { + const timePrefixes = await listTimePrefixes(datePrefix) + + for (const timePrefix of timePrefixes) { + const date = datePrefix.replace(/\/$/, '') + const url = buildDatasetUrl(timePrefix) + + datasets.push({ + id: timePrefix.replace(/\/$/, ''), + date, + url, + }) + } + } + + // Sort newest-first by date + datasets.sort((a, b) => b.date.localeCompare(a.date)) + + return datasets + } + + async fetchDatasetStream(dataset: IDatasetDescriptor): Promise { + const stream = await getHttpsStream(dataset.url) + return stream as Readable + } + + // CSV columns use dot notation (e.g. "repo.url", "default_score") + parseRow(rawRow: Record): IDiscoverySourceRow | null { + const repoUrl = rawRow['repo.url'] as string | undefined + if (!repoUrl) { + return null + } + + let repoName = '' + let projectSlug = '' + + try { + const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '') + projectSlug = urlPath + repoName = urlPath.split('/').pop() || '' + } catch { + const parts = repoUrl.replace(/\/$/, '').split('/') + projectSlug = parts.slice(-2).join('/') + repoName = parts.pop() || '' + } + + if (!projectSlug || !repoName) { + return null + } + + const scoreRaw = rawRow['default_score'] + const ossfCriticalityScore = scoreRaw ? parseFloat(scoreRaw as string) : undefined + + return { + projectSlug, + repoName, + repoUrl, + ossfCriticalityScore: Number.isNaN(ossfCriticalityScore) ? undefined : ossfCriticalityScore, + } + } +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts new file mode 100644 index 0000000000..1c7af148a3 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts @@ -0,0 +1,21 @@ +import { LfCriticalityScoreSource } from './lf-criticality-score/source' +import { OssfCriticalityScoreSource } from './ossf-criticality-score/source' +import { IDiscoverySource } from './types' + +// To add a new source: instantiate it here. +const sources: IDiscoverySource[] = [ + new OssfCriticalityScoreSource(), + new LfCriticalityScoreSource(), +] + +export function getSource(name: string): IDiscoverySource { + const source = sources.find((s) => s.name === name) + if (!source) { + throw new Error(`Unknown source: ${name}. Available: ${sources.map((s) => s.name).join(', ')}`) + } + return source +} + +export function getAvailableSourceNames(): string[] { + return sources.map((s) => s.name) +} diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts new file mode 100644 index 0000000000..9b386b5da7 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts @@ -0,0 +1,27 @@ +import { Readable } from 'stream' + +export interface IDatasetDescriptor { + id: string + date: string + url: string +} + +export interface IDiscoverySource { + name: string + /** + * 'csv' (default): fetchDatasetStream returns a raw text stream, piped through csv-parse. + * 'json': fetchDatasetStream returns an object-mode Readable that emits pre-parsed records. + */ + format?: 'csv' | 'json' + listAvailableDatasets(): Promise + fetchDatasetStream(dataset: IDatasetDescriptor): Promise + parseRow(rawRow: Record): IDiscoverySourceRow | null +} + +export interface IDiscoverySourceRow { + projectSlug: string + repoName: string + repoUrl: string + ossfCriticalityScore?: number + lfCriticalityScore?: number +} diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows.ts b/services/apps/automatic_projects_discovery_worker/src/workflows.ts new file mode 100644 index 0000000000..07b00cee6f --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/workflows.ts @@ -0,0 +1,3 @@ +import { discoverProjects } from './workflows/discoverProjects' + +export { discoverProjects } diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts new file mode 100644 index 0000000000..00856493d4 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts @@ -0,0 +1,48 @@ +import { log, proxyActivities } from '@temporalio/workflow' + +import type * as activities from '../activities' + +const listActivities = proxyActivities({ + startToCloseTimeout: '2 minutes', + retry: { maximumAttempts: 3 }, +}) + +// processDataset is long-running (10-20 min for ~119MB / ~750K rows). +const processActivities = proxyActivities({ + startToCloseTimeout: '30 minutes', + retry: { maximumAttempts: 3 }, +}) + +export async function discoverProjects( + input: { mode: 'incremental' | 'full' } = { mode: 'incremental' }, +): Promise { + const { mode } = input + + const sourceNames = await listActivities.listSources() + + for (const sourceName of sourceNames) { + const allDatasets = await listActivities.listDatasets(sourceName) + + if (allDatasets.length === 0) { + log.warn(`No datasets found for source "${sourceName}". Skipping.`) + continue + } + + // allDatasets is sorted newest-first. + // Incremental: process only the latest snapshot. + // Full: process oldest-first so the newest data wins the final upsert. + const datasets = mode === 'incremental' ? [allDatasets[0]] : [...allDatasets].reverse() + + log.info( + `source=${sourceName} mode=${mode}, ${datasets.length}/${allDatasets.length} datasets to process.`, + ) + + for (let i = 0; i < datasets.length; i++) { + const dataset = datasets[i] + log.info(`[${sourceName}] Processing dataset ${i + 1}/${datasets.length}: ${dataset.id}`) + await processActivities.processDataset(sourceName, dataset) + } + + log.info(`[${sourceName}] Done. Processed ${datasets.length} dataset(s).`) + } +} diff --git a/services/apps/automatic_projects_discovery_worker/tsconfig.json b/services/apps/automatic_projects_discovery_worker/tsconfig.json new file mode 100644 index 0000000000..bf7f183850 --- /dev/null +++ b/services/apps/automatic_projects_discovery_worker/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../base.tsconfig.json", + "include": ["src/**/*"] +} diff --git a/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts new file mode 100644 index 0000000000..2e078beb0d --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/evaluatedProjects.ts @@ -0,0 +1,397 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { + EvaluationStatus, + IDbEvaluatedProject, + IDbEvaluatedProjectCreate, + IDbEvaluatedProjectUpdate, +} from './types' + +const EVALUATED_PROJECT_COLUMNS = [ + 'id', + 'projectCatalogId', + 'evaluationStatus', + 'evaluationScore', + 'evaluation', + 'evaluationReason', + 'evaluatedAt', + 'starsCount', + 'forksCount', + 'commitsCount', + 'pullRequestsCount', + 'issuesCount', + 'onboarded', + 'onboardedAt', + 'createdAt', + 'updatedAt', +] + +export async function findEvaluatedProjectById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findEvaluatedProjectsByStatus( + qx: QueryExecutor, + evaluationStatus: EvaluationStatus, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + WHERE "evaluationStatus" = $(evaluationStatus) + ORDER BY "createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { evaluationStatus, limit, offset }, + ) +} + +export async function findAllEvaluatedProjects( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + FROM "evaluatedProjects" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countEvaluatedProjects( + qx: QueryExecutor, + evaluationStatus?: EvaluationStatus, +): Promise { + const statusFilter = evaluationStatus ? 'WHERE "evaluationStatus" = $(evaluationStatus)' : '' + + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "evaluatedProjects" + ${statusFilter} + `, + { evaluationStatus }, + ) + return parseInt(result.count, 10) +} + +export async function insertEvaluatedProject( + qx: QueryExecutor, + data: IDbEvaluatedProjectCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectCatalogId), + $(evaluationStatus), + $(evaluationScore), + $(evaluation), + $(evaluationReason), + $(starsCount), + $(forksCount), + $(commitsCount), + $(pullRequestsCount), + $(issuesCount), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + projectCatalogId: data.projectCatalogId, + evaluationStatus: data.evaluationStatus ?? 'pending', + evaluationScore: data.evaluationScore ?? null, + evaluation: data.evaluation ? JSON.stringify(data.evaluation) : null, + evaluationReason: data.evaluationReason ?? null, + starsCount: data.starsCount ?? null, + forksCount: data.forksCount ?? null, + commitsCount: data.commitsCount ?? null, + pullRequestsCount: data.pullRequestsCount ?? null, + issuesCount: data.issuesCount ?? null, + }, + ) +} + +export async function bulkInsertEvaluatedProjects( + qx: QueryExecutor, + items: IDbEvaluatedProjectCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectCatalogId: item.projectCatalogId, + evaluationStatus: item.evaluationStatus ?? 'pending', + evaluationScore: item.evaluationScore ?? null, + evaluation: item.evaluation ?? null, + evaluationReason: item.evaluationReason ?? null, + starsCount: item.starsCount ?? null, + forksCount: item.forksCount ?? null, + commitsCount: item.commitsCount ?? null, + pullRequestsCount: item.pullRequestsCount ?? null, + issuesCount: item.issuesCount ?? null, + })) + + await qx.result( + ` + INSERT INTO "evaluatedProjects" ( + "projectCatalogId", + "evaluationStatus", + "evaluationScore", + evaluation, + "evaluationReason", + "starsCount", + "forksCount", + "commitsCount", + "pullRequestsCount", + "issuesCount", + "createdAt", + "updatedAt" + ) + SELECT + v."projectCatalogId"::uuid, + v."evaluationStatus", + v."evaluationScore"::double precision, + v.evaluation::jsonb, + v."evaluationReason", + v."starsCount"::integer, + v."forksCount"::integer, + v."commitsCount"::integer, + v."pullRequestsCount"::integer, + v."issuesCount"::integer, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectCatalogId" text, + "evaluationStatus" text, + "evaluationScore" double precision, + evaluation jsonb, + "evaluationReason" text, + "starsCount" integer, + "forksCount" integer, + "commitsCount" integer, + "pullRequestsCount" integer, + "issuesCount" integer + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateEvaluatedProject( + qx: QueryExecutor, + id: string, + data: IDbEvaluatedProjectUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.evaluationStatus !== undefined) { + setClauses.push('"evaluationStatus" = $(evaluationStatus)') + params.evaluationStatus = data.evaluationStatus + } + if (data.evaluationScore !== undefined) { + setClauses.push('"evaluationScore" = $(evaluationScore)') + params.evaluationScore = data.evaluationScore + } + if (data.evaluation !== undefined) { + setClauses.push('evaluation = $(evaluation)') + params.evaluation = data.evaluation ? JSON.stringify(data.evaluation) : null + } + if (data.evaluationReason !== undefined) { + setClauses.push('"evaluationReason" = $(evaluationReason)') + params.evaluationReason = data.evaluationReason + } + if (data.evaluatedAt !== undefined) { + setClauses.push('"evaluatedAt" = $(evaluatedAt)') + params.evaluatedAt = data.evaluatedAt + } + if (data.starsCount !== undefined) { + setClauses.push('"starsCount" = $(starsCount)') + params.starsCount = data.starsCount + } + if (data.forksCount !== undefined) { + setClauses.push('"forksCount" = $(forksCount)') + params.forksCount = data.forksCount + } + if (data.commitsCount !== undefined) { + setClauses.push('"commitsCount" = $(commitsCount)') + params.commitsCount = data.commitsCount + } + if (data.pullRequestsCount !== undefined) { + setClauses.push('"pullRequestsCount" = $(pullRequestsCount)') + params.pullRequestsCount = data.pullRequestsCount + } + if (data.issuesCount !== undefined) { + setClauses.push('"issuesCount" = $(issuesCount)') + params.issuesCount = data.issuesCount + } + if (data.onboarded !== undefined) { + setClauses.push('onboarded = $(onboarded)') + params.onboarded = data.onboarded + } + if (data.onboardedAt !== undefined) { + setClauses.push('"onboardedAt" = $(onboardedAt)') + params.onboardedAt = data.onboardedAt + } + + if (setClauses.length === 0) { + return findEvaluatedProjectById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + params, + ) +} + +export async function markEvaluatedProjectAsEvaluated( + qx: QueryExecutor, + id: string, + data: { + evaluationScore: number + evaluation: Record + evaluationReason?: string + }, +): Promise { + return qx.selectOneOrNone( + ` + UPDATE "evaluatedProjects" + SET + "evaluationStatus" = 'evaluated', + "evaluationScore" = $(evaluationScore), + evaluation = $(evaluation), + "evaluationReason" = $(evaluationReason), + "evaluatedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS)} + `, + { + id, + evaluationScore: data.evaluationScore, + evaluation: JSON.stringify(data.evaluation), + evaluationReason: data.evaluationReason ?? null, + }, + ) +} + +export async function markEvaluatedProjectAsOnboarded( + qx: QueryExecutor, + id: string, +): Promise { + await qx.selectNone( + ` + UPDATE "evaluatedProjects" + SET + onboarded = true, + "onboardedAt" = NOW(), + "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProject(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteEvaluatedProjectByProjectCatalogId( + qx: QueryExecutor, + projectCatalogId: string, +): Promise { + return qx.result( + ` + DELETE FROM "evaluatedProjects" + WHERE "projectCatalogId" = $(projectCatalogId) + `, + { projectCatalogId }, + ) +} + +export async function findPendingEvaluatedProjectsWithCatalog( + qx: QueryExecutor, + options: { limit?: number } = {}, +): Promise<(IDbEvaluatedProject & { projectSlug: string; repoName: string; repoUrl: string })[]> { + const { limit } = options + + return qx.select( + ` + SELECT + ${prepareSelectColumns(EVALUATED_PROJECT_COLUMNS, 'ep')}, + pc."projectSlug", + pc."repoName", + pc."repoUrl" + FROM "evaluatedProjects" ep + JOIN "projectCatalog" pc ON pc.id = ep."projectCatalogId" + WHERE ep."evaluationStatus" = 'pending' + ORDER BY ep."createdAt" ASC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + `, + { limit }, + ) +} diff --git a/services/libs/data-access-layer/src/evaluated-projects/index.ts b/services/libs/data-access-layer/src/evaluated-projects/index.ts new file mode 100644 index 0000000000..7a4064eec2 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './evaluatedProjects' diff --git a/services/libs/data-access-layer/src/evaluated-projects/types.ts b/services/libs/data-access-layer/src/evaluated-projects/types.ts new file mode 100644 index 0000000000..bb11eb5d65 --- /dev/null +++ b/services/libs/data-access-layer/src/evaluated-projects/types.ts @@ -0,0 +1,50 @@ +export type EvaluationStatus = 'pending' | 'evaluating' | 'evaluated' | 'failed' + +export interface IDbEvaluatedProject { + id: string + projectCatalogId: string + evaluationStatus: EvaluationStatus + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null + onboarded: boolean + onboardedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +// onboarded/onboardedAt/evaluatedAt are excluded: they are managed by dedicated helpers +// (markEvaluatedProjectAsEvaluated, markEvaluatedProjectAsOnboarded) and never written on insert. +export type IDbEvaluatedProjectCreate = { + projectCatalogId: string + evaluationStatus?: EvaluationStatus + evaluationScore?: number + evaluation?: Record + evaluationReason?: string + starsCount?: number + forksCount?: number + commitsCount?: number + pullRequestsCount?: number + issuesCount?: number +} + +export type IDbEvaluatedProjectUpdate = Partial<{ + evaluationStatus: EvaluationStatus + evaluationScore: number | null + evaluation: Record | null + evaluationReason: string | null + evaluatedAt: string | null + starsCount: number | null + forksCount: number | null + commitsCount: number | null + pullRequestsCount: number | null + issuesCount: number | null + onboarded: boolean + onboardedAt: string | null +}> diff --git a/services/libs/data-access-layer/src/index.ts b/services/libs/data-access-layer/src/index.ts index 639f0547b8..5ef4749d79 100644 --- a/services/libs/data-access-layer/src/index.ts +++ b/services/libs/data-access-layer/src/index.ts @@ -13,3 +13,5 @@ export * from './systemSettings' export * from './integrations' export * from './auditLogs' export * from './maintainers' +export * from './project-catalog' +export * from './evaluated-projects' diff --git a/services/libs/data-access-layer/src/project-catalog/index.ts b/services/libs/data-access-layer/src/project-catalog/index.ts new file mode 100644 index 0000000000..af7ef7faa1 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/index.ts @@ -0,0 +1,2 @@ +export * from './types' +export * from './projectCatalog' diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts new file mode 100644 index 0000000000..b951e11317 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts @@ -0,0 +1,336 @@ +import { QueryExecutor } from '../queryExecutor' +import { prepareSelectColumns } from '../utils' + +import { IDbProjectCatalog, IDbProjectCatalogCreate, IDbProjectCatalogUpdate } from './types' + +const PROJECT_CATALOG_COLUMNS = [ + 'id', + 'projectSlug', + 'repoName', + 'repoUrl', + 'ossfCriticalityScore', + 'lfCriticalityScore', + 'syncedAt', + 'createdAt', + 'updatedAt', +] + +export async function findProjectCatalogById( + qx: QueryExecutor, + id: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} + +export async function findProjectCatalogByRepoUrl( + qx: QueryExecutor, + repoUrl: string, +): Promise { + return qx.selectOneOrNone( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "repoUrl" = $(repoUrl) + `, + { repoUrl }, + ) +} + +export async function findProjectCatalogBySlug( + qx: QueryExecutor, + projectSlug: string, +): Promise { + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + WHERE "projectSlug" = $(projectSlug) + ORDER BY "createdAt" DESC + `, + { projectSlug }, + ) +} + +export async function findAllProjectCatalog( + qx: QueryExecutor, + options: { limit?: number; offset?: number } = {}, +): Promise { + const { limit, offset } = options + + return qx.select( + ` + SELECT ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + FROM "projectCatalog" + ORDER BY "createdAt" DESC + ${limit !== undefined ? 'LIMIT $(limit)' : ''} + ${offset !== undefined ? 'OFFSET $(offset)' : ''} + `, + { limit, offset }, + ) +} + +export async function countProjectCatalog(qx: QueryExecutor): Promise { + const result = await qx.selectOne( + ` + SELECT COUNT(*) AS count + FROM "projectCatalog" + `, + ) + return parseInt(result.count, 10) +} + +export async function insertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "ossfCriticalityScore", + "lfCriticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(ossfCriticalityScore), + $(lfCriticalityScore), + NOW(), + NOW() + ) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, + }, + ) +} + +export async function bulkInsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "ossfCriticalityScore", + "lfCriticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision + ) + `, + { values: JSON.stringify(values) }, + ) +} + +export async function upsertProjectCatalog( + qx: QueryExecutor, + data: IDbProjectCatalogCreate, +): Promise { + return qx.selectOne( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "ossfCriticalityScore", + "lfCriticalityScore", + "createdAt", + "updatedAt" + ) + VALUES ( + $(projectSlug), + $(repoName), + $(repoUrl), + $(ossfCriticalityScore), + $(lfCriticalityScore), + NOW(), + NOW() + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), + "updatedAt" = NOW() + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + { + projectSlug: data.projectSlug, + repoName: data.repoName, + repoUrl: data.repoUrl, + ossfCriticalityScore: data.ossfCriticalityScore ?? null, + lfCriticalityScore: data.lfCriticalityScore ?? null, + }, + ) +} + +export async function bulkUpsertProjectCatalog( + qx: QueryExecutor, + items: IDbProjectCatalogCreate[], +): Promise { + if (items.length === 0) { + return + } + + const values = items.map((item) => ({ + projectSlug: item.projectSlug, + repoName: item.repoName, + repoUrl: item.repoUrl, + ossfCriticalityScore: item.ossfCriticalityScore ?? null, + lfCriticalityScore: item.lfCriticalityScore ?? null, + })) + + await qx.result( + ` + INSERT INTO "projectCatalog" ( + "projectSlug", + "repoName", + "repoUrl", + "ossfCriticalityScore", + "lfCriticalityScore", + "createdAt", + "updatedAt" + ) + SELECT + v."projectSlug", + v."repoName", + v."repoUrl", + v."ossfCriticalityScore"::double precision, + v."lfCriticalityScore"::double precision, + NOW(), + NOW() + FROM jsonb_to_recordset($(values)::jsonb) AS v( + "projectSlug" text, + "repoName" text, + "repoUrl" text, + "ossfCriticalityScore" double precision, + "lfCriticalityScore" double precision + ) + ON CONFLICT ("repoUrl") DO UPDATE SET + "projectSlug" = EXCLUDED."projectSlug", + "repoName" = EXCLUDED."repoName", + "ossfCriticalityScore" = COALESCE(EXCLUDED."ossfCriticalityScore", "projectCatalog"."ossfCriticalityScore"), + "lfCriticalityScore" = COALESCE(EXCLUDED."lfCriticalityScore", "projectCatalog"."lfCriticalityScore"), + "updatedAt" = NOW() + `, + { values: JSON.stringify(values) }, + ) +} + +export async function updateProjectCatalog( + qx: QueryExecutor, + id: string, + data: IDbProjectCatalogUpdate, +): Promise { + const setClauses: string[] = [] + const params: Record = { id } + + if (data.projectSlug !== undefined) { + setClauses.push('"projectSlug" = $(projectSlug)') + params.projectSlug = data.projectSlug + } + if (data.repoName !== undefined) { + setClauses.push('"repoName" = $(repoName)') + params.repoName = data.repoName + } + if (data.repoUrl !== undefined) { + setClauses.push('"repoUrl" = $(repoUrl)') + params.repoUrl = data.repoUrl + } + if (data.ossfCriticalityScore !== undefined) { + setClauses.push('"ossfCriticalityScore" = $(ossfCriticalityScore)') + params.ossfCriticalityScore = data.ossfCriticalityScore + } + if (data.lfCriticalityScore !== undefined) { + setClauses.push('"lfCriticalityScore" = $(lfCriticalityScore)') + params.lfCriticalityScore = data.lfCriticalityScore + } + if (data.syncedAt !== undefined) { + setClauses.push('"syncedAt" = $(syncedAt)') + params.syncedAt = data.syncedAt + } + + if (setClauses.length === 0) { + return findProjectCatalogById(qx, id) + } + + return qx.selectOneOrNone( + ` + UPDATE "projectCatalog" + SET + ${setClauses.join(',\n ')}, + "updatedAt" = NOW() + WHERE id = $(id) + RETURNING ${prepareSelectColumns(PROJECT_CATALOG_COLUMNS)} + `, + params, + ) +} + +export async function updateProjectCatalogSyncedAt(qx: QueryExecutor, id: string): Promise { + await qx.selectNone( + ` + UPDATE "projectCatalog" + SET "syncedAt" = NOW(), "updatedAt" = NOW() + WHERE id = $(id) + `, + { id }, + ) +} + +export async function deleteProjectCatalog(qx: QueryExecutor, id: string): Promise { + return qx.result( + ` + DELETE FROM "projectCatalog" + WHERE id = $(id) + `, + { id }, + ) +} diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts new file mode 100644 index 0000000000..8cbb39a310 --- /dev/null +++ b/services/libs/data-access-layer/src/project-catalog/types.ts @@ -0,0 +1,28 @@ +export interface IDbProjectCatalog { + id: string + projectSlug: string + repoName: string + repoUrl: string + ossfCriticalityScore: number | null + lfCriticalityScore: number | null + syncedAt: string | null + createdAt: string | null + updatedAt: string | null +} + +type ProjectCatalogWritable = Pick< + IDbProjectCatalog, + 'projectSlug' | 'repoName' | 'repoUrl' | 'ossfCriticalityScore' | 'lfCriticalityScore' +> + +export type IDbProjectCatalogCreate = Omit< + ProjectCatalogWritable, + 'ossfCriticalityScore' | 'lfCriticalityScore' +> & { + ossfCriticalityScore?: number + lfCriticalityScore?: number +} + +export type IDbProjectCatalogUpdate = Partial & { + syncedAt?: string +}