fix: add new source

ulemons · ulemons · commit e4e5fc2a3175 · 2026-03-24T11:37:48.000+01:00
diff --git a/services/apps/automatic_projects_discovery_worker/src/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities.ts
@@ -1,3 +1,3 @@
-import { listDatasets, processDataset } from './activities/activities'
+import { listDatasets, listSources, processDataset } from './activities/activities'
 
-export { listDatasets, processDataset }
+export { listDatasets, listSources, processDataset }
diff --git a/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts b/services/apps/automatic_projects_discovery_worker/src/activities/activities.ts
@@ -6,13 +6,17 @@ import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor'
 import { getServiceLogger } from '@crowd/logging'
 
 import { svc } from '../main'
-import { getSource } from '../sources/registry'
+import { getAvailableSourceNames, getSource } from '../sources/registry'
 import { IDatasetDescriptor } from '../sources/types'
 
 const log = getServiceLogger()
 
 const BATCH_SIZE = 5000
 
+export async function listSources(): Promise<string[]> {
+  return getAvailableSourceNames()
+}
+
 export async function listDatasets(sourceName: string): Promise<IDatasetDescriptor[]> {
   const source = getSource(sourceName)
   const datasets = await source.listAvailableDatasets()
@@ -32,40 +36,41 @@ export async function processDataset(
   log.info({ sourceName, datasetId: dataset.id, url: dataset.url }, 'Processing dataset...')
 
   const source = getSource(sourceName)
+  const stream = await source.fetchDatasetStream(dataset)
 
-  // We use streaming (not full download) because each CSV is ~119MB / ~750K rows.
-  // Streaming keeps memory usage low (only one batch in memory at a time) and leverages
-  // Node.js backpressure: if DB writes are slow, the HTTP stream pauses automatically.
-  const httpStream = await source.fetchDatasetStream(dataset)
-
-  httpStream.on('error', (err: Error) => {
-    log.error({ datasetId: dataset.id, error: err.message }, 'HTTP stream error.')
+  stream.on('error', (err: Error) => {
+    log.error({ datasetId: dataset.id, error: err.message }, 'Stream error.')
   })
 
-  // Pipe the raw HTTP response directly into csv-parse.
-  // Data flows as: HTTP response → csv-parse → for-await → batch → DB
-  const parser = httpStream.pipe(
-    parse({
-      columns: true,
-      skip_empty_lines: true,
-      trim: true,
-    }),
-  )
-
-  parser.on('error', (err) => {
-    log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.')
-  })
+  // For CSV sources: pipe through csv-parse to get Record<string, string> objects.
+  // For JSON sources: the stream already emits pre-parsed objects in object mode.
+  const records =
+    source.format === 'json'
+      ? stream
+      : stream.pipe(
+          parse({
+            columns: true,
+            skip_empty_lines: true,
+            trim: true,
+          }),
+        )
+
+  if (source.format !== 'json') {
+    ;(records as ReturnType<typeof parse>).on('error', (err) => {
+      log.error({ datasetId: dataset.id, error: err.message }, 'CSV parser error.')
+    })
+  }
 
   let batch: IDbProjectCatalogCreate[] = []
   let totalProcessed = 0
   let totalSkipped = 0
   let batchNumber = 0
   let totalRows = 0
 
-  for await (const rawRow of parser) {
+  for await (const rawRow of records) {
     totalRows++
 
-    const parsed = source.parseRow(rawRow)
+    const parsed = source.parseRow(rawRow as Record<string, unknown>)
     if (!parsed) {
       totalSkipped++
       continue
@@ -75,7 +80,8 @@ export async function processDataset(
       projectSlug: parsed.projectSlug,
       repoName: parsed.repoName,
       repoUrl: parsed.repoUrl,
-      criticalityScore: parsed.criticalityScore,
+      ossfCriticalityScore: parsed.ossfCriticalityScore,
+      lfCriticalityScore: parsed.lfCriticalityScore,
     })
 
     if (batch.length >= BATCH_SIZE) {
diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/lf-criticality-score/source.ts
@@ -0,0 +1,197 @@
+import http from 'http'
+import https from 'https'
+import { Readable } from 'stream'
+
+import { getServiceLogger } from '@crowd/logging'
+
+import { IDatasetDescriptor, IDiscoverySource, IDiscoverySourceRow } from '../types'
+
+const log = getServiceLogger()
+
+const DEFAULT_API_URL = 'https://hypervascular-nonduplicative-vern.ngrok-free.dev'
+const PAGE_SIZE = 100
+
+interface LfApiResponse {
+  page: number
+  pageSize: number
+  total: number
+  totalPages: number
+  data: LfApiRow[]
+}
+
+interface LfApiRow {
+  runDate: string
+  repoUrl: string
+  owner: string
+  repoName: string
+  contributors: number
+  organizations: number
+  sizeSloc: number
+  lastUpdated: number
+  age: number
+  commitFreq: number
+  score: number
+}
+
+function getApiBaseUrl(): string {
+  return (process.env.LF_CRITICALITY_SCORE_API_URL ?? DEFAULT_API_URL).replace(/\/$/, '')
+}
+
+async function fetchPage(
+  baseUrl: string,
+  startDate: string,
+  endDate: string,
+  page: number,
+): Promise<LfApiResponse> {
+  const url = `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}&page=${page}&pageSize=${PAGE_SIZE}`
+
+  return new Promise((resolve, reject) => {
+    const client = url.startsWith('https://') ? https : http
+
+    const req = client.get(url, (res) => {
+      if (res.statusCode !== 200) {
+        reject(new Error(`LF Criticality Score API returned status ${res.statusCode} for ${url}`))
+        res.resume()
+        return
+      }
+
+      const chunks: Uint8Array[] = []
+      res.on('data', (chunk: Uint8Array) => chunks.push(chunk))
+      res.on('end', () => {
+        try {
+          resolve(JSON.parse(Buffer.concat(chunks).toString('utf8')) as LfApiResponse)
+        } catch (err) {
+          reject(new Error(`Failed to parse LF Criticality Score API response: ${err}`))
+        }
+      })
+      res.on('error', reject)
+    })
+
+    req.on('error', reject)
+    req.end()
+  })
+}
+
+/**
+ * Generates the first day and last day of a given month.
+ * monthOffset = 0 → current month, -1 → previous month, etc.
+ */
+function monthRange(monthOffset: number): { startDate: string; endDate: string } {
+  const now = new Date()
+  const year = now.getUTCFullYear()
+  const month = now.getUTCMonth() + monthOffset // can be negative; Date handles rollover
+
+  const first = new Date(Date.UTC(year, month, 1))
+  const last = new Date(Date.UTC(year, month + 1, 0)) // last day of month
+
+  const pad = (n: number) => String(n).padStart(2, '0')
+  const fmt = (d: Date) =>
+    `${d.getUTCFullYear()}-${pad(d.getUTCMonth() + 1)}-${pad(d.getUTCDate())}`
+
+  return { startDate: fmt(first), endDate: fmt(last) }
+}
+
+export class LfCriticalityScoreSource implements IDiscoverySource {
+  public readonly name = 'lf-criticality-score'
+  public readonly format = 'json' as const
+
+  async listAvailableDatasets(): Promise<IDatasetDescriptor[]> {
+    const baseUrl = getApiBaseUrl()
+
+    // Return one dataset per month for the last 12 months (newest first)
+    const datasets: IDatasetDescriptor[] = []
+
+    for (let offset = 0; offset >= -11; offset--) {
+      const { startDate, endDate } = monthRange(offset)
+      const id = startDate.slice(0, 7) // e.g. "2026-02"
+
+      datasets.push({
+        id,
+        date: startDate,
+        url: `${baseUrl}/projects/scores?startDate=${startDate}&endDate=${endDate}`,
+      })
+    }
+
+    return datasets
+  }
+
+  /**
+   * Returns an object-mode Readable that fetches all pages from the API
+   * and pushes each row as a plain object. Activities.ts iterates this
+   * directly (no csv-parse) because format === 'json'.
+   */
+  async fetchDatasetStream(dataset: IDatasetDescriptor): Promise<Readable> {
+    const baseUrl = getApiBaseUrl()
+
+    // Extract startDate and endDate from the stored URL
+    const parsed = new URL(dataset.url)
+    const startDate = parsed.searchParams.get('startDate') ?? ''
+    const endDate = parsed.searchParams.get('endDate') ?? ''
+
+    const stream = new Readable({ objectMode: true, read() {} })
+
+    // Fetch pages asynchronously and push rows into the stream
+    ;(async () => {
+      try {
+        let page = 1
+        let totalPages = 1
+
+        do {
+          const response = await fetchPage(baseUrl, startDate, endDate, page)
+          totalPages = response.totalPages
+
+          for (const row of response.data) {
+            stream.push(row)
+          }
+
+          log.debug(
+            { datasetId: dataset.id, page, totalPages, rowsInPage: response.data.length },
+            'LF Criticality Score page fetched.',
+          )
+
+          page++
+        } while (page <= totalPages)
+
+        stream.push(null) // signal end of stream
+      } catch (err) {
+        stream.destroy(err instanceof Error ? err : new Error(String(err)))
+      }
+    })()
+
+    return stream
+  }
+
+  parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow | null {
+    const repoUrl = rawRow['repoUrl'] as string | undefined
+    if (!repoUrl) {
+      return null
+    }
+
+    let repoName = ''
+    let projectSlug = ''
+
+    try {
+      const urlPath = new URL(repoUrl).pathname.replace(/^\//, '').replace(/\/$/, '')
+      projectSlug = urlPath
+      repoName = urlPath.split('/').pop() || ''
+    } catch {
+      const parts = repoUrl.replace(/\/$/, '').split('/')
+      projectSlug = parts.slice(-2).join('/')
+      repoName = parts.pop() || ''
+    }
+
+    if (!projectSlug || !repoName) {
+      return null
+    }
+
+    const score = rawRow['score']
+    const lfCriticalityScore = typeof score === 'number' ? score : parseFloat(score as string)
+
+    return {
+      projectSlug,
+      repoName,
+      repoUrl,
+      lfCriticalityScore: Number.isNaN(lfCriticalityScore) ? undefined : lfCriticalityScore,
+    }
+  }
+}
diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts b/services/apps/automatic_projects_discovery_worker/src/sources/ossf-criticality-score/source.ts
@@ -39,8 +39,8 @@ export class OssfCriticalityScoreSource implements IDiscoverySource {
   }
 
   // CSV columns use dot notation (e.g. "repo.url", "default_score")
-  parseRow(rawRow: Record<string, string>): IDiscoverySourceRow | null {
-    const repoUrl = rawRow['repo.url']
+  parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow | null {
+    const repoUrl = rawRow['repo.url'] as string | undefined
     if (!repoUrl) {
       return null
     }
@@ -62,14 +62,14 @@ export class OssfCriticalityScoreSource implements IDiscoverySource {
       return null
     }
 
-    const criticalityScoreRaw = rawRow['default_score']
-    const criticalityScore = criticalityScoreRaw ? parseFloat(criticalityScoreRaw) : undefined
+    const scoreRaw = rawRow['default_score']
+    const ossfCriticalityScore = scoreRaw ? parseFloat(scoreRaw as string) : undefined
 
     return {
       projectSlug,
       repoName,
       repoUrl,
-      criticalityScore: Number.isNaN(criticalityScore) ? undefined : criticalityScore,
+      ossfCriticalityScore: Number.isNaN(ossfCriticalityScore) ? undefined : ossfCriticalityScore,
     }
   }
 }
diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts b/services/apps/automatic_projects_discovery_worker/src/sources/registry.ts
@@ -1,8 +1,12 @@
+import { LfCriticalityScoreSource } from './lf-criticality-score/source'
 import { OssfCriticalityScoreSource } from './ossf-criticality-score/source'
 import { IDiscoverySource } from './types'
 
 // To add a new source: instantiate it here.
-const sources: IDiscoverySource[] = [new OssfCriticalityScoreSource()]
+const sources: IDiscoverySource[] = [
+  new OssfCriticalityScoreSource(),
+  new LfCriticalityScoreSource(),
+]
 
 export function getSource(name: string): IDiscoverySource {
   const source = sources.find((s) => s.name === name)
diff --git a/services/apps/automatic_projects_discovery_worker/src/sources/types.ts b/services/apps/automatic_projects_discovery_worker/src/sources/types.ts
@@ -8,14 +8,20 @@ export interface IDatasetDescriptor {
 
 export interface IDiscoverySource {
   name: string
+  /**
+   * 'csv' (default): fetchDatasetStream returns a raw text stream, piped through csv-parse.
+   * 'json': fetchDatasetStream returns an object-mode Readable that emits pre-parsed records.
+   */
+  format?: 'csv' | 'json'
   listAvailableDatasets(): Promise<IDatasetDescriptor[]>
   fetchDatasetStream(dataset: IDatasetDescriptor): Promise<Readable>
-  parseRow(rawRow: Record<string, string>): IDiscoverySourceRow | null
+  parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow | null
 }
 
 export interface IDiscoverySourceRow {
   projectSlug: string
   repoName: string
   repoUrl: string
-  criticalityScore?: number
+  ossfCriticalityScore?: number
+  lfCriticalityScore?: number
 }
diff --git a/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts b/services/apps/automatic_projects_discovery_worker/src/workflows/discoverProjects.ts
diff --git a/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts b/services/libs/data-access-layer/src/project-catalog/projectCatalog.ts
diff --git a/services/libs/data-access-layer/src/project-catalog/types.ts b/services/libs/data-access-layer/src/project-catalog/types.ts

Original file line number	Diff line number	Diff line change
`@@ -39,8 +39,8 @@ export class OssfCriticalityScoreSource implements IDiscoverySource {`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`// CSV columns use dot notation (e.g. "repo.url", "default_score")`
`42`		`- parseRow(rawRow: Record<string, string>): IDiscoverySourceRow \| null {`
`43`		`- const repoUrl = rawRow['repo.url']`
	`42`	`+ parseRow(rawRow: Record<string, unknown>): IDiscoverySourceRow \| null {`
	`43`	`+ const repoUrl = rawRow['repo.url'] as string \| undefined`
`44`	`44`	`if (!repoUrl) {`
`45`	`45`	`return null`
`46`	`46`	`}`
`@@ -62,14 +62,14 @@ export class OssfCriticalityScoreSource implements IDiscoverySource {`
`62`	`62`	`return null`
`63`	`63`	`}`
`64`	`64`
`65`		`- const criticalityScoreRaw = rawRow['default_score']`
`66`		`- const criticalityScore = criticalityScoreRaw ? parseFloat(criticalityScoreRaw) : undefined`
	`65`	`+ const scoreRaw = rawRow['default_score']`
	`66`	`+ const ossfCriticalityScore = scoreRaw ? parseFloat(scoreRaw as string) : undefined`
`67`	`67`
`68`	`68`	`return {`
`69`	`69`	`projectSlug,`
`70`	`70`	`repoName,`
`71`	`71`	`repoUrl,`
`72`		`- criticalityScore: Number.isNaN(criticalityScore) ? undefined : criticalityScore,`
	`72`	`+ ossfCriticalityScore: Number.isNaN(ossfCriticalityScore) ? undefined : ossfCriticalityScore,`
`73`	`73`	`}`
`74`	`74`	`}`
`75`	`75`	`}`