diff --git a/.gitignore b/.gitignore index 571150641..e2da7cc59 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,5 @@ cert.txt # local MCP server config (contains auth tokens) .mcp.json mcp-server/create-agent-key.ts +.gcloudignore +CLAUDE.md diff --git a/components/Navbar.tsx b/components/Navbar.tsx index 942bc54fa..3c68cdd44 100644 --- a/components/Navbar.tsx +++ b/components/Navbar.tsx @@ -12,6 +12,7 @@ import { NavbarLinkBallotQuestions, DESKTOP_NAV_ITEM_CLASS, NavbarLinkAI, + NavbarLinkAiTools, NavbarLinkBills, NavbarLinkHearings, NavbarLinkEditProfile, @@ -92,6 +93,7 @@ const MobileNav: React.FC> = () => { + ) @@ -270,6 +272,7 @@ const DesktopNav: React.FC> = () => { + diff --git a/components/NavbarComponents.tsx b/components/NavbarComponents.tsx index 668cf5eea..a378acaed 100644 --- a/components/NavbarComponents.tsx +++ b/components/NavbarComponents.tsx @@ -81,6 +81,26 @@ export const Avatar = () => { ) } +export const NavbarLinkAiTools: React.FC< + React.PropsWithChildren<{ + handleClick?: any + other?: any + }> +> = ({ handleClick, other }) => { + const isMobile = useMediaQuery("(max-width: 768px)") + const { t } = useTranslation(["common", "auth"]) + return ( + + {t("navigation.aiTools")} + + ) +} + export const NavbarLinkAI: React.FC< React.PropsWithChildren<{ handleClick?: any diff --git a/components/about/MapleAI/MapleAI.tsx b/components/about/MapleAI/MapleAI.tsx index 54cfec967..6f9f753e7 100644 --- a/components/about/MapleAI/MapleAI.tsx +++ b/components/about/MapleAI/MapleAI.tsx @@ -1,5 +1,6 @@ import { useTranslation } from "next-i18next" import { Container, Row, Col } from "../../bootstrap" +import { Internal } from "../../links" import { MemberItem, Divider, @@ -184,6 +185,21 @@ const MapleAI = () => { + + + + {t("aiResearch.title")} + + {t("aiResearch.desc")} + + + + {t("aiResearch.linkText")} + + + + + ) } diff --git a/components/learn/AiTools/AiTools.tsx b/components/learn/AiTools/AiTools.tsx new file mode 100644 index 000000000..5c8d6fe6e --- /dev/null +++ b/components/learn/AiTools/AiTools.tsx @@ -0,0 +1,313 @@ +import { useTranslation } from "next-i18next" +import styled from "styled-components" +import { Container, Row, Col } from "../../bootstrap" +import { + DescrContainer, + Divider, + PageDescr, + PageTitle, + SectionContainer, + SectionTitle +} from "../../shared/CommonComponents" +import { Internal } from "../../links" + +const ExampleBox = styled.div` + background: var(--maple-surface-raised); + border-left: 4px solid var(--maple-brand-primary); + border-radius: 0 var(--maple-radius-md) var(--maple-radius-md) 0; + padding: 1rem 1.25rem; + margin: 0.75rem 0; + font-size: 15px; + font-style: italic; + color: var(--maple-text-body); +` + +const StepNumber = styled.div` + display: flex; + align-items: center; + justify-content: center; + width: 2rem; + height: 2rem; + border-radius: 50%; + background: var(--maple-brand-primary); + color: white; + font-weight: 700; + font-size: 1rem; + flex-shrink: 0; + margin-right: 0.75rem; +` + +const StepRow = styled.div` + display: flex; + align-items: flex-start; + padding: 0.75rem 1.25rem; +` + +const StepText = styled.div` + font-size: 16px; + font-weight: 500; + color: var(--maple-text-body); + line-height: 1.5; +` + +const ExampleLabel = styled.div` + font-size: 13px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.06em; + color: var(--maple-brand-primary); + margin-bottom: 0.25rem; +` + +export const AiTools = () => { + const { t } = useTranslation("aiTools") + + return ( + + + + {t("title")} + + + + + {t("description")} + + + + {/* What is it */} + + + + {t("section1.title")} + + {t("section1.desc1")} + + + {t("section1.desc2Pre")}{" "} + + + {t("section1.desc2LinkText")} + + + {t("section1.desc2Post")} + + + + + + {/* What you can do */} + + + + {t("section2.title")} + + {t("section2.intro")} + + +
    +
  • + {t("section2.item1Bold")} {t("section2.item1Main")} +
  • +
  • + {t("section2.item2Bold")} {t("section2.item2Main")} +
  • +
  • + {t("section2.item3Bold")} {t("section2.item3Main")} +
  • +
  • + {t("section2.item4Bold")} {t("section2.item4Main")} +
  • +
  • + {t("section2.item5Bold")} {t("section2.item5Main")} +
  • +
  • + {t("section2.item6Bold")} {t("section2.item6Main")} +
  • +
  • + {t("section2.item7Bold")} {t("section2.item7Main")} +
  • +
+
+
+ +
+ + {/* Examples */} + + + + {t("section3.title")} + + {t("section3.intro")} + +
+ {t("section3.example1Label")} + + “{t("section3.example1Text")}” + + + {t("section3.example2Label")} + + “{t("section3.example2Text")}” + + + {t("section3.example3Label")} + + “{t("section3.example3Text")}” + + + {t("section3.example4Label")} + + “{t("section3.example4Text")}” + +
+
+ +
+ + {/* How to get started */} + + + + {t("section4.title")} + + {t("section4.intro")} + + + 1 + + {t("section4.step1Bold")} {t("section4.step1Pre")}{" "} + {t("section4.step1LinkText")} + {t("section4.step1Post")} + + + + + 2 + + {t("section4.step2Bold")} {t("section4.step2Intro")} + + + + + + 3 + + {t("section4.step3Bold")} {t("section4.step3Pre")}{" "} + + {t("section4.step3LinkText")} + {" "} + {t("section4.step3Post")} + + + + + 4 + + {t("section4.step4Bold")} {t("section4.step4Pre")}{" "} + YOUR_TOKEN_HERE {t("section4.step4Post")} + + +
+
{`{
+  "mcpServers": {
+    "maple": {
+      "type": "http",
+      "url": "https://mapletestimony.org/api/mcp",
+      "headers": {
+        "X-Maple-Token": "Bearer YOUR_TOKEN_HERE"
+      }
+    }
+  }
+}`}
+
+ + + 5 + + {t("section4.step5Bold")} {t("section4.step5Post")} + + + + {t("section4.needHelp")} {t("section4.needHelpPost")}{" "} + + info@mapletestimony.org + + . + +
+ +
+ + {/* Privacy */} + + + + {t("section5.title")} + + {t("section5.desc1")} + + + {t("section5.desc2Pre")}{" "} + + {t("section5.desc2LinkText")} + + + + + +
+ ) +} + +export default AiTools diff --git a/firebase.json b/firebase.json index b2ddccc3a..4001ccb22 100644 --- a/firebase.json +++ b/firebase.json @@ -2,7 +2,14 @@ "hosting": { "public": "out", "cleanUrls": true, - "ignore": ["firebase.json", "**/.*", "**/node_modules/**"] + "ignore": ["firebase.json", "**/.*", "**/node_modules/**"], + "rewrites": [ + { + "source": "/api/mcp", + "function": "mcpProxy", + "region": "us-central1" + } + ] }, "functions": [ { diff --git a/functions/.env.digital-testimony-dev b/functions/.env.digital-testimony-dev new file mode 100644 index 000000000..f278cf968 --- /dev/null +++ b/functions/.env.digital-testimony-dev @@ -0,0 +1 @@ +MCP_SERVER_URL=https://maple-mcp-server-ke6znoupgq-uc.a.run.app diff --git a/functions/package.json b/functions/package.json index 0f6ef6ce6..89185a27e 100644 --- a/functions/package.json +++ b/functions/package.json @@ -13,16 +13,17 @@ }, "main": "lib/index.js", "dependencies": { + "@google-cloud/aiplatform": "^3.9.0", "@google-cloud/firestore": "^5.0.2", "@google-cloud/pubsub": "^3.0.1", "assemblyai": "^4.9.0", "axios": "^0.25.0", "date-fns": "^2.30.0", "firebase-admin": "^12.0.0", - "@google-cloud/aiplatform": "^3.9.0", "firebase-functions": "^5.1.1", "fluent-ffmpeg": "^2.1.3", "fuse.js": "6.5.3", + "google-auth-library": "^10.6.2", "handlebars": "^4.7.8", "js-sha256": "^0.11.0", "jsdom": "^26.0.0", diff --git a/functions/src/index.ts b/functions/src/index.ts index 641255bf4..231eb52c6 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -62,6 +62,8 @@ export { transcription } from "./webhooks" export * from "./triggerPubsubFunction" +export { mcpProxy } from "./mcp/proxy" + // Export the health check last so it is loaded last. export * from "./healthCheck" diff --git a/functions/src/mcp/proxy.ts b/functions/src/mcp/proxy.ts new file mode 100644 index 000000000..77b171a7a --- /dev/null +++ b/functions/src/mcp/proxy.ts @@ -0,0 +1,85 @@ +import * as functions from "firebase-functions" + +/** + * Firebase Function proxy for the MAPLE MCP server on Cloud Run. + * + * Exposed at: /api/mcp (via Firebase Hosting rewrite) + * Target: MCP_SERVER_URL env var (Cloud Run, --no-allow-unauthenticated) + * + * Header flow: + * Client → Authorization: Bearer + * Proxy → Authorization: Bearer (Cloud Run IAM) + * X-Maple-Authorization: Bearer (MCP auth middleware) + */ +export const mcpProxy = functions + .runWith({ timeoutSeconds: 30, memory: "256MB" }) + .https.onRequest(async (req, res) => { + const mcpUrl = process.env.MCP_SERVER_URL + if (!mcpUrl) { + res.status(503).json({ error: "MCP service not configured" }) + return + } + + if (req.method !== "POST") { + res.status(405).json({ error: "Method Not Allowed" }) + return + } + + // Firebase Functions strips the Authorization header from allUsers-accessible + // functions before the code runs. Clients must use X-Maple-Token instead. + const mapleAuth = + (req.headers["x-maple-token"] as string | undefined) ?? + req.headers.authorization + if (!mapleAuth) { + res.status(401).json({ + error: "Unauthorized: Missing token", + help: "Visit https://mapletestimony.org/learn/ai-tools for setup instructions." + }) + return + } + + try { + // Use GCP metadata server to get an identity token for Cloud Run IAM. + // This is the most reliable approach in any GCP-hosted environment. + const metadataUrl = `http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/identity?audience=${encodeURIComponent( + mcpUrl + )}&format=full` + const tokenRes = await fetch(metadataUrl, { + headers: { "Metadata-Flavor": "Google" } + }) + const idToken = await tokenRes.text() + + const body = JSON.stringify(req.body) + + const upstream = await fetch(`${mcpUrl}/mcp`, { + method: "POST", + headers: { + Authorization: `Bearer ${idToken}`, + "X-Maple-Authorization": mapleAuth.startsWith("Bearer ") + ? mapleAuth + : `Bearer ${mapleAuth}`, + "Content-Type": req.headers["content-type"] ?? "application/json", + ...(req.headers["accept"] && { + Accept: req.headers["accept"] as string + }), + ...(req.headers["mcp-protocol-version"] && { + "MCP-Protocol-Version": req.headers[ + "mcp-protocol-version" + ] as string + }) + }, + body + }) + + const responseBody = await upstream.arrayBuffer() + res.status(upstream.status) + const ct = upstream.headers.get("content-type") + if (ct) res.setHeader("Content-Type", ct) + res.end(Buffer.from(responseBody)) + } catch (err) { + functions.logger.error("MCP proxy error", err) + if (!res.headersSent) { + res.status(502).json({ error: "Bad Gateway" }) + } + } + }) diff --git a/functions/yarn.lock b/functions/yarn.lock index a4b30343e..9672799c1 100644 --- a/functions/yarn.lock +++ b/functions/yarn.lock @@ -3609,6 +3609,24 @@ gaxios@^7.0.0, gaxios@^7.0.0-rc.4: https-proxy-agent "^7.0.1" node-fetch "^3.3.2" +gaxios@^7.1.4: + version "7.1.4" + resolved "https://registry.yarnpkg.com/gaxios/-/gaxios-7.1.4.tgz#33a5b78e2c5c01cf5a5d17f58dd188839867fc9c" + integrity sha512-bTIgTsM2bWn3XklZISBTQX7ZSddGW+IO3bMdGaemHZ3tbqExMENHLx6kKZ/KlejgrMtj8q7wBItt51yegqalrA== + dependencies: + extend "^3.0.2" + https-proxy-agent "^7.0.1" + node-fetch "^3.3.2" + +gcp-metadata@8.1.2: + version "8.1.2" + resolved "https://registry.yarnpkg.com/gcp-metadata/-/gcp-metadata-8.1.2.tgz#e62e3373ddf41fc727ccc31c55c687b798bee898" + integrity sha512-zV/5HKTfCeKWnxG0Dmrw51hEWFGfcF2xiXqcA3+J90WDuP0SvoiSO5ORvcBsifmx/FoIjgQN3oNOGaQ5PhLFkg== + dependencies: + gaxios "^7.0.0" + google-logging-utils "^1.0.0" + json-bigint "^1.0.0" + gcp-metadata@^4.2.0: version "4.3.1" resolved "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-4.3.1.tgz" @@ -3777,6 +3795,18 @@ google-auth-library@^10.0.0, google-auth-library@^10.0.0-rc.1: gtoken "^8.0.0" jws "^4.0.0" +google-auth-library@^10.6.2: + version "10.6.2" + resolved "https://registry.yarnpkg.com/google-auth-library/-/google-auth-library-10.6.2.tgz#44557c536aec626b7cda48a85b5d026e2c9b74c4" + integrity sha512-e27Z6EThmVNNvtYASwQxose/G57rkRuaRbQyxM2bvYLLX/GqWZ5chWq2EBoUchJbCc57eC9ArzO5wMsEmWftCw== + dependencies: + base64-js "^1.3.0" + ecdsa-sig-formatter "^1.0.11" + gaxios "^7.1.4" + gcp-metadata "8.1.2" + google-logging-utils "1.1.3" + jws "^4.0.0" + google-auth-library@^7.14.0: version "7.14.1" resolved "https://registry.npmjs.org/google-auth-library/-/google-auth-library-7.14.1.tgz" @@ -3877,6 +3907,11 @@ google-gax@^4.0.3, google-gax@^4.3.3: retry-request "^7.0.0" uuid "^9.0.1" +google-logging-utils@1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/google-logging-utils/-/google-logging-utils-1.1.3.tgz#17b71f1f95d266d2ddd356b8f00178433f041b17" + integrity sha512-eAmLkjDjAFCVXg7A1unxHsLf961m6y17QFqXqAXGj/gVkKFrEICfStRfwUlGNfeCEjNRa32JEWOUTlYXPyyKvA== + google-logging-utils@^0.0.2: version "0.0.2" resolved "https://registry.yarnpkg.com/google-logging-utils/-/google-logging-utils-0.0.2.tgz#5fd837e06fa334da450433b9e3e1870c1594466a" diff --git a/mcp-server/.dockerignore b/mcp-server/.dockerignore new file mode 100644 index 000000000..46712876a --- /dev/null +++ b/mcp-server/.dockerignore @@ -0,0 +1,8 @@ +node_modules +dist +*.test.ts +jest.config.js +implementation_plan.md +.env +.env.* +create-agent-key.ts diff --git a/mcp-server/Dockerfile b/mcp-server/Dockerfile new file mode 100644 index 000000000..79496c632 --- /dev/null +++ b/mcp-server/Dockerfile @@ -0,0 +1,34 @@ +# ── Build stage ─────────────────────────────────────────────────────────────── +FROM node:20-slim AS builder + +WORKDIR /app + +COPY package.json package-lock.json ./ +RUN npm ci + +COPY tsconfig.json ./ +COPY *.ts ./ +RUN npm run build + +# ── Runtime stage ───────────────────────────────────────────────────────────── +FROM node:20-slim AS runtime + +WORKDIR /app + +# Install only production dependencies +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev + +# Copy compiled output +COPY --from=builder /app/dist ./dist + +# Cloud Run sets PORT; default to 8080 to match Cloud Run's expected port +ENV PORT=8080 +ENV HOST=0.0.0.0 + +# FIREBASE_PROJECT_ID and CURRENT_COURT must be set at deploy time +# No GOOGLE_APPLICATION_CREDENTIALS needed — Cloud Run uses Workload Identity + +EXPOSE 8080 + +CMD ["node", "dist/index-http.js"] diff --git a/mcp-server/auth.ts b/mcp-server/auth.ts index e67cec81a..7ed0d6915 100644 --- a/mcp-server/auth.ts +++ b/mcp-server/auth.ts @@ -6,13 +6,23 @@ export async function hybridAuthMiddleware( res: Response, next: NextFunction ) { - const authHeader = req.headers.authorization + // Header precedence (highest to lowest): + // X-Maple-Authorization — set by the Firebase Function proxy + // X-Maple-Token — set by MCP clients (Firebase strips Authorization on allUsers functions) + // Authorization — direct connections (local stdio, curl testing) + const authHeader = + (req.headers["x-maple-authorization"] as string | undefined) ?? + (req.headers["x-maple-token"] as string | undefined) ?? + req.headers.authorization const token = authHeader?.startsWith("Bearer ") ? authHeader.split(" ")[1] : undefined if (!token) { - return res.status(401).json({ error: "Unauthorized: Missing token" }) + return res.status(401).json({ + error: "Unauthorized: Missing token", + help: "A MAPLE account and API token are required. Visit https://mapletestimony.org/learn/ai-tools for setup instructions." + }) } try { @@ -44,7 +54,10 @@ export async function hybridAuthMiddleware( return next() } - return res.status(401).json({ error: "Unauthorized: Invalid token" }) + return res.status(401).json({ + error: "Unauthorized: Invalid token", + help: "Visit https://mapletestimony.org/learn/ai-tools to get a valid MAPLE API token." + }) } catch (error) { console.error("Auth middleware error:", error) return res.status(500).json({ error: "Internal Server Error" }) diff --git a/mcp-server/implementation_plan.md b/mcp-server/implementation_plan.md index 81fb52be8..79a10a796 100644 --- a/mcp-server/implementation_plan.md +++ b/mcp-server/implementation_plan.md @@ -1,154 +1,112 @@ -This document outlines the implementation for a Model Context Protocol (MCP) server enabling RAG operations over MAPLE data. +This document describes the implemented MCP server enabling AI-powered RAG over MAPLE legislative data. -| Category | Description | -| :------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Goals** | • Inform and empower constituents for policy change
• Increase engagement between legislature and constituents
• Grow MAPLE usage by organizations, advocates, and journalists | -| **Jobs** | • "Tell me about bills that would..." (RAG over bills)
• "Tell me what people are saying about..." (RAG over testimony)
• "Tell me about the 2026 ballot questions regarding..." (RAG over ballot questions)
• "Tell me what other states are doing..." or "if it is true that..." (RAG over MAPLE + Web) | -| **Mechanisms** | • Deploy AI features on MAPLE
• Enable 3rd parties (sites, Agent Skills)
• Enable individual authorized users to leverage MAPLE data | +| Category | Description | +| :------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Goals** | • Inform and empower constituents for policy change
• Increase engagement between legislature and constituents
• Grow MAPLE usage by organizations, advocates, and journalists | +| **Jobs** | • "Tell me about bills that would..." (RAG over bills)
• "Tell me what people are saying about..." (RAG over testimony)
• "Tell me about the 2026 ballot questions regarding..." (RAG over ballot questions) | +| **Mechanisms** | • Deploy AI features on MAPLE
• Enable 3rd parties (sites, Agent Skills)
• Enable individual authorized users to leverage MAPLE data | ## Architecture +### High-level data flow + ```mermaid graph TD - FB[(Firebase: Bills, Testimony, Ballot Qs)] --> FV[Firebase Vector DB] - FV --> MCP[MCP Server] - MCP --> M1[MAPLE App] - MCP --> M2[3rd Party Apps / Agent Skills] - MCP --> M3[Authorized Users] + FB[(Firestore: Bills, Testimony, Ballot Qs)] -->|vector_embedding field| FV[Firestore Vector Index] + VA[Vertex AI\ntext-embedding-005] -->|backfill + triggers| FB + FV -->|findNearest| MCP[MCP Server\nCloud Run] + MCP --> M1[MAPLE App\nmapletestimony.org] + MCP --> M2[3rd Party AI Clients\nClaude, ChatGPT, etc.] ``` -## Environments - -- **DEV**: Project `digital-testimony-dev`. All initial development, backfilling, and prototype testing will occur here. -- **PROD**: Project `digital-testimony-prod`. Final deployment only after successful verification in DEV. **DO NOT MODIFY PROD DURING INITIAL PROTOTYPING.** - -## Development Flow - -1. **Infrastructure**: Create `agentKeys` in Firestore (DEV) and deploy vector indexes via `firebase deploy --only firestore:indexes`. -2. **Indexing**: Implement and run `backfill-embeddings.ts` against the DEV project to populate `vector_embedding` fields. -3. **MCP Core**: Initialize `mcp-server` package and implement `tools.ts` with Vertex AI + Firestore `findNearest` logic. -4. **Auth & Local Test**: Implement Hybrid Auth and verify the prototype locally using Stdio transport and the MCP Inspector. -5. **Remote Deployment**: Deploy to Cloud Run (DEV) using SSE transport and verify with a remote agent. - -## Proposed Changes - -### Firestore Vector Search Setup - -We need to prepare Firestore to support vector queries. - -#### [MODIFY] [firestore.indexes.json](../firestore.indexes.json) +### Auth flow -- Add vector indexes for `bills`, `publishedTestimony`, and `ballotQuestions` collections (dimension 3072 for Gemini Embedding 2). - -#### [NEW] `scripts/backfill-embeddings.ts` - -- A script to iterate through all bills, testimony, and ballot questions, generate embeddings using **Vertex AI**, and save them to a new `vector_embedding` field in Firestore. - -#### [MODIFY] [bill_on_document_created.py](../llm/bill_on_document_created.py) (and similar triggers) +```mermaid +sequenceDiagram + participant C as AI Client + participant F as Firebase Function\nmcpProxy + participant CR as Cloud Run\nmcp-server + participant FS as Firestore\nagentKeys + + C->>F: POST /api/mcp\nX-Maple-Token: Bearer + Note over F: Firebase strips Authorization header\nfrom allUsers-accessible functions,\nso X-Maple-Token is used instead + F->>F: Fetch GCP identity token\nfrom metadata server + F->>CR: POST /mcp\nAuthorization: Bearer \nX-Maple-Authorization: Bearer + Note over CR: Cloud Run IAM validates\nGCP identity token + CR->>CR: Try Firebase ID token verification + alt Firebase ID token + CR-->>F: next() — user authenticated + else Agent Key + CR->>FS: GET agentKeys/{token} + FS-->>CR: {active: true, ...} + CR-->>F: next() — agent authenticated + else Invalid + CR-->>F: 401 + help URL + end + CR->>CR: Rate limit check\n(60/min, 1000/day per token) + CR->>CR: Embed query via Vertex AI\nfindNearest in Firestore + CR-->>F: MCP JSON response + F-->>C: MCP JSON response +``` -- Update existing LLM triggers to generate embeddings using Vertex AI whenever a new bill, testimony, or ballot question is added/updated. +## Environments -### Continuous Synchronization +- **DEV**: Project `digital-testimony-dev`. Cloud Run service and Firebase Function deployed and tested. +- **PROD**: Project `digital-testimony-prod`. Embeddings backfilled; deployment pending. -To keep the vector database updated in real-time, we will implement **Firestore Triggers** (Vector Indexers) for the following paths: +## What's implemented -| Data Type | Firestore Path | Logic Location | -| :------------ | :------------------------------------ | :---------------------------------------- | -| **Bills** | `generalCourts/{court}/bills/{id}` | `functions/src/bills/vector.ts` | -| **Testimony** | `users/{uid}/publishedTestimony/{id}` | `functions/src/testimony/vector.ts` | -| **Ballot Qs** | `ballotQuestions/{id}` | `functions/src/ballotQuestions/vector.ts` | +### Firestore vector search -- **Pattern**: Create a `createVectorIndexer` utility in TypeScript (mirroring the existing `createSearchIndexer`) to standardize embedding generation across all collections. +- **Vector indexes**: All required composite indexes in `firestore.indexes.json` — deployed to dev and prod. +- **Embedding model**: `text-embedding-005` (768 dimensions) via Vertex AI Predict API. +- **Backfill**: `scripts/firebase-admin/backfill-embeddings-parallel.ts` — parallel (concurrency=8) with exponential backoff. Run against both dev and prod. +- **Migration**: `scripts/firebase-admin/migrate-embeddings-to-vector.ts` — one-time migration of plain-array embeddings to Firestore `VectorValue` format (required for `findNearest`). +- **Continuous sync**: `functions/src/search/createVectorIndexer.ts` triggers on document write to keep embeddings current. ---- +### MCP server (`mcp-server/`) -### MCP Server Implementation +7 tools implemented in `tools.ts`: -#### [NEW] `mcp-server/` (New Package) +| Tool | Description | +| :------------------------ | :--------------------------------------------------------------------------------------------------------------------- | +| `search_bills` | Vector search on bills. Filters: `legislationType`, `topic`, `committee`, `primarySponsor`, `court`, `includeFullText` | +| `search_testimony` | Vector search on testimony. Filters: `policyType`, `policyId`, `authorDisplayName`, `court` | +| `search_ballot_questions` | Vector search on ballot questions | +| `search_policies` | Unified search across bills + ballot questions, sorted by relevance | +| `list_topics` | Returns all valid AI-assigned topic tags by category | +| `list_committees` | Returns active committee names for use as filters | +| `list_sponsors` | Returns primary sponsor names for use as filters | -Initialize a new Node.js package with the following structure: +Auth (`auth.ts`): checks `X-Maple-Authorization` → `X-Maple-Token` → `Authorization` in that order. Accepts Firebase ID Token or agent key. -- `package.json`: Include `@modelcontextprotocol/sdk`, `firebase-admin`, `@google-cloud/aiplatform`. -- `index.ts`: Main entry point implementing the MCP server with **SSE transport**. -- `tools.ts`: Implementation of RAG tools: - - `search_policies`: Unified vector search across both **bills** and **ballot questions**. - - `search_bills`: Focused vector search on legislative bills. - - `search_ballot_questions`: Focused vector search on ballot questions. - - `search_testimony`: Vector search on testimony, with optional `policyType` (bill/ballot) and `policyId` filters. - - `get_bill_details`: Fetch full bill content. - - `get_testimony_details`: Fetch testimony content. - - `get_ballot_question_details`: Fetch ballot question content. -- `auth.ts`: Middleware for **Hybrid Auth**: - 1. Check for `Authorization: Bearer `. - 2. If token is a Firebase ID Token, verify via `admin.auth()`. - 3. If token is an Agent Key, verify via lookup in `/agentKeys/{key}` Firestore collection. +Rate limiting (`rateLimit.ts`): 60 req/min and 1,000 req/day per token, in-memory. ---- +Transports: HTTP (`index-http.ts`) for Cloud Run; stdio (`index.ts`) for local use. ### Deployment -#### [NEW] `mcp-server/Dockerfile` - -- Containerize the MCP server for Cloud Run deployment. - -#### [NEW] `infra/deploy-mcp.sh` - -- Script to build and deploy the container to Google Cloud Run, supporting `--env dev` and `--env prod` targets. - -## Scalability & Cost Estimates - -### Scalability Analysis - -- **Cloud Run**: Scales from 0 to 1000+ instances. Initial users (5/day) will stay within the free tier. Scaling to 100+ users will trigger horizontal scaling with minimal latency impact. -- **Firestore Vector Search**: Managed by Google to scale transparently with document count and search volume. -- **Vertex AI**: High-throughput API that handles concurrent embedding requests effortlessly. - -### Estimated Monthly Costs - -| Component | Initial (25 queries/day) | Scale (500 queries/day) | Notes | -| :----------------------- | :----------------------- | :---------------------- | :------------------------------ | -| **Vertex AI Embeddings** | ~$0.00 / month | ~$0.08 / month | $0.025 per 1M characters | -| **Firestore Reads** | $0.00 (Free Tier) | $0.00 (Free Tier) | Free up to 50k reads/day | -| **Cloud Run Compute** | $0.00 (Free Tier) | $0.00 (Free Tier) | Free tier: 180k vCPU-s/mo | -| **Network Egress** | ~$0.00 / month | ~$0.05 / month | Standard GCP rates | -| **Total Monthly Cost** | **~$0.00** | **~$0.15** | **Mostly covered by Free Tier** | - -> [!NOTE] > **One-time Backfill Cost**: ~$1.50 (assuming 10k documents and 50M characters). - -## Verification Plan - -### Automated Tests - -- Unit tests for `mcp-server` tools using mocked Firestore and Vertex AI. -- Integration tests using the Firebase Emulator. - -### Manual Verification - -- Deploy to Cloud Run staging environment. -- Use the **MCP Inspector** or a custom script to connect via SSE. -- Verify that `search_bills` and `search_testimony` return relevant results for various queries using both Firebase Auth tokens and Agent Keys. - ---- - -# Integration and Configuration - -#### [MODIFY] [package.json](../package.json) - -- Add a script to start the MCP server: `"mcp:start": "ts-node mcp-server/index.ts"`. +- **Cloud Run**: `mcp-server/Dockerfile` (two-stage build). Deployed to `digital-testimony-dev` with `max-instances=2`, `--no-allow-unauthenticated`. +- **Firebase Function proxy**: `functions/src/mcp/proxy.ts` (`mcpProxy`). Deployed to dev. Exposes `/api/mcp` via `firebase.json` hosting rewrite once hosting is deployed. +- **Billing budget**: $60/month alert on dev. -#### [NEW] `mcp-server/.env.example` +### User guide -- Template for required environment variables: `FIREBASE_PROJECT_ID`, `GOOGLE_APPLICATION_CREDENTIALS`, `MCP_API_KEY`. +New page at `/learn/ai-tools` (Learn nav menu) explaining setup for non-technical advocates, including Claude Desktop and ChatGPT instructions, example queries, and privacy notes. -## Verification Plan +## Remaining work (before prod deploy) -### Automated Tests +- [ ] Deploy Cloud Run + `mcpProxy` to prod +- [ ] CI/CD pipeline for Cloud Run image build and deploy on merge +- [ ] Hosting deploy needed to activate `/api/mcp` rewrite — nav link should not be publicly promoted until complete -- Unit tests for `mcp-server` tools using mocked Firestore and Vertex AI. -- Integration tests using the Firebase Emulator (if it supports vector search, otherwise against a dev project). +## Cost estimates -### Manual Verification +| Component | 25 queries/day | 500 queries/day | +| :------------------- | :---------------- | :---------------- | +| Vertex AI Embeddings | ~$0.00 | ~$0.08/month | +| Firestore Reads | $0.00 (free tier) | $0.00 (free tier) | +| Cloud Run Compute | $0.00 (free tier) | $0.00 (free tier) | +| **Total** | **~$0.00** | **~$0.08/month** | -- Start the MCP server locally. -- Connect it to Claude Desktop or use an MCP inspector tool. -- Verify that `search_bills` and `search_testimony` return relevant results for various queries. +One-time backfill cost (dev + prod, ~27k docs): ~$3. diff --git a/mcp-server/index-http.ts b/mcp-server/index-http.ts index e63c3478a..a69c9b200 100644 --- a/mcp-server/index-http.ts +++ b/mcp-server/index-http.ts @@ -33,6 +33,7 @@ import { Request, Response, NextFunction } from "express" import dotenv from "dotenv" import { registerTools } from "./tools" import { hybridAuthMiddleware } from "./auth" +import { rateLimitMiddleware } from "./rateLimit" dotenv.config() @@ -44,7 +45,8 @@ if (!admin.apps.length) { // ── Config ──────────────────────────────────────────────────────────────────── const PORT = parseInt(process.env.PORT ?? "3001", 10) const HOST = process.env.HOST ?? "127.0.0.1" -const DISABLE_AUTH = process.env.DISABLE_AUTH === "true" +const DISABLE_AUTH = + process.env.NODE_ENV !== "production" && process.env.DISABLE_AUTH === "true" // ── MCP server factory ──────────────────────────────────────────────────────── // We create a fresh McpServer per request (stateless mode). @@ -74,31 +76,36 @@ app.get("/health", (_req, res) => { res.json({ status: "ok", server: "maple-mcp-server", version: "0.1.0" }) }) -// MCP endpoint — auth required, new transport per request (stateless) -app.post("/mcp", authMiddleware, async (req: Request, res: Response) => { - const transport = new StreamableHTTPServerTransport({ - sessionIdGenerator: undefined // stateless: no session tracking - }) +// MCP endpoint — auth then rate limit, new transport per request (stateless) +app.post( + "/mcp", + authMiddleware, + rateLimitMiddleware, + async (req: Request, res: Response) => { + const transport = new StreamableHTTPServerTransport({ + sessionIdGenerator: undefined // stateless: no session tracking + }) - const server = createMcpServer() + const server = createMcpServer() - // Clean up when the response finishes - res.on("finish", () => { - transport - .close() - .catch(err => console.error("Error closing transport:", err)) - }) + // Clean up when the response finishes + res.on("finish", () => { + transport + .close() + .catch(err => console.error("Error closing transport:", err)) + }) - try { - await server.connect(transport) - await transport.handleRequest(req, res, req.body) - } catch (err) { - console.error("Error handling MCP request:", err) - if (!res.headersSent) { - res.status(500).json({ error: "Internal Server Error" }) + try { + await server.connect(transport) + await transport.handleRequest(req, res, req.body) + } catch (err) { + console.error("Error handling MCP request:", err) + if (!res.headersSent) { + res.status(500).json({ error: "Internal Server Error" }) + } } } -}) +) // Reject GET /mcp — we're stateless, no persistent SSE stream app.get("/mcp", (_req, res) => { diff --git a/mcp-server/rateLimit.ts b/mcp-server/rateLimit.ts new file mode 100644 index 000000000..58b48dc51 --- /dev/null +++ b/mcp-server/rateLimit.ts @@ -0,0 +1,85 @@ +import { Request, Response, NextFunction } from "express" + +const WINDOW_MS = 60_000 // 1 minute +const MAX_PER_MINUTE = 60 +const MAX_PER_DAY = 1_000 + +interface TokenBucket { + minuteCount: number + minuteWindowStart: number + dayCount: number + dayStart: number +} + +const buckets = new Map() + +function getBucket(token: string): TokenBucket { + const now = Date.now() + let bucket = buckets.get(token) + + if (!bucket) { + bucket = { + minuteCount: 0, + minuteWindowStart: now, + dayCount: 0, + dayStart: now + } + buckets.set(token, bucket) + return bucket + } + + // Reset minute window if expired + if (now - bucket.minuteWindowStart >= WINDOW_MS) { + bucket.minuteCount = 0 + bucket.minuteWindowStart = now + } + + // Reset daily count if a new UTC day has started + const bucketDay = new Date(bucket.dayStart).toISOString().slice(0, 10) + const today = new Date(now).toISOString().slice(0, 10) + if (bucketDay !== today) { + bucket.dayCount = 0 + bucket.dayStart = now + } + + return bucket +} + +/** Extract the bearer token or agent key that was validated by auth middleware. */ +function getTokenKey(req: Request): string { + // Use the agent key doc ID if present, else the Firebase UID + const agent = (req as any).agent + const user = (req as any).user + if (agent?.id) return `agent:${agent.id}` + if (user?.uid) return `user:${user.uid}` + // Fallback: raw Authorization header value (already validated upstream) + const auth = req.headers.authorization ?? "" + return `raw:${auth.slice(0, 64)}` +} + +export function rateLimitMiddleware( + req: Request, + res: Response, + next: NextFunction +) { + const key = getTokenKey(req) + const bucket = getBucket(key) + + if (bucket.dayCount >= MAX_PER_DAY) { + return res.status(429).json({ + error: "Daily request limit reached", + detail: `Maximum ${MAX_PER_DAY} requests per day per token.` + }) + } + + if (bucket.minuteCount >= MAX_PER_MINUTE) { + return res.status(429).json({ + error: "Rate limit exceeded", + detail: `Maximum ${MAX_PER_MINUTE} requests per minute per token.` + }) + } + + bucket.minuteCount++ + bucket.dayCount++ + next() +} diff --git a/package.json b/package.json index 3b856ec88..9a0329989 100644 --- a/package.json +++ b/package.json @@ -76,6 +76,8 @@ "@fortawesome/fontawesome-svg-core": "^6.5.1", "@fortawesome/free-solid-svg-icons": "^6.5.1", "@fortawesome/react-fontawesome": "^0.2.0", + "@google-cloud/aiplatform": "^3.9.0", + "@modelcontextprotocol/sdk": "^1.0.3", "@mui/icons-material": "^7.3.7", "@mui/material": "^7.3.7", "@popperjs/core": "^2.11.8", @@ -94,8 +96,6 @@ "express": "^4.18.2", "firebase": "9.6.10", "firebase-admin": "^12.0.0", - "@google-cloud/aiplatform": "^3.9.0", - "@modelcontextprotocol/sdk": "^1.0.3", "fuse.js": "6.5.3", "handlebars": "^4.7.8", "i18next": "^23.10.0", diff --git a/pages/dev/token.tsx b/pages/dev/token.tsx index d3de18d8f..4b5186d80 100644 --- a/pages/dev/token.tsx +++ b/pages/dev/token.tsx @@ -81,9 +81,9 @@ function TokenPage() { "mcpServers": { "maple": { "type": "http", - "url": "https://maple-mcp.run.app/mcp", + "url": "https://mapletestimony.org/api/mcp", "headers": { - "Authorization": "Bearer " + "X-Maple-Token": "Bearer " } } } diff --git a/pages/learn/ai-tools.tsx b/pages/learn/ai-tools.tsx new file mode 100644 index 000000000..da5b50b00 --- /dev/null +++ b/pages/learn/ai-tools.tsx @@ -0,0 +1,22 @@ +import { Container } from "../../components/bootstrap" +import { createPage } from "../../components/page" +import { createGetStaticTranslationProps } from "components/translations" +import AiTools from "components/learn/AiTools/AiTools" + +export default createPage({ + titleI18nKey: "titles.ai_tools", + Page: () => { + return ( + + + + ) + } +}) + +export const getStaticProps = createGetStaticTranslationProps([ + "auth", + "common", + "footer", + "aiTools" +]) diff --git a/public/locales/en/aiTools.json b/public/locales/en/aiTools.json new file mode 100644 index 000000000..d6d330364 --- /dev/null +++ b/public/locales/en/aiTools.json @@ -0,0 +1,82 @@ +{ + "title": "AI Research Tools", + "description": "Search MAPLE's full database of bills, testimony, and ballot questions by having a conversation with an AI assistant.", + "section1": { + "title": "What is this feature?", + "desc1": "MAPLE now lets you connect AI chat tools—like Claude or ChatGPT—directly to its database. This means you can ask your AI assistant questions about Massachusetts legislation in plain language, and it will search MAPLE's data on bills, ballot questions, and testimony to answer questions or help with research.", + "desc2Pre": "This works through a standard called", + "desc2LinkText": "MCP (Model Context Protocol)", + "desc2Post": ", which allows AI tools to securely retrieve live data from services like MAPLE. Think of it as giving your AI assistant a direct line to MAPLE's research library." + }, + "section2": { + "title": "What can you ask?", + "intro": "Once connected, you can ask your AI assistant to:", + "item1Bold": "Find bills by topic", + "item1Main": "— search the full text and summaries of all bills across legislative sessions", + "item2Bold": "Map a policy area", + "item2Main": "— get a comprehensive view of all bills under a topic, how they relate, and which have the most public support", + "item3Bold": "Track policy evolution", + "item3Main": "— compare how proposals on a topic have changed across General Court sessions, and identify which ideas have gained or lost traction over time", + "item4Bold": "Synthesize public testimony", + "item4Main": "— surface the main arguments, common themes, and organizational voices in testimony on a bill or policy area", + "item5Bold": "Read public testimony", + "item5Main": "— see what advocates, organizations, and constituents have said about specific bills", + "item6Bold": "Filter and compare", + "item6Main": "— narrow results by committee, primary sponsor, legislation type, or session", + "item7Bold": "Search ballot questions", + "item7Main": "— find and understand statewide ballot measures alongside legislative bills" + }, + "section3": { + "title": "Example questions to try", + "intro": "Here are some examples of how advocates have used this feature:", + "example1Label": "Research by topic", + "example1Text": "Find bills about housing affordability that are currently in committee. Summarize what each one proposes.", + "example2Label": "Understand a specific bill", + "example2Text": "What does H.1234 propose, and what has public testimony said about it? Are there more people who support or oppose it?", + "example3Label": "Explore a policy area", + "example3Text": "What bills related to clean water or sewage have been filed in Massachusetts in the last two sessions? Which ones got the most public engagement?", + "example4Label": "Prepare to testify", + "example4Text": "I want to testify in favor of expanding paid family leave. What are the relevant bills this session, and what arguments have other supporters already made?" + }, + "section4": { + "title": "How to get started", + "intro": "Follow these steps to connect an AI assistant to MAPLE:", + "step1Bold": "Create a free MAPLE account", + "step1Pre": "at", + "step1LinkText": "mapletestimony.org", + "step1Post": ". A MAPLE account is required to authenticate your AI assistant's access to the database.", + "step2Bold": "Choose an AI tool that supports MCP.", + "step2Intro": "Several options work with MAPLE:", + "step2item1Bold": "Claude Desktop", + "step2item1Tag": "(Anthropic)", + "step2item1Pre": "— the easiest option for most users.", + "step2item1Link1": "Download for Mac or Windows", + "step2item1Mid": ", then follow the", + "step2item1Link2": "MCP setup guide", + "step2item1Post": ".", + "step2item2Bold": "ChatGPT", + "step2item2Tag": "(OpenAI)", + "step2item2Pre": "— available for Pro, Team, and Enterprise subscribers via", + "step2item2LinkText": "Apps & Connectors", + "step2item2Post": "with Developer Mode enabled in settings.", + "step2item3Bold": "Other tools", + "step2item3Main": "— MCP is an open standard and support is growing rapidly across AI products. Check your AI tool's documentation for current MCP connection instructions.", + "step3Bold": "Get your MAPLE access token.", + "step3Pre": "After logging in, visit", + "step3LinkText": "your token page", + "step3Post": "and click “Get Token” to generate a short-lived access token. Copy it — you'll paste it into your AI tool in the next step.", + "step4Bold": "Add MAPLE as a connected source in your AI tool.", + "step4Pre": "Most tools use a configuration file or settings panel. Add the following connection details, replacing", + "step4Post": "with the token from step 3:", + "step5Bold": "Start a new conversation", + "step5Post": "and try one of the example questions above. Your AI assistant will automatically search MAPLE when you ask about Massachusetts legislation.", + "needHelp": "Need help?", + "needHelpPost": "Email us at" + }, + "section5": { + "title": "Privacy & data use", + "desc1": "When you use this feature, your AI assistant retrieves only publicly available MAPLE data—the same bills and testimony anyone can read on this site. Your MAPLE token identifies you so that your access is logged, but no private account information is shared with the AI.", + "desc2Pre": "Your conversations are governed by your AI provider's privacy policy, not MAPLE's. We recommend reviewing it before discussing sensitive advocacy work.", + "desc2LinkText": "Learn more about how MAPLE uses AI →" + } +} diff --git a/public/locales/en/common.json b/public/locales/en/common.json index d1dbd8e47..ea6789c8f 100644 --- a/public/locales/en/common.json +++ b/public/locales/en/common.json @@ -136,6 +136,7 @@ "signOut": "Sign Out", "supportMaple": "Support MAPLE", "aboutTestimony": "About Testimony", + "aiTools": "AI Research Tools", "testimony": "Testimony", "viewProfile": "View Profile", "whyUseMaple": "Why Use MAPLE", @@ -197,7 +198,8 @@ "testimony": "Testimony", "policies": "Policies", "unsubscribe": "Unsubscribe", - "why_use_maple": "Why Use Maple?" + "why_use_maple": "Why Use Maple?", + "ai_tools": "AI Research Tools" }, "user_updates": "User Updates", "view": "View", diff --git a/public/locales/en/mapleAI.json b/public/locales/en/mapleAI.json index 56347e44e..96f6bf398 100644 --- a/public/locales/en/mapleAI.json +++ b/public/locales/en/mapleAI.json @@ -59,6 +59,11 @@ "desc3Main": "Summaries, tags, and hearing transcripts are stored in our backend, timestamped, and linked to specific bills, hearings, and legislative sessions. We may regenerate them periodically as modeling capabilities evolve." } }, + "aiResearch": { + "title": "Use AI to research MAPLE data", + "desc": "You can connect AI chat tools like Claude directly to MAPLE's database of bills, testimony, and ballot questions, ask questions in plain English, and get real-time answers drawn from the full legislative record.", + "linkText": "Learn how to set up AI research tools with your MAPLE account →" + }, "section5": { "title": "Future Deployments of AI", "desc1": "We are continually exploring new ways to leverage AI to improve your experience on MAPLE. Future enhancements may include more personalized recommendations, deeper insights into legislative trends, and interactive tools to engage with legislative data more effectively. Here are some projects we are considering:", diff --git a/scripts/firebase-admin/backfill-embeddings-parallel.ts b/scripts/firebase-admin/backfill-embeddings-parallel.ts new file mode 100644 index 000000000..ffef071f4 --- /dev/null +++ b/scripts/firebase-admin/backfill-embeddings-parallel.ts @@ -0,0 +1,208 @@ +import { Script } from "./types" +import { FieldValue } from "functions/src/firebase" +import { PredictionServiceClient, helpers } from "@google-cloud/aiplatform" + +const location = "us-central1" +const publisher = "google" +const model = "text-embedding-005" +const CONCURRENCY = 8 +const MAX_RETRIES = 6 +const BACKOFF_BASE_MS = 1000 + +async function getEmbeddingWithRetry( + client: PredictionServiceClient, + endpoint: string, + text: string, + title?: string +): Promise { + const formattedText = `title: ${title || "none"} | text: ${text}` + const instance = helpers.toValue({ content: formattedText })! + const parameters = helpers.toValue({ outputDimensionality: 768 })! + + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + try { + const responseArray = (await client.predict({ + endpoint, + instances: [instance], + parameters + })) as any + const response = responseArray[0] + + if (!response.predictions || response.predictions.length === 0) { + throw new Error("No predictions returned from Vertex AI") + } + + const prediction = helpers.fromValue( + response.predictions[0] as any + ) as any + const embedding = + prediction.embeddings?.values || prediction.embedding?.values + + if (!embedding) { + throw new Error( + `Unexpected prediction format: ${JSON.stringify(prediction)}` + ) + } + + return embedding + } catch (e: any) { + const isQuotaError = + e?.code === 8 || + e?.details?.includes("RESOURCE_EXHAUSTED") || + e?.message?.includes("RESOURCE_EXHAUSTED") + if (isQuotaError && attempt < MAX_RETRIES) { + const delay = + BACKOFF_BASE_MS * Math.pow(2, attempt) + Math.random() * 500 + await new Promise(r => setTimeout(r, delay)) + continue + } + throw e + } + } + throw new Error("Max retries exceeded") +} + +/** Run tasks with a fixed concurrency limit. */ +async function runWithConcurrency( + tasks: (() => Promise)[], + concurrency: number +): Promise { + const queue = [...tasks] + const workers = Array.from({ length: concurrency }, async () => { + while (queue.length > 0) { + const task = queue.shift() + if (task) await task() + } + }) + await Promise.all(workers) +} + +export const script: Script = async ({ db, args }) => { + const project = process.env.GCLOUD_PROJECT + const endpoint = `projects/${project}/locations/${location}/publishers/${publisher}/models/${model}` + const client = new PredictionServiceClient({ + apiEndpoint: `${location}-aiplatform.googleapis.com` + }) + + console.log(`Starting parallel backfill for project: ${project}`) + console.log(`Concurrency: ${CONCURRENCY}`) + + const collections = [ + { + name: "bills", + group: true, + textFields: ["content.Title", "content.DocumentText"] + }, + { name: "publishedTestimony", group: true, textFields: ["content"] }, + { + name: "ballotQuestions", + group: false, + textFields: ["title", "description", "fullSummary"] + } + ] + + const limitVal = (args as any).limit ?? (args as any).l ?? undefined + const limit = + typeof limitVal === "number" + ? limitVal + : typeof limitVal === "string" + ? parseInt(limitVal, 10) + : undefined + + for (const col of collections) { + console.log(`\nProcessing collection: ${col.name}`) + let query: any = col.group + ? db.collectionGroup(col.name) + : db.collection(col.name) + + if (limit) { + console.log(`Limiting to ${limit} documents`) + query = query.limit(limit) + } + + const snapshot = await query.get() + console.log(`Found ${snapshot.size} documents`) + + // Filter to only docs that need embeddings + const docsToProcess = snapshot.docs.filter((doc: any) => { + const existing = doc.data().vector_embedding + return !(existing && typeof (existing as any).toArray === "function") + }) + + console.log( + `${docsToProcess.length} need embeddings, ${ + snapshot.size - docsToProcess.length + } already indexed` + ) + + const bulkWriter = db.bulkWriter() + let done = 0 + let failed = 0 + const startTime = Date.now() + + const tasks = docsToProcess.map((doc: any) => async () => { + const data = doc.data() + + const textToEmbed = col.textFields + .map((field: string) => { + const parts = field.split(".") + let val: any = data + for (const part of parts) val = val?.[part] + return val + }) + .filter(Boolean) + .join("\n\n") + + if (!textToEmbed) { + done++ + return + } + + let title = "none" + if (col.name === "bills") title = data.content?.Title || "none" + else if (col.name === "ballotQuestions") title = data.title || "none" + else if (col.name === "publishedTestimony") + title = data.billTitle || "none" + + try { + const embedding = await getEmbeddingWithRetry( + client, + endpoint, + textToEmbed, + title + ) + bulkWriter.update(doc.ref, { + vector_embedding: (FieldValue as any).vector(embedding) + }) + done++ + + if (done % 100 === 0 || done === docsToProcess.length) { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(0) + const rate = (done / ((Date.now() - startTime) / 1000)).toFixed(1) + const eta = Math.round( + (docsToProcess.length - done) / parseFloat(rate) + ) + console.log( + ` ${done}/${docsToProcess.length} done | ${failed} failed | ${rate} docs/s | ETA ${eta}s | elapsed ${elapsed}s` + ) + } + } catch (e) { + console.error(` Failed ${doc.id}:`, e) + failed++ + done++ + } + }) + + await runWithConcurrency(tasks, CONCURRENCY) + await bulkWriter.close() + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1) + console.log( + `Done with ${col.name}: ${ + done - failed + } updated, ${failed} failed, in ${elapsed}s` + ) + } + + console.log("\nBackfill complete!") +}