Skip to content

Commit 14776b3

Browse files
committed
feat: implement scraping pipeline with RAG context for chat
1 parent 1eec387 commit 14776b3

38 files changed

Lines changed: 2619 additions & 11 deletions

apps/server/src/index.ts

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { join } from "node:path";
2+
import { fileURLToPath } from "node:url";
23
import { db } from "@ifinho/db";
34
import { env } from "@ifinho/env/server";
45
import type { ChatRequest } from "@ifinho/types";
56
import cors from "cors";
7+
import { sql } from "drizzle-orm";
68
import { migrate } from "drizzle-orm/node-postgres/migrator";
79
import express from "express";
810
import { Ollama } from "ollama";
@@ -66,12 +68,59 @@ Estou em desenvolvimento! Em breve estarei totalmente integrado com todos os doc
6668
};
6769

6870
const SYSTEM_PROMPT = `Você é o Ifinho, assistente virtual do IFRS Campus Canoas.
69-
Responda sempre em português, de forma clara e objetiva, usando Markdown para formatar suas respostas (títulos, listas, negrito quando fizer sentido). Use emojis com frequência para deixar as respostas mais visuais e amigáveis — em títulos, itens de lista, destaques e no início de seções.`;
71+
Responda sempre em português, de forma clara e objetiva, usando Markdown para formatar suas respostas (títulos, listas, negrito quando fizer sentido).
72+
Use emojis com frequência para deixar as respostas mais visuais e amigáveis — em títulos, itens de lista, destaques e no início de seções. Cada item de lista deve ter um emoji relevante.
73+
Ao citar fontes, mantenha os links Markdown exatamente como aparecem no contexto, no formato [texto](url).`;
7074

7175
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
7276

7377
const ollama = new Ollama({ host: env.OLLAMA_BASE_URL });
7478

79+
interface ChunkRow extends Record<string, unknown> {
80+
content: string;
81+
title: string | null;
82+
url: string | null;
83+
similarity: number;
84+
}
85+
86+
async function retrieveContext(question: string): Promise<string> {
87+
const { embeddings } = await ollama.embed({
88+
model: env.OLLAMA_EMBED_MODEL,
89+
input: question,
90+
});
91+
92+
const embedding = embeddings[0];
93+
if (!embedding) return "";
94+
95+
const vectorStr = `[${embedding.join(",")}]`;
96+
97+
const CHUNK_MAX_CHARS = 500;
98+
99+
const result = await db.execute<ChunkRow>(sql`
100+
SELECT c.content, s.title, s.url,
101+
1 - (c.embedding <=> CAST(${vectorStr} AS vector)) AS similarity
102+
FROM chunks c
103+
JOIN documents d ON d.id = c.document_id
104+
JOIN sources s ON s.id = d.source_id
105+
WHERE c.is_active = true
106+
AND c.embedding IS NOT NULL
107+
AND s.status = 'indexed'
108+
ORDER BY c.embedding <=> CAST(${vectorStr} AS vector)
109+
LIMIT 3
110+
`);
111+
if (result.rows.length === 0) return "";
112+
113+
return result.rows
114+
.map((r) => {
115+
const title = r.title ?? "Sem título";
116+
const url = r.url ?? "";
117+
const source = url ? `[${title}](${url})` : title;
118+
const content = r.content.slice(0, CHUNK_MAX_CHARS);
119+
return `**${title}**\nFonte: ${source}\n\n${content}`;
120+
})
121+
.join("\n\n---\n\n");
122+
}
123+
75124
app.post("/api/chat", async (req, res) => {
76125
const { message } = req.body as ChatRequest;
77126

@@ -99,11 +148,17 @@ app.post("/api/chat", async (req, res) => {
99148
}
100149

101150
try {
151+
const context = await retrieveContext(trimmed);
152+
153+
const systemPrompt = context
154+
? `${SYSTEM_PROMPT}\n\n## Informações relevantes encontradas na base de dados do IFRS Canoas:\n\n${context}\n\nUse as informações acima para responder. Se a resposta não estiver no contexto, diga que não encontrou informação sobre isso nos documentos disponíveis.`
155+
: SYSTEM_PROMPT;
156+
102157
const stream = await ollama.chat({
103158
model: env.OLLAMA_MODEL,
104159
stream: true,
105160
messages: [
106-
{ role: "system", content: SYSTEM_PROMPT },
161+
{ role: "system", content: systemPrompt },
107162
{ role: "user", content: trimmed },
108163
],
109164
});
@@ -118,7 +173,7 @@ app.post("/api/chat", async (req, res) => {
118173
res.write("data: [DONE]\n\n");
119174
res.end();
120175
} catch (error) {
121-
console.error("Ollama error:", error);
176+
console.error("Chat error:", error);
122177
const message =
123178
error instanceof Error ? error.message : "Erro desconhecido";
124179
res.write(`data: ${JSON.stringify({ error: message })}\n\n`);
@@ -128,7 +183,11 @@ app.post("/api/chat", async (req, res) => {
128183

129184
console.log("Running database migrations...");
130185
await migrate(db, {
131-
migrationsFolder: join(process.cwd(), "packages/db/src/migrations"),
186+
migrationsFolder: join(
187+
fileURLToPath(
188+
new URL("../../../packages/db/src/migrations", import.meta.url),
189+
),
190+
),
132191
});
133192
console.log("Migrations applied.");
134193

apps/worker/Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM node:22-alpine AS base
2+
WORKDIR /app
3+
4+
FROM base AS deps
5+
COPY package*.json ./
6+
COPY packages/config/package.json packages/config/
7+
COPY packages/db/package.json packages/db/
8+
COPY packages/env/package.json packages/env/
9+
COPY packages/queue/package.json packages/queue/
10+
COPY packages/scraper/package.json packages/scraper/
11+
COPY apps/worker/package.json apps/worker/
12+
RUN npm install
13+
14+
FROM base AS runner
15+
COPY --from=deps /app/node_modules ./node_modules
16+
COPY . .
17+
CMD ["node", "--import=tsx/esm", "apps/worker/src/index.ts"]

apps/worker/package.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"name": "worker",
3+
"type": "module",
4+
"main": "src/index.ts",
5+
"scripts": {
6+
"dev": "tsx watch src/index.ts",
7+
"build": "tsdown",
8+
"check-types": "tsc -b"
9+
},
10+
"dependencies": {
11+
"@ifinho/db": "*",
12+
"@ifinho/env": "*",
13+
"@ifinho/queue": "*",
14+
"@ifinho/scraper": "*",
15+
"dotenv": "^17.2.2"
16+
},
17+
"devDependencies": {
18+
"@ifinho/config": "*",
19+
"@types/node": "^22.13.14",
20+
"tsdown": "^0.16.5",
21+
"tsx": "^4.19.2",
22+
"typescript": "^5"
23+
}
24+
}

apps/worker/src/index.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import { db } from "@ifinho/db";
2+
import { scrapeConfigs } from "@ifinho/db/schema";
3+
import { env } from "@ifinho/env/worker";
4+
import { getPgBoss, type ScrapeJobData } from "@ifinho/queue";
5+
import {
6+
EmbedStep,
7+
Fetcher,
8+
HashCheckStep,
9+
NewsScraper,
10+
PersistStep,
11+
SanitizeStep,
12+
ScraperRunner,
13+
} from "@ifinho/scraper";
14+
import { eq } from "drizzle-orm";
15+
import { enqueueDueJobs } from "./scheduler.js";
16+
17+
const boss = await getPgBoss();
18+
19+
console.log("[Worker] pg-boss started");
20+
21+
await boss.createQueue("scrape");
22+
await boss.createQueue("check-due-scrapers");
23+
24+
const plugins = new Map([
25+
["news", new NewsScraper(new Fetcher({ delayMs: 1500 }))],
26+
]);
27+
28+
const pipeline = [
29+
new SanitizeStep(),
30+
new HashCheckStep(db),
31+
new PersistStep(db),
32+
new EmbedStep(db, env.OLLAMA_BASE_URL, env.OLLAMA_EMBED_MODEL),
33+
];
34+
35+
const runner = new ScraperRunner(plugins, pipeline);
36+
37+
await boss.work<ScrapeJobData>("scrape", async (jobs) => {
38+
const job = jobs[0];
39+
if (!job) return;
40+
41+
const { pluginId, configId } = job.data;
42+
43+
console.log(`[Worker] Starting job: plugin=${pluginId}, config=${configId}`);
44+
45+
const [config] = await db
46+
.select()
47+
.from(scrapeConfigs)
48+
.where(eq(scrapeConfigs.id, configId))
49+
.limit(1);
50+
51+
if (!config || !config.enabled) {
52+
console.log(`[Worker] Config ${configId} not found or disabled, skipping`);
53+
return;
54+
}
55+
56+
await runner.run(
57+
pluginId,
58+
{
59+
startUrl: config.baseUrl,
60+
options:
61+
(config.options as { maxPages?: number; delayMs?: number }) ?? {},
62+
},
63+
{
64+
pluginId,
65+
configId,
66+
},
67+
);
68+
69+
console.log(`[Worker] Job completed: plugin=${pluginId}`);
70+
});
71+
72+
await boss.schedule("check-due-scrapers", "*/15 * * * *");
73+
await boss.work("check-due-scrapers", async () => {
74+
await enqueueDueJobs(boss);
75+
});
76+
77+
await enqueueDueJobs(boss);
78+
79+
console.log("[Worker] Ready — listening for scrape jobs");

apps/worker/src/scheduler.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { db } from "@ifinho/db";
2+
import { scrapeConfigs } from "@ifinho/db/schema";
3+
import type { ScrapeJobData } from "@ifinho/queue";
4+
import { and, eq, sql } from "drizzle-orm";
5+
import type PgBoss from "pg-boss";
6+
7+
export async function enqueueDueJobs(boss: PgBoss): Promise<void> {
8+
const now = new Date();
9+
10+
const due = await db
11+
.select()
12+
.from(scrapeConfigs)
13+
.where(
14+
and(
15+
eq(scrapeConfigs.enabled, true),
16+
sql`NOT EXISTS (
17+
SELECT 1 FROM sources s
18+
WHERE s.url = ${scrapeConfigs.baseUrl}
19+
AND s.last_checked_at + (${scrapeConfigs.checkIntervalMinutes} * INTERVAL '1 minute') > ${now}
20+
)`,
21+
),
22+
);
23+
24+
for (const config of due) {
25+
const jobData: ScrapeJobData = {
26+
pluginId: config.pluginId,
27+
configId: config.id,
28+
};
29+
30+
await boss.send("scrape", jobData, {
31+
priority: config.priority,
32+
retryLimit: 3,
33+
retryDelay: 60,
34+
singletonKey: config.id,
35+
});
36+
37+
console.log(`[Scheduler] Enqueued scrape job for config: ${config.name}`);
38+
}
39+
}

apps/worker/tsconfig.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"extends": "@ifinho/config/tsconfig.base.json",
3+
"compilerOptions": {
4+
"outDir": "dist"
5+
},
6+
"include": ["src"]
7+
}

docker-compose.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
services:
22
postgres:
33
container_name: ifinho-postgres
4-
image: postgres:18-alpine
4+
image: pgvector/pgvector:pg18
55
restart: unless-stopped
66
environment:
77
POSTGRES_USER: ${POSTGRES_USER}
88
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
99
POSTGRES_DB: ${POSTGRES_DB}
1010
volumes:
11-
- ifinho-postgres-data:/var/lib/postgresql/data
11+
- ifinho-postgres-data:/var/lib/postgresql
1212
ports:
1313
- "5432:5432"
1414
healthcheck:

0 commit comments

Comments
 (0)