Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ jobs:

- name: Run tracer validation (same-file edge recall)
timeout-minutes: 10
# Reuse the tracer subprocess output captured by the resolution benchmark
# step above (#1166) instead of re-running run-tracer.mjs per fixture.
env:
RESOLUTION_RESULT_JSON: ${{ github.workspace }}/resolution-result.json
run: npx vitest run tests/benchmarks/resolution/tracer/tracer-validation.test.ts --reporter=verbose

- name: Merge resolution into build result
Expand Down
41 changes: 35 additions & 6 deletions scripts/resolution-benchmark.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,20 @@ interface DynamicEdge {
target_file: string;
}

/**
* Per-language tracer artifact consumed by tests/benchmarks/resolution/tracer/tracer-validation.test.ts
* to avoid re-running tracer subprocesses. See #1166.
*
* status:
* - 'ok' — tracer subprocess produced edges (possibly an empty array if the language has no
* same-file calls). Consumers should treat edges as authoritative.
* - 'skipped' — toolchain not available or tracer crashed. Consumers should skip recall assertions.
*/
interface TracerArtifact {
status: 'ok' | 'skipped';
edges: DynamicEdge[];
}

interface LangResult {
precision: number;
recall: number;
Expand All @@ -70,6 +84,7 @@ interface LangResult {
falseNegativeEdges: string[];
dynamicEdges?: number;
dynamicConfirmed?: number;
tracer?: TracerArtifact;
}

// ── Helpers ──────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -162,10 +177,17 @@ const TRACER_SCRIPT = path.join(root, 'tests', 'benchmarks', 'resolution', 'trac

/**
* Attempt to run the dynamic call tracer for a language fixture.
* Returns captured edges on success, empty array on failure or unavailability.
*
* Returns a TracerArtifact discriminating between:
* - 'ok' — tracer ran (edges may be empty if the fixture has no captured calls)
* - 'skipped' — tracer script missing, toolchain unavailable, or subprocess crashed
*
* The status distinction mirrors the semantics in
* tests/benchmarks/resolution/tracer/tracer-validation.test.ts so its `runTracer`
* can reuse this artifact directly (#1166).
*/
function runDynamicTracer(lang: string): DynamicEdge[] {
if (!fs.existsSync(TRACER_SCRIPT)) return [];
function runDynamicTracer(lang: string): TracerArtifact {
if (!fs.existsSync(TRACER_SCRIPT)) return { status: 'skipped', edges: [] };

const fixtureDir = path.join(FIXTURES_DIR, lang);
try {
Expand All @@ -176,12 +198,15 @@ function runDynamicTracer(lang: string): DynamicEdge[] {
stdio: ['pipe', 'pipe', 'pipe'],
});
const parsed = JSON.parse(result);
const edges = Array.isArray(parsed.edges) ? parsed.edges : [];
if (parsed.error) {
console.error(` Dynamic tracer for ${lang}: ${parsed.error}`);
// Treat "error reported and no edges" as toolchain-missing skip
if (edges.length === 0) return { status: 'skipped', edges: [] };
}
return Array.isArray(parsed.edges) ? parsed.edges : [];
return { status: 'ok', edges };
} catch {
return [];
return { status: 'skipped', edges: [] };
}
}

Expand Down Expand Up @@ -285,7 +310,8 @@ try {
const expectedEdges: ExpectedEdge[] = manifest.edges;

// Run dynamic tracer if available
const dynamicEdges = runDynamicTracer(lang);
const tracerArtifact = runDynamicTracer(lang);
const dynamicEdges = tracerArtifact.edges;
const { dynamicConfirmed } = mergeWithDynamic(expectedEdges, dynamicEdges);

// Use only expected edges for metrics (dynamic edges are supplemental)
Expand All @@ -294,6 +320,9 @@ try {
metrics.dynamicEdges = dynamicEdges.length;
metrics.dynamicConfirmed = dynamicConfirmed;
}
// Emit raw tracer artifact so the tracer-validation gate test can reuse it
// without spawning a second subprocess per fixture (#1166).
metrics.tracer = tracerArtifact;
results[lang] = metrics;

const dynamicInfo =
Expand Down
44 changes: 44 additions & 0 deletions tests/benchmarks/resolution/tracer/tracer-validation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,40 @@ interface ExpectedEdge {
const FIXTURES_DIR = path.join(import.meta.dirname, '..', 'fixtures');
const RUN_TRACER = path.join(import.meta.dirname, 'run-tracer.mjs');

/**
* When set, points to a resolution-result.json artifact produced by
* scripts/resolution-benchmark.ts. The benchmark script already runs each
* language's tracer subprocess and embeds the raw edges + status under
* `<lang>.tracer`. Reading that artifact lets the gate test skip a second
* subprocess per fixture in CI (~doubling the tracer cost otherwise — see #1166).
*
* When unset, the test falls back to running run-tracer.mjs directly so devs
* can still execute `npx vitest run tests/benchmarks/resolution/tracer/...`
* standalone.
*/
const RESOLUTION_RESULT_JSON = process.env.RESOLUTION_RESULT_JSON;

interface ArtifactTracerEntry {
status: 'ok' | 'skipped';
edges: TracerEdge[];
}

const artifactResults: Record<string, { tracer?: ArtifactTracerEntry }> | null = (() => {
if (!RESOLUTION_RESULT_JSON) return null;
if (!fs.existsSync(RESOLUTION_RESULT_JSON)) {
throw new Error(
`RESOLUTION_RESULT_JSON=${RESOLUTION_RESULT_JSON} does not exist — produce it with scripts/resolution-benchmark.ts first.`,
);
}
try {
return JSON.parse(fs.readFileSync(RESOLUTION_RESULT_JSON, 'utf-8'));
} catch (err) {
throw new Error(
`RESOLUTION_RESULT_JSON=${RESOLUTION_RESULT_JSON} is not valid JSON — regenerate it with scripts/resolution-benchmark.ts. (${err})`,
);
}
})();

/**
* Per-language same-file recall thresholds.
*
Expand Down Expand Up @@ -113,6 +147,16 @@ function basename(filePath: string): string {
}

function runTracer(lang: string): TracerEdge[] | null {
// Artifact mode: reuse the tracer output already produced by
// scripts/resolution-benchmark.ts during the publish workflow's resolution
// benchmark step (#1166). The script writes status='skipped' for missing
// toolchains, mirroring this function's null-return semantics.
if (artifactResults) {
const entry = artifactResults[lang]?.tracer;
if (!entry || entry.status === 'skipped') return null;
return entry.edges;
}

const fixtureDir = path.join(FIXTURES_DIR, lang);
if (!fs.existsSync(fixtureDir)) return null;

Expand Down
Loading