豆豆友情提示:这是一个非官方 GitHub 代理镜像,主要用于网络测试或访问加速。请勿在此进行登录、注册或处理任何敏感信息。进行这些操作请务必访问官方网站 github.com。 Raw 内容也通过此代理提供。
Skip to content

Commit 67c1f80

Browse files
authored
chore: refactor eval script more (#781)
add args parsing and simplify output (support debug logging + headless).
1 parent 303a138 commit 67c1f80

File tree

2 files changed

+88
-41
lines changed

2 files changed

+88
-41
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"test:update-snapshots": "npm run build && node scripts/test.mjs --test-update-snapshots",
2323
"prepare": "node --experimental-strip-types scripts/prepare.ts",
2424
"verify-server-json-version": "node --experimental-strip-types scripts/verify-server-json-version.ts",
25-
"eval": "npm run build && node --experimental-strip-types --test scripts/eval_gemini.ts"
25+
"eval": "npm run build && node --experimental-strip-types scripts/eval_gemini.ts"
2626
},
2727
"files": [
2828
"build/src",

scripts/eval_gemini.ts

Lines changed: 87 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import fs from 'node:fs';
88
import path from 'node:path';
9-
import {describe, test, before, after, afterEach} from 'node:test';
9+
import {parseArgs} from 'node:util';
1010

1111
import {
1212
GoogleGenerativeAI,
@@ -16,9 +16,10 @@ import {
1616
import {Client} from '@modelcontextprotocol/sdk/client/index.js';
1717
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
1818

19+
import {TestServer} from '../build/tests/server.js';
20+
1921
const ROOT_DIR = path.resolve(import.meta.dirname, '..');
2022
const SCENARIOS_DIR = path.join(import.meta.dirname, 'eval_scenarios');
21-
import {TestServer} from '../build/tests/server.js';
2223

2324
// Define schema for our test scenarios
2425
export interface CapturedFunctionCall {
@@ -37,8 +38,6 @@ export interface TestScenario {
3738
}
3839

3940
async function loadScenario(scenarioPath: string): Promise<TestScenario> {
40-
// Dynamic import of the test scenario
41-
// We assume the test file exports a 'scenario' object
4241
const module = await import(scenarioPath);
4342
if (!module.scenario) {
4443
throw new Error(
@@ -48,7 +47,6 @@ async function loadScenario(scenarioPath: string): Promise<TestScenario> {
4847
return module.scenario;
4948
}
5049

51-
// Helper to sanitize schema for Gemini
5250
function isRecord(v: unknown): v is Record<string, unknown> {
5351
return typeof v === 'object' && v !== null && !Array.isArray(v);
5452
}
@@ -84,9 +82,18 @@ async function runSingleScenario(
8482
scenarioPath: string,
8583
apiKey: string,
8684
server: TestServer,
85+
modelId: string,
86+
debug: boolean,
8787
): Promise<void> {
88+
const debugLog = (...args: unknown[]) => {
89+
if (debug) {
90+
console.log(...args);
91+
}
92+
};
8893
const absolutePath = path.resolve(scenarioPath);
89-
console.log(`\n### Running Scenario: ${absolutePath} ###`);
94+
debugLog(
95+
`\n### Running Scenario: ${path.relative(ROOT_DIR, absolutePath)} ###`,
96+
);
9097

9198
let client: Client | undefined;
9299
let transport: StdioClientTransport | undefined;
@@ -121,10 +128,16 @@ async function runSingleScenario(
121128
}
122129
});
123130

131+
const args = [serverPath];
132+
if (!debug) {
133+
args.push('--headless');
134+
}
135+
124136
transport = new StdioClientTransport({
125137
command: 'node',
126-
args: [serverPath],
138+
args,
127139
env,
140+
stderr: debug ? 'inherit' : 'ignore',
128141
});
129142

130143
client = new Client(
@@ -165,7 +178,7 @@ async function runSingleScenario(
165178

166179
const genAI = new GoogleGenerativeAI(apiKey);
167180
const model = genAI.getGenerativeModel({
168-
model: 'gemini-2.5-flash',
181+
model: modelId,
169182
tools: [{functionDeclarations}],
170183
});
171184

@@ -181,8 +194,8 @@ async function runSingleScenario(
181194

182195
// Execute turns
183196
let turnCount = 0;
184-
console.log(`\n--- Turn 1 (User) ---`);
185-
console.log(scenario.prompt);
197+
debugLog(`\n--- Turn 1 (User) ---`);
198+
debugLog(scenario.prompt);
186199

187200
let result = await chat.sendMessage(scenario.prompt, {
188201
timeout: 5000,
@@ -191,17 +204,15 @@ async function runSingleScenario(
191204

192205
while (turnCount < scenario.maxTurns) {
193206
turnCount++;
194-
console.log(`\n--- Turn ${turnCount} (Model) ---`);
207+
debugLog(`\n--- Turn ${turnCount} (Model) ---`);
195208
const text = response.text();
196209
if (text) {
197-
console.log(`Text: ${text}`);
210+
debugLog(`Text: ${text}`);
198211
}
199212

200213
const functionCalls = response.functionCalls();
201214
if (functionCalls && functionCalls.length > 0) {
202-
console.log(
203-
`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`,
204-
);
215+
debugLog(`Function Calls: ${JSON.stringify(functionCalls, null, 2)}`);
205216

206217
const functionResponses = [];
207218
for (const call of functionCalls) {
@@ -219,7 +230,7 @@ async function runSingleScenario(
219230

220231
const safeArgs = isRecord(call.args) ? call.args : {};
221232

222-
console.log(
233+
debugLog(
223234
`Executing tool: ${originalName} with args: ${JSON.stringify(call.args)}`,
224235
);
225236

@@ -253,49 +264,85 @@ async function runSingleScenario(
253264
}
254265

255266
// Send tool results back
256-
console.log(`Sending ${functionResponses.length} tool outputs back...`);
267+
debugLog(`Sending ${functionResponses.length} tool outputs back...`);
257268
result = await chat.sendMessage(functionResponses);
258269
response = result.response;
259270
} else {
260-
console.log('No tool calls. Interaction finished.');
271+
debugLog('No tool calls. Interaction finished.');
261272
break;
262273
}
263274
}
264275

265-
console.log('\nVerifying expectations...');
276+
debugLog('\nVerifying expectations...');
266277
expectations(allCalls);
267278
} finally {
268279
await client?.close();
269280
await transport?.close();
270281
}
271282
}
272-
const apiKey = process.env.GEMINI_API_KEY;
273-
if (!apiKey) {
274-
throw new Error('GEMINI_API_KEY environment variable is required.');
275-
}
276283

277-
void describe('Gemini Eval Scenarios', () => {
278-
const server = new TestServer(TestServer.randomPort());
284+
async function main() {
285+
const apiKey = process.env.GEMINI_API_KEY;
286+
if (!apiKey) {
287+
throw new Error('GEMINI_API_KEY environment variable is required.');
288+
}
279289

280-
before(async () => {
281-
await server.start();
290+
const {values, positionals} = parseArgs({
291+
options: {
292+
model: {
293+
type: 'string',
294+
default: 'gemini-2.5-flash',
295+
},
296+
debug: {
297+
type: 'boolean',
298+
default: false,
299+
},
300+
},
301+
allowPositionals: true,
282302
});
283303

284-
after(async () => {
285-
await server.stop();
286-
});
304+
const modelId = values.model;
305+
const debug = values.debug;
306+
const scenarioFiles =
307+
positionals.length > 0
308+
? positionals.map(p => path.resolve(p))
309+
: fs
310+
.readdirSync(SCENARIOS_DIR)
311+
.filter(file => file.endsWith('.ts') || file.endsWith('.js'))
312+
.map(file => path.join(SCENARIOS_DIR, file));
287313

288-
afterEach(() => {
289-
server.restore();
290-
});
314+
const server = new TestServer(TestServer.randomPort());
315+
await server.start();
291316

292-
const files = fs.readdirSync(SCENARIOS_DIR).filter(file => {
293-
return file.endsWith('.ts') || file.endsWith('.js');
294-
});
317+
let successCount = 0;
318+
let failureCount = 0;
295319

296-
for (const file of files) {
297-
void test(file, {timeout: 60_000}, async () => {
298-
await runSingleScenario(path.join(SCENARIOS_DIR, file), apiKey, server);
299-
});
320+
try {
321+
for (const scenarioPath of scenarioFiles) {
322+
try {
323+
await runSingleScenario(scenarioPath, apiKey, server, modelId, debug);
324+
console.log(`✔ ${path.relative(ROOT_DIR, scenarioPath)}`);
325+
successCount++;
326+
} catch (e) {
327+
console.error(`✖ ${path.relative(ROOT_DIR, scenarioPath)}`);
328+
console.error(e);
329+
failureCount++;
330+
} finally {
331+
server.restore();
332+
}
333+
}
334+
} finally {
335+
await server.stop();
300336
}
337+
338+
console.log(`\nSummary: ${successCount} passed, ${failureCount} failed`);
339+
340+
if (failureCount > 0) {
341+
process.exit(1);
342+
}
343+
}
344+
345+
main().catch(error => {
346+
console.error('Fatal error:', error);
347+
process.exit(1);
301348
});

0 commit comments

Comments
 (0)