66
77import fs from 'node:fs' ;
88import path from 'node:path' ;
9- import { describe , test , before , after , afterEach } from 'node:test ' ;
9+ import { parseArgs } from 'node:util ' ;
1010
1111import {
1212 GoogleGenerativeAI ,
@@ -16,9 +16,10 @@ import {
1616import { Client } from '@modelcontextprotocol/sdk/client/index.js' ;
1717import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js' ;
1818
19+ import { TestServer } from '../build/tests/server.js' ;
20+
1921const ROOT_DIR = path . resolve ( import . meta. dirname , '..' ) ;
2022const SCENARIOS_DIR = path . join ( import . meta. dirname , 'eval_scenarios' ) ;
21- import { TestServer } from '../build/tests/server.js' ;
2223
2324// Define schema for our test scenarios
2425export interface CapturedFunctionCall {
@@ -37,8 +38,6 @@ export interface TestScenario {
3738}
3839
3940async function loadScenario ( scenarioPath : string ) : Promise < TestScenario > {
40- // Dynamic import of the test scenario
41- // We assume the test file exports a 'scenario' object
4241 const module = await import ( scenarioPath ) ;
4342 if ( ! module . scenario ) {
4443 throw new Error (
@@ -48,7 +47,6 @@ async function loadScenario(scenarioPath: string): Promise<TestScenario> {
4847 return module . scenario ;
4948}
5049
51- // Helper to sanitize schema for Gemini
5250function isRecord ( v : unknown ) : v is Record < string , unknown > {
5351 return typeof v === 'object' && v !== null && ! Array . isArray ( v ) ;
5452}
@@ -84,9 +82,18 @@ async function runSingleScenario(
8482 scenarioPath : string ,
8583 apiKey : string ,
8684 server : TestServer ,
85+ modelId : string ,
86+ debug : boolean ,
8787) : Promise < void > {
88+ const debugLog = ( ...args : unknown [ ] ) => {
89+ if ( debug ) {
90+ console . log ( ...args ) ;
91+ }
92+ } ;
8893 const absolutePath = path . resolve ( scenarioPath ) ;
89- console . log ( `\n### Running Scenario: ${ absolutePath } ###` ) ;
94+ debugLog (
95+ `\n### Running Scenario: ${ path . relative ( ROOT_DIR , absolutePath ) } ###` ,
96+ ) ;
9097
9198 let client : Client | undefined ;
9299 let transport : StdioClientTransport | undefined ;
@@ -121,10 +128,16 @@ async function runSingleScenario(
121128 }
122129 } ) ;
123130
131+ const args = [ serverPath ] ;
132+ if ( ! debug ) {
133+ args . push ( '--headless' ) ;
134+ }
135+
124136 transport = new StdioClientTransport ( {
125137 command : 'node' ,
126- args : [ serverPath ] ,
138+ args,
127139 env,
140+ stderr : debug ? 'inherit' : 'ignore' ,
128141 } ) ;
129142
130143 client = new Client (
@@ -165,7 +178,7 @@ async function runSingleScenario(
165178
166179 const genAI = new GoogleGenerativeAI ( apiKey ) ;
167180 const model = genAI . getGenerativeModel ( {
168- model : 'gemini-2.5-flash' ,
181+ model : modelId ,
169182 tools : [ { functionDeclarations} ] ,
170183 } ) ;
171184
@@ -181,8 +194,8 @@ async function runSingleScenario(
181194
182195 // Execute turns
183196 let turnCount = 0 ;
184- console . log ( `\n--- Turn 1 (User) ---` ) ;
185- console . log ( scenario . prompt ) ;
197+ debugLog ( `\n--- Turn 1 (User) ---` ) ;
198+ debugLog ( scenario . prompt ) ;
186199
187200 let result = await chat . sendMessage ( scenario . prompt , {
188201 timeout : 5000 ,
@@ -191,17 +204,15 @@ async function runSingleScenario(
191204
192205 while ( turnCount < scenario . maxTurns ) {
193206 turnCount ++ ;
194- console . log ( `\n--- Turn ${ turnCount } (Model) ---` ) ;
207+ debugLog ( `\n--- Turn ${ turnCount } (Model) ---` ) ;
195208 const text = response . text ( ) ;
196209 if ( text ) {
197- console . log ( `Text: ${ text } ` ) ;
210+ debugLog ( `Text: ${ text } ` ) ;
198211 }
199212
200213 const functionCalls = response . functionCalls ( ) ;
201214 if ( functionCalls && functionCalls . length > 0 ) {
202- console . log (
203- `Function Calls: ${ JSON . stringify ( functionCalls , null , 2 ) } ` ,
204- ) ;
215+ debugLog ( `Function Calls: ${ JSON . stringify ( functionCalls , null , 2 ) } ` ) ;
205216
206217 const functionResponses = [ ] ;
207218 for ( const call of functionCalls ) {
@@ -219,7 +230,7 @@ async function runSingleScenario(
219230
220231 const safeArgs = isRecord ( call . args ) ? call . args : { } ;
221232
222- console . log (
233+ debugLog (
223234 `Executing tool: ${ originalName } with args: ${ JSON . stringify ( call . args ) } ` ,
224235 ) ;
225236
@@ -253,49 +264,85 @@ async function runSingleScenario(
253264 }
254265
255266 // Send tool results back
256- console . log ( `Sending ${ functionResponses . length } tool outputs back...` ) ;
267+ debugLog ( `Sending ${ functionResponses . length } tool outputs back...` ) ;
257268 result = await chat . sendMessage ( functionResponses ) ;
258269 response = result . response ;
259270 } else {
260- console . log ( 'No tool calls. Interaction finished.' ) ;
271+ debugLog ( 'No tool calls. Interaction finished.' ) ;
261272 break ;
262273 }
263274 }
264275
265- console . log ( '\nVerifying expectations...' ) ;
276+ debugLog ( '\nVerifying expectations...' ) ;
266277 expectations ( allCalls ) ;
267278 } finally {
268279 await client ?. close ( ) ;
269280 await transport ?. close ( ) ;
270281 }
271282}
272- const apiKey = process . env . GEMINI_API_KEY ;
273- if ( ! apiKey ) {
274- throw new Error ( 'GEMINI_API_KEY environment variable is required.' ) ;
275- }
276283
277- void describe ( 'Gemini Eval Scenarios' , ( ) => {
278- const server = new TestServer ( TestServer . randomPort ( ) ) ;
284+ async function main ( ) {
285+ const apiKey = process . env . GEMINI_API_KEY ;
286+ if ( ! apiKey ) {
287+ throw new Error ( 'GEMINI_API_KEY environment variable is required.' ) ;
288+ }
279289
280- before ( async ( ) => {
281- await server . start ( ) ;
290+ const { values, positionals} = parseArgs ( {
291+ options : {
292+ model : {
293+ type : 'string' ,
294+ default : 'gemini-2.5-flash' ,
295+ } ,
296+ debug : {
297+ type : 'boolean' ,
298+ default : false ,
299+ } ,
300+ } ,
301+ allowPositionals : true ,
282302 } ) ;
283303
284- after ( async ( ) => {
285- await server . stop ( ) ;
286- } ) ;
304+ const modelId = values . model ;
305+ const debug = values . debug ;
306+ const scenarioFiles =
307+ positionals . length > 0
308+ ? positionals . map ( p => path . resolve ( p ) )
309+ : fs
310+ . readdirSync ( SCENARIOS_DIR )
311+ . filter ( file => file . endsWith ( '.ts' ) || file . endsWith ( '.js' ) )
312+ . map ( file => path . join ( SCENARIOS_DIR , file ) ) ;
287313
288- afterEach ( ( ) => {
289- server . restore ( ) ;
290- } ) ;
314+ const server = new TestServer ( TestServer . randomPort ( ) ) ;
315+ await server . start ( ) ;
291316
292- const files = fs . readdirSync ( SCENARIOS_DIR ) . filter ( file => {
293- return file . endsWith ( '.ts' ) || file . endsWith ( '.js' ) ;
294- } ) ;
317+ let successCount = 0 ;
318+ let failureCount = 0 ;
295319
296- for ( const file of files ) {
297- void test ( file , { timeout : 60_000 } , async ( ) => {
298- await runSingleScenario ( path . join ( SCENARIOS_DIR , file ) , apiKey , server ) ;
299- } ) ;
320+ try {
321+ for ( const scenarioPath of scenarioFiles ) {
322+ try {
323+ await runSingleScenario ( scenarioPath , apiKey , server , modelId , debug ) ;
324+ console . log ( `✔ ${ path . relative ( ROOT_DIR , scenarioPath ) } ` ) ;
325+ successCount ++ ;
326+ } catch ( e ) {
327+ console . error ( `✖ ${ path . relative ( ROOT_DIR , scenarioPath ) } ` ) ;
328+ console . error ( e ) ;
329+ failureCount ++ ;
330+ } finally {
331+ server . restore ( ) ;
332+ }
333+ }
334+ } finally {
335+ await server . stop ( ) ;
300336 }
337+
338+ console . log ( `\nSummary: ${ successCount } passed, ${ failureCount } failed` ) ;
339+
340+ if ( failureCount > 0 ) {
341+ process . exit ( 1 ) ;
342+ }
343+ }
344+
345+ main ( ) . catch ( error => {
346+ console . error ( 'Fatal error:' , error ) ;
347+ process . exit ( 1 ) ;
301348} ) ;
0 commit comments