@@ -13,6 +13,7 @@ import type {Conversation} from '../types.js';
1313
1414import { generateGeminiContent } from './gemini.ts' ;
1515import { getGolden , getMarkdownConversation , getOutputs , type Output } from './outputs.ts' ;
16+ import { generateReport } from './report_generator.ts' ;
1617
1718const argv = yargs ( hideBin ( process . argv ) ) . option ( 'repeat' , { type : 'number' , default : 1 } ) . parseSync ( ) ;
1819
@@ -50,6 +51,14 @@ class ConcurrencyLimiter {
5051
5152const geminiLimiter = new ConcurrencyLimiter ( 25 ) ;
5253
54+ const allStores : ResultStore [ ] = [ ] ;
55+
56+ process . on ( 'exit' , ( ) => {
57+ if ( allStores . length > 0 ) {
58+ generateReport ( allStores ) ;
59+ }
60+ } ) ;
61+
5362abstract class Evaluator { }
5463
5564const NUM_CONVERSATIONS = '# of conversations' ;
@@ -58,13 +67,13 @@ const OVERALL_STATS = 'Weighted Overall';
5867const PASS_RATE = 'Pass Rate' ;
5968const ROUGE_L_SUM = 'ROUGE-Lsum' ;
6069
61- type RubricName = string ;
62- interface RubricScore {
70+ export type RubricName = string ;
71+ export interface RubricScore {
6372 rubric : RubricName ;
6473 score : number ;
6574 reason : string ;
6675}
67- type RubricWeights = Record < RubricName , number > ;
76+ export type RubricWeights = Record < RubricName , number > ;
6877
6978const IMPORTANCE_WEIGHTS : Record < string , number > = {
7079 critical : 5 ,
@@ -317,7 +326,7 @@ function calculateStandardDeviation(values: number[]): number {
317326/**
318327 * Calculate the overall score for a conversation based on rubric importance.
319328 */
320- function calculateWeightedScore ( rubricScores : RubricScore [ ] , weights : RubricWeights ) : number {
329+ export function calculateWeightedScore ( rubricScores : RubricScore [ ] , weights : RubricWeights ) : number {
321330 let totalWeightedScore = 0 ;
322331 let totalWeight = 0 ;
323332 for ( const { rubric, score} of rubricScores ) {
@@ -398,6 +407,7 @@ export interface GroupConfig {
398407
399408export async function evalGroup ( config : GroupConfig , cb : ( ( ) => Promise < void > ) ) : Promise < void > {
400409 const store = new ResultStore ( config . type , config . label ) ;
410+ allStores . push ( store ) ;
401411 const outputs = await getOutputs ( config . type , config . label ) ;
402412 const outputsByDate = Object . groupBy ( outputs , o => o . dateFolder ) ;
403413
@@ -407,6 +417,7 @@ export async function evalGroup(config: GroupConfig, cb: (() => Promise<void>)):
407417 logs : [ ] ,
408418 } ;
409419
420+ log ( 0 , `Evaluating ${ config . type } / ${ config . label } ...` ) ;
410421 await stateStorage . run ( state , async ( ) => {
411422 await cb ( ) ;
412423 } ) ;
@@ -500,24 +511,24 @@ function printResults(store: ResultStore): void {
500511 }
501512}
502513
503- interface RubricStats {
514+ export interface RubricStats {
504515 average : number ;
505516 standardDeviation : number ;
506517 allScores : number [ ] ;
507518}
508519
509- interface BinaryStats {
520+ export interface BinaryStats {
510521 success : number ;
511522 total : number ;
512523}
513- type RougeStats = RubricStats ;
514- interface JudgeStats {
524+ export type RougeStats = RubricStats ;
525+ export interface JudgeStats {
515526 statsByRubric : Record < string , RubricStats > ;
516527 overallStats : RubricStats ;
517528 inputCount : number ;
518529}
519530
520- type Result = {
531+ export type Result = {
521532 type : 'BINARY' ,
522533 details : Array < { success : boolean , conversation : Conversation } > ,
523534} | {
@@ -573,7 +584,7 @@ function calculateJudgeStats(result: Extract<Result, {type: 'JUDGE'}>): JudgeSta
573584 } ;
574585}
575586
576- function calculateStats ( result : Result ) : BinaryStats | RougeStats | JudgeStats | null {
587+ export function calculateStats ( result : Result ) : BinaryStats | RougeStats | JudgeStats | null {
577588 switch ( result . type ) {
578589 case 'BINARY' :
579590 return calculateBinaryStats ( result ) ;
@@ -586,7 +597,7 @@ function calculateStats(result: Result): BinaryStats|RougeStats|JudgeStats|null
586597 }
587598}
588599
589- class ResultStore {
600+ export class ResultStore {
590601 // Map of testName => YYYY-MM-DD => Result
591602 #results = new Map < string , Map < string , Result > > ( ) ;
592603 #type: string ;
0 commit comments