豆豆友情提示:这是一个非官方 GitHub 代理镜像,主要用于网络测试或访问加速。请勿在此进行登录、注册或处理任何敏感信息。进行这些操作请务必访问官方网站 github.com。 Raw 内容也通过此代理提供。
Skip to content

Commit 88d5fc1

Browse files
ktranDevtools-frontend LUCI CQ
authored andcommitted
[ai-eval] Add report generation
Using an entirely gemini-generated script for generating reports, to easier visualize what data we have after evaluation. Bug: 475195894 Change-Id: I921c091901c62e30df99127ef0cc39997dbd9384 Reviewed-on: https://chromium-review.googlesource.com/c/devtools/devtools-frontend/+/7548148 Commit-Queue: Kim-Anh Tran <kimanh@chromium.org> Reviewed-by: Jack Franklin <jacktfranklin@chromium.org>
1 parent c8a75d8 commit 88d5fc1

File tree

2 files changed

+596
-11
lines changed

2 files changed

+596
-11
lines changed

scripts/ai_assistance/suite/helpers/evaluators.ts

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import type {Conversation} from '../types.js';
1313

1414
import {generateGeminiContent} from './gemini.ts';
1515
import {getGolden, getMarkdownConversation, getOutputs, type Output} from './outputs.ts';
16+
import {generateReport} from './report_generator.ts';
1617

1718
const argv = yargs(hideBin(process.argv)).option('repeat', {type: 'number', default: 1}).parseSync();
1819

@@ -50,6 +51,14 @@ class ConcurrencyLimiter {
5051

5152
const geminiLimiter = new ConcurrencyLimiter(25);
5253

54+
const allStores: ResultStore[] = [];
55+
56+
process.on('exit', () => {
57+
if (allStores.length > 0) {
58+
generateReport(allStores);
59+
}
60+
});
61+
5362
abstract class Evaluator {}
5463

5564
const NUM_CONVERSATIONS = '# of conversations';
@@ -58,13 +67,13 @@ const OVERALL_STATS = 'Weighted Overall';
5867
const PASS_RATE = 'Pass Rate';
5968
const ROUGE_L_SUM = 'ROUGE-Lsum';
6069

61-
type RubricName = string;
62-
interface RubricScore {
70+
export type RubricName = string;
71+
export interface RubricScore {
6372
rubric: RubricName;
6473
score: number;
6574
reason: string;
6675
}
67-
type RubricWeights = Record<RubricName, number>;
76+
export type RubricWeights = Record<RubricName, number>;
6877

6978
const IMPORTANCE_WEIGHTS: Record<string, number> = {
7079
critical: 5,
@@ -317,7 +326,7 @@ function calculateStandardDeviation(values: number[]): number {
317326
/**
318327
* Calculate the overall score for a conversation based on rubric importance.
319328
*/
320-
function calculateWeightedScore(rubricScores: RubricScore[], weights: RubricWeights): number {
329+
export function calculateWeightedScore(rubricScores: RubricScore[], weights: RubricWeights): number {
321330
let totalWeightedScore = 0;
322331
let totalWeight = 0;
323332
for (const {rubric, score} of rubricScores) {
@@ -398,6 +407,7 @@ export interface GroupConfig {
398407

399408
export async function evalGroup(config: GroupConfig, cb: (() => Promise<void>)): Promise<void> {
400409
const store = new ResultStore(config.type, config.label);
410+
allStores.push(store);
401411
const outputs = await getOutputs(config.type, config.label);
402412
const outputsByDate = Object.groupBy(outputs, o => o.dateFolder);
403413

@@ -407,6 +417,7 @@ export async function evalGroup(config: GroupConfig, cb: (() => Promise<void>)):
407417
logs: [],
408418
};
409419

420+
log(0, `Evaluating ${config.type} / ${config.label}...`);
410421
await stateStorage.run(state, async () => {
411422
await cb();
412423
});
@@ -500,24 +511,24 @@ function printResults(store: ResultStore): void {
500511
}
501512
}
502513

503-
interface RubricStats {
514+
export interface RubricStats {
504515
average: number;
505516
standardDeviation: number;
506517
allScores: number[];
507518
}
508519

509-
interface BinaryStats {
520+
export interface BinaryStats {
510521
success: number;
511522
total: number;
512523
}
513-
type RougeStats = RubricStats;
514-
interface JudgeStats {
524+
export type RougeStats = RubricStats;
525+
export interface JudgeStats {
515526
statsByRubric: Record<string, RubricStats>;
516527
overallStats: RubricStats;
517528
inputCount: number;
518529
}
519530

520-
type Result = {
531+
export type Result = {
521532
type: 'BINARY',
522533
details: Array<{success: boolean, conversation: Conversation}>,
523534
}|{
@@ -573,7 +584,7 @@ function calculateJudgeStats(result: Extract<Result, {type: 'JUDGE'}>): JudgeSta
573584
};
574585
}
575586

576-
function calculateStats(result: Result): BinaryStats|RougeStats|JudgeStats|null {
587+
export function calculateStats(result: Result): BinaryStats|RougeStats|JudgeStats|null {
577588
switch (result.type) {
578589
case 'BINARY':
579590
return calculateBinaryStats(result);
@@ -586,7 +597,7 @@ function calculateStats(result: Result): BinaryStats|RougeStats|JudgeStats|null
586597
}
587598
}
588599

589-
class ResultStore {
600+
export class ResultStore {
590601
// Map of testName => YYYY-MM-DD => Result
591602
#results = new Map<string, Map<string, Result>>();
592603
#type: string;

0 commit comments

Comments
 (0)