whitphx HF Staff commited on
Commit
f6c05e9
·
1 Parent(s): 11d0b50

id serialization

Browse files
.gitignore CHANGED
@@ -136,3 +136,4 @@ bench-web/.transformers-cache
136
 
137
  # Benchmark result files
138
  bench/benchmark-results.jsonl
 
 
136
 
137
  # Benchmark result files
138
  bench/benchmark-results.jsonl
139
+ bench/benchmark-results/
bench/docs/benchmark-id-format.md ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark ID and File Organization
2
+
3
+ ## Overview
4
+
5
+ Benchmark results are organized using a deterministic ID system that groups results with identical settings into the same file. The ID is structured hierarchically with task at the top level, followed by model ID, and finally encoded parameters.
6
+
7
+ ## Directory Structure
8
+
9
+ Results are stored in nested directories with task as the top level:
10
+
11
+ ```
12
+ benchmark-results/
13
+ ├── {task}/
14
+ │ └── {org}/
15
+ │ └── {model-name}/
16
+ │ ├── {params1}.jsonl
17
+ │ ├── {params2}.jsonl
18
+ │ └── {params3}.jsonl
19
+ ```
20
+
21
+ ## ID Format
22
+
23
+ **Full ID**: `{task}/{modelId}/{platform}_{mode}_{device}_{dtype}_{batch}_{browser}_{headed}`
24
+
25
+ ### Components
26
+
27
+ 1. **Task** (top-level directory): The transformers task
28
+ - Examples: `feature-extraction`, `text-classification`, `text-generation`, `sentiment-analysis`
29
+ - Rationale: Tasks are fundamentally different operations, so they form the primary organization
30
+
31
+ 2. **Model ID** (nested directory path): Full model identifier with organization
32
+ - Examples: `Xenova/all-MiniLM-L6-v2`, `meta-llama/Llama-2-7b`
33
+ - Preserved as-is, including slashes for directory structure
34
+
35
+ 3. **Platform**: `node` or `web`
36
+
37
+ 4. **Mode**: `warm` or `cold`
38
+
39
+ 5. **Device**: Execution device
40
+ - Node.js: `cpu` (default), `webgpu`
41
+ - Web: `webgpu` (default), `wasm`
42
+
43
+ 6. **DType** (optional): Model data type
44
+ - Examples: `fp32`, `fp16`, `q8`, `q4`, `int8`
45
+ - Omitted if not specified
46
+
47
+ 7. **Batch Size**: Always included as `b{N}`
48
+ - Examples: `b1`, `b4`, `b32`
49
+
50
+ 8. **Browser** (web only): Browser type
51
+ - Examples: `chromium`, `firefox`, `webkit`
52
+ - Omitted for Node.js benchmarks
53
+
54
+ 9. **Headed** (web only): Display mode
55
+ - Included as `headed` only if true
56
+ - Omitted for headless mode or Node.js benchmarks
57
+
58
+ ## Examples
59
+
60
+ ### Node.js Benchmarks
61
+
62
+ ```
63
+ feature-extraction/Xenova/all-MiniLM-L6-v2/node_warm_cpu_fp32_b1.jsonl
64
+ feature-extraction/Xenova/all-MiniLM-L6-v2/node_warm_webgpu_fp16_b4.jsonl
65
+ feature-extraction/Xenova/all-MiniLM-L6-v2/node_cold_cpu_b1.jsonl
66
+ text-generation/meta-llama/Llama-2-7b/node_warm_cpu_q4_b1.jsonl
67
+ ```
68
+
69
+ ### Web Benchmarks
70
+
71
+ ```
72
+ feature-extraction/Xenova/distilbert-base-uncased/web_warm_wasm_b1_chromium.jsonl
73
+ feature-extraction/Xenova/distilbert-base-uncased/web_warm_wasm_q8_b1_firefox.jsonl
74
+ feature-extraction/Xenova/distilbert-base-uncased/web_warm_webgpu_fp16_b1_chromium_headed.jsonl
75
+ feature-extraction/Xenova/roberta-large-mnli/web_cold_wasm_b1_chromium.jsonl
76
+ ```
77
+
78
+ ### Mixed Tasks and Models
79
+
80
+ ```
81
+ benchmark-results/
82
+ ├── feature-extraction/
83
+ │ └── Xenova/
84
+ │ ├── all-MiniLM-L6-v2/
85
+ │ │ ├── node_warm_cpu_fp32_b1.jsonl
86
+ │ │ └── web_warm_wasm_b1_chromium.jsonl
87
+ │ └── distilbert-base-uncased/
88
+ │ └── node_warm_webgpu_fp16_b1.jsonl
89
+ └── text-classification/
90
+ └── Xenova/
91
+ └── distilbert-base-uncased/
92
+ └── node_warm_cpu_fp32_b1.jsonl
93
+ ```
94
+
95
+ ## File Format
96
+
97
+ Each file is in JSONL (JSON Lines) format, with one benchmark result per line. This allows:
98
+ - Appending new results without parsing the entire file
99
+ - Streaming large result sets
100
+ - Easy analysis with tools like `jq`
101
+
102
+ Example:
103
+ ```jsonl
104
+ {"id":"uuid1","platform":"node","modelId":"Xenova/all-MiniLM-L6-v2","task":"feature-extraction",...}
105
+ {"id":"uuid2","platform":"node","modelId":"Xenova/all-MiniLM-L6-v2","task":"feature-extraction",...}
106
+ {"id":"uuid3","platform":"node","modelId":"Xenova/all-MiniLM-L6-v2","task":"feature-extraction",...}
107
+ ```
108
+
109
+ ## Querying Results
110
+
111
+ ### API Endpoints
112
+
113
+ 1. **Get all results**:
114
+ ```bash
115
+ curl http://localhost:7860/api/benchmarks
116
+ ```
117
+
118
+ 2. **Get results by model**:
119
+ ```bash
120
+ curl "http://localhost:7860/api/benchmarks?modelId=Xenova/all-MiniLM-L6-v2"
121
+ ```
122
+
123
+ 3. **Get specific benchmark**:
124
+ ```bash
125
+ curl http://localhost:7860/api/benchmark/{uuid}
126
+ ```
127
+
128
+ ### Direct File Access
129
+
130
+ Results can also be queried directly from the filesystem:
131
+
132
+ ```bash
133
+ # All results for a specific task
134
+ cat benchmark-results/feature-extraction/**/*.jsonl | jq
135
+
136
+ # All results for a specific model across all tasks
137
+ cat benchmark-results/*/Xenova/all-MiniLM-L6-v2/*.jsonl | jq
138
+
139
+ # All results for a specific model and task
140
+ cat benchmark-results/feature-extraction/Xenova/all-MiniLM-L6-v2/*.jsonl | jq
141
+
142
+ # Specific configuration
143
+ cat benchmark-results/feature-extraction/Xenova/all-MiniLM-L6-v2/node_warm_cpu_fp32_b1.jsonl | jq
144
+
145
+ # Count results per configuration
146
+ wc -l benchmark-results/feature-extraction/Xenova/all-MiniLM-L6-v2/*.jsonl
147
+
148
+ # Filter by device across all models
149
+ cat benchmark-results/feature-extraction/*/*/web_*_wasm_*.jsonl | jq
150
+
151
+ # Compare same model across different tasks
152
+ cat benchmark-results/*/Xenova/distilbert-base-uncased/node_warm_cpu_fp32_b1.jsonl | jq
153
+ ```
154
+
155
+ ## Benefits
156
+
157
+ 1. **Task-First Organization**: Primary organization by task type, as models are typically designed for specific tasks
158
+ 2. **Grouping**: Multiple runs with identical settings are stored together in JSONL files
159
+ 3. **Easy Comparison**: Compare different models on the same task, or same model across different tasks
160
+ 4. **Organization**: Clear hierarchy: task → org → model → configurations
161
+ 5. **Readability**: Filenames are human-readable and self-documenting
162
+ 6. **Searchability**: Easy to find specific configurations using filesystem tools and glob patterns
163
+ 7. **Scalability**: Nested directory structure handles thousands of models and tasks
164
+ 8. **Model ID Preservation**: Full model IDs maintained without sanitization, preserving org/model structure
165
+
166
+ ## Configuration
167
+
168
+ The base directory can be customized via environment variable:
169
+
170
+ ```bash
171
+ export BENCHMARK_RESULTS_DIR=/path/to/results
172
+ npm run server
173
+ ```
174
+
175
+ Default: `./benchmark-results`
bench/src/core/benchmark-id.ts ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Benchmark ID Generator
3
+ *
4
+ * Creates human-readable, deterministic IDs from benchmark settings that:
5
+ * 1. Group results with identical configurations
6
+ * 2. Use model ID as directory structure (e.g., "Xenova/all-MiniLM-L6-v2/...")
7
+ * 3. Encode other parameters as filename
8
+ * 4. Are sortable and searchable
9
+ */
10
+
11
+ export interface BenchmarkSettings {
12
+ platform: "node" | "web";
13
+ modelId: string;
14
+ task: string;
15
+ mode: "warm" | "cold";
16
+ device?: string;
17
+ dtype?: string;
18
+ batchSize?: number;
19
+ browser?: string;
20
+ headed?: boolean;
21
+ }
22
+
23
+ /**
24
+ * Generate a benchmark ID path from settings
25
+ *
26
+ * Format: {task}/{modelId}/{platform}_{mode}_{device}_{dtype}_{batch}_{browser}_{headed}
27
+ *
28
+ * Examples:
29
+ * - "feature-extraction/Xenova/all-MiniLM-L6-v2/node_warm_cpu_fp32_b1"
30
+ * - "feature-extraction/Xenova/distilbert-base-uncased/web_warm_wasm_q8_b1_chromium"
31
+ * - "text-generation/meta-llama/Llama-2-7b/web_cold_webgpu_fp16_b4_firefox_headed"
32
+ *
33
+ * The path can be used to create directories and files:
34
+ * - Directory: {task}/{modelId}/
35
+ * - Filename: {platform}_{mode}_{device}_{dtype}_{batch}_{browser}_{headed}.jsonl
36
+ */
37
+ export function generateBenchmarkId(settings: BenchmarkSettings): string {
38
+ // Task at top level
39
+ const task = settings.task;
40
+
41
+ // Model ID is preserved as-is (with slashes for directory structure)
42
+ const modelId = settings.modelId;
43
+
44
+ // Generate filename parts from other settings (excluding task since it's in the directory)
45
+ const filenameParts = generateFilenameParts(settings);
46
+
47
+ // Combine: task/modelId/filename
48
+ return `${task}/${modelId}/${filenameParts.join("_")}`;
49
+ }
50
+
51
+ /**
52
+ * Generate the filename parts (everything except task and model ID)
53
+ */
54
+ function generateFilenameParts(settings: BenchmarkSettings): string[] {
55
+ const parts: string[] = [];
56
+
57
+ // 1. Platform (node/web)
58
+ parts.push(settings.platform);
59
+
60
+ // 2. Mode (warm/cold)
61
+ parts.push(settings.mode);
62
+
63
+ // 3. Device
64
+ if (settings.device) {
65
+ parts.push(settings.device);
66
+ } else if (settings.platform === "node") {
67
+ parts.push("cpu"); // default for node
68
+ } else {
69
+ parts.push("webgpu"); // default for web
70
+ }
71
+
72
+ // 4. DType (if specified)
73
+ if (settings.dtype) {
74
+ parts.push(settings.dtype);
75
+ }
76
+
77
+ // 5. Batch size (always include for consistency)
78
+ const batchSize = settings.batchSize || 1;
79
+ parts.push(`b${batchSize}`);
80
+
81
+ // 6. Browser (for web platform)
82
+ if (settings.platform === "web" && settings.browser) {
83
+ parts.push(settings.browser);
84
+ }
85
+
86
+ // 7. Headed mode (for web platform, only if true)
87
+ if (settings.platform === "web" && settings.headed) {
88
+ parts.push("headed");
89
+ }
90
+
91
+ return parts;
92
+ }
93
+
94
+ /**
95
+ * Generate a filesystem path for storing benchmark results
96
+ * Returns: { dir: "feature-extraction/Xenova/all-MiniLM-L6-v2", filename: "node_warm_cpu_fp32_b1.jsonl" }
97
+ */
98
+ export function generateBenchmarkPath(settings: BenchmarkSettings): { dir: string; filename: string; fullPath: string } {
99
+ const dir = `${settings.task}/${settings.modelId}`;
100
+ const filenameParts = generateFilenameParts(settings);
101
+ const filename = `${filenameParts.join("_")}.jsonl`;
102
+ const fullPath = `${dir}/${filename}`;
103
+
104
+ return { dir, filename, fullPath };
105
+ }
106
+
107
+ /**
108
+ * Parse a benchmark ID path back into settings (best effort)
109
+ * This is useful for filtering and querying
110
+ *
111
+ * Example: "feature-extraction/Xenova/all-MiniLM-L6-v2/node_warm_cpu_fp32_b1"
112
+ */
113
+ export function parseBenchmarkId(id: string): Partial<BenchmarkSettings> {
114
+ const settings: Partial<BenchmarkSettings> = {};
115
+
116
+ // Split into parts
117
+ const pathParts = id.split("/");
118
+
119
+ if (pathParts.length < 4) {
120
+ return settings; // Invalid ID - need at least task/org/model/filename
121
+ }
122
+
123
+ // Extract task (first part)
124
+ settings.task = pathParts[0];
125
+
126
+ // Extract model ID (everything from second part to second-to-last slash)
127
+ // Example: ["feature-extraction", "Xenova", "all-MiniLM-L6-v2", "node_warm_cpu_fp32_b1"]
128
+ // modelId should be "Xenova/all-MiniLM-L6-v2"
129
+ const lastSlashIdx = id.lastIndexOf("/");
130
+ const taskLength = settings.task.length + 1; // +1 for the slash
131
+ settings.modelId = id.substring(taskLength, lastSlashIdx);
132
+
133
+ // Extract filename parts (everything after the last slash)
134
+ const filenamePart = id.substring(lastSlashIdx + 1);
135
+ const parts = filenamePart.split("_");
136
+
137
+ if (parts.length < 3) {
138
+ return settings; // Invalid filename format
139
+ }
140
+
141
+ let idx = 0;
142
+
143
+ // Platform
144
+ if (parts[idx] === "node" || parts[idx] === "web") {
145
+ settings.platform = parts[idx] as "node" | "web";
146
+ idx++;
147
+ }
148
+
149
+ // Mode
150
+ if (idx < parts.length && (parts[idx] === "warm" || parts[idx] === "cold")) {
151
+ settings.mode = parts[idx] as "warm" | "cold";
152
+ idx++;
153
+ }
154
+
155
+ // Device (might be cpu, webgpu, wasm)
156
+ if (idx < parts.length && ["cpu", "webgpu", "wasm"].includes(parts[idx])) {
157
+ settings.device = parts[idx];
158
+ idx++;
159
+ }
160
+
161
+ // DType
162
+ if (idx < parts.length && ["fp32", "fp16", "q8", "q4", "int8", "uint8", "bnb4", "q4f16"].includes(parts[idx])) {
163
+ settings.dtype = parts[idx];
164
+ idx++;
165
+ }
166
+
167
+ // Batch size
168
+ if (idx < parts.length && parts[idx].startsWith("b")) {
169
+ const batch = parseInt(parts[idx].substring(1), 10);
170
+ if (!isNaN(batch)) {
171
+ settings.batchSize = batch;
172
+ idx++;
173
+ }
174
+ }
175
+
176
+ // Browser
177
+ if (idx < parts.length && ["chromium", "firefox", "webkit"].includes(parts[idx])) {
178
+ settings.browser = parts[idx];
179
+ idx++;
180
+ }
181
+
182
+ // Headed
183
+ if (idx < parts.length && parts[idx] === "headed") {
184
+ settings.headed = true;
185
+ idx++;
186
+ }
187
+
188
+ return settings;
189
+ }
190
+
191
+ /**
192
+ * Generate a human-readable display name from settings
193
+ */
194
+ export function generateDisplayName(settings: BenchmarkSettings): string {
195
+ const parts: string[] = [];
196
+
197
+ // Model name
198
+ parts.push(settings.modelId);
199
+
200
+ // Task
201
+ parts.push(`(${settings.task})`);
202
+
203
+ // Platform and device
204
+ if (settings.platform === "web") {
205
+ parts.push(`[${settings.browser || "browser"}/${settings.device || "webgpu"}]`);
206
+ } else {
207
+ parts.push(`[node/${settings.device || "cpu"}]`);
208
+ }
209
+
210
+ // Mode
211
+ parts.push(settings.mode);
212
+
213
+ // DType if specified
214
+ if (settings.dtype) {
215
+ parts.push(settings.dtype);
216
+ }
217
+
218
+ // Batch size if not 1
219
+ const batchSize = settings.batchSize || 1;
220
+ if (batchSize !== 1) {
221
+ parts.push(`batch=${batchSize}`);
222
+ }
223
+
224
+ // Headed if true
225
+ if (settings.headed) {
226
+ parts.push("headed");
227
+ }
228
+
229
+ return parts.join(" ");
230
+ }
bench/src/server/index.ts CHANGED
@@ -132,9 +132,19 @@ app.get("/api/benchmark/:id", async (c) => {
132
  /**
133
  * GET /api/benchmarks
134
  * Get all benchmark results from storage
 
 
135
  */
136
  app.get("/api/benchmarks", async (c) => {
137
- const results = await storage.getAllResults();
 
 
 
 
 
 
 
 
138
  return c.json({
139
  total: results.length,
140
  results,
 
132
  /**
133
  * GET /api/benchmarks
134
  * Get all benchmark results from storage
135
+ * Query params:
136
+ * - modelId: Filter by model ID
137
  */
138
  app.get("/api/benchmarks", async (c) => {
139
+ const modelId = c.req.query("modelId");
140
+
141
+ let results;
142
+ if (modelId) {
143
+ results = await storage.getResultsByModel(modelId);
144
+ } else {
145
+ results = await storage.getAllResults();
146
+ }
147
+
148
  return c.json({
149
  total: results.length,
150
  results,
bench/src/server/storage.ts CHANGED
@@ -1,24 +1,55 @@
1
  import fs from "fs/promises";
2
  import path from "path";
3
  import { QueuedBenchmark } from "./queue.js";
 
4
 
5
  export class BenchmarkStorage {
6
- private filePath: string;
7
 
8
- constructor(filePath?: string) {
9
  // Use environment variable if set, otherwise fall back to default
10
- const defaultPath = process.env.BENCHMARK_RESULTS_PATH || "./benchmark-results.jsonl";
11
- this.filePath = path.resolve(filePath || defaultPath);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  }
13
 
14
  async appendResult(benchmark: QueuedBenchmark): Promise<void> {
 
 
 
 
 
 
 
15
  const line = JSON.stringify(benchmark) + "\n";
16
- await fs.appendFile(this.filePath, line, "utf-8");
17
  }
18
 
19
- async getAllResults(): Promise<QueuedBenchmark[]> {
 
 
 
20
  try {
21
- const content = await fs.readFile(this.filePath, "utf-8");
22
  const lines = content.trim().split("\n").filter(line => line.length > 0);
23
  return lines.map(line => JSON.parse(line));
24
  } catch (error: any) {
@@ -29,14 +60,81 @@ export class BenchmarkStorage {
29
  }
30
  }
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  async getResultById(id: string): Promise<QueuedBenchmark | undefined> {
33
  const results = await this.getAllResults();
34
  return results.find(r => r.id === id);
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  async clearResults(): Promise<void> {
38
  try {
39
- await fs.unlink(this.filePath);
40
  } catch (error: any) {
41
  if (error.code !== "ENOENT") {
42
  throw error;
 
1
  import fs from "fs/promises";
2
  import path from "path";
3
  import { QueuedBenchmark } from "./queue.js";
4
+ import { generateBenchmarkPath, type BenchmarkSettings } from "../core/benchmark-id.js";
5
 
6
  export class BenchmarkStorage {
7
+ private baseDir: string;
8
 
9
+ constructor(baseDir?: string) {
10
  // Use environment variable if set, otherwise fall back to default
11
+ const defaultDir = process.env.BENCHMARK_RESULTS_DIR || "./benchmark-results";
12
+ this.baseDir = path.resolve(baseDir || defaultDir);
13
+ }
14
+
15
+ /**
16
+ * Get the file path for a benchmark based on its settings
17
+ */
18
+ private getBenchmarkFilePath(benchmark: QueuedBenchmark): string {
19
+ const settings: BenchmarkSettings = {
20
+ platform: benchmark.platform,
21
+ modelId: benchmark.modelId,
22
+ task: benchmark.task,
23
+ mode: benchmark.mode,
24
+ device: benchmark.device,
25
+ dtype: benchmark.dtype,
26
+ batchSize: benchmark.batchSize,
27
+ browser: benchmark.browser,
28
+ headed: benchmark.headed,
29
+ };
30
+
31
+ const { dir, filename } = generateBenchmarkPath(settings);
32
+ return path.join(this.baseDir, dir, filename);
33
  }
34
 
35
  async appendResult(benchmark: QueuedBenchmark): Promise<void> {
36
+ const filePath = this.getBenchmarkFilePath(benchmark);
37
+ const dir = path.dirname(filePath);
38
+
39
+ // Ensure directory exists
40
+ await fs.mkdir(dir, { recursive: true });
41
+
42
+ // Append result as JSONL
43
  const line = JSON.stringify(benchmark) + "\n";
44
+ await fs.appendFile(filePath, line, "utf-8");
45
  }
46
 
47
+ /**
48
+ * Read all results from a specific JSONL file
49
+ */
50
+ private async readJsonlFile(filePath: string): Promise<QueuedBenchmark[]> {
51
  try {
52
+ const content = await fs.readFile(filePath, "utf-8");
53
  const lines = content.trim().split("\n").filter(line => line.length > 0);
54
  return lines.map(line => JSON.parse(line));
55
  } catch (error: any) {
 
60
  }
61
  }
62
 
63
+ /**
64
+ * Recursively find all JSONL files in the results directory
65
+ */
66
+ private async findAllJsonlFiles(dir: string): Promise<string[]> {
67
+ const files: string[] = [];
68
+
69
+ try {
70
+ const entries = await fs.readdir(dir, { withFileTypes: true });
71
+
72
+ for (const entry of entries) {
73
+ const fullPath = path.join(dir, entry.name);
74
+
75
+ if (entry.isDirectory()) {
76
+ // Recursively search subdirectories
77
+ const subFiles = await this.findAllJsonlFiles(fullPath);
78
+ files.push(...subFiles);
79
+ } else if (entry.isFile() && entry.name.endsWith(".jsonl")) {
80
+ files.push(fullPath);
81
+ }
82
+ }
83
+ } catch (error: any) {
84
+ if (error.code === "ENOENT") {
85
+ return []; // Directory doesn't exist yet
86
+ }
87
+ throw error;
88
+ }
89
+
90
+ return files;
91
+ }
92
+
93
+ async getAllResults(): Promise<QueuedBenchmark[]> {
94
+ const allFiles = await this.findAllJsonlFiles(this.baseDir);
95
+ const allResults: QueuedBenchmark[] = [];
96
+
97
+ for (const file of allFiles) {
98
+ const results = await this.readJsonlFile(file);
99
+ allResults.push(...results);
100
+ }
101
+
102
+ return allResults;
103
+ }
104
+
105
  async getResultById(id: string): Promise<QueuedBenchmark | undefined> {
106
  const results = await this.getAllResults();
107
  return results.find(r => r.id === id);
108
  }
109
 
110
+ /**
111
+ * Get all results for a specific benchmark configuration
112
+ */
113
+ async getResultsBySettings(settings: BenchmarkSettings): Promise<QueuedBenchmark[]> {
114
+ const { dir, filename } = generateBenchmarkPath(settings);
115
+ const filePath = path.join(this.baseDir, dir, filename);
116
+ return this.readJsonlFile(filePath);
117
+ }
118
+
119
+ /**
120
+ * Get all results for a specific model (all configurations)
121
+ */
122
+ async getResultsByModel(modelId: string): Promise<QueuedBenchmark[]> {
123
+ const modelDir = path.join(this.baseDir, modelId);
124
+ const allFiles = await this.findAllJsonlFiles(modelDir);
125
+ const allResults: QueuedBenchmark[] = [];
126
+
127
+ for (const file of allFiles) {
128
+ const results = await this.readJsonlFile(file);
129
+ allResults.push(...results);
130
+ }
131
+
132
+ return allResults;
133
+ }
134
+
135
  async clearResults(): Promise<void> {
136
  try {
137
+ await fs.rm(this.baseDir, { recursive: true, force: true });
138
  } catch (error: any) {
139
  if (error.code !== "ENOENT") {
140
  throw error;