sveltejs · khromov · Dec 29, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/AGENTS.md b/AGENTS.md
@@ -20,14 +20,35 @@ bun run generate-report.ts
 # Generate HTML report from specific result file
 bun run generate-report.ts results/result-2024-12-07-14-30-45.json
 
+# Generate index.html with all results
+bun run generate-index
+
+# Build all reports (generate-report + generate-index)
+bun run build
+
 # Run unit tests for lib modules
-bun run test:self
+bun test
+
+# Run specific test file
+bun test lib/utils.test.ts
 
 # Run TypeScript type checking
 bun tsc --noEmit
 
 # Format code with Prettier
 bun run prettier
+
+# Lint code
+bun run lint
+
+# Lint and fix code
+bun run lint:fix
+
+# Link to Vercel project for AI Gateway
+bun run vercel:link
+
+# Pull environment variables from Vercel
+bun run vercel:env:pull
 ```
 
 ## Environment Variables
@@ -60,14 +81,19 @@ MCP integration is configured via the interactive CLI at runtime. Options:
 ├── lib/
 │   ├── pricing.ts              # Cost calculation from gateway pricing
 │   ├── pricing.test.ts         # Unit tests for pricing module
-│   ├── test-discovery.ts       # Test suite discovery and prompt building
-│   ├── test-discovery.test.ts  # Unit tests for test discovery
+│   ├── test-discovery.ts       # Test suite discovery
 │   ├── output-test-runner.ts   # Vitest runner for component verification
 │   ├── output-test-runner.test.ts # Unit tests for output runner
+│   ├── validator-runner.ts     # Optional validator runner
+│   ├── validator-runner.test.ts # Unit tests for validator runner
 │   ├── verify-references.ts    # Reference implementation verification
 │   ├── report.ts               # Report generation orchestration
+│   ├── report.test.ts          # Unit tests for report module
 │   ├── report-template.ts      # HTML report template generation
 │   ├── report-styles.ts        # CSS styles for HTML reports
+│   ├── token-cache.ts          # Token cache simulation for cost estimation
+│   ├── utils.ts                # Utility functions (sanitization, cost calculation, etc.)
+│   ├── utils.test.ts           # Unit tests for utility functions
 │   └── tools/
 │       ├── index.ts            # Tool exports
 │       ├── result-write.ts     # ResultWrite tool for final output
@@ -76,9 +102,10 @@ MCP integration is configured via the interactive CLI at runtime. Options:
 │       └── test-component.test.ts # Unit tests for TestComponent tool
 ├── tests/                      # Benchmark test suites
 │   └── {test-name}/
-│       ├── Reference.svelte    # Reference implementation
-│       ├── test.ts             # Vitest test file
-│       └── prompt.md           # Agent prompt
+│       ├── Reference.svelte    # Reference implementation (required)
+│       ├── test.ts             # Vitest test file (required)
+│       ├── prompt.md           # Agent prompt (required)
+│       └── validator.ts        # Optional custom validator
 ├── results/                    # Benchmark results (JSON + HTML)
 ├── outputs/                    # Temporary directory for test verification
 └── patches/                    # Patches for dependencies
@@ -149,8 +176,122 @@ Key functions:
 
 - `extractPricingFromGatewayModel()`: Parse gateway model pricing
 - `buildPricingMap()`: Build lookup map from gateway models
+- `lookupPricingFromMap()`: Find pricing for a specific model
 - `calculateCost()`: Calculate total cost from token usage
 - `formatCost()` / `formatMTokCost()`: Format costs for display
+- `getModelPricingDisplay()`: Convert per-token costs to per-MTok for display
+
+### Token Cache Simulation
+
+The `lib/token-cache.ts` module simulates prompt caching behavior:
+
+**TokenCache Class:**
+
+- Models growing prefix cache across multiple API calls
+- Tracks cache hits, cache writes, and output tokens
+- Calculates simulated costs using cache read/write rates
+- Default rates: 10% for reads, 125% for writes (if not specified in pricing)
+
+**Cache Behavior Model:**
+
+1. Each test runs in its own context (cache resets between tests)
+2. Step 1's input is written to cache (pays cache creation rate)
+3. Each subsequent step:
+   - Previous step's full input is cached (pays cache read rate)
+   - New tokens extend the cache (pays cache creation rate)
+4. The cache prefix grows with each step
+
+**simulateCacheSavings()** (in `lib/utils.ts`):
+
+- Estimates cost savings with prompt caching enabled
+- Returns `simulatedCostWithCache`, `cacheHits`, and `cacheWriteTokens`
+- Results displayed in HTML report as "Cache Simulation" section
+- Shows potential savings compared to actual cost without caching
+
+### Test Discovery
+
+The `lib/test-discovery.ts` module handles test suite discovery:
+
+**Key Functions:**
+
+- `discoverTests()`: Scans `tests/` directory for test suites
+- Returns array of `TestDefinition` objects
+- Each test suite must have all three files: Reference.svelte, test.ts, and prompt.md
+- Skips incomplete test suites with a warning
+
+**Test Definition Structure:**
+```typescript
+interface TestDefinition {
+  name: string;           // Directory name from tests/{name}/
+  directory: string;      // Full path to test directory
+  referenceFile: string;  // Path to Reference.svelte
+  componentFile: string;  // Path to Component.svelte (generated)
+  testFile: string;       // Path to test.ts
+  promptFile: string;     // Path to prompt.md
+  prompt: string;         // Contents of prompt.md file
+  testContent: string;    // Contents of test.ts file
+}
+```
+
+### Validator Runner
+
+The `lib/validator-runner.ts` module provides optional validation for test outputs:
+
+**Key Functions:**
+
+- `hasValidator()`: Check if a test has an optional `validator.ts` file
+- `getValidatorPath()`: Get the path to the validator file if it exists
+- Validators can perform custom validation logic beyond standard test suites
+
+**Validator Interface:**
+```typescript
+interface ValidatorModule {
+  validate: (code: string) => ValidationResult;
+}
+
+interface ValidationResult {
+  valid: boolean;
+  errors: string[];
+}
+```
+
+### Utility Functions
+
+The `lib/utils.ts` module provides core utilities:
+
+- `sanitizeModelName()`: Convert model IDs to filesystem-safe names
+- `getTimestampedFilename()`: Generate timestamped filenames with optional model suffix
+- `isHttpUrl()`: Check if string is HTTP/HTTPS URL
+- `extractResultWriteContent()`: Extract component code from agent steps
+- `calculateTotalCost()`: Aggregate token usage and costs across all tests
+- `buildAgentPrompt()`: Build user message array from test definition
+- `simulateCacheSavings()`: Simulate cache savings using growing prefix model
+
+### Reference Verification
+
+The `lib/verify-references.ts` module verifies reference implementations:
+
+**Key Functions:**
+
+- `loadTestDefinitions()`: Discover test suites in `tests/` directory
+- `copyReferenceToComponent()`: Copy Reference.svelte to Component.svelte temporarily
+- `cleanupComponent()`: Remove temporary Component.svelte file
+- `runTest()`: Execute tests and collect detailed results
+- `printSummary()`: Display verification results summary
+- `verifyAllReferences()`: Main function that orchestrates entire verification workflow
+
+**Workflow:**
+
+1. Discover all test suites with Reference.svelte
+2. For each test:
+   - Copy Reference.svelte → Component.svelte
+   - Run vitest against the test
+   - Collect pass/fail results
+   - Cleanup Component.svelte
+3. Print summary of all results
+4. Return exit code (0 for success, 1 for failures)
+
+Used by `verify-references.ts` script accessible via `bun run verify-tests`.
 
 ### Key Technologies
 
@@ -185,16 +326,24 @@ The project uses `@ai-sdk/mcp` with a custom patch applied via `patch-package`:
    f. Test results are collected (pass/fail, error details)
    g. Output directory is cleaned up
 5. Results aggregated with pricing calculations
-6. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json`
-7. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html`
-8. Report automatically opens in default browser
+6. Cache simulation estimates potential savings
+7. Results written to `results/result-YYYY-MM-DD-HH-MM-SS.json`
+8. HTML report generated at `results/result-YYYY-MM-DD-HH-MM-SS.html`
+9. Report automatically opens in default browser
 
 ### Output Files
 
 All results are saved in the `results/` directory with timestamped filenames:
 
 - **JSON files**: `result-2024-12-07-14-30-45.json` - Complete execution trace
 - **HTML files**: `result-2024-12-07-14-30-45.html` - Interactive visualization
+- **Index file**: `index.html` - Dashboard showing all benchmark results with score, pass/fail rates, and costs
+
+**Report Generation:**
+
+- `generate-report.ts`: Converts individual JSON result files to HTML reports
+- `generate-index.ts`: Creates an index.html dashboard from all result JSON files
+- `bun run build`: Runs both scripts to generate all reports
 
 **Multi-Test Result JSON Structure:**
 
@@ -227,7 +376,8 @@ All results are saved in the `results/` directory with timestamped filenames:
     "pricing": {
       "inputCostPerMTok": 3,
       "outputCostPerMTok": 15,
-      "cacheReadCostPerMTok": 0.3
+      "cacheReadCostPerMTok": 0.3,
+      "cacheCreationCostPerMTok": 3.75
     },
     "totalCost": {
       "inputCost": 0.003,
@@ -237,6 +387,11 @@ All results are saved in the `results/` directory with timestamped filenames:
       "inputTokens": 1000,
       "outputTokens": 1000,
       "cachedInputTokens": 1000
+    },
+    "cacheSimulation": {
+      "simulatedCostWithCache": 0.015,
+      "cacheHits": 2000,
+      "cacheWriteTokens": 1500
     }
   }
 }
@@ -247,12 +402,15 @@ All results are saved in the `results/` directory with timestamped filenames:
 Unit tests for library modules are in `lib/*.test.ts`:
 
 - `lib/pricing.test.ts` - Pricing extraction, calculation, formatting
-- `lib/test-discovery.test.ts` - Test suite discovery and prompt building
-- `lib/output-test-runner.test.ts` - Output directory management
+- `lib/output-test-runner.test.ts` - Output directory management and test execution
+- `lib/validator-runner.test.ts` - Validator test runner
+- `lib/report.test.ts` - Report generation
 - `lib/tools/result-write.test.ts` - ResultWrite tool behavior
 - `lib/tools/test-component.test.ts` - TestComponent tool behavior
+- `lib/utils.test.ts` - Utility functions, cost calculation, cache simulation
 
-Run unit tests with: `bun run test:self`
+Run all unit tests with: `bun test`
+Run specific test file: `bun test lib/utils.test.ts`
 
 ## TypeScript Configuration
 
@@ -277,9 +435,10 @@ Run unit tests with: `bun run test:self`
 - All result files are saved with timestamps to preserve historical benchmarks
 - MCP integration can be configured via interactive CLI without code changes
 - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge
+- Cache simulation shows estimated savings if prompt caching were enabled
 - Exit code is 0 if all tests pass, 1 if any tests fail
 - Pricing is fetched from Vercel AI Gateway model metadata at runtime
 
-## Self-tests
+## Important notes
 
-For running the self-test test suite, run `bun test`
+Always run `bun tsc --noEmit` and `bun test` before completing work to make sure the TypeScript types and tests pass.