{
  "scenario": "legal-citation",
  "scenario_name": "Legal Citation Accuracy",
  "scenario_description": "50-question SEC + EU AI Act Q&A pack with primary-source ground truth",
  "date": "2026-05-06",
  "stack_id": "langchain_gpt4o",
  "stack_name": "LangChain + GPT-4o",
  "is_simulated": true,
  "simulation_note": "Competitor framework orchestration is not modeled in this run. The identical underlying model (gpt-4o-2024-11-20) is used with each stack's typical system prompt to establish baseline model-level performance. Actual competitor performance varies based on framework overhead, multi-agent coordination latency, and framework-specific prompt engineering.",
  "parameters": {
    "model": "gpt-4o-2024-11-20",
    "temperature": 0,
    "max_tokens": 1024,
    "n_runs": 10,
    "n_items": 50,
    "eval_rubric": "scripts/benchmarks/eval/rubrics/legal-citation.js",
    "hardware": "single Node.js process, sequential runs"
  },
  "metrics": {
    "accuracy_pct": {
      "mean": 72.00581461639818,
      "stdev": 2.2792455365791517
    },
    "hallucination_rate_pct": {
      "mean": 8,
      "stdev": 0
    },
    "citation_grounding_pct": {
      "mean": 56.164535400790584,
      "stdev": 2.507170090237067
    },
    "mean_latency_ms": {
      "mean": 2770.700571405331,
      "stdev": 328.0776077157166
    },
    "mean_token_cost_usd": {
      "mean": 0.0016716507305810124,
      "stdev": 0.00003574071507389783
    },
    "total_wall_clock_s": 1385.3502857026656,
    "n_runs": 10,
    "n_items": 50
  },
  "runs_summary": [
    {
      "run_index": 0,
      "accuracy_pct": 69.8137785496126,
      "hallucination_count": 4,
      "mean_latency_ms": 2632.1131045190596,
      "total_token_cost_usd": 0.0812145776937989
    },
    {
      "run_index": 1,
      "accuracy_pct": 73.3389177356167,
      "hallucination_count": 4,
      "mean_latency_ms": 2391.0619787575006,
      "total_token_cost_usd": 0.08401169013562884
    },
    {
      "run_index": 2,
      "accuracy_pct": 74.81441006797156,
      "hallucination_count": 4,
      "mean_latency_ms": 2836.8954678875893,
      "total_token_cost_usd": 0.08161331683189264
    },
    {
      "run_index": 3,
      "accuracy_pct": 73.09999991060904,
      "hallucination_count": 4,
      "mean_latency_ms": 2912.127460650902,
      "total_token_cost_usd": 0.08259144214295006
    },
    {
      "run_index": 4,
      "accuracy_pct": 72.01598886759197,
      "hallucination_count": 4,
      "mean_latency_ms": 2456.387045716625,
      "total_token_cost_usd": 0.08488907682079866
    },
    {
      "run_index": 5,
      "accuracy_pct": 74.58606781957323,
      "hallucination_count": 4,
      "mean_latency_ms": 2692.956169162447,
      "total_token_cost_usd": 0.08745488780561887
    },
    {
      "run_index": 6,
      "accuracy_pct": 68.17535194339102,
      "hallucination_count": 4,
      "mean_latency_ms": 3268.868238294631,
      "total_token_cost_usd": 0.0850664857437223
    },
    {
      "run_index": 7,
      "accuracy_pct": 71.6647731478161,
      "hallucination_count": 4,
      "mean_latency_ms": 2509.7753387213074,
      "total_token_cost_usd": 0.08314861932195722
    },
    {
      "run_index": 8,
      "accuracy_pct": 73.89793367351967,
      "hallucination_count": 4,
      "mean_latency_ms": 3427.2240489932296,
      "total_token_cost_usd": 0.0837259476534417
    },
    {
      "run_index": 9,
      "accuracy_pct": 68.65092444827995,
      "hallucination_count": 4,
      "mean_latency_ms": 2579.596861350017,
      "total_token_cost_usd": 0.08210932114069713
    }
  ],
  "reproduce": {
    "command": "node scripts/benchmarks/run-all.js --scenario=legal-citation --stack=langchain_gpt4o",
    "requirements": "OPENAI_API_KEY env var required",
    "repo": "https://github.com/Polsia-Inc/octomind"
  },
  "signed_at": "2026-05-06T16:02:51.849Z",
  "signer_version": "1.0",
  "key_hint": "embedded",
  "signature": "hmac-sha256:fe20d99d6cc61afaa774c6a98c09e3c9e1cb823167f6fdba8ed9619317458d6d"
}