[
  {
    "run": "2026-02-19T11-51-32-08-00_nf-rag_KnqXv5dBYhsWmNfNY2jtYj",
    "model": "openai/gpt-5.2",
    "solver": "basic_agent",
    "score": 0.31442577030812324,
    "cost": 3.05,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:51:32-08:00",
    "completed_at": "2026-02-19T11:52:24-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.3445378151260504,
      "complexity/0-hop": 0.38095238095238093,
      "level/advanced": 0.28431372549019607,
      "complexity/1-hop": 0.32312925170068024,
      "complexity/2-hop": 0.13888888888888887
    },
    "category_scores": {
      "category/AM": 0.08333333333333333,
      "category/CL": 0.3333333333333333,
      "category/AB": 0.6666666666666666,
      "category/CR": 0.1111111111111111,
      "category/GR": 0.6,
      "category/MUT": 0.30952380952380953,
      "category/PI": 0.0
    },
    "frustration_scores": {
      "frustration/low": 0.5555555555555556,
      "frustration/high": 0.2,
      "frustration/moderate": 0.14285714285714285,
      "frustration/very_high": 0.2738095238095238
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.31442577030812324,
        "score_stderr": null,
        "cost": 0.03654886323529412,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.31442577030812324,
        "score_stderr": null,
        "cost": 0.03654886323529412,
        "cost_stderr": 0.006163437988877076
      }
    }
  },
  {
    "run": "2026-02-19T11-47-41-08-00_nf-rag_kNLjACkyqQc6zmF43fBfeY",
    "model": "openai/gpt-5.2",
    "solver": "basic_agent",
    "score": 0.3781862745098039,
    "cost": 2.98,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:47:41-08:00",
    "completed_at": "2026-02-19T11:48:43-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.43529411764705883,
      "complexity/0-hop": 0.5285714285714286,
      "level/advanced": 0.321078431372549,
      "complexity/1-hop": 0.33035714285714285,
      "complexity/2-hop": 0.13888888888888887
    },
    "category_scores": {
      "category/CL": 0.39629629629629626,
      "category/AM": 0.16666666666666666,
      "category/AB": 0.6111111111111112,
      "category/CR": 0.1111111111111111,
      "category/GR": 0.625,
      "category/MUT": 0.5,
      "category/PI": 0.0
    },
    "frustration_scores": {
      "frustration/low": 0.6000000000000001,
      "frustration/high": 0.16666666666666669,
      "frustration/very_high": 0.33035714285714285,
      "frustration/moderate": 0.3333333333333333
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.3781862745098039,
        "score_stderr": null,
        "cost": 0.03552187058823529,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.3781862745098039,
        "score_stderr": null,
        "cost": 0.03552187058823529,
        "cost_stderr": 0.004781286595305113
      }
    }
  },
  {
    "run": "2026-02-19T11-30-57-08-00_nf-rag_YU2fvsxkrMY4F45swTbtJu",
    "model": "openai/gpt-5.2",
    "solver": "basic_agent",
    "score": 0.40631808278867104,
    "cost": 3.24,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:30:57-08:00",
    "completed_at": "2026-02-19T11:32:11-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.5087145969498911,
      "complexity/0-hop": 0.5462962962962963,
      "level/advanced": 0.303921568627451,
      "complexity/1-hop": 0.40476190476190477,
      "complexity/2-hop": 0.08333333333333333
    },
    "category_scores": {
      "category/AM": 0.16666666666666666,
      "category/CL": 0.31275720164609055,
      "category/AB": 0.6666666666666666,
      "category/GR": 0.8,
      "category/CR": 0.0,
      "category/MUT": 0.6666666666666666,
      "category/PI": 0.0
    },
    "frustration_scores": {
      "frustration/low": 0.6111111111111112,
      "frustration/moderate": 0.5246913580246914,
      "frustration/high": 0.2,
      "frustration/very_high": 0.2976190476190476
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.40631808278867104,
        "score_stderr": null,
        "cost": 0.047114313235294115,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.40631808278867104,
        "score_stderr": null,
        "cost": 0.047114313235294115,
        "cost_stderr": 0.005298403402965691
      }
    }
  },
  {
    "run": "2026-02-19T11-27-27-08-00_nf-rag_HyQrsDHKf9t8eqT4gsfBYW",
    "model": "openai/gpt-5.2",
    "solver": "basic_agent",
    "score": 0.38738364032481676,
    "cost": 4.44,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:27:27-08:00",
    "completed_at": "2026-02-19T11:28:38-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.4610417904535552,
      "complexity/0-hop": 0.4338624338624339,
      "level/advanced": 0.3137254901960784,
      "complexity/1-hop": 0.4116883116883117,
      "complexity/2-hop": 0.2222222222222222
    },
    "category_scores": {
      "category/CL": 0.3153011597456042,
      "category/AB": 0.3333333333333333,
      "category/AM": 0.3333333333333333,
      "category/GR": 0.6,
      "category/CR": 0.1111111111111111,
      "category/MUT": 0.6666666666666666,
      "category/PI": 0.0
    },
    "frustration_scores": {
      "frustration/low": 0.5555555555555556,
      "frustration/very_high": 0.3095238095238095,
      "frustration/moderate": 0.345679012345679,
      "frustration/high": 0.3527272727272727
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.38738364032481676,
        "score_stderr": null,
        "cost": 0.07421507058823529,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.38738364032481676,
        "score_stderr": null,
        "cost": 0.07421507058823529,
        "cost_stderr": 0.01095161966791244
      }
    }
  },
  {
    "run": "2026-02-19T11-11-43-08-00_nf-rag_Hv67aUohY4J8aZ7vATnmBN",
    "model": "anthropic/claude-sonnet-4-5",
    "solver": "basic_agent",
    "score": 0.7706651287966166,
    "cost": 7.53,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:11:43-08:00",
    "completed_at": "2026-02-19T11:16:33-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.7973856209150326,
      "complexity/0-hop": 0.8253968253968254,
      "level/advanced": 0.7439446366782007,
      "complexity/1-hop": 0.6904761904761905,
      "complexity/2-hop": 0.8300653594771242
    },
    "category_scores": {
      "category/CL": 0.7262164124909223,
      "category/AM": 0.5,
      "category/AB": 1.0,
      "category/GR": 0.8,
      "category/MUT": 1.0,
      "category/CR": 0.5555555555555555,
      "category/PI": 1.0
    },
    "frustration_scores": {
      "frustration/low": 0.8395061728395061,
      "frustration/high": 0.4,
      "frustration/very_high": 0.8319327731092436,
      "frustration/moderate": 0.8333333333333334
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.7706651287966166,
        "score_stderr": null,
        "cost": 0.22156247205882354,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.7706651287966166,
        "score_stderr": null,
        "cost": 0.22156247205882354,
        "cost_stderr": 0.018907123562742434
      }
    }
  },
  {
    "run": "2026-02-19T11-11-43-08-00_nf-rag_WjovKuknAH7Zj5MYXfoLp8",
    "model": "anthropic/claude-haiku-4-5",
    "solver": "basic_agent",
    "score": 0.6058823529411765,
    "cost": 2.7,
    "task_name": "astabench/nf_rag",
    "total_samples": 34,
    "started_at": "2026-02-19T11:11:43-08:00",
    "completed_at": "2026-02-19T11:14:07-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.7450980392156863,
      "complexity/0-hop": 0.7023809523809524,
      "level/advanced": 0.4666666666666667,
      "complexity/1-hop": 0.5904761904761904,
      "complexity/2-hop": 0.4166666666666667
    },
    "category_scores": {
      "category/CL": 0.5,
      "category/AB": 1.0,
      "category/AM": 0.5166666666666667,
      "category/CR": 0.0,
      "category/GR": 0.6,
      "category/MUT": 0.8333333333333334,
      "category/PI": 1.0
    },
    "frustration_scores": {
      "frustration/low": 0.8395061728395061,
      "frustration/moderate": 0.6851851851851851,
      "frustration/high": 0.4,
      "frustration/very_high": 0.49523809523809526
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.6058823529411765,
        "score_stderr": null,
        "cost": 0.07941518529411765,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.6058823529411765,
        "score_stderr": null,
        "cost": 0.07941518529411765,
        "cost_stderr": 0.007152929131073515
      }
    }
  },
  {
    "run": "2026-02-19T09-18-56-08-00_nf-rag_CAVnbeZzeQRQYKtKgxYkLa",
    "model": "anthropic/claude-haiku-4-5",
    "solver": "basic_agent",
    "score": 0.587202380952381,
    "cost": 2.77,
    "task_name": "astabench/nf_rag",
    "total_samples": 32,
    "started_at": "2026-02-19T09:18:56-08:00",
    "completed_at": "2026-02-19T09:21:30-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.6621848739495798,
      "complexity/0-hop": 0.6833333333333333,
      "level/advanced": 0.5022222222222222,
      "complexity/1-hop": 0.4908730158730159,
      "complexity/2-hop": 0.5555555555555556
    },
    "category_scores": {
      "category/CL": 0.45185185185185184,
      "category/AB": 0.9444444444444445,
      "category/AM": 0.54,
      "category/CR": 0.16666666666666666,
      "category/GR": 0.6,
      "category/MUT": 0.8095238095238094,
      "category/PI": 0.5
    },
    "frustration_scores": {
      "frustration/low": 0.7111111111111111,
      "frustration/moderate": 0.6428571428571429,
      "frustration/high": 0.45833333333333337,
      "frustration/very_high": 0.5153846153846154
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.587202380952381,
        "score_stderr": null,
        "cost": 0.0865772046875,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.587202380952381,
        "score_stderr": null,
        "cost": 0.0865772046875,
        "cost_stderr": 0.0077560305658304724
      }
    }
  },
  {
    "run": "2026-02-19T09-18-56-08-00_nf-rag_LuiugqwiS6sv9mjyo6YJe3",
    "model": "anthropic/claude-sonnet-4-5",
    "solver": "basic_agent",
    "score": 0.798373440285205,
    "cost": 7.4,
    "task_name": "astabench/nf_rag",
    "total_samples": 32,
    "started_at": "2026-02-19T09:18:56-08:00",
    "completed_at": "2026-02-19T09:23:55-08:00",
    "git_commit": "12084f1",
    "score_stderr": null,
    "difficulty_scores": {
      "level/baseline": 0.7704099821746881,
      "complexity/0-hop": 0.8095238095238095,
      "level/advanced": 0.8300653594771242,
      "complexity/1-hop": 0.7858585858585858,
      "complexity/2-hop": 0.7973856209150326
    },
    "category_scores": {
      "category/CL": 0.6905129728659141,
      "category/AM": 0.8,
      "category/AB": 1.0,
      "category/GR": 0.8,
      "category/CR": 0.6666666666666666,
      "category/MUT": 0.8333333333333334,
      "category/PI": 1.0
    },
    "frustration_scores": {
      "frustration/low": 0.925925925925926,
      "frustration/moderate": 0.6666666666666666,
      "frustration/high": 0.4409090909090909,
      "frustration/very_high": 0.8808446455505279
    },
    "task_stats": {
      "tag/custom": {
        "score": 0.798373440285205,
        "score_stderr": null,
        "cost": 0.2311816125,
        "cost_stderr": null
      },
      "task/nf_rag": {
        "score": 0.798373440285205,
        "score_stderr": null,
        "cost": 0.2311816125,
        "cost_stderr": 0.013664681883304868
      }
    }
  }
]