/
Swebenchf1f73fe
import json
import os
from typing import Any, Dict
from datasets import Dataset, load_dataset
from benchflow import BaseBench
from benchflow.schemas import BenchArgs, BenchmarkResult
class SwebenchBench(BaseBench):
def __init__(self):
super().__init__()
def get_args(self, task_id: str) -> BenchArgs:
arguments = {
"required": [],
"optional": [
{"INSTANCE_IDS": task_id},
{"MAX_WORKERS": 1},
{"RUN_ID": task_id}
]
}
return BenchArgs(arguments)
def get_image_name(self) -> str:
return "kirk2000/benchflow:swebench-v1"
def get_results_dir_in_container(self) -> str:
return "/app/results"
def get_log_files_dir_in_container(self) -> str:
return "/app/logs"
def get_result(self, task_id: str) -> BenchmarkResult:
results_file = os.path.join(self.results_dir, f"self_model.{task_id}.json")
model_prediction_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/patch.diff")
report_file = os.path.join(self.log_files_dir, f"run_evaluation/{task_id}/self_model/{task_id}/report.json")
try:
with open(results_file, 'r') as f:
result_data = json.load(f)
total_instances = result_data.get("total_instances", 1)
resolved_instances = result_data.get("resolved_instances", 0)
pass_rate = resolved_instances / total_instances if total_instances else 0
with open(model_prediction_file, 'r') as f:
model_prediction = f.read()
with open(report_file, 'r') as f:
report = json.load(f)
return BenchmarkResult(
task_id=task_id,
is_resolved=pass_rate > 0.99,
metrics={"pass_rate": pass_rate},
log={"prediction": model_prediction, "report": report},
other={"details": result_data},
)
except Exception as e:
return BenchmarkResult(
task_id=task_id,
is_resolved=False,
metrics={"pass_rate": 0},
log={"error": str(e)},
other={"error": str(e)},
)
def get_all_tasks(self, split: str) -> Dict[str, Any]:
try:
dataset: Dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split=split)
dataset_ids = [instance["instance_id"] for instance in dataset]
return {"task_ids": dataset_ids, "error_message": None}
except Exception as e:
return {"task_ids": [], "error_message": str(e)}
def cleanup(self):
pass