/
humaneval5036639
import os
import json
from typing import Dict, Any
from utils.utils import post_log
from benchflow import BaseBench
from benchflow.schemas import BenchArgs, BenchmarkResult
class HumanEvalBench(BaseBench):
def __init__(self):
super.__init__()
def get_args(self, task_id: str) -> BenchArgs:
payload = {
"data": {
"message": "[HUMANEVAL BENCH]: getting args"
}
}
post_log(payload)
arguments = {
"required": ["K_VALUES", "N_COMPLETIONS"],
"optional": ["BASE_URL", "MODEL"]
}
return BenchArgs(arguments)
def get_image_name(self) -> str:
payload = {
"data": {
"message": "[HUMANEVAL BENCH]: getting image name"
}
}
post_log(payload)
return "brabderrahmane/benchflow:humaneval-v1"
def get_results_dir_in_container(self) -> str:
payload = {
"data": {
"message": "[HUMANEVAL BENCH]: getting results dir"
}
}
post_log(payload)
return "/app/eval_results"
def get_log_files_dir_in_container(self) -> str:
return "/app/logs"
def get_result(self, task_id: str) -> BenchmarkResult:
payload = {
"data": {
"message": "[HUMANEVAL BENCH]: getting results"
}
}
post_log(payload)
result_file = os.path.join(self.results_dir, "humaneval_result.json")
try:
with open(result_file, 'r') as f:
result = json.load(f)
log = ''.join(json.dumps(item, ensure_ascii=False) for item in result)
def compute_average_pass_k(result):
"""
calculates the average pass@k scores
across all tasks. Returns a dictionary with each pass@k key and its average value.
"""
# Dictionaries to sum values and count occurrences for each pass@k key
sums = {}
counts = {}
for task in result:
pass_k = task.get("pass@k", {})
for key, value in pass_k.items():
sums[key] = sums.get(key, 0) + value
counts[key] = counts.get(key, 0) + 1
# Compute average for each key
averages = {key: sums[key] / counts[key] for key in sums}
return averages
return BenchmarkResult(
task_id=task_id,
is_resolved=True,
metrics={"score": compute_average_pass_k(result)},
log={"result": log},
other={},
)
except Exception as e:
return BenchmarkResult(
task_id=task_id,
is_resolved=False,
metrics={"score": 0},
log={"error": str(e)},
other={"error": str(e)},
)
def get_all_tasks(self, split: str) -> Dict[str, Any]:
payload = {
"data": {
"message": "[HUMANEVAL BENCH]: getting all tasks"
}
}
post_log(payload)
# Only one task for HumanEval benchmark
return {"task_ids": ["0"], "error_message": None}