Vimgolf/James4ever0 · BenchFlow

mirrored 3 minutes ago
Benchmark Card Files and versions Leaderboard
Your Namepatch difference between vimgolf terminal bench adaptor ba5fca2
import vimgolf_gym
import vimgolf_gym.dataclasses
import vimgolf_gym.lib
import vimgolf.vimgolf
import json
import argparse
import pydantic
import hashlib
import shlex
import subprocess
import shutil
import tempfile
import os
import pathlib


class TerminalBenchAdaptorSolution(pydantic.BaseModel):
    task_id: str
    trial_name: str
    challenge_hash: str
    solution: str


class VimGolfBenchmarkSolution(pydantic.BaseModel):
    task_id: str
    dataset_name: str
    trial_name: str
    start_time: float
    elapsed_time: float
    solution: str
    input_content: str
    output_content: str


def extract_first_non_empty_line(content: str) -> str:
    content = vimgolf.vimgolf.format_(content)
    lines = content.split("\n")
    for line in lines:
        if line.strip():
            return line
    return ""


def prepare_input(
    solution_format: str, solution
) -> vimgolf_gym.dataclasses.VimGolfCustomChallenge:
    if solution_format == "terminal-bench-adaptor":
        solution = TerminalBenchAdaptorSolution.parse_obj(solution)
        solution.solution = extract_first_non_empty_line(solution.solution)
        challenge_definition = vimgolf_gym.lib.get_local_challenge_definition(
            solution.challenge_hash
        )
        challenge_input = challenge_definition.input.data
        challenge_output = challenge_definition.output.data
        solution.challenge_hash
        return vimgolf_gym.dataclasses.VimGolfCustomChallenge(
            input=challenge_input,
            output=challenge_output,
            solution=solution.solution,
            name=solution.trial_name,
        )
    elif solution_format == "vimgolf-benchmark":
        solution = VimGolfBenchmarkSolution.parse_obj(solution)
        solution.solution = extract_first_non_empty_line(solution.solution)
        return vimgolf_gym.dataclasses.VimGolfCustomChallenge(
            input=solution.input_content,
            output=solution.output_content,
            solution=solution.solution,
            name=solution.trial_name,
        )
    else:
        raise ValueError(f"Unknown solution format: {solution_format}")


def run_vimgolf_local(custom_challenge: vimgolf_gym.dataclasses.VimGolfCustomChallenge):
    validated = False
    with vimgolf_gym.make(
        "vimgolf-custom",
        custom_challenge=custom_challenge,
    ) as env:
        if custom_challenge.solution:
            validated = env.verify_keys(custom_challenge.solution)
    return validated


def run_vimgolf_docker(
    custom_challenge: vimgolf_gym.dataclasses.VimGolfCustomChallenge,
):
    validated = False
    with vimgolf_gym.make(
        "vimgolf-custom", custom_challenge=custom_challenge, use_docker=True
    ) as env:
        if custom_challenge.solution:
            validated = env.verify_keys(custom_challenge.solution)
    return validated


def sha256_checksum(content: str) -> str:
    sha256 = hashlib.sha256()
    sha256.update(content.encode("utf-8"))
    return sha256.hexdigest()


def assert_docker_privilege():
    assert shutil.which("docker"), "Docker not found in PATH"
    # assert user is in docker group or has permission to run docker without sudo
    assert (
        os.geteuid() == 0
        or subprocess.run(["groups"], capture_output=True, text=True).stdout.find(
            "docker"
        )
        != -1
    ), "User does not have permission to run Docker commands"


def run_vimgolf_validator(
    custom_challenge: vimgolf_gym.dataclasses.VimGolfCustomChallenge,
):
    validated = False
    tainted = False
    input_content = vimgolf.vimgolf.format_(custom_challenge.input)
    output_content = vimgolf.vimgolf.format_(custom_challenge.output)
    solution_keys = custom_challenge.solution
    if not solution_keys:
        print("Empty solution keys")
        return validated

    with tempfile.TemporaryDirectory() as tmpdir:
        input_file_relpath = "input.txt"
        solution_file_relpath = "solution.txt"
        cmd = shlex.split(
            f"docker run --rm -v {tmpdir}:/verifier-input --entrypoint python --network=none agile4im/vimgolf-verifier:v0.0.3 /app/vimgolf-verifier.py single_shot"
        ) + [
            "--input_content",
            "/verifier-input/" + input_file_relpath,
            "--solution_keys",
            "/verifier-input/" + solution_file_relpath,
            "--load_from_path",
            "--remove_load_paths",
        ]
        docker_input_file_path = pathlib.Path(tmpdir) / input_file_relpath
        docker_input_file_path.write_text(input_content)
        docker_solution_file_path = pathlib.Path(tmpdir) / solution_file_relpath
        docker_solution_file_path.write_text(solution_keys)
        try:
            output = subprocess.check_output(cmd, timeout=15.0)  # type: ignore
            output = json.loads(output)
            checksum_server = output["checksum"]
            checksum_output = sha256_checksum(output_content)
            validated = checksum_server == checksum_output
            try:
                # confirm deletion
                assert not docker_input_file_path.is_file()
                assert not docker_solution_file_path.is_file()
            except AssertionError:
                tainted = True
        except subprocess.CalledProcessError:
            pass
        except subprocess.TimeoutExpired:
            pass
        except json.JSONDecodeError:
            pass
        except KeyError:
            pass
    if tainted:
        print("Warning: validator container is tainted")
    return validated


class Evaluator:
    def __init__(
        self,
        solution_format: str,
        jsonl_file: str,
        validator: str,
        solution_not_longer_than_output: bool,
    ):
        self.solution_format = solution_format
        self.jsonl_file = jsonl_file
        self.validator = validator
        self.solution_not_longer_than_output = solution_not_longer_than_output

        if self.validator == "vimgolf-validator":
            assert_docker_privilege()

    def _evaluate_single(
        self,
        custom_challenge: vimgolf_gym.dataclasses.VimGolfCustomChallenge,
        original_solution_object: dict,
    ) -> bool:
        assert custom_challenge.name, "Challenge name is required"
        print("Challenge ID:", custom_challenge.name)
        print("Checking solution:", repr(custom_challenge.solution))
        # evaluate the solution
        validated = False
        if self.solution_not_longer_than_output:
            if self.solution_format == "terminal-bench-adaptor":
                key_length = len(
                    vimgolf.vimgolf.tokenize_keycode_reprs(
                        original_solution_object["solution"]
                    )
                )
            else:
                key_length = len(
                    vimgolf.vimgolf.tokenize_keycode_reprs(custom_challenge.solution)
                )
            if key_length > len(custom_challenge.output):
                print(
                    "Invalidate solution: solution is longer than output (%s > %s)"
                    % (key_length, len(custom_challenge.output))
                )
                return False
        if custom_challenge.solution:
            if self.validator == "vimgolf-local":
                validated = run_vimgolf_local(custom_challenge)
            elif self.validator == "vimgolf-docker":
                validated = run_vimgolf_docker(custom_challenge)
            elif self.validator == "vimgolf-validator":
                validated = run_vimgolf_validator(custom_challenge)
            else:
                raise ValueError("Unknown validator:", self.validator)
        else:
            pass
        return validated

    def evaluate(self):
        results = []
        working_solutions = []
        validated_ids = []
        invalidated_ids = []
        challenges: list[
            tuple[vimgolf_gym.dataclasses.VimGolfCustomChallenge, dict]
        ] = []
        with open(self.jsonl_file, "r") as f:
            for line in f:
                original_solution_object = json.loads(line)
                # use solution_format to parse the solution
                custom_challenge = prepare_input(
                    solution_format=self.solution_format,
                    solution=original_solution_object,
                )
                challenges.append((custom_challenge, original_solution_object))
        print("Evaluating", len(challenges), "solutions")
        for index, (custom_challenge, original_solution_object) in enumerate(
            challenges
        ):
            print("Processing item (%s/%s)" % (index + 1, len(challenges)))
            # retrieve the evaluation result
            validated = self._evaluate_single(
                custom_challenge=custom_challenge,
                original_solution_object=original_solution_object,
            )
            results.append(validated)
            if validated:
                working_solutions.append(custom_challenge.solution)
                validated_ids.append(custom_challenge.name)
            else:
                invalidated_ids.append(custom_challenge.name)
            print(
                "Result for item (%s/%s) is %s"
                % (index + 1, len(challenges), validated)
            )
        print("Evaluation finished")
        print("Working solutions (%s):" % len(working_solutions))
        for it in working_solutions:
            print(repr(it))
        return dict(
            results=results,
            validated_ids=validated_ids,
            invalidated_ids=invalidated_ids,
            working_solutions=working_solutions,
        )


def calculate_stats(results: list[bool]):
    return {
        "total": len(results),
        "validated": sum(results),
        "invalidated": len(results) - sum(results),
        "pass_rate": (sum(results) / len(results)) * 100,
    }


def main():

    parser = argparse.ArgumentParser(
        description="Evaluate a batch of VimGolf solutions."
    )
    # evaluation format for parsing: terminal-bench-adaptor
    parser.add_argument(
        "--solution-format",
        type=str,
        help="Formating style of the JSONL file.",
        required=True,
    )
    # jsonl filepath
    parser.add_argument(
        "--jsonl-file",
        type=str,
        help="Path to the JSONL file containing the VimGolf solutions.",
        required=True,
    )
    # validator type
    parser.add_argument(
        "--validator",
        type=str,
        help="Which validator to use for scoring solutions: vimgolf-local, vimgolf-docker, vimgolf-validator",
        required=True,
    )
    parser.add_argument(
        "--solution-not-longer-than-output",
        action="store_true",
        help="Filter solutions that are longer than the output. This is done by comparing the the key count in solution to the length of the output",
    )
    parser.add_argument(
        "--result-savepath",
        type=str,
        required=True,
        help="Path to the result file as JSON.",
    )

    args = parser.parse_args()

    solution_format = args.solution_format
    jsonl_filepath = args.jsonl_file
    validator = args.validator
    result_savepath = args.result_savepath

    evaluator = Evaluator(
        solution_format=solution_format,
        jsonl_file=jsonl_filepath,
        validator=validator,
        solution_not_longer_than_output=args.solution_not_longer_than_output,
    )
    eval_result = evaluator.evaluate()
    print("Eval result:")
    print(eval_result)
    results = eval_result["results"]
    stats = calculate_stats(results)
    print("Statistics:")
    print(json.dumps(stats, indent=4))
    # save everything to json
    data = dict(eval_result=eval_result, stats=stats)
    with open(result_savepath, "w") as f:
        f.write(json.dumps(data, indent=4))
    print("Result saved to:", result_savepath)


if __name__ == "__main__":
    main()