asta-bench/astabench/tools/stateful_python.py at main · smiky2011/asta-bench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import logging
from contextvars import ContextVar

from inspect_ai.tool import tool
from inspect_ai.util import sandbox

from astabench.util.sandbox import SandboxJupyter
from astabench.util.sandbox.static.sandbox_types import (
    JUPYTER_HISTORY_FILE,
    CellHistoryItem,
)

logger = logging.getLogger(__name__)

sandbox_jupyter = ContextVar[SandboxJupyter]("sandbox_jupyter")


async def get_sandbox_jupyter():
    """
    Get the current SandboxJupyter instance from the context variable.
    If it does not exist, create a new one and set it in the context variable.
    """
    if not sandbox_jupyter.get(None):
        sj = SandboxJupyter("default")
        await sj.setup()
        sandbox_jupyter.set(sj)
    return sandbox_jupyter.get()


async def exec_python_session(code: str):
    """
    Run python code in a notebook environment.  This tool is stateful and
    maintains variables across calls.  Like Jupyter, it returns the value of
    the final statement in the code as well as any print statements.  It works
    with notebook syntax like "!" to run shell commands.

    Args:
        code: The python code to execute.
    """

    # sandbox_jupyter contextvar to have a different SandboxJupyter per sample
    res = await (await get_sandbox_jupyter()).run_code(code=code)

    return res


@tool
def python_session():
    return exec_python_session


async def continue_wait_impl():
    """
    Continue waiting for the previous code from the stateful python tool to
    finish.  This tool has the same timeout as the execution tool and can be
    used any number of times to allow arbitrary waiting.  It will return any
    subsequent output from the command.

    This will only work if the previous code timed out and is still running
    (not interrupted).
    """

    # TODO
    raise NotImplementedError(
        "continue_wait can't currently be used as a standalone tool, because the jupyter contextvar is not in a session"
    )

    # res = await (await get_sandbox_jupyter()).continue_after_timeout()

    # return res


@tool
def continue_wait():
    raise NotImplementedError(
        "continue_wait can't currently be used as a standalone tool, because the jupyter contextvar is not in a session"
    )
    return continue_wait_impl


async def interrupt_kernel_impl():
    """
    Interrupt the currently running code in the stateful python tool, if it
    previously timed out.  This is typically not needed, because the code it
    automatically interrupted if a new code block is executed.
    """

    res = await (await get_sandbox_jupyter()).interrupt_kernel()

    return res


@tool
def interrupt_kernel():
    return interrupt_kernel_impl


async def get_cell_history(
    filename: str = JUPYTER_HISTORY_FILE,
) -> list[CellHistoryItem] | None:
    """Get the history of code cells executed in the Jupyter environment."""

    try:
        raw_history = json.loads(await sandbox().read_file(filename))
        return [CellHistoryItem(**item) for item in raw_history]
    except FileNotFoundError:
        logger.warning(f"Missing {filename} in sandbox; can't score trajectory metrics")
    return None