Spaces:

jonahkall
/

ether0-inference

Running on Zero

File size: 6,092 Bytes

4c346eb

import re
from collections.abc import Callable, MutableMapping
from dataclasses import dataclass, field
from functools import wraps
from itertools import starmap
from typing import Any, ParamSpec, TypeVar, cast

from ether0.model_prompts import (
    ANSWER_END,
    ANSWER_START,
    THINK_END,
    THINK_START,
    ProblemPrompt,
    SysPrompt,
    extract_answer_loose,
)
from ether0.rewards import accuracy_reward, format_reward

P = ParamSpec("P")
R = TypeVar("R")


def wrap_reward_func(func: Callable[P, R], **wrap_kwargs: Any) -> Callable[P, R]:
    @wraps(func)  # needed by GRPOTrainer for logging
    def wrapped(*args: P.args, **kwargs: P.kwargs) -> R:
        return func(*args, **wrap_kwargs, **kwargs)

    return wrapped


@dataclass
class ChatArguments:
    """Arguments for making a chat conversation for SFT or RL training."""

    sys_prompt: SysPrompt | None = field(
        default=None,
        metadata={
            "help": (
                "If provided, use this system prompt. If not provided, the chat"
                " template may inject one."
            )
        },
    )

    problem_prompt: ProblemPrompt = field(
        default=ProblemPrompt.NONE,
        metadata={
            "help": (
                "Prompt to put before the problem in the first user message, relevant"
                " for both RL or SFT. Make sure this matches between SFT and RL, so if"
                " the SFT'd model wasn't passed this during SFT, don't pass this to RL."
            )
        },
    )

    reasoning: bool = field(
        default=True,
        metadata={
            "help": (
                "If True (default), it is assumed that the model's response contains"
                f" reasoning enclosed in `{THINK_START}` and `{THINK_END}`."
            )
        },
    )

    def make_rl_conversation(
        self, row: MutableMapping[str, str | list[str]]
    ) -> dict[str, list[dict] | list[list[dict]]]:
        """Format a dataset row into a chat-like conversation structure.

        This will add a `messages` key to the dataset. Unlike make_sft_convo,
        the answer will not be included.
        """
        if not self.sys_prompt:
            msgs: list[dict] = []
        else:
            msgs = [{
                "role": "system",
                "content": SysPrompt(self.sys_prompt).get_sys_prompt(),
            }]
        problem_prompt = ProblemPrompt(self.problem_prompt).get_prompt()
        if problem_prompt:
            problem_prompt += "\n\n"

        def add_user(problem: str) -> list[dict]:
            return [*msgs, {"role": "user", "content": problem_prompt + problem}]

        if isinstance(row["problem"], str):  # Single
            all_msgs: list[dict] | list[list[dict]] = add_user(row["problem"])
        else:  # Batched
            all_msgs = [add_user(p) for p in row["problem"]]
        return {"prompt": all_msgs}

    def make_sft_conversation(
        self, row: MutableMapping[str, str | list[str]]
    ) -> dict[str, list[dict] | list[list[dict]]]:
        """Format a dataset row into a chat-like conversation structure.

        This will add a `messages` key to the dataset.
        """
        if (
            self.reasoning
            and ProblemPrompt(self.problem_prompt) == ProblemPrompt.ANSWER
        ):
            raise ValueError(
                "It does not make sense to include reasoning in the SFT traces,"
                " but then only prompt about answer XML (without thoughts)."
            )

        def add_assistant(
            raw_answer: str, thought: str, prior_msgs: list[dict]
        ) -> list[dict]:
            if re.search(r"<\/answer>", raw_answer):
                # Remove prelude and postlude plus XML tags,
                # because an OpenRouter-hosted DeepSeek R1 can give answer
                # with a prelude and XML tags, but our training expects just an answer
                # > The reaction involves sodium borohydride ([BH4-].[Na+]), <redacted>.
                # > Under these conditions, <redacted>.
                # > <answer>N1(CCOCC1)C1=CC=C(C(O))C=C1</answer>
                answer = extract_answer_loose(raw_answer)
                if not answer:
                    raise ValueError(
                        "Failed to extract just the answer from the answer"
                        f" {raw_answer!r}."
                    )
            else:
                answer = raw_answer

            return [
                *prior_msgs,
                {
                    "role": "assistant",
                    "content": (
                        (f"{THINK_START}{thought}{THINK_END}" if self.reasoning else "")
                        + f"{ANSWER_START}{answer}{ANSWER_END}"
                    ),
                },
            ]

        # The first part will be the same as the RL conversation
        msgs = self.make_rl_conversation(row)["prompt"]
        # Now add the answer, with optional thinking
        if isinstance(row["problem"], str):  # Single
            all_msgs: list[dict] | list[list[dict]] = add_assistant(
                cast(str, row["answer"]),
                cast(str, row["thought"]),
                cast(list[dict], msgs),
            )
        else:  # Batched
            all_msgs = list(
                starmap(
                    add_assistant, zip(row["answer"], row["thought"], msgs, strict=True)
                )
            )
        return {"messages": all_msgs}

    def get_reward_funcs(
        self,
        format_reward_value: float = 1.0,
        soft: bool = False,
        test: bool = False,
        good_molecule_bonus: float = 0.0,
    ) -> list[Callable]:
        return [
            wrap_reward_func(
                format_reward,
                reasoning=self.reasoning,
                reward=format_reward_value,
            ),
            wrap_reward_func(
                accuracy_reward,
                reasoning=self.reasoning,
                soft=soft,
                test=test,
                good_molecule_bonus=good_molecule_bonus,
            ),
        ]