microsoft
diff --git a/‎contrib/agentlightning/contrib/agent/empo2_agent.py‎
Lines changed: 258 additions & 0 deletions b/‎contrib/agentlightning/contrib/agent/empo2_agent.py‎
Lines changed: 258 additions & 0 deletions
diff --git a/‎contrib/agentlightning/contrib/agent/env_agent.py‎
Lines changed: 3 additions & 1 deletion b/‎contrib/agentlightning/contrib/agent/env_agent.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎contrib/agentlightning/contrib/algorithm/env_verl/core_empo2.py‎
Lines changed: 68 additions & 0 deletions b/‎contrib/agentlightning/contrib/algorithm/env_verl/core_empo2.py‎
Lines changed: 68 additions & 0 deletions
@@ -0,0 +1,258 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import copy
+import logging
+from typing import Any, Dict
+
+import numpy as np
+import requests
+from add_instruction import add_chat_all_tips, add_chat_instruction
+from agl_envs import make_env_manager
+
+from agentlightning import LLM, NamedResources, Rollout, configure_logger, emit_reward, operation
+from agentlightning.utils.otel import make_link_attributes
+from contrib.agentlightning.contrib.agent.env_agent import EnvAgent
+from contrib.recipes.envs.prompt_builder import HistoryPromptBuilder
+
+configure_logger()
+logger = configure_logger(name=__name__, level=logging.ERROR)
+
+
+def do_compress(text):
+    url = "http://127.0.0.1:8000/key_cal/"
+    headers = {"Content-Type": "application/json"}  # 明确指定 JSON 格式
+    data = {"text": text}
+    response = requests.post(url, json=data, headers=headers)  # 使用 json 参数
+    return response.json()
+
+
+url_mem = "http://127.0.0.1:8001/mem/"
+
+
+def retrieve_memory(idx, key):
+    response = requests.post(url_mem, json={"key": key, "idx": idx})
+    count, data = response.json()
+    return count, data
+
+
+def reset_memory(mem_list_num):
+    requests.post(url_mem, json={"key": [], "idx": mem_list_num, "content": "Reset"})  # 用于初始化多个 memory slot
+
+
+def add_memory(idx, key, content, score):
+    requests.post(url_mem, json={"key": key, "idx": idx, "content": content, "score": score})
+
+
+def gather_chats(prompt):
+    chat_list = []
+    for item in prompt:
+        role = item.type
+        content = item.content
+        if "System" in role:
+            continue
+        elif "User" in role:
+            role = "user"
+        else:
+            role = "assistant"
+        chat_list.append(f"{role}: {content}")
+    text = " ".join(chat_list)
+    return text
+
+
+class EMPO2Agent(EnvAgent):
+    def _get_all_tip_prompt(self, prompt, tip_list):
+        prompt_type = self.config.captioner.prompt_type
+        if prompt_type == "chat":
+            return add_chat_all_tips(prompt, tip_list)
+        else:
+            raise ValueError(f"Unsupported prompt_type '{prompt_type}' for _get_tip_obs (expected 'chat')")
+
+    def _get_tip_generation_prompt(self, prompt):
+        return add_chat_instruction(prompt, "tip")
+
+    async def rollout_async(
+        self,
+        task: Dict[str, Any],
+        resources: NamedResources,
+        rollout: Rollout,
+    ) -> float | None:
+        rollout_id = rollout.rollout_id
+        logger.info(f"[Rollout {rollout_id}] Task: {task}")
+
+        reward_scale = float(self.config["reawrd_scale"])
+
+        # Setup LLM + agent
+        llm: LLM = resources.get("main_llm")
+        print("Training with model:", llm.model, "on endpoint:", llm.endpoint)
+        self.agent = self._build_agent(llm, 1.0 if rollout.mode == "train" else 0.4)
+
+        if rollout.mode == "train":
+            train_mode = task["train_mode"]
+            global_steps = task["global_steps"]
+        else:
+            train_mode = "on-policy"
+
+        if rollout.mode == "train" and (train_mode == "off-policy" or train_mode == "on-policy-with-tips"):
+            use_tips = True
+        else:
+            use_tips = False
+
+        variation_idx = task["variation_idx"]
+
+        try:
+            # Setup environment
+            prompt_builder = HistoryPromptBuilder(
+                max_history=self.config.captioner.max_history, prompt_type=self.config.captioner.prompt_type
+            )
+
+            self.env = make_env_manager(self.config.env_name, task, self.config)
+            env_obs, infos, available_actions_hint = self.env.reset()
+
+            prompt_builder.init(self.env)
+            prompt_builder.update_observation(env_obs)
+            # prompt_builder.update_admissible_actions(available_actions_hint)
+
+            prompt = prompt_builder.get_prompt()
+
+            episode_reward, done = 0.0, False
+
+            history_actions_for_mem = []
+            tip_list = []
+            step_count = 0
+            while not done:
+                if use_tips:
+                    text = gather_chats(prompt)
+                    key = (
+                        np.array(do_compress(text)["key"])
+                        .reshape(
+                            -1,
+                        )
+                        .tolist()
+                    )
+                    count, mem_list = retrieve_memory(variation_idx, key)
+                else:
+                    count, mem_list = 0, []
+
+                ret_tips, intrinsic_reward = "", 0.0
+
+                if use_tips:
+                    if count > 0:
+                        ret_tips = "Here are some memories you collected in your previous exploration:\n"
+                        for mem in mem_list:
+                            ret_tips += mem + "\n"
+
+                        tip_list.append(ret_tips)
+                        intrinsic_reward = 1 / (count + 1)
+                    else:
+                        tip_list.append("")
+                        intrinsic_reward = 1
+
+                try:
+                    if use_tips and any(t != "" for t in tip_list):
+                        llm_prompt = self._get_all_tip_prompt(prompt, tip_list)
+                    else:
+                        llm_prompt = prompt
+
+                    instructed_prompt = self._get_instructed_prompt(llm_prompt)
+
+                    # Main agent step
+                    with operation(step_count=step_count):
+                        result = await self.agent._model_client.create(instructed_prompt)
+                    output = result.content
+                    logger.info(f"[LLM output]: {output}")
+
+                except Exception as e:
+                    logger.error(f"[Rollout {rollout_id}] Error during training rollout: {e}", exc_info=True)
+                    break
+
+                env_obs, executed_action, is_valid, step_reward, terminated, truncated, info, available_actions_hint = (
+                    self.env.step(
+                        output,
+                        use_reasoning=self.config.captioner.type == "cot",
+                        use_success_rate=self.config.use_success_rate,
+                    )
+                )
+
+                history_actions_for_mem.append(output)
+
+                action_for_history = output if self.config.get("record_original_action", False) else executed_action
+
+                prompt_builder.update_step_count()
+                prompt_builder.update_action(action_for_history)
+                prompt_builder.update_observation(env_obs)
+                # prompt_builder.update_admissible_actions(available_actions_hint)
+
+                prompt = prompt_builder.get_prompt()
+
+                if rollout.mode == "train":
+                    step_reward = reward_scale * step_reward
+
+                emit_reward(
+                    {
+                        "extrinsic_reward": step_reward,
+                        "intrinsic_reward": intrinsic_reward,
+                    },
+                    primary_key="extrinsic_reward",
+                    attributes=make_link_attributes({"step_count": str(step_count)}),
+                )
+
+                episode_reward += float(step_reward)
+                done = np.logical_or(terminated, truncated)
+
+                step_count += 1
+
+            if rollout.mode == "train":
+                prompt_builder.prompt_type = "chat"
+                prompt_builder.max_history = -1
+                full_prompt = prompt_builder.get_prompt()
+
+                # Add tips as raw text (no tags)
+                if use_tips and len(tip_list) > 0:
+                    tip_base_prompt = copy.deepcopy(full_prompt)
+                    tips_iter = iter(tip_list)
+                    for item in tip_base_prompt:
+                        if "User" in item.type:
+                            tip = next(tips_iter, None)
+                            if tip is None:
+                                break
+                            if tip != "":
+                                item.content += tip
+                else:
+                    tip_base_prompt = full_prompt
+
+                tip_generation_prompt = self._get_tip_generation_prompt(tip_base_prompt)
+
+                self.agent._model_client.max_tokens = 512
+                result = await self.agent._model_client.create(tip_generation_prompt)
+                tips = result.content
+
+                logger.info(f"Tips: {tips}")
+
+                #! Fill the ret and tip, then save memory
+                #! Use final prompt state for ALL steps' keys
+                final_prompt_text = gather_chats(prompt)
+                final_key = (
+                    np.array(do_compress(final_prompt_text)["key"])
+                    .reshape(
+                        -1,
+                    )
+                    .tolist()
+                )
+
+                for i in range(len(history_actions_for_mem)):
+                    max_score = 100 * reward_scale
+                    content = (
+                        tips
+                        + f"; At that timestep, the specific action your took was {history_actions_for_mem[i]}; Eventually you got the score {round(episode_reward, 1)}/{int(max_score)}."
+                    )
+                    score = episode_reward
+                    add_memory(variation_idx, final_key, content, round(score, 1))
+
+            if self.config.use_success_rate:
+                return self.env.get_success_score() * reward_scale
+            else:
+                return episode_reward
+
+        finally:
+            if self.env is not None:
+                self.env.close()
@@ -127,8 +127,10 @@ async def rollout_async(
                     )
                 )
 
+                action_for_history = output if self.config.get("record_original_action", False) else executed_action
+
                 prompt_builder.update_step_count()
-                prompt_builder.update_action(executed_action)
+                prompt_builder.update_action(action_for_history)
                 prompt_builder.update_observation(env_obs)
                 prompt_builder.update_admissible_actions(available_actions_hint)
 
 
@@ -0,0 +1,68 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from typing import Any, List
+
+import torch
+
+
+def is_sublist(sub, full):
+    n, m = len(sub), len(full)
+    return any(full[i : i + n] == sub for i in range(m - n + 1))
+
+
+# Function to remove segments of a list between a start pattern and an end pattern
+def remove_pattern_ranges(seq: List[Any], start_pat: List[Any], end_pat: List[Any]) -> List[Any]:
+    """Remove every [start_pat ... end_pat] slice (inclusive) from seq."""
+
+    out: List[Any] = []
+    i = 0
+    n = len(seq)
+    ls, le = len(start_pat), len(end_pat)
+
+    while i < n:
+        # Check if the start pattern matches at the current position
+        if i + ls <= n and seq[i : i + ls] == start_pat:
+            # Look for the first occurrence of the end pattern after the start pattern
+            j = i + ls
+            found_end = -1
+            while j + le <= n:
+                if seq[j : j + le] == end_pat:
+                    found_end = j
+                    break  # Stop when the end pattern is found
+                j += 1
+
+            # If the end pattern is found, skip the whole segment from start to end
+            if found_end != -1:
+                i = found_end + le  # Move the index past the end pattern
+                continue  # Skip the current iteration and go to the next
+            else:
+                # If the end pattern is not found, keep the current element and move one step forward
+                out.append(seq[i])
+                i += 1
+        else:
+            # If the start pattern is not found, just append the current element
+            out.append(seq[i])
+            i += 1
+
+    # Return the filtered list with the start-end pattern segments removed
+    return out
+
+
+def low_prob_token_masking(batch, threshold: float = -5.0):
+    response_mask = batch.batch["response_mask"]  # [N, T]
+    old_log_prob = batch.batch["old_log_probs"]
+
+    masked_old_log_prob = old_log_prob.masked_fill(response_mask == 0, 1e9)
+    min_values, _ = torch.min(masked_old_log_prob, dim=1)  # [N]
+
+    mask = min_values < threshold  # [N]
+
+    combined_mask = mask.unsqueeze(1) & (response_mask == 1)
+
+    # advantages masking
+    response_mask = response_mask.masked_fill(combined_mask, 0)
+    batch.batch["response_mask"] = response_mask
+
+    print(f"Number of tokens masked: {combined_mask.sum().item()}")
+
+    return batch
Original file line number	Diff line number	Diff line change
`@@ -127,8 +127,10 @@ async def rollout_async(`
`127`	`127`	`)`
`128`	`128`	`)`
`129`	`129`
	`130`	`+ action_for_history = output if self.config.get("record_original_action", False) else executed_action`
	`131`	`+`
`130`	`132`	`prompt_builder.update_step_count()`
`131`		`- prompt_builder.update_action(executed_action)`
	`133`	`+ prompt_builder.update_action(action_for_history)`
`132`	`134`	`prompt_builder.update_observation(env_obs)`
`133`	`135`	`prompt_builder.update_admissible_actions(available_actions_hint)`
`134`	`136`