Hi, I am trying to reproduce the evaluation performance on reward inference, specifically the cumulative return on 24 tasks shown in the original paper (second row in Figure.3, Rwd 221.9).
I trained from scratch for 80M steps on my local 5090 with buffer_size decreased from 5.12M -> 1M. The tracking performance is good already, but I can only get reward around -3 by running the reward_inference script shown below. I also tried replacing the replay buffer for z computation with expert motion. Although not sure if this is done right, the return is also around -3.
model_folder = Path(model_folder)
video_folder = Path(video_folder) if video_folder is not None else model_folder / "reward_inference" / "videos"
video_folder.mkdir(parents=True, exist_ok=True)
model = load_model_from_checkpoint_dir(model_folder / "checkpoint", device=device)
model.to(device)
model.eval()
model_name = model.__class__.__name__
with open(model_folder / "config.json", "r") as f:
config = json.load(f)
if data_path is not None:
config["env"]["lafan_tail_path"] = str(data_path)
if not Path(config["env"]["lafan_tail_path"]).exists():
config["env"]["lafan_tail_path"] = "data/lafan_29dof.pkl"
config["env"]["hydra_overrides"].append("env.config.max_episode_length_s=10000")
config["env"]["hydra_overrides"].append(f"env.config.headless={headless}")
# config["env"]["hydra_overrides"].append("env.config.lie_down_init=True")
# config["env"]["hydra_overrides"].append("env.config.lie_down_init_prob=1")
config["env"]["hydra_overrides"].append(f"simulator={simulator}")
config["env"]["disable_domain_randomization"] = disable_dr
config["env"]["disable_obs_noise"] = disable_obs_noise
rich.print(config["env"])
num_envs = 1
env_cfg = HumanoidVerseIsaacConfig(**config["env"])
wrapped_env, _ = env_cfg.build(num_envs)
output_dir = model_folder / "exported"
output_dir.mkdir(parents=True, exist_ok=True)
export_meta_policy_as_onnx(
model,
output_dir,
f"{model_name}.onnx",
{"actor_obs": torch.randn(1, model._actor.input_filter.output_space.shape[0] + model.cfg.archi.z_dim)},
z_dim=model.cfg.archi.z_dim,
history=('history_actor' in model.cfg.archi.actor.input_filter.key),
use_29dof=True,
)
print(f"Exported model to {output_dir}/{model_name}.onnx")
tasks = [
# stand
"move-ego-0-0",
"move-ego-low0.5-0-0",
# locomotion medium
"move-ego-0-0.7",
# "move-ego-90-0.7",
# "move-ego-180-0.7",
# "move-ego--90-0.7",
# "move-ego-low0.6-0-0.7",
# locomotion slow
"move-ego-0-0.3",
"move-ego-90-0.3",
"move-ego-180-0.3",
"move-ego--90-0.3",
# locomotion fast
# "move-ego-0-1",
# "move-ego-90-1",
# "move-ego-180-1",
# "move-ego--90-1",
# spin
"rotate-z-5-0.5",
"rotate-z--5-0.5",
# raise arms
"raisearms-l-l",
"raisearms-l-m",
"raisearms-m-l",
"raisearms-m-m",
# move + arms
"move-arms-0-0.7-m-m",
"move-arms-90-0.7-m-m",
"move-arms-180-0.4-m-m",
"move-arms--90-0.7-m-m",
"move-arms-0-0.7-l-m",
"move-arms-90-0.7-l-m",
"move-arms-180-0.4-l-m",
"move-arms--90-0.7-l-m",
"move-arms-0-0.7-m-l",
"move-arms-90-0.7-m-l",
"move-arms-180-0.4-m-l",
"move-arms--90-0.7-m-l",
"move-arms-0-0.7-l-l",
"move-arms-90-0.7-l-l",
"move-arms-180-0.4-l-l",
"move-arms--90-0.7-l-l",
# spin + arms
"spin-arms-5-l-l",
"spin-arms--5-l-l",
"spin-arms-5-l-m",
"spin-arms--5-l-m",
# "spin-arms-5-m-m",
# "spin-arms--5-m-m",
"spin-arms-5-m-l",
"spin-arms--5-m-l",
# sit
"crouch-0",
"crouch-0.25",
"sitonground",
]
print("Loading the replay buffer...", end=" ", flush=True)
start_t = time.time()
buffer_path = model_folder / "checkpoint/buffers/train_reduced"
if buffer_path.is_dir():
# Load reduced buffer if that exists
dataset = DictBuffer.load(buffer_path, device="cpu")
print("Loaded reduced buffer")
else:
# Try loading the original dataset
buffer_path = model_folder / "checkpoint/buffers/train"
# buffer_path = "/home/yulei/play_ground/BFM-Zero/humanoidverse/data/lafan_29dof.pkl"
dataset = TrajectoryDictBufferMultiDim.load(buffer_path, device="cpu")
print("Loaded original buffer")
# breakpoint()
# NOTE: try use expert buffer for reward inference
# dataset = load_expert_trajectories_from_motion_lib(
# env=wrapped_env._env,
# agent_cfg=config["agent"],
# device="cpu",
# )
# dataset = fast_load_buffer(model_folder / "checkpoint/buffers/train", device="cpu")
print(f"done in {time.time()-start_t}s")
inference_function = "reward_wr_inference"
reward_eval_agent = RewardWrapperHV(
model=model,
inference_dataset=dataset,
num_samples_per_inference=num_samples,
inference_function=inference_function,
max_workers=24,
process_executor=True,
env_model=str(HUMANOIDVERSE_DIR / "data" / "robots" / "g1" / "scene_29dof_freebase_noadditional_actuators.xml"),
)
z_dict = {}
for r in range(n_inferences):
for task in tasks:
print(f"Started inference for {task}...", end=" ", flush=True)
start_t = time.time()
z = reward_eval_agent.reward_inference(task=task)
z_dict[task] = z_dict.get(task, []) + [z.cpu()]
print(f"done in {time.time()-start_t}s")
path = model_folder / "reward_inference"
path.mkdir(exist_ok=True)
with open(os.path.join(path, "reward_locomotion.pkl"), "wb") as f:
joblib.dump(z_dict, f)
print(f"Saved file at {path}/reward_locomotion.pkl")
# z_dict = joblib.load(model_folder / "reward_inference/reward_locomotion.pkl")
if not skip_rollouts:
print("Generating videos...")
if save_mp4:
rgb_renderer = IsaacRendererWithMuJoco(render_size=256)
reward_inference_path = model_folder / "reward_inference"
reward_inference_path.mkdir(exist_ok=True)
task_return_stats = {}
all_episode_returns = []
for task in tasks:
frames = []
episode_returns = []
for z in z_dict[task]:
z = z.repeat(num_envs, 1).to(device)
observation, info = wrapped_env.reset(to_numpy=False, reset_to_default_pose=True)
if save_mp4:
frames.append(rgb_renderer.render(wrapped_env._env, 0)[0])
episode_return = torch.zeros(num_envs, device=device, dtype=torch.float32)
for i in range(episode_length):
action = model.act(observation, z, mean=True)
observation, reward, terminated, truncated, info = wrapped_env.step(action, to_numpy=False)
episode_return += reward.to(device=device, dtype=torch.float32)
# Finish episode early if all envs are done.
episode_done = bool(torch.all(terminated | truncated).item())
if save_mp4:
frames.append(rgb_renderer.render(wrapped_env._env, 0)[0])
if episode_done:
break
# episode_return_value = float(episode_return.mean().item())
episode_return_value = float(episode_return.item())
episode_returns.append(episode_return_value)
# all_episode_returns.append(episode_return_value)
task_return_stats[task] = {
"num_episodes": len(episode_returns),
"episode_returns": episode_returns,
"sum_episode_returns": float(sum(episode_returns)),
"mean_episode_return": float(sum(episode_returns) / len(episode_returns)) if episode_returns else 0.0,
}
print(
f"[{task}] cumulative return over {len(episode_returns)} episodes: "
f"{task_return_stats[task]['sum_episode_returns']:.4f} "
f"(mean per episode: {task_return_stats[task]['mean_episode_return']:.4f})"
)
all_episode_returns.append(task_return_stats[task]['mean_episode_return'])
if save_mp4:
file = video_folder / f"{task}.mp4"
media.write_video(file, frames, fps=50)
print(f"Saved video for {task}: {file}")
global_return_stats = {
"num_tasks": len(task_return_stats),
"num_total_episodes": len(all_episode_returns),
"sum_episode_returns_over_tasks": float(sum(all_episode_returns)),
"mean_episode_return_over_tasks": float(sum(all_episode_returns) / len(all_episode_returns)) if all_episode_returns else 0.0,
}
reward_return_summary = {
"per_task": task_return_stats,
"global": global_return_stats,
}
reward_return_file = reward_inference_path / "reward_return_summary_replay_buffer_new_g1_xml.json"
with open(reward_return_file, "w") as f:
json.dump(reward_return_summary, f, indent=2)
print(
"Global cumulative return over all tasks/episodes: "
f"{global_return_stats['sum_episode_returns_over_tasks']:.4f} "
f"(mean per episode: {global_return_stats['mean_episode_return_over_tasks']:.4f})"
)
print(f"Saved reward return summary: {reward_return_file}")```
Hi, I am trying to reproduce the evaluation performance on reward inference, specifically the cumulative return on 24 tasks shown in the original paper (second row in Figure.3, Rwd 221.9).
I trained from scratch for 80M steps on my local 5090 with buffer_size decreased from 5.12M -> 1M. The tracking performance is good already, but I can only get reward around -3 by running the reward_inference script shown below. I also tried replacing the replay buffer for z computation with expert motion. Although not sure if this is done right, the return is also around -3.
Could you please provide some guidance on reproducing this?