Skip to content

Reproducing the evaluation performance on reward-based tasks #23

@TheKnight-Z

Description

@TheKnight-Z

Hi, I am trying to reproduce the evaluation performance on reward inference, specifically the cumulative return on 24 tasks shown in the original paper (second row in Figure.3, Rwd 221.9).

I trained from scratch for 80M steps on my local 5090 with buffer_size decreased from 5.12M -> 1M. The tracking performance is good already, but I can only get reward around -3 by running the reward_inference script shown below. I also tried replacing the replay buffer for z computation with expert motion. Although not sure if this is done right, the return is also around -3.

Could you please provide some guidance on reproducing this?


    model_folder = Path(model_folder)
    video_folder = Path(video_folder) if video_folder is not None else model_folder / "reward_inference" / "videos"
    video_folder.mkdir(parents=True, exist_ok=True)

    model = load_model_from_checkpoint_dir(model_folder / "checkpoint", device=device)
    model.to(device)
    model.eval()
    model_name = model.__class__.__name__
    with open(model_folder / "config.json", "r") as f:
        config = json.load(f)

    if data_path is not None:
        config["env"]["lafan_tail_path"] = str(data_path)

    if not Path(config["env"]["lafan_tail_path"]).exists():
        config["env"]["lafan_tail_path"] = "data/lafan_29dof.pkl"

    config["env"]["hydra_overrides"].append("env.config.max_episode_length_s=10000")
    config["env"]["hydra_overrides"].append(f"env.config.headless={headless}")
    # config["env"]["hydra_overrides"].append("env.config.lie_down_init=True")
    # config["env"]["hydra_overrides"].append("env.config.lie_down_init_prob=1")
    config["env"]["hydra_overrides"].append(f"simulator={simulator}")
    config["env"]["disable_domain_randomization"] = disable_dr
    config["env"]["disable_obs_noise"] = disable_obs_noise

    rich.print(config["env"])
    num_envs = 1
    env_cfg = HumanoidVerseIsaacConfig(**config["env"])
    wrapped_env, _ = env_cfg.build(num_envs)

    output_dir = model_folder / "exported"
    output_dir.mkdir(parents=True, exist_ok=True)
    export_meta_policy_as_onnx(
        model,
        output_dir,
        f"{model_name}.onnx",
        {"actor_obs": torch.randn(1, model._actor.input_filter.output_space.shape[0] + model.cfg.archi.z_dim)},
        z_dim=model.cfg.archi.z_dim,
        history=('history_actor' in model.cfg.archi.actor.input_filter.key),
        use_29dof=True,
    )
    print(f"Exported model to {output_dir}/{model_name}.onnx")
    tasks = [
        # stand
        "move-ego-0-0",
        "move-ego-low0.5-0-0",

        # locomotion medium
        "move-ego-0-0.7",
        # "move-ego-90-0.7",
        # "move-ego-180-0.7",
        # "move-ego--90-0.7",

        # "move-ego-low0.6-0-0.7",

        # locomotion slow
        "move-ego-0-0.3",
        "move-ego-90-0.3",
        "move-ego-180-0.3",
        "move-ego--90-0.3",
        
        # locomotion fast
        # "move-ego-0-1",
        # "move-ego-90-1",
        # "move-ego-180-1",
        # "move-ego--90-1",

        # spin
        "rotate-z-5-0.5",
        "rotate-z--5-0.5",
    
        # raise arms
        "raisearms-l-l",
        "raisearms-l-m",
        "raisearms-m-l",
        "raisearms-m-m",


        # move + arms
        "move-arms-0-0.7-m-m",
        "move-arms-90-0.7-m-m",
        "move-arms-180-0.4-m-m",
        "move-arms--90-0.7-m-m",
        "move-arms-0-0.7-l-m",
        "move-arms-90-0.7-l-m",
        "move-arms-180-0.4-l-m",
        "move-arms--90-0.7-l-m",
        "move-arms-0-0.7-m-l",
        "move-arms-90-0.7-m-l",
        "move-arms-180-0.4-m-l",
        "move-arms--90-0.7-m-l",
        "move-arms-0-0.7-l-l",
        "move-arms-90-0.7-l-l",
        "move-arms-180-0.4-l-l",
        "move-arms--90-0.7-l-l",

        # spin + arms
        "spin-arms-5-l-l",
        "spin-arms--5-l-l",
        "spin-arms-5-l-m",
        "spin-arms--5-l-m",
        # "spin-arms-5-m-m",
        # "spin-arms--5-m-m",
        "spin-arms-5-m-l",
        "spin-arms--5-m-l",

        # sit
        "crouch-0",
        "crouch-0.25",
        "sitonground",
    ]

    print("Loading the replay buffer...", end=" ", flush=True)
    start_t = time.time()
    buffer_path = model_folder / "checkpoint/buffers/train_reduced"
    if buffer_path.is_dir():
        # Load reduced buffer if that exists
        dataset = DictBuffer.load(buffer_path, device="cpu")
        print("Loaded reduced buffer")
    else:
        # Try loading the original dataset
        buffer_path = model_folder / "checkpoint/buffers/train"
        # buffer_path = "/home/yulei/play_ground/BFM-Zero/humanoidverse/data/lafan_29dof.pkl"
        dataset = TrajectoryDictBufferMultiDim.load(buffer_path, device="cpu")
        print("Loaded original buffer")
    
    # breakpoint()
    # NOTE: try use expert buffer for reward inference
    # dataset = load_expert_trajectories_from_motion_lib(
    #     env=wrapped_env._env,
    #     agent_cfg=config["agent"],
    #     device="cpu",
    # )
    
    
    # dataset = fast_load_buffer(model_folder / "checkpoint/buffers/train", device="cpu")
    print(f"done in {time.time()-start_t}s")
    inference_function = "reward_wr_inference"
    reward_eval_agent = RewardWrapperHV(
        model=model,
        inference_dataset=dataset,
        num_samples_per_inference=num_samples,
        inference_function=inference_function,
        max_workers=24,
        process_executor=True,
        env_model=str(HUMANOIDVERSE_DIR / "data" / "robots" / "g1" / "scene_29dof_freebase_noadditional_actuators.xml"),
    )
    z_dict = {}
    for r in range(n_inferences):
        for task in tasks:
            print(f"Started inference for {task}...", end=" ", flush=True)
            start_t = time.time()
            z = reward_eval_agent.reward_inference(task=task)
            z_dict[task] = z_dict.get(task, []) + [z.cpu()]
            print(f"done in {time.time()-start_t}s")

            path = model_folder / "reward_inference"
            path.mkdir(exist_ok=True)
            with open(os.path.join(path, "reward_locomotion.pkl"), "wb") as f:
                joblib.dump(z_dict, f)
            print(f"Saved file at {path}/reward_locomotion.pkl")

    # z_dict = joblib.load(model_folder / "reward_inference/reward_locomotion.pkl")

    if not skip_rollouts:
        print("Generating videos...")
        if save_mp4:
            rgb_renderer = IsaacRendererWithMuJoco(render_size=256)
        reward_inference_path = model_folder / "reward_inference"
        reward_inference_path.mkdir(exist_ok=True)
        task_return_stats = {}
        all_episode_returns = []
        for task in tasks:
            frames = []
            episode_returns = []
            for z in z_dict[task]:
                z = z.repeat(num_envs, 1).to(device)
                
                observation, info = wrapped_env.reset(to_numpy=False, reset_to_default_pose=True)
                if save_mp4:
                    frames.append(rgb_renderer.render(wrapped_env._env, 0)[0])
                episode_return = torch.zeros(num_envs, device=device, dtype=torch.float32)
                for i in range(episode_length):
                    action = model.act(observation, z, mean=True)
                    observation, reward, terminated, truncated, info = wrapped_env.step(action, to_numpy=False)
                    episode_return += reward.to(device=device, dtype=torch.float32)

                    # Finish episode early if all envs are done.
                    episode_done = bool(torch.all(terminated | truncated).item())

                    if save_mp4:
                        frames.append(rgb_renderer.render(wrapped_env._env, 0)[0])
                    if episode_done:
                        break
                # episode_return_value = float(episode_return.mean().item())
                episode_return_value = float(episode_return.item())
                episode_returns.append(episode_return_value)
                # all_episode_returns.append(episode_return_value)

            task_return_stats[task] = {
                "num_episodes": len(episode_returns),
                "episode_returns": episode_returns,
                "sum_episode_returns": float(sum(episode_returns)),
                "mean_episode_return": float(sum(episode_returns) / len(episode_returns)) if episode_returns else 0.0,
            }
            print(
                f"[{task}] cumulative return over {len(episode_returns)} episodes: "
                f"{task_return_stats[task]['sum_episode_returns']:.4f} "
                f"(mean per episode: {task_return_stats[task]['mean_episode_return']:.4f})"
            )
            all_episode_returns.append(task_return_stats[task]['mean_episode_return'])
            
            if save_mp4:
                file = video_folder / f"{task}.mp4"
                media.write_video(file, frames, fps=50)
                print(f"Saved video for {task}: {file}")

        global_return_stats = {
            "num_tasks": len(task_return_stats),
            "num_total_episodes": len(all_episode_returns),
            "sum_episode_returns_over_tasks": float(sum(all_episode_returns)),
            "mean_episode_return_over_tasks": float(sum(all_episode_returns) / len(all_episode_returns)) if all_episode_returns else 0.0,
        }
        reward_return_summary = {
            "per_task": task_return_stats,
            "global": global_return_stats,
        }
        reward_return_file = reward_inference_path / "reward_return_summary_replay_buffer_new_g1_xml.json"
        with open(reward_return_file, "w") as f:
            json.dump(reward_return_summary, f, indent=2)
        print(
            "Global cumulative return over all tasks/episodes: "
            f"{global_return_stats['sum_episode_returns_over_tasks']:.4f} "
            f"(mean per episode: {global_return_stats['mean_episode_return_over_tasks']:.4f})"
        )
        print(f"Saved reward return summary: {reward_return_file}")```

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions