Skip to content

Error when evaluation in parallel environments #36

Closed
@MasterXiong

Description

@MasterXiong

Hi,

I'm trying to evaluate an octo-based policy in several simpler environments in parallel to accelerate the evaluation process. I generally use python's built-in multiprocessing. A minimal code example is as below:

import numpy as np
from multiprocessing import Process, Pipe
import simpler_env

def worker(remote, parent_remote, env_name):
    parent_remote.close()  # Close the parent end of the pipe
    env = simpler_env.make(env_name)
    while True:
        cmd, data = remote.recv()
        if cmd == 'step':
            obs, reward, success, truncated, info = env.step(data)
            # if done:
            #     obs = env.reset()
            remote.send((obs, reward, success, truncated, info))
        elif cmd == 'reset':
            obs, reset_info = env.reset()
            remote.send((obs, reset_info))
        elif cmd == 'close':
            env.close()
            remote.close()
            break

class ParallelEnvs:
    def __init__(self, env_name, num_envs):
        self.num_envs = num_envs
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(num_envs)])
        self.processes = [Process(target=worker, args=(work_remote, remote, env_name))
                          for (work_remote, remote) in zip(self.work_remotes, self.remotes)]
        for p in self.processes:
            p.start()
        for work_remote in self.work_remotes:
            work_remote.close()  # Close the worker end in the main process

    def step(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        results = [remote.recv() for remote in self.remotes]
        obs, reward, success, truncated, info = zip(*results)
        return obs, reward, success, truncated, info

    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
        obs, reset_info = [remote.recv() for remote in self.remotes]
        return obs, reset_info

    def close(self):
        for remote in self.remotes:
            remote.send(('close', None))
        for p in self.processes:
            p.join()


if __name__ == "__main__":

    env_name = "google_robot_pick_coke_can"
    num_envs = 4  # Number of parallel environments
    num_steps = 100
    dummy_env = simpler_env.make(env_name)

    # Initialize parallel environments
    envs = ParallelEnvs(env_name, num_envs)

    # Reset all environments
    obs, reset_info = envs.reset()
    breakpoint()

    for step in range(num_steps):
        # Get actions from the policy
        actions = [dummy_env.action_space.sample() for _ in range(num_envs)]

        # Step all environments with the actions
        obs, reward, success, truncated, info = envs.step(actions)
        breakpoint()

    # Close the environments
    envs.close()

But I got the following error when initializing multiple environments:

  File "/user/fine-tune/test.py", line 15, in worker                                                                                                                 [15/1901]
    env = simpler_env.make(env_name)
  File "/SimplerEnv/simpler_env/__init__.py", line 78, in make
    env = gym.make(env_name, obs_mode="rgbd", **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/gymnasium/envs/registration.py", line 802, in make
    env = env_creator(**env_spec_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 92, in make
    env = env_spec.make(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/utils/registration.py", line 34, in make
    return self.cls(**_kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 630, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 540, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 64, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 134, in __init__
    super().__init__(**kwargs)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 188, in __init__
    obs, _ = self.reset(seed=2022, options=dict(reconfigure=True))
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 585, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/grasp_single_in_scene.py", line 135, in reset
    obs, info = super().reset(seed=self._episode_seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 228, in reset
    obs, info = super().reset(seed=seed, options=options)
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 488, in reset
    return self.get_obs(), {}
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/custom_scenes/base_env.py", line 350, in get_obs
    obs = super().get_obs()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 265, in get_obs
    return self._get_obs_images()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 312, in _get_obs_images
    self.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/envs/sapien_env.py", line 289, in take_picture
    cam.take_picture()
  File "/SimplerEnv/ManiSkill2_real2sim/mani_skill2_real2sim/sensors/camera.py", line 187, in take_picture
    self.camera.take_picture()
RuntimeError: vk::Device::waitForFences: ErrorDeviceLost

Could you help have a look at what is the issue here? Or what is the right way to parallalize simpler environments? Thanks for your help!

P.S. This link may be relevant to my issue here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions