Open
Description
Add Link
https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html
Describe the bug
Tutorial failing with the following error:
Unexpected failing examples:
/var/lib/workspace/intermediate_source/pinmem_nonblock.py failed leaving traceback:
Traceback (most recent call last):
File "/var/lib/workspace/intermediate_source/pinmem_nonblock.py", line 642, in <module>
from tensordict import TensorDict
File "/usr/local/lib/python3.10/dist-packages/tensordict/__init__.py", line 6, in <module>
import tensordict._reductions
File "/usr/local/lib/python3.10/dist-packages/tensordict/_reductions.py", line 11, in <module>
from tensordict._lazy import LazyStackedTensorDict
File "/usr/local/lib/python3.10/dist-packages/tensordict/_lazy.py", line 37, in <module>
from tensordict.memmap import MemoryMappedTensor
File "/usr/local/lib/python3.10/dist-packages/tensordict/memmap.py", line 22, in <module>
from tensordict.utils import _shape, implement_for, IndexType, NESTED_TENSOR_ERR
File "/usr/local/lib/python3.10/dist-packages/tensordict/utils.py", line 94, in <module>
from torchrec import KeyedJaggedTensor
File "/usr/local/lib/python3.10/dist-packages/torchrec/__init__.py", line 10, in <module>
import torchrec.distributed # noqa
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/__init__.py", line 38, in <module>
from torchrec.distributed.model_parallel import DistributedModelParallel # noqa
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/model_parallel.py", line 26, in <module>
from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/planner/__init__.py", line 24, in <module>
from torchrec.distributed.planner.planners import EmbeddingShardingPlanner # noqa
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/planner/planners.py", line 21, in <module>
from torchrec.distributed.planner.constants import BATCH_SIZE, MAX_SIZE
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/planner/constants.py", line 12, in <module>
from torchrec.distributed.embedding_types import EmbeddingComputeKernel
File "/usr/local/lib/python3.10/dist-packages/torchrec/distributed/embedding_types.py", line 16, in <module>
from fbgemm_gpu.split_table_batched_embeddings_ops_training import EmbeddingLocation
File "/usr/local/lib/python3.10/dist-packages/fbgemm_gpu/__init__.py", line 71, in <module>
_load_library(f"{library}.so")
File "/usr/local/lib/python3.10/dist-packages/fbgemm_gpu/__init__.py", line 21, in _load_library
raise error
File "/usr/local/lib/python3.10/dist-packages/fbgemm_gpu/__init__.py", line 17, in _load_library
torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
File "/var/lib/ci-user/.local/lib/python3.10/site-packages/torch/_ops.py", line 1392, in load_library
ctypes.CDLL(path)
File "/usr/lib/python3.10/ctypes/__init__.py", line 374, in __init__
self._handle = _dlopen(self._name, mode)
OSError: /usr/local/lib/python3.10/dist-packages/fbgemm_gpu/fbgemm_gpu_config.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSsb
Please submit fixes against the 2.7-RC-TEST branch and enable in .jenkins/validate_tutorials_built.py.
Describe your environment
CUDA: 12.6
PyTorch: 2.7