Skip to content

Commit b1824f7

Browse files
[feat] upgrade torchrec to 1.2.0 (#197)
1 parent ea82751 commit b1824f7

20 files changed

+52
-364
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-bench-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
options: --gpus all --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}

.github/workflows/codestyle_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-codestyle-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
steps:
1414
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
1515
uses: actions/checkout@v2

.github/workflows/pytyping_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-codestyle-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
steps:
1414
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
1515
uses: actions/checkout@v2

.github/workflows/unittest_ci.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
options: --gpus all --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
@@ -21,5 +21,4 @@ jobs:
2121
id: run_unittest_ci
2222
run: |
2323
cd run_${{ github.run_id }}
24-
pip uninstall faiss-cpu -y
2524
CI_HYPOTHESIS=true bash scripts/ci/ci_test.sh

.github/workflows/unittest_cpu_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-cpu-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7-cpu
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8-cpu
1313
options: --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}

.github/workflows/unittest_nightly.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
options: --gpus all --ipc host
1414
steps:
1515
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
@@ -25,5 +25,4 @@ jobs:
2525
ODPS_ENDPOINT: ${{ secrets.ODPS_ENDPOINT }}
2626
run: |
2727
cd run_${{ github.run_id }}
28-
pip uninstall faiss-cpu -y
2928
bash scripts/ci/ci_test.sh

.github/workflows/whl_and_doc_nightly.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
ci-test:
1010
runs-on: tzrec-codestyle-runner
1111
container:
12-
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.7
12+
image: mybigpai-public-registry.cn-beijing.cr.aliyuncs.com/easyrec/tzrec-devel:0.8
1313
steps:
1414
- name: FetchCommit ${{ github.event.pull_request.head.sha }}
1515
uses: actions/checkout@v2

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ repos:
66
files: \.py$
77
args: ["--license-filepath", "data/.license_header.txt", "--allow-past-years"]
88
- repo: https://github.com/astral-sh/ruff-pre-commit
9-
rev: v0.9.6
9+
rev: v0.11.12
1010
hooks:
1111
- id: ruff
1212
args: [ --fix ]

docker/Dockerfile

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,19 @@ ENV PATH /opt/conda/bin:$PATH
2626

2727
ARG DEVICE
2828
RUN case ${DEVICE} in \
29-
"cu124") wget https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
29+
"cu126") wget https://tzrec.oss-cn-beijing.aliyuncs.com/third_party/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
3030
dpkg -i cuda-keyring_1.1-1_all.deb && \
3131
apt-get update && \
32-
apt-get install cuda-compat-12-4 && \
32+
apt-get install cuda-compat-12-6 && \
3333
rm -rf /var/lib/apt/lists/* ;; \
3434
esac
3535
RUN case ${DEVICE} in \
36-
"cu124") pip install torch==2.6.0 fbgemm-gpu==1.1.0 --index-url https://download.pytorch.org/whl/cu124 && \
37-
pip install torchmetrics==1.0.3 torch_tensorrt==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu124 && \
38-
pip install torchrec==1.1.0 --index-url https://download.pytorch.org/whl/cu124 ;; \
39-
* ) pip install torch==2.6.0 fbgemm-gpu==1.1.0 --index-url https://download.pytorch.org/whl/cpu && \
40-
pip install torchmetrics==1.0.3 && \
41-
pip install torchrec==1.1.0 --index-url https://download.pytorch.org/whl/cpu ;; \
36+
"cu126") pip install torch==2.7.0 fbgemm-gpu==1.2.0 --index-url https://download.pytorch.org/whl/cu126 && \
37+
pip install torchmetrics==1.0.3 tensordict torch_tensorrt==2.7.0 --extra-index-url https://download.pytorch.org/whl/cu126 && \
38+
pip install torchrec==1.2.0 --index-url https://download.pytorch.org/whl/cu126 ;; \
39+
* ) pip install torch==2.7.0 fbgemm-gpu==1.2.0 --index-url https://download.pytorch.org/whl/cpu && \
40+
pip install torchmetrics==1.0.3 tensordict && \
41+
pip install torchrec==1.2.0 --index-url https://download.pytorch.org/whl/cpu ;; \
4242
esac && \
4343
/opt/conda/bin/conda clean -ya
4444

@@ -48,14 +48,15 @@ ARG LD_LIBRARY_PATH
4848
ENV LD_LIBRARY_PATH ${LD_LIBRARY_PATH}
4949

5050
ADD requirements /root/requirements
51+
ADD requirements.txt /root/requirements.txt
5152
ADD requirements-cpu.txt /root/requirements-cpu.txt
5253
ADD requirements-gpu.txt /root/requirements-gpu.txt
5354
RUN cd /root && \
5455
case ${DEVICE} in \
55-
"cu124") pip install -r requirements-gpu.txt ;; \
56+
"cu126") pip install -r requirements-gpu.txt ;; \
5657
* ) pip install -r requirements-cpu.txt ;; \
5758
esac && \
58-
rm -rf requirements requirements-cpu.txt requirements-gpu.txt && \
59+
rm -rf requirements requirements.txt requirements-cpu.txt requirements-gpu.txt && \
5960
/opt/conda/bin/conda clean -ya
6061

6162
RUN mkdir -p /home/pai/bin && \

docs/source/quick_start/dlc_odps_dataset_tutorial.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ bash upload_data.sh ${ODPS_PROJECT_NAME}
3434

3535
进入[PAI控制台](https://pai.console.aliyun.com),并选择需要使用的工作空间,点击 **模型开发与训练-分布式训练(DLC)**,点击创建任务。
3636

37-
**节点镜像** 选择官方镜像`torcheasyrec:0.7.0-pytorch2.6.0-gpu-py311-cu124-ubuntu22.04`
37+
**节点镜像** 选择官方镜像`torcheasyrec:0.8.0-pytorch2.7.0-gpu-py311-cu126-ubuntu22.04`
3838

3939
**数据集配置** 选择刚新建的NAS数据集
4040

41-
**资源配置** 选择框架为PyTorch,任务资源我们以选择单机8卡V100为例(建议优先选择单机多卡机型,需要多机多卡训练时建议选择带RDMA的机型)
41+
**资源配置** 选择框架为PyTorch,任务资源我们以选择单机8卡V100为例(建议优先选择单机多卡机型,需要多机多卡训练时建议选择带RDMA的机型)**驱动设置选择535+**
4242

4343
**角色信息** 选择**PAI默认角色**
4444

0 commit comments

Comments
 (0)