Please enable Javascript to view the contents
常用 NPU 运维及故障处理
处理故障时,参考或者记录下的内容,持续更新中
1. 容器挂载设备
1
| export IMAGE=registry.cn-beijing.aliyuncs.com/opshub/ascendai-pytorch:2.1.0
|
1
2
3
4
5
6
7
8
9
10
| nerdctl run --rm -it --ipc=host \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
${IMAGE} \
/bin/bash
|
2. 创建 Pod
1
2
| export IMAGE=registry.cn-beijing.aliyuncs.com/opshub/ascendai-pytorch:2.1.0
export NodeName=
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| kubectl create -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: test-ascend-pod
namespace: default
spec:
restartPolicy: Never
nodeName: ${NodeName}
containers:
- name: ascend-container
image: ${IMAGE}
command: ["/bin/bash"]
args: ["-c", "sleep infinity"]
resources:
limits:
huawei.com/Ascend910: "1"
requests:
huawei.com/Ascend910: "1"
EOF
|
3. 驱动升级
1
2
3
4
5
6
7
8
9
| mkdir -p /data/paascontainer/ops
cd /data/paascontainer/ops
rm -f *.run
wget http://mirrors-internal.cmecloud.cn/coca/huawei/910b/Ascend-hdk-910b-npu-firmware_7.7.0.1.231.run
wget http://mirrors-internal.cmecloud.cn/coca/huawei/910b/Ascend-hdk-910b-npu-driver_25.0.rc1.1_linux-aarch64.run
chmod +x *.run
./Ascend-hdk-*-npu-firmware_*.run --upgrade
./Ascend-hdk-*-npu-driver_*.run --upgrade
reboot
|