Please enable Javascript to view the contents

常用 NPU 运维及故障处理

 ·  ☕ 1 分钟

处理故障时,参考或者记录下的内容,持续更新中

1. 容器挂载设备

1
export IMAGE=registry.cn-beijing.aliyuncs.com/opshub/ascendai-pytorch:2.1.0
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
nerdctl run --rm -it --ipc=host \
                     --device=/dev/davinci7 \
                     --device=/dev/davinci_manager \
                     --device=/dev/devmm_svm \
                     --device=/dev/hisi_hdc \
                     -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
                     -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
                     -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
                     ${IMAGE} \
                     /bin/bash

2. 创建 Pod

1
2
export IMAGE=registry.cn-beijing.aliyuncs.com/opshub/ascendai-pytorch:2.1.0
export NodeName=
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
kubectl create -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
  name: test-ascend-pod
  namespace: default
spec:
  restartPolicy: Never
  nodeName: ${NodeName}
  containers:
  - name: ascend-container
    image: ${IMAGE}
    command: ["/bin/bash"]
    args: ["-c", "sleep infinity"]
    resources:
      limits:
        huawei.com/Ascend910: "1"
      requests:
        huawei.com/Ascend910: "1"
EOF

3. 驱动升级

1
2
3
4
5
6
7
8
9
mkdir -p /data/paascontainer/ops
cd /data/paascontainer/ops
rm -f *.run
wget http://mirrors-internal.cmecloud.cn/coca/huawei/910b/Ascend-hdk-910b-npu-firmware_7.7.0.1.231.run
wget http://mirrors-internal.cmecloud.cn/coca/huawei/910b/Ascend-hdk-910b-npu-driver_25.0.rc1.1_linux-aarch64.run
chmod +x *.run
./Ascend-hdk-*-npu-firmware_*.run --upgrade
./Ascend-hdk-*-npu-driver_*.run --upgrade
reboot

微信公众号
作者
微信公众号