# 参考地址: https://www.hiascend.com/software/modelzoo/models/detail/678bdeb4e1a64c9dae51d353d84ddd15 # 模型权重权限修改 chown root:root -R /home/openlab/deepseek-v3/ chmod 640 -R /home/openlab/deepseek-v3/ # NPU侧权重转换(留足空间,建议磁盘空间3T+,DeepSeek-V3在转换前权重约为640G左右,在转换后权重约为1.3T左右,w8a8量化后也需要600G) git clone https://gitee.com/ascend/ModelZoo-PyTorch.git cd ModelZoo-PyTorch/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/ # 权重转换,时长约1.5H python fp8_cast_bf16.py --input-fp8-hf-path /home/openlab/deepseek/deepseek-v3/ --output-bf16-hf-path /home/openlab/deepseek/deepseek-v3-bf16/ # 权重权限改为750 chmod 750 -R /home/openlab/deepseek-v3-bf16/
# 下载msit git clone -b br_noncom_MindStudio_8.0.0_POC_20251231 https://gitee.com/ascend/msit.git cd msit/msmodelslim bash install.sh # 升级transformers pip install transformers==4.48.2 # 修改模型权重的modeling_deepseek.py,注释掉is_flash_attn_2_available相关内容 vim /home/openlab/deepseek-v3-bf16/modeling_deepseek.py # 修改模型config.json,删除"quantization_config" vim /home/openlab/deepseek-v3-bf16/config.json cd example/DeepSeek/ # 修改脚本,改为8卡, vim quant_deepseek_w8a8.py export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:False python3 quant_deepseek_w8a8.py --model_path /home/openlab/deepseek-v3-bf16/ --save_path /home/openlab/deepseek-v3-w8a8
"model_type":"deepseekv2"
for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
for i in {0..7}; do hccn_tool -i $i -link -g ; done
for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
for i in {0..7};do hccn_tool -i $i -ip -g; done
for i in {0..7}; do hccn_tool -i $i -tls -g ; done | grep switch
for i in {0..7};do hccn_tool -i $i -tls -s enable 0;done
for j in {0..7}; do for i in {90..97}; do hccn_tool -i ${j} -ping -g address 10.30.4.${i}; 【机器B各个卡的IP地址】 done; done
斜体内容可根据实际情况进行配置。
{ "server_count": "...", # 总节点数 # server_list中第一个server为主节点 "server_list": [ { "device": [ { "device_id": "...", # 当前卡的本机编号,取值范围[0,本机卡数) "device_ip": "...", # 当前卡的IP地址,可通过hccn_tool命令获取 "rank_id": "..." # 当前卡的全局编号,取值范围[0,总卡数) }, ... ], "server_id": "...", # 当前节点的IP地址 "container_ip": "..." # 容器IP地址 }, ... ], "status": "completed", "version": "1.0" }
chmod -R 640 {rank_table_file.json路径}
参考3.3.1 章节步骤2和步骤3,查看Atlas 800I A2推理产品如何安装NPU的驱动和固件。
访问链接并下载下列软件:https://www.hiascend.com/developer/ascendhub/detail/af85b724a7e5469ebd7ea13c3439d48f
docker run -itd --privileged --name= {容器名称} --net=host --shm-size 500g \ --device=/dev/davinci0 \ --device=/dev/davinci1 \ --device=/dev/davinci2 \ --device=/dev/davinci3 \ --device=/dev/davinci4 \ --device=/dev/davinci5 \ --device=/dev/davinci6 \ --device=/dev/davinci7 \ --device=/dev/davinci_manager \ --device=/dev/hisi_hdc \ --device /dev/devmm_svm \ -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ -v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \ -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \ -v /usr/local/sbin:/usr/local/sbin \ -v /etc/hccn.conf:/etc/hccn.conf \ -v {/权重路径:/权重路径} \ -v {/rank_table_file.json路径:/rank_table_file.json路径} \ {根据加载的镜像名称修改} \ bash
docker exec -it {容器名称} bash
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/nnal/atb/set_env.sh source /usr/local/Ascend/atb-models/set_env.sh source /usr/local/Ascend/mindie/set_env.sh
export ATB_LLM_HCCL_ENABLE=1 export ATB_LLM_COMM_BACKEND="hccl" export HCCL_CONNECT_TIMEOUT=7200 双机: export WORLD_SIZE=16
export MIES_CONTAINER_IP= {容器IP地址} export RANKTABLEFILE= {rank_table_file.json路径}
vim /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
# 如果网络环境不安全,不开启HTTPS通信,即“httpsEnabled”=“false”时,会存在较高的网络安全风险 "httpsEnabled" : false, ... # 开启多机推理 ... # 若不需要安全认证,则将以下两个参数设为false "multiNodesInferEnabled" : true, "interCommTLSEnabled" : false, "interNodeTLSEnabled" : false, ... "npudeviceIds" : [[0,1,2,3,4,5,6,7]], ... "modelName" : "DeepSeek-R1" # 不影响服务化拉起 "modelWeightPath" : "权重路径", "worldSize":8,
# 以下命令需在所有机器上同时执行 # 解决权重加载过慢问题 export OMP_NUM_THREADS=1 # 设置显存比 export NPU_MEMORY_FRACTION=0.95 # 拉起服务化 cd /usr/local/Ascend/mindie/latest/mindie-service/ ./bin/mindieservice_daemon
执行命令后,首先会打印本次启动所用的所有参数,然后直到出现以下输出,则认为服务启动成功:
Daemon start success!
新建窗口进行接口测试。
curl -X POST -d '{ "model":"DeepseekV1", "messages": [{ "role": "system", "content": "你是谁?" }], "max_tokens": 20, "stream": false }' http://ip:port/v1/chat/completions