在机密虚机中使用GPU搭配vLLM推理框架部署Llama2大模型
发表于 2025/05/12
0
作者: 张强
一 环境
1. 处理器: 鲲鹏920新型号处理器(7270Z、7280Z、7250Y、7260Y等)2. Host OS: openEuler24.03 SP2
3. Guetst OS: openEuler24.03 SP2
二 模型
1. 测试数据集: ShareGPT_V3_unfiltered_cleaned_split.json2. 大模型:Llama-2-7b-hf
三 Host 环境搭建
1. 安装依赖yum install libvirt qemu edk2-aarch64 qemu-nbd -y
2. 将edk2中的QEMU_EFI.fd对齐到64M fallocate -l $((64 * 1024 * 1024)) /usr/share/edk2/aarch64/QEMU_EFI.fd
3. 修改内核启动参数vim /boot/efi/EFI/openEuler/grub.cfg
在内核启动参数 linux /vmlinuz-xxx末尾添加
virtcca_cvm_host=1 arm_smmu_v3.disable_ecmdq=1 vfio_pci.disable_idle_d3=1
四 机密虚机环境搭建
1 下载虚机镜像wget https://repo.openeuler.org/openEuler-24.03-LTS-SP2/virtual_machine_img/aarch64/openEuler-24.03-LTS-SP2-aarch64.qcow2.xz
xz -d openEuler-24.03-LTS-SP2-aarch64.qcow2.xz
2. 修改内核参数modprobe nbd max_part=16
qemu-nbd -c /dev/nbd0 openEuler-24.03-LTS-SP2-aarch64.qcow2
mkdir /mnt/qcow2
mount /dev/nbd0p1 /mnt/qcow2
vim /mnt/qcow2/efi/EFI/openEuler/grub.cfg
# 在启动参数末尾加上
cma=64M virtcca_cvm_guest=1 cvm_guest=1 swiotlb=65536,force loglevel=8
umount /mnt/qcow2
qemu-nbd -d /dev/nbd0
3. 增加虚拟网桥新建network.xml, 写入如下内容
<network>
<name>default</name>
<uuid>c256bbb0-408d-47b4-83f7-b8e4085cf85c</uuid>
<forward mode='nat'/>
<bridge name='virbr0' stp='on' delay='0'/>
<mac address='52:54:00:bb:eb:bf'/>
<ip address='192.168.122.1' netmask='255.255.255.0'>
<dhcp>
<range start='192.168.122.2' end='192.168.122.254'/>
</dhcp>
</ip>
</network>
virsh net-define network.xml
virsh net-start default
4. 将GPU直通到机密虚机中lspci |grep -i xxx
查看BDF号,写入,libvirt配置文件中
5. 启动机密虚机
新建libvirt配置文件cvm-gpu.xml,写入如下内容
<domain type='kvm' xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
<name>cvm-gpu</name>
<memory unit='GiB'>16</memory>
<currentMemory unit='GiB'>16</currentMemory>
<vcpu placement='static'>16</vcpu>
<iothreads>1</iothreads>
<cputune>
<vcpupin vcpu='0' cpuset='0'/>
<vcpupin vcpu='1' cpuset='1'/>
<vcpupin vcpu='2' cpuset='2'/>
<vcpupin vcpu='3' cpuset='3'/>
<vcpupin vcpu='4' cpuset='4'/>
<vcpupin vcpu='5' cpuset='5'/>
<vcpupin vcpu='6' cpuset='6'/>
<vcpupin vcpu='7' cpuset='7'/>
<vcpupin vcpu='8' cpuset='8'/>
<vcpupin vcpu='9' cpuset='9'/>
<vcpupin vcpu='10' cpuset='10'/>
<vcpupin vcpu='11' cpuset='11'/>
<vcpupin vcpu='12' cpuset='12'/>
<vcpupin vcpu='13' cpuset='13'/>
<vcpupin vcpu='14' cpuset='14'/>
<vcpupin vcpu='15' cpuset='15'/>
<emulatorpin cpuset='0-15'/>
</cputune>
<numatune>
<memnode cellid='0' mode='strict' nodeset='0-3'/>
</numatune>
<resource>
<partition>/machine</partition>
</resource>
<os>
<type arch='aarch64' machine='virt-6.1'>hvm</type>
<loader readonly='yes' type='rom'>/usr/share/edk2/aarch64/QEMU_EFI.fd</loader>
<boot dev='hd'/>
</os>
<features>
<gic version='3'/>
</features>
<cpu mode='host-passthrough' check='none'>
<topology sockets='1' dies='1' clusters='1' cores='16' threads='1'/>
<numa>
<cell id='0' cpus='0-15' memory='16' unit='GiB'/>
</numa>
</cpu>
<clock offset='utc'/>
<on_poweroff>destroy</on_poweroff>
<on_reboot>restart</on_reboot>
<on_crash>destroy</on_crash>
<devices>
<emulator>/usr/libexec/qemu-kvm</emulator>
<disk type='file' device='disk' model='virtio-non-transitional'>
<driver name='qemu' type='qcow2' cache='none' queues='2' iommu='on'/>
<source file='/home/test/qcow2/openEuler-24.03-LTS-SP1-aarch64-cvm.qcow2'/>
<target dev='vda' bus='virtio'/>
<address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/>
</disk>
<controller type='pci' index='0' model='pcie-root'/>
<controller type='pci' index='1' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='1' port='0x8'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0' multifunction='on'/>
</controller>
<controller type='pci' index='2' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='2' port='0x9'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x1'/>
</controller>
<controller type='pci' index='3' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='3' port='0xa'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x2'/>
</controller>
<controller type='pci' index='4' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='4' port='0xb'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x3'/>
</controller>
<serial type='pty'>
<target type='system-serial' port='0'>
<model name='pl011'/>
</target>
</serial>
<console type='pty'>
<target type='serial' port='0'/>
</console>
<audio id='1' type='none'/>
<hostdev mode='subsystem' type='pci' managed='yes'>
<driver name='vfio'/>
<source>
<!-- 这里写入 lspci |grep -i nvidia 查询到的BDF号-->
<address domain='0x0000' bus='0xab' slot='0x00' function='0x0'/>
</source>
<address type='pci' domain='0x0000' bus='0x02' slot='0x00' function='0x0'/>
</hostdev>
<interface type='bridge'>
<mac address='52:54:00:d0:a3:be'/>
<source bridge='virbr0'/>
<model type='virtio-non-transitional'/>
<driver queues='4' iommu='on'/>
<address type='pci' domain='0x0000' bus='0x03' slot='0x00' function='0x0'/>
</interface>
</devices>
<launchSecurity type='cvm'/>
<qemu:commandline>
<qemu:arg value='-object'/>
<qemu:arg value='tmm-guest,id=tmm0,sve-vector-length=128,num-pmu-counters=1'/>
</qemu:commandline>
</domain>
virsh define cvm-gpu.xml # 定义机密虚机。
virsh start cvm-gpu --console # 输入账号密码后,进入机密虚机。
五 机密虚机内vLLM推理环境搭建
1. 驱动下载、安装2. 计算平台下载、安装
3. conda 下载、安装wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh
4. 安装python3.10 conda create -n py310 python=3.10
conda activate py310
5. vLLM 安装wget https://download.pytorch.org/whl/nightly/cu126/torch-2.6.0.dev20250104%2Bcu126-cp310-cp310-linux_aarch64.whl
pip install torch-2.6.0.dev20250104
pip install compressed_tensors
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout v0.6.6
python use_existing_torch.py
pip install -r requirements-build.txt
pip install -e . --no-build-isolation
六 模型推理测试
python3 benchmarks/benchmark_throughput.py --dataset /home/test/ShareGPT_V3_unfiltered_cleaned_split.json --model /home/test/Llama-2-7B-hf/ -tp 1