PyTorch使能HPCKit进行编译安装
发表于 2025/09/23
0
众所周知,PyTorch除了可以直接下载软件包进行安装外,为获取最优性能往往使用源码进行编译安装,其性能强依赖于编译器、数学库等软件的性能。在鲲鹏平台上,使用HPCKit进行编译安装,通过内置毕昇编译器、鲲鹏数学库(Kunpeng Math Library,KML)加速PyTorch,以获得在鲲鹏处理器上的最优性能。
1、环境信息
硬件环境:鲲鹏服务器
操作系统 :openEuler 22.03 LTS SP4/SP3/SP2、openEuler 20.03 LTS SP3、麒麟V10 SP2/HPC版、麒麟信安V3.5.2 /V3.5.3等操作系统
HPCKit版本:25.1.RC1及以上
2、前置环境配置
2.1 HPCKit安装HPCKit安装包下载链接:https://www.hikunpeng.com/developer/hpc/hpckit-download
安装过程参考HPCKit文档:https://www.hikunpeng.com/document/detail/zh/kunpenghpcs/hpckit/instg/KunpengHPCKit_install_002.html
2.2、PyTorch源码下载
在github上下载Pytorch:
毕昇编译器升级4.0.0以后对代码要求变严格,原版的Pytorch脚本中部分代码书写不规范,新版LLVM会导致报错,因此需要将非ASCII字符清理掉,并避免编译告警。
在
with open(self._cmake_cache_file, 'rb') as f:
content = f.read()
content = bytes(i for i in content if i < 128)
with open(self._cmake_cache_file, 'wb') as f:
f.write(content)
在
with open(cmake_cache_txt, 'rb') as f:
content = f.read()
content = bytes(i for i in content if i < 128)
with open(cmake_cache_txt, 'wb') as f:
f.write(content)
3.2、Pytorch使能KML
3.2.1、kblas替换openblas
1. 在
SET(KML_BLAS_INCLUDE_SEARCH_PATHS
$ENV{KMLROOT}/include
)
SET(KML_BLAS_LIB_SEARCH_PATHS
$ENV{KMLROOT}/lib/sme/kblas/multi/
)
FIND_PATH(KML_INCLUDE_DIR NAMES kblas.h PATHS ${KML_BLAS_INCLUDE_SEARCH_PATHS})
FIND_LIBRARY(KML_LIB NAMES kblas PATHS ${KML_BLAS_LIB_SEARCH_PATHS})
SET(KML_FOUND ON)
# Check include files
IF(NOT KML_INCLUDE_DIR)
SET(KML_FOUND OFF)
MESSAGE(STATUS "Could not find KBLAS include. Turning KBLAS_FOUND off")
ENDIF()
# Check libraries
IF(NOT KML_LIB)
SET(KML_FOUND OFF)
MESSAGE(STATUS "Could not find KBLAS lib. Turning KBLAS_FOUND off")
ENDIF()
IF (KML_FOUND)
IF (NOT KML_FIND_QUIETLY)
MESSAGE(STATUS "Found KBLAS libraries: ${KML_LIB}")
MESSAGE(STATUS "Found KBLAS include: ${KML_INCLUDE_DIR}")
ENDIF (NOT KML_FIND_QUIETLY)
ELSE (KML_FOUND)
IF (KML_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "Could not find KBLAS")
ENDIF (KML_FIND_REQUIRED)
ENDIF (KML_FOUND)
MARK_AS_ADVANCED(KML_INCLUDE_DIR KML_LIB KBLAS)
2. 在
elseif(BLAS STREQUAL "KML")
find_package(KML REQUIRED)
include_directories(SYSTEM ${KML_INCLUDE_DIR})
list(APPEND Caffe2_DEPENDENCY_LIBS ${KML_LIB})
在github上下载Pytorch:
git clone -b v2.5.0 --depth=1 --recursive https://github.com/pytorch/pytorch.git
cd pytorch
git submodule update --init --recursive
3、Pytorch链接KML步骤
3.1 PyTorch使能毕昇编译器毕昇编译器升级4.0.0以后对代码要求变严格,原版的Pytorch脚本中部分代码书写不规范,新版LLVM会导致报错,因此需要将非ASCII字符清理掉,并避免编译告警。
在
tools/setup_helpers/cmake.py
文件的def get_cmake_cache_variables(self) -> dict[str, CMakeValue]
函数中添加以下代码:with open(self._cmake_cache_file, 'rb') as f:
content = f.read()
content = bytes(i for i in content if i < 128)
with open(self._cmake_cache_file, 'wb') as f:
f.write(content)
在
tools/setup_helpers/env.py
的def __init__(self, cmake_build_type_env: str | None = None) -> None
函数里,针对if os.path.isfile(cmake_cache_txt)
分支判断逻辑下添加以下代码:with open(cmake_cache_txt, 'rb') as f:
content = f.read()
content = bytes(i for i in content if i < 128)
with open(cmake_cache_txt, 'wb') as f:
f.write(content)
3.2、Pytorch使能KML
3.2.1、kblas替换openblas
1. 在
cmake/Modules/
新建FindKML.cmake
文件,文件内容如下:SET(KML_BLAS_INCLUDE_SEARCH_PATHS
$ENV{KMLROOT}/include
)
SET(KML_BLAS_LIB_SEARCH_PATHS
$ENV{KMLROOT}/lib/sme/kblas/multi/
)
FIND_PATH(KML_INCLUDE_DIR NAMES kblas.h PATHS ${KML_BLAS_INCLUDE_SEARCH_PATHS})
FIND_LIBRARY(KML_LIB NAMES kblas PATHS ${KML_BLAS_LIB_SEARCH_PATHS})
SET(KML_FOUND ON)
# Check include files
IF(NOT KML_INCLUDE_DIR)
SET(KML_FOUND OFF)
MESSAGE(STATUS "Could not find KBLAS include. Turning KBLAS_FOUND off")
ENDIF()
# Check libraries
IF(NOT KML_LIB)
SET(KML_FOUND OFF)
MESSAGE(STATUS "Could not find KBLAS lib. Turning KBLAS_FOUND off")
ENDIF()
IF (KML_FOUND)
IF (NOT KML_FIND_QUIETLY)
MESSAGE(STATUS "Found KBLAS libraries: ${KML_LIB}")
MESSAGE(STATUS "Found KBLAS include: ${KML_INCLUDE_DIR}")
ENDIF (NOT KML_FIND_QUIETLY)
ELSE (KML_FOUND)
IF (KML_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "Could not find KBLAS")
ENDIF (KML_FIND_REQUIRED)
ENDIF (KML_FOUND)
MARK_AS_ADVANCED(KML_INCLUDE_DIR KML_LIB KBLAS)
2. 在
cmake/Dependencies.cmake
文件中添加BLAS对KML引用的索引,在if (BLAS STREQUAL “Eigen”)
分支判断逻辑下添加以下代码:elseif(BLAS STREQUAL "KML")
find_package(KML REQUIRED)
include_directories(SYSTEM ${KML_INCLUDE_DIR})
list(APPEND Caffe2_DEPENDENCY_LIBS ${KML_LIB})
set(BLAS_INFO "KML")
set(BLAS_FOUND 1)
set(BLAS_LIBRARIES ${KML_LIB})
set(BLAS_FOUND 1)
set(BLAS_LIBRARIES ${KML_LIB})
3.2.2、klapack替换lapack
在
IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "kml"))
SET(_kml_include $ENV{KML_LAPACK_INCLUDE})
SET(_kml_lib $ENV{KML_LAPACK_LIB})
find_path(LAPACK_INCLUDE_DIR
NAMES lapacke.h
HINTS ${_kml_include}
)
find_library(LAPACK_LIBRARY
NAMES klapack_full km
HINTS ${_kml_lib} ${_kml_lib}/../noarch/
)
if(LAPACK_INCLUDE_DIR AND LAPACK_LIBRARY)
set(LAPACK_LIBRARIES ${LAPACK_LIBRARY})
set(LAPACK_INFO "kml")
set(LAPACK_FOUND TRUE)
message(STATUS "Found kml LAPACK at ${_kml_lib}")
endif()
在
cmake/Modules/FindLAPACK.cmake
中添加新KML后端,添加以下代码:IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "kml"))
SET(_kml_include $ENV{KML_LAPACK_INCLUDE})
SET(_kml_lib $ENV{KML_LAPACK_LIB})
find_path(LAPACK_INCLUDE_DIR
NAMES lapacke.h
HINTS ${_kml_include}
)
find_library(LAPACK_LIBRARY
NAMES klapack_full km
HINTS ${_kml_lib} ${_kml_lib}/../noarch/
)
if(LAPACK_INCLUDE_DIR AND LAPACK_LIBRARY)
set(LAPACK_LIBRARIES ${LAPACK_LIBRARY})
set(LAPACK_INFO "kml")
set(LAPACK_FOUND TRUE)
message(STATUS "Found kml LAPACK at ${_kml_lib}")
endif()
4、Pytorch编译步骤
4.1、加载HPCKit环境假设HPCKit安装路径为
/path-to-HPCKit/
,执行下述命令进行加载环境变量:source /path-to-HPCKit/HPCKit/latest/modulefiles/kml/bisheng/env/setvars.sh
source /path-to-HPCKit/HPCKit/latest/modulefiles/compiler/bisheng/env/setvars.sh
4.2、下载PyTorch依赖项
4.3、编译PyTorch
编译指令如下:
pip install -r requirements.txt
4.3、编译PyTorch
编译指令如下:
SVE_VECBITS=512 DEBUG=0 USE_MKLDNN=0 USE_CUDA=0 BUILD_TEST=0 USE_FBGEMM=0 USE_NNPACK=1 USE_QNNPACK=0 USE_XNNPACK=0 USE_HBM=0 BLAS=OpenBLAS USE_LAPACK=1 KML_LAPACK_LIB=/path/to/HPCKit/latest/kml/bisheng/lib/sve512/ KML_LAPACK_INCLUDE=/path/to/HPCKit/latest/kml/bisheng/include python ./setup.py develop
5、Pytorch性能测试
5.1、新增测试文件linear_test.py
,代码如下import torch
import time
import time
def measure_cpu_matmul_time(size: int = 9728, warmup: int = 3, repeats: int = 10):
# 在 CPU 上测量 size x size 矩阵乘法的耗时(秒)。
# 先进行若干次预热(不计入时间),然后重复多次取平均。
# 在 CPU 上生成随机张量
A = torch.randn(size, size)
B = torch.randn(size, size)
# 预热
for _ in range(warmup):
_ = torch.mm(A, B)
# 测量
times = []
for _ in range(repeats):
t0 = time.perf_counter()
_ = torch.mm(A, B)
t1 = time.perf_counter()
times.append(t1 - t0)
avg_time = sum(times) / len(times)
print(f"[CPU] 矩阵 {size}×{size} 相乘平均耗时:{avg_time:.6f} 秒 (over {repeats} runs, warmup={warmup})")
if __name__ == "__main__":
measure_cpu_matmul_time()
5.3、HPCKit版本性能测试
按照以下流程执行程序:
开启大页内存:
for i in $(seq 16 31); do echo 2044 > /sys/devices/system/node/node${i}/hugepages/hugepages-2048kB/nr_hugepages; done
设置KML后端:
export KML_BLAS_THREAD_TYPE=OMP
限制线程数:
export OMP_NUM_THREADS=38
执行测试程序:
taskset -c 0-37 numactl -m 16 python linear_test.py
执行结果:
[CPU] 矩阵8192×8192 相乘平均耗时: 0.2189695秒 (over 10 runs, warmup=3)
至此,Pytorch使能HPCKit编译安装就完成了,大家可以对比软件包安装的版本进行性能对比测试。