前言
PyTorch
的官网地址为:https://pytorch.org/ 。
PyTorch Tutorials
的地址为:https://pytorch.org/tutorials/ 。
本篇博客代码仓库: https://github.com/LuYF-Lemon-love/susu-cuda-example 。
操作系统:Windows 10 专业版
参考文档
PyTorch自定义CUDA算子教程与运行时间分析
详解PyTorch编译并调用自定义CUDA算子的三种方式
三分钟教你如何PyTorch自定义反向传播
内容
文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 $ tree -L 2 . ├── 01-just-in-time.py ├── 02-setuptools.py ├── 03-train.py ├── env │ ├── bin │ ├── include │ ├── lib │ ├── lib64 -> lib │ ├── pyvenv.cfg │ └── share ├── include │ └── add2.h ├── kernel │ ├── add2.cpp │ └── add2_kernel.cu ├── LICENSE ├── README.md ├── run.sh └── setup.py 8 directories, 11 files $
安装
创建虚拟环境:
1 2 3 4 python -m venv env source env/bin/activate which python pip install --upgrade pip
使用 pip 安装依赖:
1 2 3 pip install torch -i https://pypi.tuna.tsinghua.edu.cn/simple pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple pip install ninja -i https://pypi.tuna.tsinghua.edu.cn/simple
CUDA算子实现
新建 include/add2.h
。
1 2 3 4 void launch_add2 (float *c, const float *a, const float *b, int n) ;
新建 kernel/add2_kernel.cu
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 __global__ void add2_kernel (float * c, const float * a, const float * b, int n) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i < n; i += gridDim.x * blockDim.x) { c[i] = a[i] + b[i]; } } void launch_add2 (float * c, const float * a, const float * b, int n) { dim3 grid ((n + 1023 ) / 1024 ) ; dim3 block (1024 ) ; add2_kernel<<<grid, block>>>(c, a, b, n); }
Torch C++封装
新建 kernel/add2.cpp
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 #include <torch/extension.h> #include "add2.h" void torch_launch_add2 (torch::Tensor &c, const torch::Tensor &a, const torch::Tensor &b, int n) { launch_add2 ((float *)c.data_ptr (), (const float *)a.data_ptr (), (const float *)b.data_ptr (), n); } PYBIND11_MODULE (TORCH_EXTENSION_NAME, m) { m.def ("torch_launch_add2" , &torch_launch_add2, "add2 kernel warpper" ); }
Python调用
新建 01-just-in-time.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 import timeimport numpy as npimport torchfrom torch.utils.cpp_extension import loadcuda_module = load(name="add2" , extra_include_paths=["include" ], sources=["kernel/add2.cpp" , "kernel/add2_kernel.cu" ], verbose=True ) n = 1024 * 1024 a = torch.rand(n, device="cuda:0" ) b = torch.rand(n, device="cuda:0" ) cuda_c = torch.rand(n, device="cuda:0" ) ntest = 10 def show_time (func ): times = list () res = list () for _ in range (10 ): func() for _ in range (ntest): torch.cuda.synchronize(device="cuda:0" ) start_time = time.time() r = func() torch.cuda.synchronize(device="cuda:0" ) end_time = time.time() times.append((end_time-start_time)*1e6 ) res.append(r) return times, res def run_cuda (): cuda_module.torch_launch_add2(cuda_c, a, b, n) return cuda_c def run_torch (): a + b return None print ("Running cuda..." )cuda_time, _ = show_time(run_cuda) print ("Cuda time: {:.3f}us" .format (np.mean(cuda_time)))print ("Running torch..." )torch_time, _ = show_time(run_torch) print ("Torch time: {:.3f}us" .format (np.mean(torch_time)))
运行 01-just-in-time.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 $ python 01-just-in-time.py Using /home/luyanfeng/.cache/torch_extensions/py310_cu117 as PyTorch extensions root... Detected CUDA files, patching ldflags Emitting ninja build file /home/luyanfeng/.cache/torch_extensions/py310_cu117/add2/build.ninja... Building extension module add2... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) ninja: no work to do. Loading extension module add2... Running cuda... Cuda time: 14.591us Running torch... Torch time: 14.329us $
上述过程使用的是即时编译
(动态编译)编译C++文件,需要安装 ninja
。
新建 setup.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 from setuptools import setupfrom torch.utils.cpp_extension import BuildExtension, CUDAExtensionsetup( name="add2" , include_dirs=["include" ], ext_modules=[ CUDAExtension( "add2" , ["kernel/add2.cpp" , "kernel/add2_kernel.cu" ], ) ], cmdclass={ "build_ext" : BuildExtension } )
新建 02-setuptools.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 import timeimport numpy as npimport torchimport add2n = 1024 * 1024 a = torch.rand(n, device="cuda:0" ) b = torch.rand(n, device="cuda:0" ) cuda_c = torch.rand(n, device="cuda:0" ) ntest = 10 def show_time (func ): times = list () res = list () for _ in range (10 ): func() for _ in range (ntest): torch.cuda.synchronize(device="cuda:0" ) start_time = time.time() r = func() torch.cuda.synchronize(device="cuda:0" ) end_time = time.time() times.append((end_time-start_time)*1e6 ) res.append(r) return times, res def run_cuda (): add2.torch_launch_add2(cuda_c, a, b, n) return cuda_c def run_torch (): a + b return None print ("Running cuda..." )cuda_time, _ = show_time(run_cuda) print ("Cuda time: {:.3f}us" .format (np.mean(cuda_time)))print ("Running torch..." )torch_time, _ = show_time(run_torch) print ("Torch time: {:.3f}us" .format (np.mean(torch_time)))
运行 02-setuptools.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 $ python setup.py install running install /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/setuptools/command/install.py:34: SetuptoolsDeprecationWarning: setup.py install is deprecated. Use build and pip and other standards-based tools. warnings.warn( /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/setuptools/command/easy_install.py:144: EasyInstallDeprecationWarning: easy_install command is deprecated. Use build and pip and other standards-based tools. warnings.warn( running bdist_egg running egg_info creating add2.egg-info writing add2.egg-info/PKG-INFO writing dependency_links to add2.egg-info/dependency_links.txt writing top-level names to add2.egg-info/top_level.txt writing manifest file 'add2.egg-info/SOURCES.txt' reading manifest file 'add2.egg-info/SOURCES.txt' adding license file 'LICENSE' writing manifest file 'add2.egg-info/SOURCES.txt' installing library code to build/bdist.linux-x86_64/egg running install_lib running build_ext /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/utils/cpp_extension.py:388: UserWarning: The detected CUDA version (11.5) has a minor version mismatch with the version that was used to compile PyTorch (11.7). Most likely this shouldn't be a problem. warnings.warn(CUDA_MISMATCH_WARN.format(cuda_str_version, torch.version.cuda)) building 'add2' extension creating /home/luyanfeng/my_code/github/susu-cuda-example/build creating /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310 creating /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel Emitting ninja build file /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/build.ninja... Compiling objects... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) [1/2] /usr/bin/nvcc -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/TH -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/THC -I/home/luyanfeng/my_code/github/susu-cuda-example/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/include -I/home/luyanfeng/miniconda3/include/python3.10 -c -c /home/luyanfeng/my_code/github/susu-cuda-example/kernel/add2_kernel.cu -o /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel/add2_kernel.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=add2 -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++17 [2/2] c++ -MMD -MF /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel/add2.o.d -pthread -B /home/luyanfeng/miniconda3/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /home/luyanfeng/miniconda3/include -fPIC -O2 -isystem /home/luyanfeng/miniconda3/include -fPIC -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/TH -I/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/include/THC -I/home/luyanfeng/my_code/github/susu-cuda-example/include -I/home/luyanfeng/my_code/github/susu-cuda-example/env/include -I/home/luyanfeng/miniconda3/include/python3.10 -c -c /home/luyanfeng/my_code/github/susu-cuda-example/kernel/add2.cpp -o /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel/add2.o -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=add2 -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++17 creating build/lib.linux-x86_64-cpython-310 g++ -pthread -B /home/luyanfeng/miniconda3/compiler_compat -shared -Wl,-rpath,/home/luyanfeng/miniconda3/lib -Wl,-rpath-link,/home/luyanfeng/miniconda3/lib -L/home/luyanfeng/miniconda3/lib -Wl,-rpath,/home/luyanfeng/miniconda3/lib -Wl,-rpath-link,/home/luyanfeng/miniconda3/lib -L/home/luyanfeng/miniconda3/lib /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel/add2.o /home/luyanfeng/my_code/github/susu-cuda-example/build/temp.linux-x86_64-cpython-310/kernel/add2_kernel.o -L/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/torch/lib -L/usr/lib64 -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-cpython-310/add2.cpython-310-x86_64-linux-gnu.so creating build/bdist.linux-x86_64 creating build/bdist.linux-x86_64/egg copying build/lib.linux-x86_64-cpython-310/add2.cpython-310-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg creating stub loader for add2.cpython-310-x86_64-linux-gnu.so byte-compiling build/bdist.linux-x86_64/egg/add2.py to add2.cpython-310.pyc creating build/bdist.linux-x86_64/egg/EGG-INFO copying add2.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO copying add2.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO copying add2.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO copying add2.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt zip_safe flag not set; analyzing archive contents... __pycache__.add2.cpython-310: module references __file__ creating dist creating 'dist/add2-0.0.0-py3.10-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it removing 'build/bdist.linux-x86_64/egg' (and everything under it) Processing add2-0.0.0-py3.10-linux-x86_64.egg removing '/home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/add2-0.0.0-py3.10-linux-x86_64.egg' (and everything under it) creating /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/add2-0.0.0-py3.10-linux-x86_64.egg Extracting add2-0.0.0-py3.10-linux-x86_64.egg to /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages add2 0.0.0 is already the active version in easy-install.pth Installed /home/luyanfeng/my_code/github/susu-cuda-example/env/lib/python3.10/site-packages/add2-0.0.0-py3.10-linux-x86_64.egg Processing dependencies for add2==0.0.0 Finished processing dependencies for add2==0.0.0 $ python 02-setuptools.py Running cuda... Cuda time: 10.347us Running torch... Torch time: 12.517us $
训练模型
之前我们实现了一个a + b a+b a + b 的tensor求和cuda算子,于是我们可以利用它来实现L = a 2 + b 2 \mathcal{L}=a^2+b^2 L = a 2 + b 2 。
最终训练收敛后a和b都会趋近于0,模型没有输入,只有两个可训练的参数a和b。
新建 03-train.py
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 import torchfrom torch import nnfrom torch.utils.cpp_extension import loadfrom torch.autograd import Functioncuda_module = load(name="add2" , extra_include_paths=["include" ], sources=["kernel/add2.cpp" , "kernel/add2_kernel.cu" ], verbose=True ) class AddModel (nn.Module): def __init__ (self, n ): super (AddModel, self).__init__() self.n = n self.a = nn.Parameter(torch.Tensor(self.n)) self.b = nn.Parameter(torch.Tensor(self.n)) self.a.data.normal_(mean=0.0 , std=1.0 ) self.b.data.normal_(mean=0.0 , std=1.0 ) def forward (self ): a2 = torch.square(self.a) b2 = torch.square(self.b) c = AddModelFunction.apply(a2, b2, self.n) return c class AddModelFunction (Function ): @staticmethod def forward (ctx, a, b, n ): c = torch.empty(n).to(device="cuda:0" ) cuda_module.torch_launch_add2(c, a, b, n) return c @staticmethod def backward (ctx, grad_output ): return (grad_output, grad_output, None ) n = 1000000 model = AddModel(n) model.to(device="cuda:0" ) opt = torch.optim.SGD(model.parameters(), lr=0.01 ) for epoch in range (50000 ): opt.zero_grad() output = model() loss = output.sum () loss.backward() opt.step() if epoch % 25 == 0 : print ("epoch {:>3d}: loss = {:>8.3f}" .format (epoch, loss))
训练模型:
其他教程
https://godweiyang.com/2021/03/28/nn-cuda-example/
https://godweiyang.com/2021/01/25/cuda-reading/
结语
第八十二篇博文写完,开心!!!!
今天,也是充满希望的一天。