Following a previous question ( Performing high number of 4x4 matrix inversion - PyCuda ), considering the inversion of 4x4 matrix, I would like to do the same but with 3x3 matrix.
Solution 1:
This answer will closely follow my answer on the 4x4 invert question, both in terms of answer layout and calculation method/kernel design. The formulas are described here.
First, as before, we will show a CUDA C++ version with comparison to cublas:
$ cat
#include<iostream>#include<cublas_v2.h>#include<cstdlib>// 3x3 matrix inversion// 9 threads per matrix to invert// 32 matrices per 288 thread blockconstunsigned block_size = 288;
typedefdouble mt;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)#include<time.h>#include<sys/time.h>#define USECPSEC 1000000ULLlonglongdtime_usec(unsignedlonglong start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
__device__ unsigned pat[9];
constunsigned hpat[9] = {0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140};
__device__ unsignedgetoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
// in-place is acceptable i.e. out == in)// T = float or double onlytemplate <typename T>
__global__ voidinv3x3(const T * __restrict__ in, T * __restrict__ out, constsize_t n){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
size_t nr = 2048;
intmain(int argc, char *argv[]){
if (argc > 1) nr = atoi(argv[1]);
const mt m2[] = {1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0};
const mt i2[] = {2.0, 0.0, -1.0, -1.0, -0.33333334, 1.0, 0.0, 0.33333334, 0.0};
const mt m1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
const mt i1[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
mt *h_d, *d_d;
h_d = (mt *)malloc(nr*9*sizeof(mt));
cudaMalloc(&d_d, nr*9*sizeof(mt));
cudaMemcpyToSymbol(pat, hpat, 9*sizeof(unsigned));
for (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
longlong t = dtime_usec(0);
inv3x3<<<((nr*9)/block_size)+1, block_size>>>(d_d, d_d, nr);
t = dtime_usec(t);
cudaMemcpy(h_d, d_d, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "kernel time: " << t << " microseconds" << std::endl;
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
//cublasfor (int i = 0; i < nr/2; i++){
memcpy(h_d+i*2*9, m1, sizeof(m1));
memcpy(h_d+i*2*9+9, m2, sizeof(m2));}
cudaMemcpy(d_d, h_d, nr*9*sizeof(mt), cudaMemcpyHostToDevice);
cublasHandle_t h;
cublasStatus_t cs = cublasCreate(&h);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas create error" << std::endl;
mt **A, **Ai, *Aid, **Ap, **Aip;
A = (mt **)malloc(nr*sizeof(mt *));
Ai = (mt **)malloc(nr*sizeof(mt *));
cudaMalloc(&Aid, nr*9*sizeof(mt));
cudaMalloc(&Ap, nr*sizeof(mt *));
cudaMalloc(&Aip, nr*sizeof(mt *));
for (int i = 0; i < nr; i++) A[i] = d_d + 9*i;
for (int i = 0; i < nr; i++) Ai[i] = Aid + 9*i;
cudaMemcpy(Ap, A, nr*sizeof(mt *), cudaMemcpyHostToDevice);
cudaMemcpy(Aip, Ai, nr*sizeof(mt *), cudaMemcpyHostToDevice);
int *info;
cudaMalloc(&info, nr*sizeof(int));
t = dtime_usec(0);
cs = cublasDmatinvBatched(h, 3, Ap, 3, Aip, 3, info, nr);
if (cs != CUBLAS_STATUS_SUCCESS) std::cout << "cublas matinv error" << std::endl;
t = dtime_usec(t);
cudaMemcpy(h_d, Aid, nr*9*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++){
for (int j = 0; j < 9; j++) std::cout << h_d[i*9 + j] << ",";
std::cout << std::endl;
for (int j = 0; j < 9; j++) std::cout << ((i==0)?i1[j]:i2[j]) << ",";
std::cout << std::endl;}
std::cout << "cublas time: " << t << " microseconds" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess) std::cout << cudaGetErrorString(err) << std::endl;
$ nvcc -o t432 -lcublas
$ ./t432
kernel time: 59 microseconds
cublas time: 68 microseconds
So this is perhaps slightly faster than cublas but not much, for this 2048 matrix test case, CUDA 10.0, Tesla P100, linux.
Similar to the previous answer, here is a simplified (only 2 matrices) pycuda test case:
$ cat
import numpy as np
# import matplotlib.pyplot as pltimport pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
// in-place is acceptable i.e. out == in)
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
# host codedefgpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
# Convert parameters into numpy array# *** change next line between float32 and float64 to match float or double
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# *** change next line between float32 and float64 to match float or double
output = np.empty((n*9), dtype= np.float64)
# Get kernel function
matinv3x3 = kernel.get_function("inv3x3")
# Define block, grid and compute
blockDim = (288,1,1) # do not change
gridDim = ((n/32)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
$ python
[[[ 2. -0. -1. ]
[-1. -0.333333331. ]
[-0.0.33333333 -0. ]]
[[ 1.0.0. ]
[ 0.1.0. ]
[ 0.0.1. ]]]
The above happens to be using double
i.e. float64
in pycuda. Changing it to float
i.e. float32
in pycuda involves changing the same 3 lines as described in this answer.
