As I told you on your earlier, almost identical question, this matrix multiply code is only designed to do calculations on matrices whose dimensions are a round multiple of block_size. If you choose block_size=32, then it can only be used for 32x32, 64x64, 96x96, 128x128, etc. Nothing you have done with dynamically allocated shared memory changes this.
To verify that this is the case, let's start with a complete, compilable repro case which will run your kernel, check whether it executed and compare its output to a simple reference calculation done on the host. This code is your posted kernel, plus the core of your launch parameter calculations. It will read a size from stdin and then run the case. If the results differ by more than a certain tolerance, an assert error will be raised. Here is the code, it should compile on CUDA 3.0 or later and run on any CUDA compatible GPU:
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <cmath>
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d
", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
__global__ void matrixMul( float* C, float* A, float* B, int wA, int wB, size_t block_size)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * block_size * by;
int aEnd = aBegin + wA - 1;
int aStep = block_size;
int bBegin = block_size * bx;
int bStep = block_size * wB;
float Csub=0.f;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
extern __shared__ float smem[];
smem[ty*block_size+tx] = A[a + wA * ty + tx];
smem[block_size*block_size+ty*block_size+tx] = B[b + wB * ty + tx];
__syncthreads();
for (int k = 0; k < block_size; ++k)
Csub += smem[ty*block_size+k] * smem[block_size*block_size+k*block_size+tx] ;
__syncthreads();
}
int c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub;
}
inline float frand(){
return (float)rand()/(float)RAND_MAX;
}
void matmul(float *C, const float *A, const float *B, int wA, int wB)
{
for(int k=0; k<wB; k++) {
for(int j=0; j<wB; j++) {
float dotp = 0.f;
for(int i=0; i<wA; i++) {
dotp += A[j*wA+i] * B[i*wB+k];
}
C[j*wB+k] = dotp;
}
}
}
int main(int argc, char ** argv)
{
int val = 128;
if ( argc == 2 ) {
val = atoi(argv[1]);
}
int m = val, n = val, mn = m*n;
size_t sz = size_t(mn) * sizeof(float);
srand(time(NULL));
float * A = new float[mn], * B = new float[mn], * C= new float[mn];
float * A_, * B_, * C_;
for(int i=0; i<mn; i++) {
A[i] = frand(); B[i] = frand();
}
GPUerrchk( cudaMalloc((void **)&A_, sz) );
GPUerrchk( cudaMalloc((void **)&B_, sz) );
GPUerrchk( cudaMalloc((void **)&C_, sz) );
GPUerrchk( cudaMemcpy(A_, A, sz, cudaMemcpyHostToDevice) );
GPUerrchk( cudaMemcpy(B_, B, sz, cudaMemcpyHostToDevice) );
// Launch configuration
// Note that the input matrice sizes *must* be a round
// multiple of blocksize for this code to work correctly.
const int blocksize=16;
const int shmsz = size_t(2*blocksize*blocksize) * sizeof(float);
dim3 block=dim3(blocksize,blocksize), grid = dim3(m/block.x,m/block.y);
matrixMul<<<grid,block,shmsz>>>(C_,A_,B_,m,n,blocksize);
GPUerrchk( cudaPeekAtLastError() );
GPUerrchk( cudaMemcpy(C, C_, sz, cudaMemcpyDeviceToHost) );
// Verfication on host
float * Cref = new float[mn];
matmul(Cref,A,B,m,n);
const float tol = 5e-5f;
for(int i=0; i<mn; i++) {
assert(fabs(C[i]-Cref[i])/C[i] < tol);
}
GPUerrchk( cudaThreadExit() ); // CUDA 3.2 compatible
return 0;
}
So now, let's run this code for different sizes. To verify that the code on the GPU didn't do anything wrong, I will run it using the cuda-memcheck utility, which can detect out of bounds memory access. All of the following tests were made on an OS X 10.6 machine with a compute capability 1.2 card and CUDA 3.2, using blocksize=16
:
$ nvcc -arch=sm_12 -Xcompiler="-Wall" -Xptxas="-v" -o matmul2 matmul2.cu
ptxas info : Compiling entry function '_Z9matrixMulPfS_S_iim' for 'sm_12'
ptxas info : Used 16 registers, 32+16 bytes smem, 4 bytes cmem[1]
Let's try a case where the matrices are less than blocksize
first
$ cuda-memcheck ./matmul2 4
========= CUDA-MEMCHECK
GPUassert: invalid configuration argument matmul2.cu 101
========= ERROR SUMMARY: 0 errors
Here we failed to run the kernel with an invalid configuration argument error. Why? Because of this:
dim3 block=dim3(blocksize,blocksize), grid = dim3(m/block.x,m/block.y);
which results in a 0 grid size when m,n < blocksize
.
Next let's try the smallest round multiple of blocksize, in this case 16:
$ cuda-memcheck ./matmul2 16
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
which runs without error, or assert failure. Let's now increase the size to 17:
cuda-memcheck ./matmul2 17
========= CUDA-MEMCHECK
GPUassert: unspecified launch failure matmul2.cu 103
========= Invalid __global__ read of size 4
========= at 0x000001f8 in matrixMul
========= by thread (0,2,0) in block (0,0)
========= Address 0x001009c8 is out of bounds
=========
========= ERROR SUMMARY: 1 error
and we get out of bounds memory access detected and a launch failure error, which is expected. Now lets try 64, 96, and 128:
$ cuda-memcheck ./matmul2 64
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./matmul2 96
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./matmul2 128
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
and finally let's try 129:
$ cuda-memcheck ./matmul2 129
========= CUDA-MEMCHECK
GPUassert: unspecified launch failure matmul2.cu 103
========= Invalid __global__ read of size 4
========= at 0x000001f8 in matrixMul
========= by thread (0,1,0) in block (0,0)
========= Address 0x00120904 is out of bounds
=========
========= ERROR SUMMARY: 1 error
Even if you don't follow why the out of bounds errors are occurring, are you at least willing to accept that this code really does only work correctly for matrices which are round multiples of blocksize?