I'm expanding njuffa's comment into a worked example. In that example, I'm simply adding two arrays in three different ways: loading the data as float
, float2
or float4
.
These are the timings on a GT540M and on a Kepler K20c card:
GT540M
float - Elapsed time: 74.1 ms
float2 - Elapsed time: 61.0 ms
float4 - Elapsed time: 56.1 ms
Kepler K20c
float - Elapsed time: 4.4 ms
float2 - Elapsed time: 3.3 ms
float4 - Elapsed time: 3.2 ms
As it can be seen, loading the data as float4
is the fastest approach.
Below are the disassembled codes for the three kernels (compilation for compute capability 2.1
).
add_float
Function : _Z9add_floatPfS_S_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x2; /* 0x6000c00008209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x2; /* 0x6000c00008001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ @P0 BRA.U 0xd8; /* 0x40000002600081e7 */
/*0040*/ @!P0 ISCADD R2, R0, c[0x0][0x24], 0x2; /* 0x400040009000a043 */
/*0048*/ @!P0 ISCADD R10, R0, c[0x0][0x20], 0x2; /* 0x400040008002a043 */
/*0050*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x2; /* 0x40004000a0002043 */
/*0058*/ @!P0 LD R8, [R2]; /* 0x8000000000222085 */
/*0060*/ @!P0 LD R6, [R2+0x4]; /* 0x800000001021a085 */
/*0068*/ @!P0 LD R4, [R2+0x8]; /* 0x8000000020212085 */
/*0070*/ @!P0 LD R9, [R10]; /* 0x8000000000a26085 */
/*0078*/ @!P0 LD R7, [R10+0x4]; /* 0x8000000010a1e085 */
/*0080*/ @!P0 LD R5, [R10+0x8]; /* 0x8000000020a16085 */
/*0088*/ @!P0 LD R3, [R10+0xc]; /* 0x8000000030a0e085 */
/*0090*/ @!P0 LD R2, [R2+0xc]; /* 0x800000003020a085 */
/*0098*/ @!P0 FADD R8, R9, R8; /* 0x5000000020922000 */
/*00a0*/ @!P0 FADD R6, R7, R6; /* 0x500000001871a000 */
/*00a8*/ @!P0 FADD R4, R5, R4; /* 0x5000000010512000 */
/*00b0*/ @!P0 ST [R0], R8; /* 0x9000000000022085 */
/*00b8*/ @!P0 FADD R2, R3, R2; /* 0x500000000830a000 */
/*00c0*/ @!P0 ST [R0+0x4], R6; /* 0x900000001001a085 */
/*00c8*/ @!P0 ST [R0+0x8], R4; /* 0x9000000020012085 */
/*00d0*/ @!P0 ST [R0+0xc], R2; /* 0x900000003000a085 */
/*00d8*/ EXIT; /* 0x8000000000001de7 */
add_float2
Function : _Z10add_float2P6float2S0_S0_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0010*/ SHL R2, R2, 0x1; /* 0x6000c00004209c03 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHL R0, R0, 0x1; /* 0x6000c00004001c03 */
/*0028*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0030*/ ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc03 */
/*0038*/ @P0 BRA.U 0xa8; /* 0x40000001a00081e7 */
/*0040*/ @!P0 ISCADD R10, R0, c[0x0][0x20], 0x3; /* 0x400040008002a063 */
/*0048*/ @!P0 ISCADD R11, R0, c[0x0][0x24], 0x3; /* 0x400040009002e063 */
/*0050*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x3; /* 0x40004000a0002063 */
/*0058*/ @!P0 LD.64 R4, [R10]; /* 0x8000000000a120a5 */
/*0060*/ @!P0 LD.64 R8, [R11]; /* 0x8000000000b220a5 */
/*0068*/ @!P0 LD.64 R2, [R10+0x8]; /* 0x8000000020a0a0a5 */
/*0070*/ @!P0 LD.64 R6, [R11+0x8]; /* 0x8000000020b1a0a5 */
/*0078*/ @!P0 FADD R9, R5, R9; /* 0x5000000024526000 */
/*0080*/ @!P0 FADD R8, R4, R8; /* 0x5000000020422000 */
/*0088*/ @!P0 FADD R3, R3, R7; /* 0x500000001c30e000 */
/*0090*/ @!P0 FADD R2, R2, R6; /* 0x500000001820a000 */
/*0098*/ @!P0 ST.64 [R0], R8; /* 0x90000000000220a5 */
/*00a0*/ @!P0 ST.64 [R0+0x8], R2; /* 0x900000002000a0a5 */
/*00a8*/ EXIT; /* 0x8000000000001de7 */
add_float4
Function : _Z10add_float4P6float4S0_S0_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ NOP; /* 0x4000000000001de4 */
/*0010*/ MOV R3, c[0x0][0x2c]; /* 0x28004000b000dde4 */
/*0018*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0020*/ SHR.U32 R3, R3, 0x2; /* 0x5800c0000830dc03 */
/*0028*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0030*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0038*/ ISETP.GE.U32.AND P0, PT, R0, R3, PT; /* 0x1b0e00000c01dc03 */
/*0040*/ @P0 BRA.U 0x98; /* 0x40000001400081e7 */
/*0048*/ @!P0 ISCADD R2, R0, c[0x0][0x20], 0x4; /* 0x400040008000a083 */
/*0050*/ @!P0 ISCADD R3, R0, c[0x0][0x24], 0x4; /* 0x400040009000e083 */
/*0058*/ @!P0 ISCADD R0, R0, c[0x0][0x28], 0x4; /* 0x40004000a0002083 */
/*0060*/ @!P0 LD.128 R8, [R2]; /* 0x80000000002220c5 */
/*0068*/ @!P0 LD.128 R4, [R3]; /* 0x80000000003120c5 */
/*0070*/ @!P0 FADD R7, R11, R7; /* 0x500000001cb1e000 */
/*0078*/ @!P0 FADD R6, R10, R6; /* 0x5000000018a1a000 */
/*0080*/ @!P0 FADD R5, R9, R5; /* 0x5000000014916000 */
/*0088*/ @!P0 FADD R4, R8, R4; /* 0x5000000010812000 */
/*0090*/ @!P0 ST.128 [R0], R4; /* 0x90000000000120c5 */
/*0098*/ EXIT; /* 0x8000000000001de7 */
As it can be seen and as mentioned by njuffa, different load instructions are used for the three cases: LD
, LD.64
and LD.128
, respectively.
Finally, the code:
#include <thrust/device_vector.h>
#define BLOCKSIZE 256
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************/
/* ADD_FLOAT KERNEL */
/********************/
__global__ void add_float(float *d_a, float *d_b, float *d_c, unsigned int N) {
const int tid = 4 * threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float a2 = d_a[tid+1];
float b2 = d_b[tid+1];
float a3 = d_a[tid+2];
float b3 = d_b[tid+2];
float a4 = d_a[tid+3];
float b4 = d_b[tid+3];
float c1 = a1 + b1;
float c2 = a2 + b2;
float c3 = a3 + b3;
float c4 = a4 + b4;
d_c[tid] = c1;
d_c[tid+1] = c2;
d_c[tid+2] = c3;
d_c[tid+3] = c4;
//if ((tid < 1800) && (tid > 1790)) {
//printf("%i %i %i %f %f %f
", tid, threadIdx.x, blockIdx.x, a1, b1, c1);
//printf("%i %i %i %f %f %f
", tid+1, threadIdx.x, blockIdx.x, a2, b2, c2);
//printf("%i %i %i %f %f %f
", tid+2, threadIdx.x, blockIdx.x, a3, b3, c3);
//printf("%i %i %i %f %f %f
", tid+3, threadIdx.x, blockIdx.x, a4, b4, c4);
//}
}
}
/*********************/
/* ADD_FLOAT2 KERNEL */
/*********************/
__global__ void add_float2(float2 *d_a, float2 *d_b, float2 *d_c, unsigned int N) {
const int tid = 2 * threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float2 a1 = d_a[tid];
float2 b1 = d_b[tid];
float2 a2 = d_a[tid+1];
float2 b2 = d_b[tid+1];
float2 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
float2 c2;
c2.x = a2.x + b2.x;
c2.y = a2.y + b2.y;
d_c[tid] = c1;
d_c[tid+1] = c2;
}
}
/*********************/
/* ADD_FLOAT4 KERNEL */
/*********************/
__global__ void add_float4(float4 *d_a, float4 *d_b, float4 *d_c, unsigned int N) {
const int tid = 1 * threadIdx.x + blockIdx.x * (1 * blockDim.x);
if (tid < N/4) {
float4 a1 = d_a[tid];
float4 b1 = d_b[tid];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
d_c[tid] = c1;
}
}
/********/
/* MAIN */
/********/
int main() {
const int N = 4*10000000;
const float a = 3.f;
const float b = 5.f;
// --- float
thrust::device_vector<float> d_A(N, a);
thrust::device_vector<float> d_B(N, b);
thrust::device_vector<float> d_C(N);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add_float<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_A.data()), thrust::raw_pointer_cast(d_B.data()), thrust::raw_pointer_cast(d_C.data()), N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.1f ms
", time); gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<float> h_float = d_C;
for (int i=0; i<N; i++) {
if (h_float[i] != (a+b)) {
printf("Error for add_float at %i: result is %f
",i, h_float[i]);
return -1;
}
}
// --- float2
thrust::device_vector<float> d_A2(N, a);
thrust::device_vector<float> d_B2(N, b);
thrust::device_vector<float> d_C2(N);
cudaEventCreate(&stop);