Kernel is executed by many threads
Threads are grouped into blocks
Each thread has a thread ID
Thread blocks form an execution grid
Each block has a block ID
Single thread is executed on core
One or multiple (sm_20+) kernels are executed on the device
Thread has its own registers
Thread has its own local memory
__shared__ float buffer[CTA_SIZE];
Grid is able to access global and constant memory
__global__ void kernel(float *in, float *out)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
out[tid] = in[tid];
}
void execute_kernel(const* float host_in, float* host_out, int size)
{
float* device_in, * device_out;
cudaMalloc((void**)&device_in, size*sizeof(float));
cudaMalloc((void**)&device_out, size*sizeof(float));
// 1. Upload data into device memory
cudaMemcpy(device_in, host_in, cudaMemcpyHostToDevice);
// 2. Configure kernel launch
dim3 block(256);
dim3 grid(size / 256);
// 3. Execute kernel
kernel<<<grid, block>>>(device_in, device_out);
// 4. Wait till completion
cudaThreadSynchronize();
// 5. Download results into host memory
cudaMemcpy(host_out, device_out, cudaMemcpyDeviceToHost);
}