Cuda Toolkit -

// Allocate host memory float *h_a = new float[n]; float *h_b = new float[n]; float *h_c = new float[n];

run: $(TARGET) ./$(TARGET)

// Cleanup cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); delete[] h_a; delete[] h_b; delete[] h_c; cuda toolkit

// Allocate device memory float *d_a, *d_b, *d_c; cudaMalloc(&d_a, bytes); cudaMalloc(&d_b, bytes); cudaMalloc(&d_c, bytes); // Allocate host memory float *h_a = new

clean: rm -f $(TARGET)

// Copy result back to host cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); float *h_b = new float[n]

.PHONY: all run clean | Operation | Function | |-----------|----------| | Allocate GPU memory | cudaMalloc(&ptr, size) | | Free GPU memory | cudaFree(ptr) | | Copy to GPU | cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice) | | Copy to CPU | cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost) | | Get GPU count | cudaGetDeviceCount(&count) | | Set active GPU | cudaSetDevice(device_id) | | Synchronize | cudaDeviceSynchronize() | | Error checking | cudaGetLastError() | Installation Check # Check CUDA version nvcc --version Check GPU driver & CUDA capability nvidia-smi Check available GPUs nvidia-smi -L This gives you a working starting point. Need a specific CUDA library example (cuBLAS for matrix multiplication, cuFFT for FFTs, or multi-GPU programming)?