#include <stdio.h>
#include <math.h>
#include <cuda.h>
// Define the function to integrate: f(x) = x^3
__device__ double f(double x) {
return pow(x, 3); // pow(a,b) computes a^b
}
// CUDA kernel to compute the sum of function values at given points
__global__ void trapezoidal_kernel(double a, double b, double p, int n, double *sum) {
// Compute the global thread ID
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Accumulate contributions from this thread
if (idx < n) {
double x = a + idx * p;
atomicAdd(sum, f(x)); // Atomic add to safely update shared sum
}
// Add the contributions from the end points
if (idx==0) atomicAdd(sum, 0.5 * (f(a) + f(b)));
}
// Host function to compute the integral using the trapezoidal rule with CUDA
double trapezoidal_rule_cuda(double a, double b, int n) {
double *d_sum, h_sum = 0.0;
double p = (b - a) / n; // Width of each trapezoid
// Allocate memory on the device for the sum
cudaMalloc((void **)&d_sum, sizeof(double));
cudaMemcpy(d_sum, &h_sum, sizeof(double), cudaMemcpyHostToDevice);
// Define the number of threads and blocks
int blockSize = 32; // Number of threads per block
int numBlocks = (n + blockSize - 1) / blockSize; // Round up to cover all trapezoids
// Launch the kernel
trapezoidal_kernel<<<numBlocks, blockSize>>>(a, b, p, n, d_sum);
// Copy the result back to the host and finalize the sum
cudaMemcpy(&h_sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost);
// Clean up
cudaFree(d_sum);
return h_sum * p;
}
int main() {
double a = 0.0; // Lower limit of integration
double b = 1.0; // Upper limit of integration
int n = 1000; // Number of trapezoids (higher n for better accuracy)
printf("This program performs numerical integration of f(x) = x^3 from a = %.2f to b = %.2f using %d trapezoids.\n", a, b, n);
// Check if n is a valid number of trapezoids
if (n <= 0) {
printf("Error: The number of trapezoids must be positive.\n");
return -1;
}
// Perform numerical integration
double result = trapezoidal_rule_cuda(a, b, n);
printf("The integral of f(x) = x^3 from %.2f to %.2f is approximately: %.5f\n", a, b, result);
return 0;
}
module trapezoidal_module
use cudafor
contains
! Device function to define f(x) = x^3
attributes(device) function f(x) result(res)
real(8), value :: x
real(8) :: res
res = x ** 3
end function
! CUDA kernel for the trapezoidal rule
attributes(global) subroutine trapezoidal_kernel(a, b, p, n, sum)
real(8), value :: a, b, p
integer, value :: n
real(8) :: sum
integer :: idx, iostat
real(8) :: x
! Compute global thread ID
idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
! Add contributions from each thread
if (idx < n) then
x = a + idx * p
iostat = atomicAdd(sum, f(x))
end if
! Add the contributions from the end points (only by thread 0)
if (idx == 0) iostat = atomicAdd(sum, 0.5 * (f(a) + f(b)))
end subroutine
! Host function for the trapezoidal rule
function trapezoidal_rule_cuda(a, b, n) result(integral)
real(8), value :: a, b
integer, value :: n
real(8) :: integral
real(8), device :: d_sum
real(8) :: h_sum
real(8) :: p
integer :: blockSize, numBlocks, iostat
! Initialize parameters
h_sum = 0.0
p = (b - a) / n
! Allocate device memory
d_sum = 0.0
! Define grid and block dimensions
blockSize = 32
numBlocks = (n + blockSize - 1) / blockSize
! Launch the kernel
call trapezoidal_kernel<<<numBlocks, blockSize>>>(a, b, p, n, d_sum)
iostat = cudaDeviceSynchronize()
! Copy the result back to the host
h_sum = d_sum
! Finalize the sum
integral = h_sum * p
end function
end module
program trapezoidal_integration
use trapezoidal_module
implicit none
real(8) :: a, b, result
integer :: n
! Define integration limits and number of trapezoids
a = 0.0
b = 1.0
n = 1000
print *, "This program performs numerical integration of f(x) = x^3"
print *, "from a = ", a, " to b = ", b, " using ", n, " trapezoids."
! Check for valid input
if (n <= 0) then
print *, "Error: The number of trapezoids must be positive."
stop
end if
! Perform numerical integration
result = trapezoidal_rule_cuda(a, b, n)
print *, "The integral of f(x) = x^3 from ", a, " to ", b, " is approximately: ", result
end program
cudaMalloc
C
It is a CUDA runtime API function used to allocate memory on the GPU device. It works similarly to malloc in standard C, but the allocated memory resides in the GPU’s global memory rather than the host’s memory. It is essential for transferring data from the host (CPU) to the device (GPU) and enabling computations on the GPU.
Syntax:
cudaError_t cudaMalloc(void **devPtr, size_t size);
devPtr: A pointer to a pointer that will store the address of the allocated memory on the device.size: The number of bytes to allocate.
Example:
double *d_array; size_t size = n * sizeof(double); // Allocate memory on the GPU cudaMalloc((void **)&d_array, size);
[!TIP] Free the allocated memory on the device after use to avoid memory leaks!
cudaFree(d_array);
Fortran
In CUDA Fortran, memory allocation on the device is typically managed through Fortran’s native syntax rather than using cudaMalloc explicitly. However, it’s essential to understand how memory is allocated in CUDA Fortran and how it compares to the use of cudaMalloc in CUDA C.
In CUDA Fortran, arrays or variables intended to reside on the GPU are declared with the device attribute. The memory allocation and deallocation are handled via Fortran’s standard allocate and deallocate statements.
Example of Device Allocation in CUDA Fortran
real(8), device :: d_array(:)
integer :: n
n = 1000
! Allocate memory on the device
allocate(d_array(n))
! Deallocate memory when done
deallocate(d_array)
Here, the device attribute ensures that d_array resides in the GPU memory, and the allocate statement allocates memory on the device. This approach is more straightforward than using cudaMalloc.
Comparison with cudaMalloc in CUDA C
In CUDA C, cudaMalloc is explicitly used to allocate memory on the GPU:
double *d_array;
cudaMalloc((void **)&d_array, n * sizeof(double));
This is equivalent to the allocate statement in CUDA Fortran but requires explicit management with cudaMalloc and cudaFree.
Attributes in Fortran
In CUDA Fortran, attributes are used to define the memory location or scope of variables, specifying whether they reside on the host, device, or in special GPU memory spaces. These attributes allow developers to manage memory effectively and optimize performance when working with GPUs.
Here are the most commonly used attributes in CUDA Fortran:
device
- Marks variables that reside in the global memory of the GPU.
- These variables are accessible from device (GPU) code but not directly from the host (CPU) unless transferred explicitly.
- Example:
real(8), device :: d_array(:)
allocate(d_array(1000)) ! Allocates memory in GPU global memory
constant
- Marks variables stored in the constant memory of the GPU.
- Constant memory is optimized for read-only data that is frequently accessed by many threads.
- Example:
real(8), constant :: c_array(10) = (/ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 /)
managed
- Marks variables allocated in unified memory, which is accessible by both the host and the device without explicit memory transfers.
- This allows variables to be accessed by both host and device without explicit memory allocation or data transfer.
- Example:
real(8), managed :: m_array(100)
[!TIP]
Best Practices
- Use device for arrays or variables requiring large storage and accessed frequently in kernels.
- Use constant for small read-only data shared across threads for better caching performance.
- Use managed for ease of development when performance is less critical or during prototyping.q
cudaMemcpy
It is used to transfer data between the host (CPU) and the device (GPU). It can also transfer data between two devices if needed. Proper memory management using cudaMemcpy ensures that computations on the GPU use the correct data.
C
Syntax:
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind);
dst: Destination pointer.src: Source pointer.count: Number of bytes to copy.kind: Direction of the transfer, specified as:cudaMemcpyHostToDevice: Host to device.cudaMemcpyDeviceToHost: Device to host.cudaMemcpyDeviceToDevice: Device to device.
Example:
double *h_array = (double *)malloc(size); // Allocate host memory double *d_array; cudaMalloc((void **)&d_array, size); // Copy data from host to device cudaMemcpy(d_array, h_array, size, cudaMemcpyHostToDevice); // Copy data from device to host cudaMemcpy(h_array, d_array, size, cudaMemcpyDeviceToHost);
[!TIP]
- Always ensure the source and destination pointers point to valid memory regions.
- Minimize the number of memory transfers between host and device, as they can be a performance bottleneck.
- Use
cudaMemcpyAsyncfor non-blocking transfers if overlapping computation with communication is necessary.
Fortran
The basic syntax for cudaMemcpy in CUDA Fortran is:
call cudaMemcpy(destination, source, numBytes, direction)
destination: A pointer to the memory location where data will be copied.source: A pointer to the memory location from where data will be copied.numBytes: The size of the data to copy in bytes.direction: The direction of data transfer, specified by one of the following constants:cudaMemcpyHostToDevice– Copy data from the host (CPU) to the device (GPU).cudaMemcpyDeviceToHost– Copy data from the device (GPU) to the host (CPU).cudaMemcpyDeviceToDevice– Copy data between two device memory locations.
Implicit data transfer
However, in CUDA Fortran, it is possible to transfer data between the host and device without explicitly using cudaMemcpy. This is achieved by using the Fortran array assignment or automatic data movement features provided by CUDA Fortran. These features simplify memory management and data transfers, making the code more concise and easier to write.
When an array is declared with the device attribute in CUDA Fortran, you can directly assign values between host and device arrays. The CUDA Fortran runtime handles the memory transfers behind the scenes.
Examples of implicit data transfers
- Host to Device Transfer
program implicit_host_to_device
use cudafor
implicit none
real(8), device :: d_array(100) ! Device array
real(8) :: h_array(100) ! Host array
integer :: i
! Initialize the host array
h_array = [(real(i, 8), i = 1, 100)]
! Assign host array to device array (host to device transfer)
d_array = h_array
print *, "Data transferred to device."
end program implicit_host_to_device
- Device to Host Transfer
program implicit_device_to_host
use cudafor
implicit none
real(8), device :: d_array(100) ! Device array
real(8) :: h_array(100) ! Host array
integer :: i
! Initialize the device array with values (on the host)
d_array = [(real(i, 8), i = 1, 100)]
! Assign device array to host array (device to host transfer)
h_array = d_array
print *, "Data transferred back to host:", h_array(1:10)
end program implicit_device_to_host
[!NOTE]
- The runtime automatically determines when to perform host-to-device or device-to-host transfers based on array assignments. This eliminates the need for explicit cudaMemcpy calls.
- The Fortran-style assignment syntax is more intuitive and concise than explicitly managing memory transfers.
- While convenient, implicit data transfers may introduce hidden performance costs, especially if used frequently in a loop. Explicit
cudaMemcpymight be better for performance-critical code as it provides finer control.
numBlocks and blockSize
CUDA uses a grid and block hierarchy to execute kernels in parallel.
blockSize: The number of threads per block.numBlocks: The number of blocks in the grid.
This configuration determines how the workload is distributed across the GPU.
Choosing blockSize and numBlocks:
- The total number of threads is
numBlocks * blockSize. blockSizemust not exceed the maximum threads per block (typically 512 or 1024).numBlocksis often calculated as:int numBlocks = (n + blockSize - 1) / blockSize; // Ceiling division
Example:
int n = 1000; // Total number of elements
int blockSize = 256; // Threads per block
int numBlocks = (n + blockSize - 1) / blockSize; // Number of blocks
[!TIP]
- A good choice of blockSize depends on the GPU hardware and the problem size. Common choices are 128, 256, or 512.
Multi-Dimensional Grids and Blocks
CUDA supports multi-dimensional grids and blocks, which are particularly useful for problems involving 2D or 3D data (e.g., images or volumes).
- Threads in a Block: Each block can be 1D, 2D, or 3D.
- Defined by
dim3 blockDim. - Total threads per block:
blockDim.x * blockDim.y * blockDim.z.
- Defined by
- Blocks in a Grid: Grids can also be 1D, 2D, or 3D.
- Defined by
dim3 gridDim. - Total blocks:
gridDim.x * gridDim.y * gridDim.z
- Defined by
Example
dim3 blockSize(8, 8, 8); // Define a block with 8x8x8 threads (512 threads per block)
dim3 gridSize((Nx + blockSize.x - 1) / blockSize.x,
(Ny + blockSize.y - 1) / blockSize.y,
(Nz + blockSize.z - 1) / blockSize.z); // Define a grid to cover the 3D array
// Launch the kernel
myKernel<<<gridSize, blockSize>>>(deviceArray, Nx, Ny, Nz);
In this code:
blockSizespecifies the dimensions of each block (3D configuration).gridSizeensures enough blocks to cover the entire 3D problem size.- The kernel
myKernelis invoked with these grid and block configurations.
In Fortran:
program cuda_3d_example
use cudafor
implicit none
integer, parameter :: Nx = 64, Ny = 64, Nz = 64
type(dim3) :: blockSize, gridSize
! Define block dimensions
blockSize = dim3(8, 8, 8) ! Block of 8x8x8 threads
! Define grid dimensions (rounding up to cover all data)
gridSize = dim3((Nx + blockSize%x - 1) / blockSize%x, &
(Ny + blockSize%y - 1) / blockSize%y, &
(Nz + blockSize%z - 1) / blockSize%z)
! Launch the kernel
call myKernel<<<gridSize, blockSize>>>(Nx, Ny, Nz)
end program cuda_3d_example
CUDA Kernels
CUDA kernels are functions executed on the GPU. Key Features of Kernels:
- Parallel Execution:
- Each thread executes the kernel independently and concurrently.
- Special Attributes:
- Kernels must be declared with
__global__in CUDA C and withattributes(global)in CUDA Fortran. - They can only return
void(i.e., they do not have a return value).
- Thread and Block Indices:
- Threads and blocks are identified using special variables:
threadIdx: Index of a thread within its block.blockIdx: Index of the block within the grid.blockDim: Number of threads in a block.gridDim: Number of blocks in the grid.
- These indices are used to compute the unique thread ID for assigning work.
C
Kernels appear like normal functions but are defined with the __global__ qualifier. They are invoked from the host code. A kernel launch specifies the grid and block dimensions.
Syntax:
kernelName<<<numBlocks, blockSize>>>(arguments);
numBlocks: Number of blocks in the grid.blockSize: Number of threads per block.
Example Kernel:
__global__ void addArrays(double *a, double *b, double *c, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; // Compute global thread index if (idx < n) { c[idx] = a[idx] + b[idx]; // Perform addition for this thread } }Kernel Invocation:
int n = 1000; // Number of elements int blockSize = 256; // Threads per block int numBlocks = (n + blockSize - 1) / blockSize; // Allocate memory and copy data as needed addArrays<<<numBlocks, blockSize>>>(d_a, d_b, d_c, n);
[!TIP]
- Ensure the number of threads covers the entire workload (
nin the above example).- Synchronize if necessary to ensure all threads finish before proceeding:
cudaDeviceSynchronize();
Fortran
Kernels are written with the attributes(global) specifier to indicate that they are callable from the host but execute on the device.
Example: A Simple Kernel in CUDA Fortran
Here’s an example of a kernel that adds two arrays element-wise:
module kernel_module
use cudafor
contains
attributes(global) subroutine add_arrays_kernel(a, b, c, n)
real, device :: a(:), b(:), c(:)
integer, value :: n
integer :: idx
! Compute the unique thread index
idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
! Ensure we stay within array bounds
if (idx <= n) then
c(idx) = a(idx) + b(idx)
end if
end subroutine add_arrays_kernel
end module kernel_module
Steps to Launch a Kernel
- Declare Arrays on the Device:
real, device :: d_a(1000), d_b(1000), d_c(1000)
real :: h_a(1000), h_b(1000), h_c(1000)
- Transfer Data from Host to Device:
d_a = h_a
d_b = h_b
- Define Grid and Block Dimensions:
type(dim3) :: gridSize, blockSize
blockSize = dim3(256) ! 256 threads per block
gridSize = dim3((1000 + 255) / 256) ! Enough blocks to cover all elements
- Launch the Kernel:
use kernel_module
call add_arrays_kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, 1000)
- Copy Results Back to Host:
h_c = d_c
[!NOTE]
- Memory: Kernel arguments can include device arrays or scalars (passed by value!).
- Scope: Kernels can only access device memory and cannot directly access host memory.
- Concurrency: Each thread runs the same kernel code but works on different data, making it highly parallel.
Device functions
In CUDA programming, __device__ in C or attributes(device) in Fortran is used to declare functions that are executed on the GPU and can only be called by other GPU functions. These are referred to as device functions. Unlike kernels (which are marked with __global__ or attributes(global)), device functions cannot be launched directly from the host but are called within kernels or other device functions.
Key features:
- GPU-Only Execution:
- Device functions are compiled for GPU execution only.
- They are not accessible from the host.
- Calling Rules:
- Can be called from other device functions or kernels.
- Cannot be called directly from host functions.
- Inlined by Default:
- Device functions are often inlined by the compiler to reduce the overhead of function calls.
- Purpose:
- They help modularize GPU code by allowing reusable computations within kernels.
C
In CUDA C, __device__ is the keyword to define a device function.
Example:
#include <stdio.h>
#include <math.h>
// Device function to compute the cube of a number
__device__ double cube(double x) {
return x * x * x;
}
// Kernel to use the device function
__global__ void compute_cubes(const double *input, double *output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
output[idx] = cube(input[idx]); // Calling the device function
}
}
int main() {
// Example host code would allocate arrays, transfer them to the device,
// launch compute_cubes, and copy results back to the host.
}
Fortran
In CUDA Fortran, attributes(device) is used to define device functions.
Example:
module device_functions
use cudafor
contains
! Device function to compute the cube of a number
attributes(device) function cube(x) result(res)
real, value :: x
real :: res
res = x ** 3
end function cube
! Kernel to use the device function
attributes(global) subroutine compute_cubes(input, output, n)
real, device :: input(:), output(:)
integer, value :: n
integer :: idx
! Compute unique thread index
idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
! Ensure within bounds
if (idx <= n) then
output(idx) = cube(input(idx)) ! Calling the device function
end if
end subroutine compute_cubes
end module device_functions
Atomic Functions in CUDA
Atomic functions are special operations that allow safe updates to shared memory by ensuring that only one thread modifies a memory location at a time. They are essential in parallel programming to prevent race conditions when multiple threads access and modify shared data simultaneously.
General Considerations
- Atomicity: An operation is atomic if it is executed as a single, indivisible step. This guarantees that no other thread can interfere during the operation.
- Common Use Cases: Incrementing counters. Accumulating sums. Updating shared data structures like histograms.
- Performance Impact: Atomic operations serialize access to shared data, which can reduce performance due to contention. However, they are often faster and easier to implement than more complex synchronization methods.
- Supported Operations: CUDA supports atomic addition, subtraction, exchange, min/max, and logical operations like AND/OR. These operations are hardware-dependent and optimized for GPU execution.
C
In CUDA C, atomic functions are prefixed with atomic. For example, atomicAdd performs an atomic addition. These functions operate on integers and floats in global or shared memory.
Example: Atomic Addition (C)
#include <stdio.h>
__global__ void atomic_example(int *counter) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < 10) {
atomicAdd(counter, 1); // Safely increment the counter
}
}
int main() {
int *d_counter, h_counter = 0;
// Allocate memory and initialize
cudaMalloc((void**)&d_counter, sizeof(int));
cudaMemcpy(d_counter, &h_counter, sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
atomic_example<<<1, 32>>>(d_counter);
// Copy back and print result
cudaMemcpy(&h_counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost);
printf("Counter = %d\n", h_counter); // Should print 10
cudaFree(d_counter);
return 0;
}
Fortran
In CUDA Fortran, atomic operations are similarly supported with the atomic subroutines provided by the language. These subroutines operate on variables in device memory.
Example: Atomic Addition (Fortran)
module atomic_example
use cudafor
contains
attributes(global) subroutine increment_counter(counter)
integer, device :: counter
integer :: idx
idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x
if (idx <= 10) call atomicadd(counter, 1) ! Safely increment the counter
end subroutine increment_counter
end module atomic_example
program main
use atomic_example
use cudafor
integer, device :: d_counter
integer :: h_counter
! Allocate and initialize counter
allocate(d_counter)
h_counter = 0
d_counter = h_counter
! Launch kernel
call increment_counter<<<1, 32>>>(d_counter)
! Copy back and print result
h_counter = d_counter
print *, "Counter =", h_counter ! Should print 10
deallocate(d_counter)
end program main
Compiling CUDA Programs with nvcc and nvfortran
To compile CUDA programs, you use either nvcc for CUDA C/C++ programs or nvfortran for CUDA Fortran programs. Below, we explain how to use these compilers, including basic commands and options.
Compiling CUDA C/C++ with nvcc
nvcc is the NVIDIA CUDA Compiler for compiling C or C++ programs with CUDA kernels.
- Basic Command:
nvcc -o program_name source_file.cu
source_file.cu: The CUDA source file.program_name: The name of the output binary.
- Common Options:
arch=sm_xx: Specify the GPU architecture (e.g.,sm_70for Volta,sm_80for Ampere).
nvcc -arch=sm_80 -o program_name source_file.cu
-O2,-O3: Enable optimization levels.-g: Include debugging information.
Compiling CUDA Fortran with nvfortran
nvfortran is part of the NVIDIA HPC SDK and is used to compile Fortran programs, including those with CUDA Fortran extensions.
- Basic Command:
nvfortran -o program_name source_file.f90
- source_file.f90: The CUDA Fortran source file.
- program_name: The name of the output binary.
- Common Options:
-cuda: Enable CUDA Fortran features (this is implicit if the code includes CUDA Fortran constructs).-Mcuda=ccX: Specify the compute capability of the target GPU (e.g., -Mcuda=cc80 for Ampere GPUs).-O2,-O3: Enable optimization levels.-g: Include debugging information.
[!TIP] To query device properties, like the compute capability, memory, maximum number of threads per block, etc., we can use the function
cudaGetDeviceProperties(see example code here)