#include <stdio.h>
#include <math.h>
#include <cuda.h>

// Define the function to integrate: f(x) = x^3
__device__ double f(double x) {
    return pow(x, 3); // pow(a,b) computes a^b
}

// CUDA kernel to compute the sum of function values at given points
__global__ void trapezoidal_kernel(double a, double b, double p, int n, double *sum) {

    // Compute the global thread ID
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Accumulate contributions from this thread
    if (idx < n) {
        double x = a + idx * p;
        atomicAdd(sum, f(x)); // Atomic add to safely update shared sum
    }
    // Add the contributions from the end points
    if (idx==0) atomicAdd(sum, 0.5 * (f(a) + f(b))); 
    
}

// Host function to compute the integral using the trapezoidal rule with CUDA
double trapezoidal_rule_cuda(double a, double b, int n) {
    double *d_sum, h_sum = 0.0;
    double p = (b - a) / n; // Width of each trapezoid

    // Allocate memory on the device for the sum
    cudaMalloc((void **)&d_sum, sizeof(double));
    cudaMemcpy(d_sum, &h_sum, sizeof(double), cudaMemcpyHostToDevice);

    // Define the number of threads and blocks
    int blockSize = 32; // Number of threads per block
    int numBlocks = (n + blockSize - 1) / blockSize; // Round up to cover all trapezoids

    // Launch the kernel
    trapezoidal_kernel<<<numBlocks, blockSize>>>(a, b, p, n, d_sum);

    // Copy the result back to the host and finalize the sum
    cudaMemcpy(&h_sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost);

    // Clean up
    cudaFree(d_sum);

    return h_sum * p;
}

int main() {
    double a = 0.0;  // Lower limit of integration
    double b = 1.0;  // Upper limit of integration
    int n = 1000;    // Number of trapezoids (higher n for better accuracy)

    printf("This program performs numerical integration of f(x) = x^3 from a = %.2f to b = %.2f using %d trapezoids.\n", a, b, n);

    // Check if n is a valid number of trapezoids
    if (n <= 0) {
        printf("Error: The number of trapezoids must be positive.\n");
        return -1;
    }

    // Perform numerical integration
    double result = trapezoidal_rule_cuda(a, b, n);

    printf("The integral of f(x) = x^3 from %.2f to %.2f is approximately: %.5f\n", a, b, result);

    return 0;
}
module trapezoidal_module
  use cudafor
contains

  ! Device function to define f(x) = x^3
  attributes(device) function f(x) result(res)
    real(8), value :: x
    real(8) :: res
    res = x ** 3
  end function

  ! CUDA kernel for the trapezoidal rule
  attributes(global) subroutine trapezoidal_kernel(a, b, p, n, sum)
    real(8), value :: a, b, p
    integer, value :: n
    real(8)        :: sum
    integer        :: idx, iostat
    real(8)        :: x

    ! Compute global thread ID
    idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x

    ! Add contributions from each thread
    if (idx < n) then
      x = a + idx * p
      iostat = atomicAdd(sum, f(x))
    end if

    ! Add the contributions from the end points (only by thread 0)
    if (idx == 0) iostat = atomicAdd(sum, 0.5 * (f(a) + f(b)))
  end subroutine

  ! Host function for the trapezoidal rule
  function trapezoidal_rule_cuda(a, b, n) result(integral)
    real(8), value :: a, b
    integer, value :: n
    real(8) :: integral
    real(8), device :: d_sum
    real(8) :: h_sum
    real(8) :: p
    integer :: blockSize, numBlocks, iostat

    ! Initialize parameters
    h_sum = 0.0
    p = (b - a) / n

    ! Allocate device memory
    d_sum = 0.0

    ! Define grid and block dimensions
    blockSize = 32
    numBlocks = (n + blockSize - 1) / blockSize

    ! Launch the kernel
    call trapezoidal_kernel<<<numBlocks, blockSize>>>(a, b, p, n, d_sum)
    iostat = cudaDeviceSynchronize()

    ! Copy the result back to the host
    h_sum = d_sum

    ! Finalize the sum
    integral = h_sum * p

  end function

end module

program trapezoidal_integration
  use trapezoidal_module
  implicit none

  real(8) :: a, b, result
  integer :: n

  ! Define integration limits and number of trapezoids
  a = 0.0
  b = 1.0
  n = 1000

  print *, "This program performs numerical integration of f(x) = x^3"
  print *, "from a = ", a, " to b = ", b, " using ", n, " trapezoids."

  ! Check for valid input
  if (n <= 0) then
    print *, "Error: The number of trapezoids must be positive."
    stop
  end if

  ! Perform numerical integration
  result = trapezoidal_rule_cuda(a, b, n)

  print *, "The integral of f(x) = x^3 from ", a, " to ", b, " is approximately: ", result

end program

cudaMalloc

C

It is a CUDA runtime API function used to allocate memory on the GPU device. It works similarly to malloc in standard C, but the allocated memory resides in the GPU’s global memory rather than the host’s memory. It is essential for transferring data from the host (CPU) to the device (GPU) and enabling computations on the GPU.

Syntax:

cudaError_t cudaMalloc(void **devPtr, size_t size);
  • devPtr: A pointer to a pointer that will store the address of the allocated memory on the device.
  • size: The number of bytes to allocate.

Example:

double *d_array;
size_t size = n * sizeof(double);

// Allocate memory on the GPU
cudaMalloc((void **)&d_array, size);

[!TIP] Free the allocated memory on the device after use to avoid memory leaks!

cudaFree(d_array);


Fortran

In CUDA Fortran, memory allocation on the device is typically managed through Fortran’s native syntax rather than using cudaMalloc explicitly. However, it’s essential to understand how memory is allocated in CUDA Fortran and how it compares to the use of cudaMalloc in CUDA C.

In CUDA Fortran, arrays or variables intended to reside on the GPU are declared with the device attribute. The memory allocation and deallocation are handled via Fortran’s standard allocate and deallocate statements.

Example of Device Allocation in CUDA Fortran

real(8), device :: d_array(:)
integer :: n

n = 1000

! Allocate memory on the device
allocate(d_array(n))

! Deallocate memory when done
deallocate(d_array)

Here, the device attribute ensures that d_array resides in the GPU memory, and the allocate statement allocates memory on the device. This approach is more straightforward than using cudaMalloc.

Comparison with cudaMalloc in CUDA C

In CUDA C, cudaMalloc is explicitly used to allocate memory on the GPU:

double *d_array;
cudaMalloc((void **)&d_array, n * sizeof(double));

This is equivalent to the allocate statement in CUDA Fortran but requires explicit management with cudaMalloc and cudaFree.

Attributes in Fortran

In CUDA Fortran, attributes are used to define the memory location or scope of variables, specifying whether they reside on the host, device, or in special GPU memory spaces. These attributes allow developers to manage memory effectively and optimize performance when working with GPUs.

Here are the most commonly used attributes in CUDA Fortran:

  1. device
  • Marks variables that reside in the global memory of the GPU.
  • These variables are accessible from device (GPU) code but not directly from the host (CPU) unless transferred explicitly.
  • Example:
real(8), device :: d_array(:)
allocate(d_array(1000))  ! Allocates memory in GPU global memory
  1. constant
  • Marks variables stored in the constant memory of the GPU.
  • Constant memory is optimized for read-only data that is frequently accessed by many threads.
  • Example: real(8), constant :: c_array(10) = (/ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 /)
  1. managed
  • Marks variables allocated in unified memory, which is accessible by both the host and the device without explicit memory transfers.
  • This allows variables to be accessed by both host and device without explicit memory allocation or data transfer.
  • Example: real(8), managed :: m_array(100)

[!TIP]

Best Practices

  • Use device for arrays or variables requiring large storage and accessed frequently in kernels.
  • Use constant for small read-only data shared across threads for better caching performance.
  • Use managed for ease of development when performance is less critical or during prototyping.q

cudaMemcpy

It is used to transfer data between the host (CPU) and the device (GPU). It can also transfer data between two devices if needed. Proper memory management using cudaMemcpy ensures that computations on the GPU use the correct data.


C

Syntax:

cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind);
  • dst: Destination pointer.
  • src: Source pointer.
  • count: Number of bytes to copy.
  • kind: Direction of the transfer, specified as:
    • cudaMemcpyHostToDevice: Host to device.
    • cudaMemcpyDeviceToHost: Device to host.
    • cudaMemcpyDeviceToDevice: Device to device.

Example:

double *h_array = (double *)malloc(size); // Allocate host memory
double *d_array;
cudaMalloc((void **)&d_array, size);

// Copy data from host to device
cudaMemcpy(d_array, h_array, size, cudaMemcpyHostToDevice);

// Copy data from device to host
cudaMemcpy(h_array, d_array, size, cudaMemcpyDeviceToHost);

[!TIP]

  • Always ensure the source and destination pointers point to valid memory regions.
  • Minimize the number of memory transfers between host and device, as they can be a performance bottleneck.
  • Use cudaMemcpyAsync for non-blocking transfers if overlapping computation with communication is necessary.

Fortran

The basic syntax for cudaMemcpy in CUDA Fortran is:

call cudaMemcpy(destination, source, numBytes, direction)
  • destination: A pointer to the memory location where data will be copied.
  • source: A pointer to the memory location from where data will be copied.
  • numBytes: The size of the data to copy in bytes.
  • direction: The direction of data transfer, specified by one of the following constants:
    • cudaMemcpyHostToDevice – Copy data from the host (CPU) to the device (GPU).
    • cudaMemcpyDeviceToHost – Copy data from the device (GPU) to the host (CPU).
    • cudaMemcpyDeviceToDevice – Copy data between two device memory locations.

Implicit data transfer

However, in CUDA Fortran, it is possible to transfer data between the host and device without explicitly using cudaMemcpy. This is achieved by using the Fortran array assignment or automatic data movement features provided by CUDA Fortran. These features simplify memory management and data transfers, making the code more concise and easier to write.

When an array is declared with the device attribute in CUDA Fortran, you can directly assign values between host and device arrays. The CUDA Fortran runtime handles the memory transfers behind the scenes.

Examples of implicit data transfers

  1. Host to Device Transfer
program implicit_host_to_device
  use cudafor
  implicit none

  real(8), device :: d_array(100)  ! Device array
  real(8) :: h_array(100)          ! Host array
  integer :: i

  ! Initialize the host array
  h_array = [(real(i, 8), i = 1, 100)]

  ! Assign host array to device array (host to device transfer)
  d_array = h_array

  print *, "Data transferred to device."
end program implicit_host_to_device
  1. Device to Host Transfer
program implicit_device_to_host
  use cudafor
  implicit none

  real(8), device :: d_array(100)  ! Device array
  real(8) :: h_array(100)          ! Host array
  integer :: i

  ! Initialize the device array with values (on the host)
  d_array = [(real(i, 8), i = 1, 100)]

  ! Assign device array to host array (device to host transfer)
  h_array = d_array

  print *, "Data transferred back to host:", h_array(1:10)
end program implicit_device_to_host

[!NOTE]

  • The runtime automatically determines when to perform host-to-device or device-to-host transfers based on array assignments. This eliminates the need for explicit cudaMemcpy calls.
  • The Fortran-style assignment syntax is more intuitive and concise than explicitly managing memory transfers.
  • While convenient, implicit data transfers may introduce hidden performance costs, especially if used frequently in a loop. Explicit cudaMemcpy might be better for performance-critical code as it provides finer control.

numBlocks and blockSize

CUDA uses a grid and block hierarchy to execute kernels in parallel.

  • blockSize: The number of threads per block.
  • numBlocks: The number of blocks in the grid.

This configuration determines how the workload is distributed across the GPU.

Choosing blockSize and numBlocks:

  • The total number of threads is numBlocks * blockSize.
  • blockSize must not exceed the maximum threads per block (typically 512 or 1024).
  • numBlocks is often calculated as: int numBlocks = (n + blockSize - 1) / blockSize; // Ceiling division

Example:

int n = 1000;              // Total number of elements
int blockSize = 256;       // Threads per block
int numBlocks = (n + blockSize - 1) / blockSize; // Number of blocks

[!TIP]

  • A good choice of blockSize depends on the GPU hardware and the problem size. Common choices are 128, 256, or 512.

Multi-Dimensional Grids and Blocks

CUDA supports multi-dimensional grids and blocks, which are particularly useful for problems involving 2D or 3D data (e.g., images or volumes).

  • Threads in a Block: Each block can be 1D, 2D, or 3D.
    • Defined by dim3 blockDim.
    • Total threads per block: blockDim.x * blockDim.y * blockDim.z.
  • Blocks in a Grid: Grids can also be 1D, 2D, or 3D.
    • Defined by dim3 gridDim.
    • Total blocks: gridDim.x * gridDim.y * gridDim.z

Example

dim3 blockSize(8, 8, 8);  // Define a block with 8x8x8 threads (512 threads per block)
dim3 gridSize((Nx + blockSize.x - 1) / blockSize.x, 
              (Ny + blockSize.y - 1) / blockSize.y, 
              (Nz + blockSize.z - 1) / blockSize.z);  // Define a grid to cover the 3D array

// Launch the kernel
myKernel<<<gridSize, blockSize>>>(deviceArray, Nx, Ny, Nz);

In this code:

  • blockSize specifies the dimensions of each block (3D configuration).
  • gridSize ensures enough blocks to cover the entire 3D problem size.
  • The kernel myKernel is invoked with these grid and block configurations.

In Fortran:

program cuda_3d_example
  use cudafor
  implicit none

  integer, parameter :: Nx = 64, Ny = 64, Nz = 64
  type(dim3) :: blockSize, gridSize

  ! Define block dimensions
  blockSize = dim3(8, 8, 8)  ! Block of 8x8x8 threads

  ! Define grid dimensions (rounding up to cover all data)
  gridSize = dim3((Nx + blockSize%x - 1) / blockSize%x, &
                  (Ny + blockSize%y - 1) / blockSize%y, &
                  (Nz + blockSize%z - 1) / blockSize%z)

  ! Launch the kernel
  call myKernel<<<gridSize, blockSize>>>(Nx, Ny, Nz)

end program cuda_3d_example

CUDA Kernels

CUDA kernels are functions executed on the GPU. Key Features of Kernels:

  1. Parallel Execution:
  • Each thread executes the kernel independently and concurrently.
  1. Special Attributes:
  • Kernels must be declared with __global__ in CUDA C and with attributes(global) in CUDA Fortran.
  • They can only return void (i.e., they do not have a return value).
  1. Thread and Block Indices:
  • Threads and blocks are identified using special variables:
    • threadIdx: Index of a thread within its block.
    • blockIdx: Index of the block within the grid.
    • blockDim: Number of threads in a block.
    • gridDim: Number of blocks in the grid.
  • These indices are used to compute the unique thread ID for assigning work.
plot
Figure 1. Source: http://thebeardsage.com/cuda-dimensions-mapping-and-indexing/

C

Kernels appear like normal functions but are defined with the __global__ qualifier. They are invoked from the host code. A kernel launch specifies the grid and block dimensions.

Syntax:

kernelName<<<numBlocks, blockSize>>>(arguments);
  • numBlocks: Number of blocks in the grid.
  • blockSize: Number of threads per block.

Example Kernel:

__global__ void addArrays(double *a, double *b, double *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x; // Compute global thread index
    if (idx < n) {
        c[idx] = a[idx] + b[idx]; // Perform addition for this thread
    }
}

Kernel Invocation:

int n = 1000;            // Number of elements
int blockSize = 256;     // Threads per block
int numBlocks = (n + blockSize - 1) / blockSize;

// Allocate memory and copy data as needed
addArrays<<<numBlocks, blockSize>>>(d_a, d_b, d_c, n);

[!TIP]

  • Ensure the number of threads covers the entire workload (n in the above example).
  • Synchronize if necessary to ensure all threads finish before proceeding: cudaDeviceSynchronize();

Fortran

Kernels are written with the attributes(global) specifier to indicate that they are callable from the host but execute on the device.

Example: A Simple Kernel in CUDA Fortran

Here’s an example of a kernel that adds two arrays element-wise:

module kernel_module
  use cudafor
contains
  attributes(global) subroutine add_arrays_kernel(a, b, c, n)
    real, device :: a(:), b(:), c(:)
    integer, value :: n
    integer :: idx

    ! Compute the unique thread index
    idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x

    ! Ensure we stay within array bounds
    if (idx <= n) then
      c(idx) = a(idx) + b(idx)
    end if
  end subroutine add_arrays_kernel
end module kernel_module

Steps to Launch a Kernel

  1. Declare Arrays on the Device:
real, device :: d_a(1000), d_b(1000), d_c(1000)
real :: h_a(1000), h_b(1000), h_c(1000)
  1. Transfer Data from Host to Device:
d_a = h_a
d_b = h_b
  1. Define Grid and Block Dimensions:
type(dim3) :: gridSize, blockSize
blockSize = dim3(256)  ! 256 threads per block
gridSize = dim3((1000 + 255) / 256)  ! Enough blocks to cover all elements
  1. Launch the Kernel:
use kernel_module
call add_arrays_kernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, 1000)
  1. Copy Results Back to Host:
h_c = d_c

[!NOTE]

  • Memory: Kernel arguments can include device arrays or scalars (passed by value!).
  • Scope: Kernels can only access device memory and cannot directly access host memory.
  • Concurrency: Each thread runs the same kernel code but works on different data, making it highly parallel.

Device functions

In CUDA programming, __device__ in C or attributes(device) in Fortran is used to declare functions that are executed on the GPU and can only be called by other GPU functions. These are referred to as device functions. Unlike kernels (which are marked with __global__ or attributes(global)), device functions cannot be launched directly from the host but are called within kernels or other device functions.

Key features:

  1. GPU-Only Execution:
  • Device functions are compiled for GPU execution only.
  • They are not accessible from the host.
  1. Calling Rules:
  • Can be called from other device functions or kernels.
  • Cannot be called directly from host functions.
  1. Inlined by Default:
  • Device functions are often inlined by the compiler to reduce the overhead of function calls.
  1. Purpose:
  • They help modularize GPU code by allowing reusable computations within kernels.

C

In CUDA C, __device__ is the keyword to define a device function.

Example:

#include <stdio.h>
#include <math.h>

// Device function to compute the cube of a number
__device__ double cube(double x) {
    return x * x * x;
}

// Kernel to use the device function
__global__ void compute_cubes(const double *input, double *output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        output[idx] = cube(input[idx]); // Calling the device function
    }
}

int main() {
    // Example host code would allocate arrays, transfer them to the device,
    // launch compute_cubes, and copy results back to the host.
}

Fortran

In CUDA Fortran, attributes(device) is used to define device functions.

Example:

module device_functions
  use cudafor
contains
  ! Device function to compute the cube of a number
  attributes(device) function cube(x) result(res)
    real, value :: x
    real :: res
    res = x ** 3
  end function cube

  ! Kernel to use the device function
  attributes(global) subroutine compute_cubes(input, output, n)
    real, device :: input(:), output(:)
    integer, value :: n
    integer :: idx

    ! Compute unique thread index
    idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x

    ! Ensure within bounds
    if (idx <= n) then
      output(idx) = cube(input(idx))  ! Calling the device function
    end if
  end subroutine compute_cubes
end module device_functions

Atomic Functions in CUDA

Atomic functions are special operations that allow safe updates to shared memory by ensuring that only one thread modifies a memory location at a time. They are essential in parallel programming to prevent race conditions when multiple threads access and modify shared data simultaneously.

General Considerations

  1. Atomicity: An operation is atomic if it is executed as a single, indivisible step. This guarantees that no other thread can interfere during the operation.
  2. Common Use Cases: Incrementing counters. Accumulating sums. Updating shared data structures like histograms.
  3. Performance Impact: Atomic operations serialize access to shared data, which can reduce performance due to contention. However, they are often faster and easier to implement than more complex synchronization methods.
  4. Supported Operations: CUDA supports atomic addition, subtraction, exchange, min/max, and logical operations like AND/OR. These operations are hardware-dependent and optimized for GPU execution.

C

In CUDA C, atomic functions are prefixed with atomic. For example, atomicAdd performs an atomic addition. These functions operate on integers and floats in global or shared memory.

Example: Atomic Addition (C)

#include <stdio.h>

__global__ void atomic_example(int *counter) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < 10) {
        atomicAdd(counter, 1); // Safely increment the counter
    }
}

int main() {
    int *d_counter, h_counter = 0;

    // Allocate memory and initialize
    cudaMalloc((void**)&d_counter, sizeof(int));
    cudaMemcpy(d_counter, &h_counter, sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    atomic_example<<<1, 32>>>(d_counter);

    // Copy back and print result
    cudaMemcpy(&h_counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost);
    printf("Counter = %d\n", h_counter); // Should print 10

    cudaFree(d_counter);
    return 0;
}

Fortran

In CUDA Fortran, atomic operations are similarly supported with the atomic subroutines provided by the language. These subroutines operate on variables in device memory.

Example: Atomic Addition (Fortran)

module atomic_example
  use cudafor
contains
  attributes(global) subroutine increment_counter(counter)
    integer, device :: counter

    integer :: idx
    idx = threadIdx%x + (blockIdx%x - 1) * blockDim%x

    if (idx <= 10) call atomicadd(counter, 1)  ! Safely increment the counter

  end subroutine increment_counter
end module atomic_example

program main
  use atomic_example
  use cudafor
  integer, device :: d_counter
  integer :: h_counter

  ! Allocate and initialize counter
  allocate(d_counter)
  h_counter = 0
  d_counter = h_counter

  ! Launch kernel
  call increment_counter<<<1, 32>>>(d_counter)

  ! Copy back and print result
  h_counter = d_counter
  print *, "Counter =", h_counter  ! Should print 10

  deallocate(d_counter)
end program main

Compiling CUDA Programs with nvcc and nvfortran

To compile CUDA programs, you use either nvcc for CUDA C/C++ programs or nvfortran for CUDA Fortran programs. Below, we explain how to use these compilers, including basic commands and options.

Compiling CUDA C/C++ with nvcc

nvcc is the NVIDIA CUDA Compiler for compiling C or C++ programs with CUDA kernels.

  1. Basic Command:
nvcc -o program_name source_file.cu
  • source_file.cu: The CUDA source file.
  • program_name: The name of the output binary.
  1. Common Options:
  • arch=sm_xx: Specify the GPU architecture (e.g., sm_70 for Volta, sm_80 for Ampere).
nvcc -arch=sm_80 -o program_name source_file.cu
  • -O2, -O3: Enable optimization levels.
  • -g: Include debugging information.

Compiling CUDA Fortran with nvfortran

nvfortran is part of the NVIDIA HPC SDK and is used to compile Fortran programs, including those with CUDA Fortran extensions.

  1. Basic Command:
nvfortran -o program_name source_file.f90
  • source_file.f90: The CUDA Fortran source file.
  • program_name: The name of the output binary.
  1. Common Options:
  • -cuda: Enable CUDA Fortran features (this is implicit if the code includes CUDA Fortran constructs).
  • -Mcuda=ccX: Specify the compute capability of the target GPU (e.g., -Mcuda=cc80 for Ampere GPUs).
  • -O2, -O3: Enable optimization levels.
  • -g: Include debugging information.

[!TIP] To query device properties, like the compute capability, memory, maximum number of threads per block, etc., we can use the function cudaGetDeviceProperties (see example code here)