Effect of optimisation

Modern compilers are doing quite a large part of the heavy lifting that's necessary to produce performing code. The number of strategies a modern compiler is using in order to optimise the code for speed can be quite impressive, and choosing the correct optimisation flags at compile time can really make a difference. However, this is true only if the data layout and basic structure of the code allow for these strategies to be implemented. Still, by performing some optimisaations by hand and looking at the resulting performances and disassembled code we can learn which optimisation strategies works best and why. We will show that, with a bit of patience, we can even outperform the compiler and reach performances that are close to the theoretical maximum that, as we will see, is set by the cache bandwdith for this specific problem.

Basic performance optimisation

Let's start with a simple code that adds up floating point numbers:

/* sum_std.c */
double sum(int N, double *a) {
    double s = 0.0;
    for (int i = 0; i < N; i++) {
        s += a[i];
    }
    return s;
}

We can test this by calling the function repeatedly and collecting some statistics, using this code:

/* runner.c */
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>

double sum(int N, double *a);

#define NUM_RUNS 200
void time_function(int N, double *a, double (*func)(int, double*), double *mean, double *std) {
    clock_t start, end;
    double time, m=0.0, s=0.0;
    for (int i = 0; i < NUM_RUNS; i++) {
        start = clock();
        func(N, a);
        end = clock();
        time = (double)(end - start) / CLOCKS_PER_SEC;
        m += time;
	      s += time*time;
    }
    m/=NUM_RUNS;
    *mean = m;
    *std = sqrt(s/NUM_RUNS - m*m);
}

int main(void) {
    int N = 1<<20;
    double mean, std;
    /* Let's enforce 16 byte memory alignment, to be sure */
    double *a = (double*) aligned_alloc( (size_t)16, N * sizeof(double));

    for (int i = 0; i < N; i++) a[i] = (double)(i + 1);

    time_function(N, a, sum, &mean, &std);
    return printf("Average execution time for sum: %g ± %g ms \n", 1e3*mean, 1e3*std);
}

We can compile this with different optimisation flags, for example:

> # no optimisation
> gcc  -O0 -lm  runner.c sum_std.c -o sum  && ./sum
Average execution time for sum: 4.11662 ± 0.0645093 ms

We're doing 2**20 = 1,048,576 additions in 4 ms, this means that we're crunching numbers at the rate of 0.26 Gflops (1 flops = one floating point operation per second). This is far from the 16 Gflops theoretical peak of the EPYC 7713 @ 2GHz where this code has been ran.

Switching on -O3 improves the performances by a factor 4,

> # -O3
> gcc  -O3  -lm  runner.c sum_std.c -o sum  && ./sum
Average execution time for sum: 1.03888 ± 0.0608442 ms 

bringing us to 1 Gflops, still more than 10 times slower than the theoretical peak. First of all, where does this factor 4 come from?

Difference in the compiled code

If we produce the assembly code for the first case with

gcc -O0 -S sum_std.c 

we see that the main loop, .L3, is mainly loading one double after the other, summing them up:

	.file	"sum_std.c"
	.text
	.globl	sum
	.type	sum, @function
sum:
.LFB0:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	movl	%edi, -20(%rbp)
	movq	%rsi, -32(%rbp)
	pxor	%xmm0, %xmm0
	movsd	%xmm0, -8(%rbp)
	movl	$0, -12(%rbp)
	jmp	.L2
.L3:
	movl	-12(%rbp), %eax
	cltq
	leaq	0(,%rax,8), %rdx
	movq	-32(%rbp), %rax
	addq	%rdx, %rax
	movsd	(%rax), %xmm0
	movsd	-8(%rbp), %xmm1
	addsd	%xmm1, %xmm0
	movsd	%xmm0, -8(%rbp)
	addl	$1, -12(%rbp)
.L2:
	movl	-12(%rbp), %eax
	cmpl	-20(%rbp), %eax
	jl	.L3
	movsd	-8(%rbp), %xmm0
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	sum, .-sum
	.ident	"GCC: (GNU) 11.4.1 20231218 (Red Hat 11.4.1-3)"
	.section	.note.GNU-stack,"",@progbits

Here's an annotated version of .L3:

.L3:
	movl	-12(%rbp), %eax      # Load the loop index (stored at -12(%rbp)) into %eax
	cltq                       # Sign-extend %eax (32-bit) into %rax (64-bit)
	leaq	0(,%rax,8), %rdx     # Compute the offset by multiplying the loop index by 8 and store it in %rdx
	movq	-32(%rbp), %rax      # Load the base address of the array (or pointer) from -32(%rbp) into %rax
	addq	%rdx, %rax           # Add the computed offset to the base address to get the current element address
	movsd	(%rax), %xmm0        # Load the double-precision float at the current element address into %xmm0
	movsd	-8(%rbp), %xmm1      # Load the running sum (or previous value) from -8(%rbp) into %xmm1
	addsd	%xmm1, %xmm0         # Add the current element value (in %xmm0) to the running sum (in %xmm1)
	movsd	%xmm0, -8(%rbp)      # Store the updated running sum back into -8(%rbp)
	addl	$1, -12(%rbp)        # Increment the loop index (stored at -12(%rbp)) by 1

The version compiled with -O3 is different, in that in .L4 (other lables like .L3 here are used in special array length cases) makes use vectorised load operations:

	vmovupd	(%rax), %ymm3   # loads 4 double at a time, note the ymm 256-bit register
		.file	"sum_std.c"
	.text
	.p2align 4
	.globl	sum
	.type	sum, @function
sum:
.LFB0:
	.cfi_startproc
	movl	%edi, %edx
	movq	%rsi, %rcx
	testl	%edi, %edi
	jle	.L7
	leal	-1(%rdi), %eax
	cmpl	$2, %eax
	jbe	.L8
	movq	%rsi, %rax
	movl	%edi, %esi
	vxorpd	%xmm0, %xmm0, %xmm0
	shrl	$2, %esi
	decl	%esi
	salq	$5, %rsi
	leaq	32(%rcx,%rsi), %rsi
	.p2align 4
	.p2align 3
.L4:
	vmovupd	(%rax), %xmm1
	vmovupd	(%rax), %ymm3
	addq	$32, %rax
	vaddsd	%xmm1, %xmm0, %xmm0
	vunpckhpd	%xmm1, %xmm1, %xmm1
	vaddsd	%xmm1, %xmm0, %xmm0
	vextractf128	$0x1, %ymm3, %xmm1
	vaddsd	%xmm1, %xmm0, %xmm0
	vunpckhpd	%xmm1, %xmm1, %xmm1
	vaddsd	%xmm1, %xmm0, %xmm0
	cmpq	%rsi, %rax
	jne	.L4
	movl	%edx, %eax
	andl	$-4, %eax
	testb	$3, %dl
	je	.L12
	vzeroupper
.L3:
	movslq	%eax, %rsi
	vaddsd	(%rcx,%rsi,8), %xmm0, %xmm0
	leaq	0(,%rsi,8), %rdi
	leal	1(%rax), %esi
	cmpl	%esi, %edx
	jle	.L1
	addl	$2, %eax
	vaddsd	8(%rcx,%rdi), %xmm0, %xmm0
	cmpl	%eax, %edx
	jle	.L1
	vaddsd	16(%rcx,%rdi), %xmm0, %xmm0
	ret
	.p2align 4
	.p2align 3
.L7:
	vxorpd	%xmm0, %xmm0, %xmm0
.L1:
	ret
	.p2align 4
	.p2align 3
.L12:
	vzeroupper
	ret
.L8:
	xorl	%eax, %eax
	vxorpd	%xmm0, %xmm0, %xmm0
	jmp	.L3
	.cfi_endproc
.LFE0:
	.size	sum, .-sum
	.ident	"GCC: (GNU) 11.4.1 20231218 (Red Hat 11.4.1-3)"
	.section	.note.GNU-stack,"",@progbits

The remaining operations, though involve unpacking the 4 double and adding them up using scalar operations like

	vaddsd	%xmm1, %xmm0, %xmm0

Optimising code by hand with AVX intrinsics

It seems that the code above is not making full use of the vectorised operations. Let's try to force its hand by coding the function directly using AVX intrinsics:

/* sum_avx.c */

// this include loads all the necessary definitions
// and constants to use the AVX extensions:
#include <immintrin.h>

double sum(int N, double *a) {
    // vsum will use a 256-bit register and is set to zero
    __m256d vsum = _mm256_set1_pd(0.0);
    
    // we move 4 doubles at a time (4 x 64 = 256)
    for (int i = 0; i < N; i += 4) {
        // load 4 doubles at a time and stores them in an AVX register
        __mm256d value = _mm256_load_pd(&a[i])
        // add the 4 double in column to vsum
        vsum = _mm256_add_pd(vsum, value);  
    }
    // now add up horizontally the 4 double in vsum,
    // and store the result in vsum  
    vsum = _mm256_hadd_pd(vsum, vsum);
    // copy the lower double-precision element;
    // we disregard the rest
    return _mm256_cvtsd_f64(vsum);
}

This turns out to be way faster (a factor 16 faster than the -O0 version, and a factor 4 faster than the -O3 version) :

> gcc  -O3 -lm  -mavx2 runner.c sum_avx.c -o sum  && ./sum
Average execution time for sum: 0.264505 ± 0.0396432 ms 

That's much better, almost 4 Gflops, only 1/4th of the theoretical peak of 14 Gflops.

How did we manage to get to this performance? Let's have a look at the corresponding assembly code generated with

gcc -O3 -march=native -mtune=native -S sum_avx.c

or, equivalently on this machine (an AMD EPYC 7713), with

gcc -O3 -mavx2 -S sum_avx.c
	.file	"sum_avx.c"
	.text
	.p2align 4
	.globl	sum
	.type	sum, @function
sum:
.LFB5484:
	.cfi_startproc
	testl	%edi, %edi
	jle	.L4
	decl	%edi
	vxorpd	%xmm0, %xmm0, %xmm0
	shrl	$2, %edi
	salq	$5, %rdi
	leaq	32(%rsi,%rdi), %rax
	.p2align 4
	.p2align 3
.L3:
	vappd	(%rsi), %ymm0, %ymm0
	addq	$32, %rsi
	cmpq	%rsi, %rax
	jne	.L3
	vhaddpd	%ymm0, %ymm0, %ymm0
	vzeroupper
	ret
	.p2align 4
	.p2align 3
.L4:
	vxorpd	%xmm0, %xmm0, %xmm0
	vhaddpd	%ymm0, %ymm0, %ymm0
	vzeroupper
	ret
	.cfi_endproc
.LFE5484:
	.size	sum, .-sum
	.ident	"GCC: (GNU) 11.4.1 20231218 (Red Hat 11.4.1-3)"
	.section	.note.GNU-stack,"",@progbits

where it is clear that the bulk of the work is done in .L3 by these lines:

.L3:
    vappd  (%rsi), %ymm0, %ymm0   # Add 4 packed double-precision floats from memory (pointed to by %rsi) 
                                   # to the values currently in the %ymm0 register. The result is stored back in %ymm0.
                                   # This is a vectorized addition, processing 4 doubles in parallel.

    addq    $32, %rsi              # Increment the memory pointer %rsi by 32 bytes (4 doubles x 8 bytes per double).
                                   # This moves to the next set of 4 double-precision floating-point numbers.

    cmpq    %rsi, %rax             # Compare the updated pointer %rsi with %rax (which holds the address 
                                   # marking the end of the array). This checks if we’ve processed all the elements.

    jne     .L3                    # If %rsi is still less than %rax (i.e., there are more elements left to process), 
                                   # jump back to label .L3 and repeat the loop.

Now, this way we are fully exploiting AVX instructions by adding packed double-precision floats while traversing the memory in jumps of 4 doubles.

Non-associative algebra is blocking further optimisations

Why is the -O3 flag not using the vappd instruction? This is because using packed operations would change the order operations. Remember that in floating point arithmentics (a+b)+c is not the same as a+(b+c). So, in the code that uses vappd we are adding up the values in the four columns first, and then we are adding up the four final values. This is not the same as what our initial C code was supposed to do (sum one after another all the doubles). Let's quick-fix this by releasing the constraint that prevent us from reordering, by passing -ffast-math to the compiler:

> gcc  -O3 -mtune=native -march=native  -ffast-math -lm  runner.c sum_std.c -o sum  && ./sum
Average execution time for sum: 0.265075 ± 0.0677651 ms 

In fact, we have now recovered the same performances, and looking at the assembly code it's clear that the code now makes full use of the AVX instructions, for example, looking at the output of

gcc -O3 -march=native -mtune=native -ffast-math -S sum_std.c

we find:

.L4:
	vappd	(%rax), %ymm0, %ymm0
	addq	$32, %rax
	cmpq	%rcx, %rax
	jne	.L4

Providing proper code to the compiler

Using -ffast-math might be not what we want in some cases. How can we convince the compiler to reach this level of optimisation without this "trick"? The answer is to provide a code that is already arranged in four "columns", so that using the packed sum vappd won't change the order of the operations; we "unroll" the loop manually:

/* sum_unrolled.c */
double sum(int N, double *a) {
    double s[4];
    for (int j=0;j<4;j++) s[j]=0.0;

    for (int i = 0; i < N-4; i+=4) {
        for (int j=0;j<4;j++) s[j] += a[i+j];
    }
    return s[0]+s[1]+s[2]+s[3];
}

This yields the wanted performance without using -ffast-math:

> gcc  -O3 -mtune=native -march=native  -lm  runner.c sum_unroll.c -o sum  && ./sum
Average execution time for sum: 0.26451  ± 0.00615288 ms 

Can we get closer to the peak performances?

Considering that we're doing 2**20 = 1,048,576 additions in 0.27 ms, we're crunching numbers at the rate of 3.9 Gflops. The EPYC 7713 is working at a clock speed of 2GHz. It's a Zen 3 architecture, which, for the packed add (VADDPS/D) operations has a latency of three clock cycles and a reciprocal throughput of 0.5 (cycles per instruction), meaning that every cycle it can perform two operations on 4 double precision floats.
Under ideal conditions (an effortless, continuous stream of data from L1 cache to the SIMD units), this would total 8 flops x 2 GHz = 16 Gflops. This is the way we came to the theoretical peak performances mentioned before.

[!TIP] You can find the information on latency and CPI of the instructions of many modern processors on Agner Fog's Software optimization resources, in particular in the instruction tables. For Intel processors, you can also refer to the intrinsics guide.

The fact that the CPI is 0.5 suggests that we could pack more than one vaddpd instruction within the for loop, because at the moment, as it is also apparent from the assembly code

.L4:
	vaddpd	(%rax), %ymm0, %ymm0
	addq	$32, %rax
	cmpq	%rcx, %rax
	jne	.L4

only one vaddpd is performed before moving the pointer forward. Let's try to force pre-fetching of two 4-double-precision blocks of data in each iteration, simply like this:

/* sum_avx.c */
#include <immintrin.h>
double sum(int N, double *a) {
    double s1,s2;
    __m256d vsum1 = _mm256_set1_pd(0.0);
    __m256d vsum2 = _mm256_set1_pd(0.0);
    for (int i = 0; i < N; i += 8) {
        vsum1 = _mm256_add_pd(vsum1, _mm256_load_pd(&a[i]) );
        vsum2 = _mm256_add_pd(vsum2, _mm256_load_pd(&a[i+4]) );
    }
    vsum1 = _mm256_hadd_pd(vsum1, vsum1);
    vsum2 = _mm256_hadd_pd(vsum2, vsum2);
    _mm256_store_pd(&s1, vsum1);
    _mm256_store_pd(&s2, vsum2);
    return s1+s2;
}
> gcc  -lm -O3 -mavx2  runner.c sum_avx.c -o sum  && ./sum
Average execution time for sum:  0.132805 ± 0.0319479 ms

This is a noticeable performance boos, to 7.9 Gflops! However, this is still just a bit more than half of peak performance. Where did the the remaining time go "wasted"?

Let's consider the data size: 2**20 double = 8 MB will only fit in L3 cache on the EPYC 7713 (L3: 32 MB/core; L2: 512 kB/core; L1: 32 kB/core). This means that there is a bottleneck to move data from L3 down to the registers. The cache peak bandwidth is about 32 bytes/clock cycle, meaning 32 bytes / (8 bytes/flop) x 2 GHz = 8 Gflops, showing that with the code above crunching numbers at 7.9 Gflops we managed to saturate the bandwidth, reaching the actual theoretical limit for this kind of operation on this CPU.

[!NOTE] One cache line on the 7713 is precisely 64 bytes, or, the 8 double-precision floats we are loading in the code above. In this sense, the code makes the most efficient use of the cache bandwidth.

[!IMPORTANT] It's always important to understand whether your calculation is compute-limited, with many operations to be performed on data residing in the registers or close nearby, or memory-limited, where the work is less computationally intensive, but it has to be performed on a large set of data that needs to be streamed continuously from RAM/cache the registers. In the last case, the bottleneck is the memory bandwidth, and this usually prevents to reach the theoretical peak flops.