This is an elementary comparison of “loop speeds” in some classical “scientific” languages. See summary at the end.

This experiment was conducted on MacOSX 10.14.5 on

lcambier$ sysctl -n machdep.cpu.brand_string
Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz

C version

lcambier$ gcc --version
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.14.sdk/usr/include/c++/4.2.1
Apple LLVM version 10.0.1 (clang-1001.0.46.4)
Target: x86_64-apple-darwin18.6.0
Thread model: posix
InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin

The code

#include <stdio.h>
#include <chrono>
#include <iostream>

const int N = 100000000;

using namespace std;

int main() {
    auto start = std::chrono::system_clock::now();
    double prod = 1.0;
    double inc = 1.0 / ((double)N);
    for(int i = 0; i < N; i++) {
        prod *= (1.0 + inc);
    }
    auto end = std::chrono::system_clock::now();
    printf("Prod (~ e?): %e\n", prod);
    cout << "C++ tooks " << chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us.\n";
    return 0;
}

Using -O0

lcambier$ g++ -O0 loop.cpp -std=c++11
lcambier$ ./a.out
Prod (~ e?): 2.718282e+00
C++ tooks 316146us.

So it took 0.3 secs. (Part of) the assembly is shown here. The core of the loop is on line 0000000100000fee (add 1 and 1/n) and on 0000000100000ff3 (the multiplication).

0000000100000fb4    movsd   0xefc(%rip), %xmm0
0000000100000fbc    movsd   0xefc(%rip), %xmm1
0000000100000fc4    movq    %rax, -0x10(%rbp)
0000000100000fc8    movsd   %xmm1, -0x18(%rbp)
0000000100000fcd    movsd   %xmm0, -0x20(%rbp)
0000000100000fd2    movl    $0x0, -0x24(%rbp)
0000000100000fd9    cmpl    $0x5f5e100, -0x24(%rbp)
0000000100000fe0    jge 0x10000100b
0000000100000fe6    movsd   0xed2(%rip), %xmm0
0000000100000fee    addsd   -0x20(%rbp), %xmm0
0000000100000ff3    mulsd   -0x18(%rbp), %xmm0
0000000100000ff8    movsd   %xmm0, -0x18(%rbp)
0000000100000ffd    movl    -0x24(%rbp), %eax
0000000100001000    addl    $0x1, %eax
0000000100001003    movl    %eax, -0x24(%rbp)
0000000100001006    jmp 0x100000fd9

Using -O3

lcambier$ gcc -O3 loop.c
lcambier$ ./a.out
Prod (~ e?): 2.718282e+00
C++ tooks 128346us.

So about 0.12 secs. The assembly (partial) is shown here. Compare with the above. The loop unrolling is clear on line 0000000100000a80 to 0000000100000aa4. This likely explains the performances differences with the -O3 version.

0000000100000a65    movsd   0x493(%rip), %xmm1
0000000100000a6d    movq    %rax, %r14
0000000100000a70    movsd   0x490(%rip), %xmm0
0000000100000a78    nopl    (%rax,%rax)
0000000100000a80    mulsd   %xmm0, %xmm1
0000000100000a84    mulsd   %xmm0, %xmm1
0000000100000a88    mulsd   %xmm0, %xmm1
0000000100000a8c    mulsd   %xmm0, %xmm1
0000000100000a90    mulsd   %xmm0, %xmm1
0000000100000a94    mulsd   %xmm0, %xmm1
0000000100000a98    mulsd   %xmm0, %xmm1
0000000100000a9c    mulsd   %xmm0, %xmm1
0000000100000aa0    mulsd   %xmm0, %xmm1
0000000100000aa4    mulsd   %xmm0, %xmm1
0000000100000aa8    addl    $-0xa, %ebx
0000000100000aab    jne 0x100000a80
0000000100000aad    movsd   %xmm1, -0x18(%rbp)

Julia

lcambier$ julia --version
julia version 1.1.0

Then, from the REPL

using Printf

function main()
    @time begin
        N = 100000000
        inc = 1.0 / N
        prod = 1.0
        for i = 1:N
            prod = prod * (1.0 + inc)
        end
    end
    @printf("Julia Prod (~ e?): %e\n", prod)
end
julia> include("loop.jl")
main (generic function with 1 method)

julia> main()
  0.131840 seconds
Julia Prod (~ e?): 2.718282e+00

julia> main()
  0.147879 seconds
Julia Prod (~ e?): 2.718282e+00

The second timing should be the reference. The first one includes the JIT (“compilation”).

Python

lcambier$ python3 --version
Python 3.6.8 :: Anaconda custom (x86_64)
import time

def main():
    t0 = time.time()
    N = 100000000
    inc = 1.0 / N
    prod = 1.0
    for i in range(N):
        prod = prod * (1.0 + inc)
    print("Python Prod (~ e?): {}\n".format(prod))
    t1 = time.time()
    print("Time: {} s.".format(t1-t0))
main()
lcambier$ python3 loop.py
Python Prod (~ e?): 2.71828179834636

Time: 6.2512829303741455 s.

Matlab

tic();
N = 100000000;
inc = 1/N;
prod = 1.0;
i = 1;
while i < N
    prod = prod * (1 + inc);
    i = i+1;
end
fprintf('Matlab Prod (~ e?): %e\n', prod);
toc();

From the Matlab prompt

>> loop
Matlab Prod (~ e?): 2.718282e+00
Elapsed time is 125.433666 seconds.

Go

package main

import "fmt"

func main() {
    N := 100000000
    inc := 1.0 / float64(N)
	prod := 1.0
	for i := 0; i < N; i++ {
		prod = prod * (1.0 + inc)
	}
    fmt.Println("Go Prod (~ e?): %e\n", prod)
}
lcambier$ time ./comp_loop
Go Prod (~ e?): %e
 2.71828179834636

real	0m0.146s
user	0m0.140s
sys	0m0.003s

Summary

  • C(-O0): 0.3 secs
  • C(-O3): 0.12 secs
  • Julia: 0.14 secs
  • Python: 6.2 secs
  • Matlab: 125 secs
  • Go: 0.14 secs

We can safely assume that the 0.15 secs is pretty much as good as we could do for a compiled (or JIT) language. The -O0 is slower simply because of the lack of any optimization. Python, as expected, is significantly slower (it’s interpreted). And Matlab, well…