Hello,
I’m very surprised by some benchmarks I did recently. Hopefully someone here has an idea of what is going on. To get my main point over quickly, here are the results of running a quick informal benchmark on two different systems, a laptop from 2011 and a modern beefy desktop.
% lscpu | grep 'Model name'
Model name: Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz
% python3 -m timeit -s 'def fib(n): return fib(n-2) + fib(n-1) if n > 1 else 1' 'fib(23)'
50 loops, best of 5: 6.96 msec per loop
% lscpu | grep 'Model name'
Model name: 12th Gen Intel(R) Core(TM) i7-12700K
% python3 -m timeit -s 'def fib(n): return fib(n-2) + fib(n-1) if n > 1 else 1' 'fib(23)'
50 loops, best of 5: 5.13 msec per loop
Both machine are running Debian stable, and the system Python is used (Python 3.11). I have also compiled 3.12.4 from source (using ./configure --enable-optimizations
) and it gives the same results.
Now the single-thread speed difference between both CPUs (i5-2520M, 12700K) should amount to a factor of about 3. This is confirmed by a non-Python-benchmark where the command sysbench cpu run
produces results (“CPU speed”) of, respectively, 883 and 2452.
Perhaps the Fibonacci “benchmark” is flawed? Executing the script listed below (which is the implementation of a real-world tensor network contraction algorithm that is mostly juggling dicts and sets of tuples of ints) outputs (respectively):
cwg@drac:/tmp% python3 contract.py
...
time: 8.352014541625977
cwg@smok:/tmp% python3 contract.py
...
time: 7.0084943771362305
Again, the results are the same with Debian’s system Python and with CPython 3.12.4 compiled from source.
But perhaps CPython integer code simply does not profit much from modern CPUs (because of its memory access patterns, say)? I asked a colleague to execute the listed script on his laptop with i7-1265U CPU and he gets
time: 2.9437015056610107
both with Ubuntu’s system Python and the one of micromamba. So much better integer performance is possible. (I also measure a running time of 6.40 on a server with EPYC 7452 processor and Python 3.9.)
Note that the CPU of my colleague’s laptop belongs to the same generation from the same manufacturer (Alder Lake) as the desktop. So one could expect both to perform similarly, but the desktop should be slightly faster.
Any ideas?
# Implementations of algorithms from
# "Faster identification of optimal contraction sequences for tensor networks"
# by Robert N. C. Pfeifer, Jutho Haegeman, Frank Verstraete
# https://arxiv.org/abs/1304.6112
#
# Written in 2024 by Christoph Groth <christoph.groth@cea.fr>
#
# This is work in progress.
from math import prod, inf
from itertools import combinations
import collections
import collections.abc
import time
# The tensor network is made up by interconnected *simple* tensors.
# Any number of simple tensors can be contracted into a *composite* tensor.
# We call that number the *level* of the composite tensor.
class Network(collections.abc.Sequence, collections.abc.Iterable):
def __init__(self, tensors, dims):
self.tensors = tuple(tuple(t) for t in tensors)
self.dims = tuple(dims)
def __len__(self):
return len(self.tensors)
def __getitem__(self, index):
return self.tensors[index]
def __iter__(self):
return iter(self.tensors)
def cost(self, inds):
return prod(self.dims[ind] for ind in inds)
@classmethod
def make_rect(cls, width, height, val):
"""Generate the edges of a rectangular graph
This produces an iterable of (n0, n1, val) where n0 and n1 are
integer node ids and val is the provided constant value.
"""
assert width >= 1
assert height >= 1
tensors = collections.defaultdict(list)
dims = []
# Horizontal edges
for y in range(height):
for x in range(width - 1):
eid = len(dims)
dims.append(val)
tensors[x, y].append(eid)
tensors[x + 1, y].append(eid)
# Vertical edges
for y in range(height - 1):
for x in range(width):
eid = len(dims)
dims.append(val)
tensors[x, y].append(eid)
tensors[x, y + 1].append(eid)
return cls(tensors.values(), dims)
# The cache is used to remember the best (so far) way to contract a given
# composite tensor. The key is a tuple of sorted node ids that uniquely
# identifies the composite tensor. The value is a 3-tuple of the following
# values:
#
# - construction cost
# - set of indices
# - construction sequence: If the sequence has length one it is represented
# by the index of the corresponding tensor. Otherwise it's a 2-tuple of
# tensor ids.
def _init_cache(network):
"""Create cache entries for the simple tensors."""
return {(t,): (0, set(inds), t) for t, inds in enumerate(network)}
def _extract_seq(cache, ten):
def seq(ten):
_, _, s = cache[ten]
if not isinstance(s, tuple):
return s
a, b = s
# TODO: b does not need to be stored, it can be extracted:
assert b == tuple(sorted(set(ten) - set(a)))
return seq(a), seq(b)
return seq(ten)
def _any_common(seq0, seq1):
i0, i1 = iter(seq0), iter(seq1)
e0, e1 = next(i0), next(i1)
try:
while True:
if e0 == e1:
return True
elif e0 < e1:
e0 = next(i0)
else:
e1 = next(i1)
except StopIteration:
return False
def seq_bf_cap(network):
"""Exhaustive breadth-first search with cost capping
"""
n = len(network)
simple = tuple(range(n))
caches = [None]
caches.append(_init_cache(network))
for i in range(n - 1):
caches.append(dict())
min_dims = min(network.dims)
cost_cap = 1 # Current cost cap
cost_old = 0 # Previous cost cap
while not caches[n]:
# Tensors that have been updated in this round.
new = set()
# Lowest encountered cost that lies above the cost cap
cost_next = inf
# Loop over level of tensor to be constructed.
for lev in range(2, n + 1):
# Loop over level of source tensor a. The level of source tensor
# b follows.
for lev_a in range(1, lev // 2 + 1):
lev_b = lev - lev_a
for ten_a in caches[lev_a]:
for ten_b in caches[lev_b]:
# Avoid generating (c, d), (a, b) together with
# (a, b), (c, d).
if lev_a == lev_b and ten_a[0] > ten_b[0]:
continue
if _any_common(ten_a, ten_b):
continue
cost_a, inds_a, _ = caches[lev_a][ten_a]
cost_b, inds_b, _ = caches[lev_b][ten_b]
# Ignore outer products (not part of the original algorithm).
if not (inds_a & inds_b):
continue
cost = cost_a + cost_b + network.cost(inds_a | inds_b)
if cost > cost_cap:
if cost < cost_next:
cost_next = cost
elif cost_old < cost or ten_a in new or ten_b in new:
ten = list(ten_a)
ten.extend(ten_b)
ten.sort()
ten = tuple(ten)
# Query best known way to contract current tensor.
known = caches[lev].get(ten)
if known is None:
inds = inds_a ^ inds_b
else:
known_cost, inds, _ = known
assert inds == inds_a ^ inds_b
if cost >= known_cost:
continue
caches[lev][ten] = cost, inds, (ten_a, ten_b)
new.add(ten)
cost_old = cost_cap
cost_cap = max(cost_next, min_dims * cost_cap)
cost = caches[n][simple][0]
cache = {}
for c in caches[1:]:
cache.update(c)
return cost, _extract_seq(cache, simple)
def main():
nw = Network.make_rect(5, 5, 2)
print(nw.tensors)
print(nw.dims)
print()
t = time.time()
print(seq_bf_cap(nw))
print("time:", time.time() - t)
if __name__ == "__main__":
main()