It seems to actually be about the location of the raise. Check these out:
def before():
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
try:
raise RuntimeError
except RuntimeError:
pass
end = True
def after():
try:
raise RuntimeError
except RuntimeError:
pass
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
end = True
They do the exact same things, only differ in whether they have the large unreached code block before or after the error. But their runtimes still differ a lot:
344.4 ± 1.3 ns after
1773.3 ± 3.5 ns before
Python: 3.11.4 (main, Jun 24 2023, 10:18:04) [GCC 13.1.1 20230429]
Benchmark code
from timeit import timeit
from time import perf_counter as time
from statistics import mean, stdev
import sys
def before():
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
try:
raise RuntimeError
except RuntimeError:
pass
end = True
def after():
try:
raise RuntimeError
except RuntimeError:
pass
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
end = True
funcs = before, after
for _ in range(3):
times = {f: [] for f in funcs}
def stats(f):
ts = [t * 1e9 for t in sorted(times[f])[:5]]
return f'{mean(ts):6.1f} ± {stdev(ts):4.1f} ns '
for _ in range(100):
for f in funcs:
t = timeit(f, number=10**4) / 1e4
times[f].append(t)
for f in sorted(funcs, key=stats):
print(stats(f), f.__name__)
print()
print('Python:', sys.version)
So putting it anywhere before the raise makes it slow, and putting it anywhere after the raise makes it fast.
Code
from timeit import timeit
from time import perf_counter as time
from statistics import mean, stdev
import sys
def before_try():
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
try:
raise RuntimeError
except RuntimeError:
pass
end = True
def between_try_and_raise():
try:
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
raise RuntimeError
except RuntimeError:
pass
end = True
def between_raise_and_except():
try:
raise RuntimeError
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
except RuntimeError:
pass
end = True
def in_except():
try:
raise RuntimeError
except RuntimeError:
pass
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
end = True
def after_except():
try:
raise RuntimeError
except RuntimeError:
pass
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
end = True
funcs = before_try, between_try_and_raise, between_raise_and_except, in_except, after_except
for _ in range(3):
times = {f: [] for f in funcs}
def stats(f):
ts = [t * 1e9 for t in sorted(times[f])[:5]]
return f'{mean(ts):6.1f} ± {stdev(ts):4.1f} ns '
for _ in range(500):
for f in funcs:
t = timeit(f, number=10**3) / 1e3
times[f].append(t)
for f in funcs: # sorted(funcs, key=stats):
print(stats(f), f.__name__)
print()
print('Python:', sys.version)
I get similar runtimes (well, a similar pattern) in Python 3.10.12 (Clang 14.0.6) (all ok) and Python 3.11.4 (before_try and between_try_and_raise slow) on an Apple M2.
In 3.11.4:
The main difference that I see in the disassembly of the original before_try function is that āLOAD_GLOBALā in 3.11 always increases the opcode offset with 12 whereas in 3.10 it always increases with only 2. (There are some diffs in the exception handling opcodes as well, but I guess they are minor?).
[ Not true: When u is defined as some local const variable, so that LOAD_GLOBAL becomes LOAD_CONST, the functions all seem to run again as fast as in 3.10. So, ]
Iām guessing the change is that the LOAD_GLOBAL implementation changed in 3.11:
LOAD_GLOBAL( namei)
Loads the global named co_names[namei>>1] onto the stack. Changed in version 3.11: If the low bit of namei is set, then a NULL is pushed to the stack before the global variable.
Ha - I guess, Iāve been doing too much Rust and C, lately
I meant just define
u = 1
as local variable in the before_try function (in order to change the opcode). But I think I messed that up - it doesnāt really make a difference in runtime:
def before_try():
u = 1
if 0 == 1:
u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
try:
raise RuntimeError
except RuntimeError:
pass
end = True
Still, sth changed with the LOAD_GLOBAL. But why that would cause a slowdown stumps me. Is it because the stack frame of the function is larger now?