Why does unreached code take linear time?!?

It seems to actually be about the location of the raise. Check these out:

def before():
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    end = True

def after():
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    end = True

They do the exact same things, only differ in whether they have the large unreached code block before or after the error. But their runtimes still differ a lot:

 344.4 ±  1.3 ns  after
1773.3 ±  3.5 ns  before

Python: 3.11.4 (main, Jun 24 2023, 10:18:04) [GCC 13.1.1 20230429]
Benchmark code
from timeit import timeit
from time import perf_counter as time
from statistics import mean, stdev
import sys


def before():
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    end = True


def after():
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    end = True


funcs = before, after

for _ in range(3):
    times = {f: [] for f in funcs}
    def stats(f):
        ts = [t * 1e9 for t in sorted(times[f])[:5]]
        return f'{mean(ts):6.1f} ± {stdev(ts):4.1f} ns '
    for _ in range(100):
        for f in funcs:
            t = timeit(f, number=10**4) / 1e4
            times[f].append(t)
    for f in sorted(funcs, key=stats):
        print(stats(f), f.__name__)
    print()

print('Python:', sys.version)

Attempt This Online!

Trying more locations of the unreached code:

1631.8 ± 18.0 ns  before_try
1710.2 ± 29.6 ns  between_try_and_raise
 294.1 ±  0.1 ns  between_raise_and_except
 315.6 ±  0.2 ns  in_except
 311.6 ±  2.2 ns  after_except

So putting it anywhere before the raise makes it slow, and putting it anywhere after the raise makes it fast.

Code
from timeit import timeit
from time import perf_counter as time
from statistics import mean, stdev
import sys


def before_try():
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    end = True


def between_try_and_raise():
    try:
        if 0 == 1:
            u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
        raise RuntimeError
    except RuntimeError:
        pass
    end = True


def between_raise_and_except():
    try:
        raise RuntimeError
        if 0 == 1:
            u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    except RuntimeError:
        pass
    end = True


def in_except():
    try:
        raise RuntimeError
    except RuntimeError:
        pass
        if 0 == 1:
            u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    end = True


def after_except():
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    end = True


funcs = before_try, between_try_and_raise, between_raise_and_except, in_except, after_except

for _ in range(3):
    times = {f: [] for f in funcs}
    def stats(f):
        ts = [t * 1e9 for t in sorted(times[f])[:5]]
        return f'{mean(ts):6.1f} ± {stdev(ts):4.1f} ns '
    for _ in range(500):
        for f in funcs:
            t = timeit(f, number=10**3) / 1e3
            times[f].append(t)
    for f in funcs:  # sorted(funcs, key=stats):
        print(stats(f), f.__name__)
    print()

print('Python:', sys.version)

Attempt This Online!

I get similar runtimes (well, a similar pattern) in Python 3.10.12 (Clang 14.0.6) (all ok) and Python 3.11.4 (before_try and between_try_and_raise slow) on an Apple M2.
In 3.11.4:

 694.7 ±  0.5 ns  before_try
 708.4 ±  0.6 ns  between_try_and_raise
  94.3 ±  0.0 ns  between_raise_and_except
 103.0 ±  0.1 ns  in_except
 102.1 ±  0.2 ns  after_except

But try this variation:

def norwegian_blue():
    u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u


def before_try():
    if 0 == 1:
        norwegian_blue()
        # [u for _ in range(100)]
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    end = True

Now I get:

 115.8 ±  0.3 ns  before_try
 692.9 ±  5.3 ns  between_try_and_raise
  93.9 ±  0.4 ns  between_raise_and_except
 101.8 ±  0.1 ns  in_except
 101.1 ±  0.2 ns  after_except

And when using the commented out line with the list comprehension instead of the dead parrot line, I get a similar result:

 125.1 ±  0.0 ns  before_try
 684.1 ±  0.2 ns  between_try_and_raise
  92.7 ±  0.1 ns  between_raise_and_except
 100.6 ±  0.1 ns  in_except
  99.9 ±  0.1 ns  after_except

The main difference that I see in the disassembly of the original before_try function is that ā€œLOAD_GLOBALā€ in 3.11 always increases the opcode offset with 12 whereas in 3.10 it always increases with only 2. (There are some diffs in the exception handling opcodes as well, but I guess they are minor?).

See: Compiler Explorer

[ Not true: When u is defined as some local const variable, so that LOAD_GLOBAL becomes LOAD_CONST, the functions all seem to run again as fast as in 3.10. So, ]
I’m guessing the change is that the LOAD_GLOBAL implementation changed in 3.11:

LOAD_GLOBAL( namei)
Loads the global named co_names[namei>>1] onto the stack.
Changed in version 3.11: If the low bit of namei is set, then a NULL is pushed to the stack before the global variable.

What’s a ā€œconst variableā€? That sounds like an oxymoron. How do you create that?

Ha - I guess, I’ve been doing too much Rust and C, lately :slight_smile:
I meant just define

u = 1

as local variable in the before_try function (in order to change the opcode). But I think I messed that up - it doesn’t really make a difference in runtime:

def before_try():
    u = 1
    if 0 == 1:
        u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u;u
    try:
        raise RuntimeError
    except RuntimeError:
        pass
    end = True

Still, sth changed with the LOAD_GLOBAL. But why that would cause a slowdown stumps me. Is it because the stack frame of the function is larger now?

Created a GitHub issue now: Exceptions slow in 3.11, depending on location Ā· Issue #109181 Ā· python/cpython Ā· GitHub