Add recursion guard for `hash()` function like `reprlib.recursive_repr()` to avoid infinite recursion

Creating custom container-like types is a common practice in the real world. Sometimes developers overwrite some magic methods for their special needs. Here is a case to overwrite the __repr__ and __hash__ methods. Both methods will traverse the object recursively which may run into an infinite recursion.

A minimal snippet to demonstrate the use case. There is a self-reference in the container-like object.

from __future__ import annotations

import reprlib
from typing import Iterable, Iterator, Sequence, SupportsIndex, TypeVar, overload


T_co = TypeVar('T_co', covariant=True)


class FrozenList(Sequence[T_co]):
    def __init__(self, data: Iterable[T_co] = (), freeze: bool = False) -> None:
        self.data: list[T_co] = list(data)
        self._freeze = bool(freeze)

    def freeze(self) -> None:
        self._freeze = True

    def clear(self) -> None:
        if self._freeze:
            raise ValueError('The list has been set frozen.')
        self.data.clear()

    def append(self, value: T_co) -> None:
        if self._freeze:
            raise ValueError('The list has been set frozen.')
        self.data.append(value)

    def extend(self, values: Iterable[T_co]) -> None:
        if self._freeze:
            raise ValueError('The list has been set frozen.')
        self.data.extend(values)

    @overload
    def __getitem__(self, item: SupportsIndex) -> T_co:
        ...

    @overload
    def __getitem__(self, item: slice) -> FrozenList[T_co]:
        ...

    def __getitem__(self, item):
        if isinstance(item, slice):
            return FrozenList(self.data[item], freeze=self._freeze)
        return self.data[item]

    @overload
    def __setitem__(self, item: SupportsIndex, value: T_co) -> None:
        ...

    @overload
    def __setitem__(self, item: slice, value: Iterable[T_co]) -> None:
        ...

    def __setitem__(self, item, value):
        if self._freeze:
            raise ValueError('The list has been set frozen.')
        self.data[item] = value

    def __len__(self) -> int:
        return len(self.data)

    def __iter__(self) -> Iterator[T_co]:
        return iter(self.data)

    def __eq__(self, other: object) -> bool:
        return (
            isinstance(other, FrozenList)
            and self.data == other.data
            and self._freeze == other._freeze
        )

    def __hash__(self) -> int:
        if not self._freeze:
            raise ValueError('The list has not been set frozen.')
        return hash(tuple(self.data))

    @reprlib.recursive_repr()
    def __repr__(self) -> str:
        return f'FrozenList({self.data!r}, freeze={self._freeze})'


if __name__ == '__main__':
    fl = FrozenList()
    fl.append(0)
    fl.append(1)
    print(fl)  # -> FrozenList([0, 1], freeze=False)
    fl.append(fl)  # self-reference
    print(fl)  # -> FrozenList([0, 1, ...], freeze=False)
    fl.freeze()
    print(fl)  # -> FrozenList([0, 1, ...], freeze=True)
    print(hash(fl))  # -> RecursionError

The reprlib.recursive_repr() decorator handles the recursion case for the __repr__ method well.

I wonder if it is worth adding a similar decorator for hash. E.g.:

from _thread import get_ident

def recursive_hash(fillvalue=0):
    'Decorator to make a hash function return fillvalue for a recursive call'

    def decorating_function(user_function):
        hash_running = set()

        def wrapper(self):
            key = id(self), get_ident()
            if key in hash_running:
                return fillvalue
            hash_running.add(key)
            try:
                result = user_function(self)
            finally:
                hash_running.discard(key)
            return result

        # Can't use functools.wraps() here because of bootstrap issues
        wrapper.__module__ = getattr(user_function, '__module__')
        wrapper.__doc__ = getattr(user_function, '__doc__')
        wrapper.__name__ = getattr(user_function, '__name__')
        wrapper.__qualname__ = getattr(user_function, '__qualname__')
        wrapper.__annotations__ = getattr(user_function, '__annotations__', {})
        return wrapper

    return decorating_function

Ref:

2 Likes

Seems reasonable. Weird thought: I know it’s not the best name for a hash decorator, but can’t recursive_repr itself be used here? As far as I can tell, the only change is the default fillvalue, so using @recursive_repr(0) should work. It seems to in your example.

It is a dangerous idea, because you can get different hashes for equal objects. For example, the following recursive lists (only with FrozenList instead of list):

a = []; a.append(a)
b = [[]]; b[0].append(a)

Yes. @recursive_repr(0) would work. The only concern is the function name. We can wrap around the repr.recursive_repr function like:

def recursive_hash(fillvalue: int = 0):
     import reprlib
     return reprlib.recursive_repr(fillvalue)

We cannot compare two objects with self-reference. It will also raise RecursionError. For example:

a = []
b = []
a[:] = [b]
b[:] = [a]

print(a)  # -> [[[...]]]
print(b)  # -> [[[...]]]
a == b    # -> RecursionError
a != b    # -> RecursionError
import copy

a = []
a[:] = [a]
b = copy.deepcopy(a)

print(a)  # -> [[...]]
print(b)  # -> [[...]]
a == b    # -> RecursionError
a != b    # -> RecursionError

Variables a and b exhibit symmetry. In this case, we can calculate the hash for both lists (if we convert it to FrozenList) and it would be the same.

In example:

a = []; a.append(a)
b = [[]]; b[0].append(a)

I don’t think they are equivalent because the topology is different.

a:   [ ]
    /   \  # cycle
    \___/

b:   [ ]
      |
      v
     [ ]
      |
      v
     [ ]
    /   \  # cycle
    \___/

Also, We cannot check the equality for a very deeply nested container. For example:

a = [[[...[[[0]]]...]]]  # beyond the maximum recursion depth
b = [[[...[[[1]]]...]]]  # beyond the maximum recursion depth
a == b  # -> RecursionError

Thanks for the code example - very interesting. Do you have a use case, that isn’t trying to construct an immutable wrapper for an iterable, that contains a reference to itself?

If that’s all you need, this might be a work around for that:

Also, We cannot check the equality for a very deeply nested container.

An iterable way of calculating the hash is preferable to a recursive one. Otherwise, if you know your code will or could include nested containers, more than sys.getrecursionlimit() levels deep, than you can do something about that, e.g. warn the user, or call sys.setrecursionlimit and own the consequences.

Isn’t the Python native recursion limit you’re hitting, a good enough recursion guard as it is?

But a == [a] and b == [[b]].

The use case only happens for user-defined custom types. If they are written in pure Python, they are always mutable. The developers should be aware that if they overwrite the __hash__ method, the object should be immutable. This is usually not true in practice. In common practice, developers only want to overwrite __eq__ then they have to overwrite the __hash__ method as well.

A minimal snippet to raise RecursionError:

import reprlib


class ValueHolder:
    def __init__(self, value: object) -> None:
        self.value = value

    def __eq__(self, other: object) -> bool:
        return isinstance(other, ValueHolder) and self.value == other.value

    def __hash__(self) -> int:
        return hash(self.value)

    @reprlib.recursive_repr()
    def __repr__(self) -> str:
        return f'ValueHolder({self.value!r})'


if __name__ == '__main__':
    holder = ValueHolder(0)
    holder.value = holder
    print(holder)        # -> ValueHolder(...)
    print(hash(holder))  # -> RecursionError

I agree. But in the self-referential case, iterating over a graph with cycles will lead to an infinite loop and cause the program to hang forever.


Yes. But I wonder if we could return a reasonable hash value rather than raise a RecursionError (same as repr(...)). Note that if we accept this proposal, it would be a breaking change that is not backward-compatible.

It should be a == [a] and b == [[a]]. Although both a and b are list of list of list ... with infinite depth. But if we track the id(...) of each level, they are not the same.

a  # list of list of list ...
b  # list of list of list ...
a[0] is a     # -> True
a[0][0] is a  # -> True
b[0] is b     # -> False
b[0][0] is b  # -> False

Note that self == other will return True immediately if self is other. So we have:

a[0][0] is a  # -> True
b[0][0] is a  # -> True
a == b        # -> True without RecursionError

And:

a = []
a[:] = [a]
a == [a]  # -> True without RecursionError

But in the case:

a = []
a[:] = [a]
b = []
b[:] = [b]
a == b  # -> RecursionError

it will always raise RecursionError because they do not share the same object.

It seems strange to me that

a = []
a.append(a)
b == []
b.append(b)
c == []
c.append(a)

leads to

a == a # True
b == b # True
c == a # True
a == b # Recursion Error
a
[[...]]  
c
[[[...]]]  # the recursive repr of c != recursive repr of a

I would have preferred that a == a also generated a recursion error! The fact that it doesn’t not seems merely caused by an implementation detail (using id rather than equality which in this case should either give True - as it does -, since you can sort of “see” that the objects are equal, but which in general should perhaps be undefined - as I’d have preferred)?

In @storchaka’s example a == b evaluates to True. So shouldn’t any hash function – according the normal definition of hash functions – also preserve hash(a) == hash(b) ? I mean, the danger in the proposal seems to me that __hash__ and __eq__ become sort of unmoored from each other.

1 Like

Yes, it should - either that, or it should fail to hash. There are two rules:

  1. Two objects which compare equal must have the same hash
  2. An object’s hash must never change

And if you can’t comply with both, the object should be unhashable.

Don’t forget

from math import nan
nan is nan  # True
nan == nan # False

:slight_smile:

I’m sorry for the misleading here. I revisited the relevant source code of CPython. The argument for is shortcut in __eq__ is true for PyObject_RichCompareBool, not PyObject_RichCompare. The a == b statement calls PyObject_RichCompare.

/* Perform a rich comparison with integer result.  This wraps
   PyObject_RichCompare(), returning -1 for error, 0 for false, 1 for true. */
int
PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
{
    PyObject *res;
    int ok;

    /* Quick result when objects are the same.
       Guarantees that identity implies equality. */
    if (v == w) {
        if (op == Py_EQ)
            return 1;
        else if (op == Py_NE)
            return 0;
    }

    res = PyObject_RichCompare(v, w, op);
    if (res == NULL)
        return -1;
    if (PyBool_Check(res))
        ok = (res == Py_True);
    else
        ok = PyObject_IsTrue(res);
    Py_DECREF(res);
    return ok;
}

Then I found another interesting problem here:

from math import nan

nan is nan      # -> True
nan == nan      # -> False  # calls PyObject_RichCompare
[nan] == [nan]  # -> True   # calls PyObject_RichCompareBool in list.__eq__()
1 Like

+1 for left comparing equality for infinite recursive containers as “undefined”, even if theirs ids are the same. This can be done by removing the shortcut in PyObject_RichCompareBool:

int
PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
{
    ...

    /* Quick result when objects are the same.
       Guarantees that identity implies equality. */
    if (v == w) {
        if (op == Py_EQ)
            return 1;
        else if (op == Py_NE)
            return 0;
    }

    ...
}

After removing this, comparing any self-referenced objects will always raise RecursionError.

a = []
a[:] = [a]
a == a  # RecursionError after removing the `is` shortcut
b = []
b[:] = [b]
a == b  # RecursionError like before

Also, it will fix the case for [nan] == [nan].

from math import nan

nan is nan      # -> True
nan == nan      # -> False
[nan] == [nan]  # -> False after removing the `is` shortcut
1 Like

Yep, that’s because RichCompareBool PyEQ is the “x is y or x == y” check used for containment checks. It’s a subtlety that is a little hard to explain sometimes, but prevents utterly bizarre behaviour like iterating over something and having those things not be in the collection.

1 Like

Thanks for the clarification! I wrote a snippet to demonstrate this for better discussion.

from collections import UserList


class Node:
    def __init__(self, value):
        self.value = value
        self.parent = None

    def __repr__(self) -> str:
        return f'Node({self.value})'

    def __eq__(self, other: object) -> bool:
        if isinstance(other, Node):
            if self.value == other.value:
                if self.parent is not None:
                    self.parent.remove(self)
                    self.parent = None
                return True
            return False
        return False


class MyList(UserList):
    def __init__(self, initlist=None):
        super().__init__(initlist)
        for item in self:
            item.parent = self


if __name__ == '__main__':
    print(Node(1) == Node(1))  # -> True
    n1 = Node(1)
    print(n1 == n1)            # -> True

    ml1 = MyList([Node(1), Node(2), Node(3)])
    ml2 = MyList([Node(1), Node(2), Node(3)])
    print(ml1 == ml2)  # -> False
    print(ml1)         # -> [Node(2), Node(3)]
    print(ml2)         # -> [Node(1), Node(2), Node(3)]

    n1 = Node(1)
    n2 = Node(2)
    n3 = Node(3)
    ml1 = MyList([n1, n2, n3])
    ml2 = MyList([n1, n2, n3])
    print(ml1 == ml2)  # -> True
    print(ml1)         # -> [Node(1), Node(2), Node(3)]
    print(ml2)         # -> [Node(1), Node(2), Node(3)]

    n1 = Node(1)
    n2 = Node(2)
    n3 = Node(3)
    ml1 = MyList([n1, n2, n3])
    ml2 = MyList([n1, Node(2), n3])
    print(ml1 == ml2)  # -> False
    print(ml1)         # -> [Node(1), Node(3)]
    print(ml2)         # -> [Node(1), Node(2), Node(3)]