"""Hash utilities."""
from __future__ import annotations
import logging
from typing import Any, Callable, Literal, Optional
from rwskit.collections_ import recursive_sort
from rwskit.numeric import to_signed
import xxhash
from icontract import require
from msgspec import msgpack
[docs]
log = logging.getLogger(__name__)
[docs]
HashSize = Literal[32, 64, 128]
[docs]
class ObjectHasher:
"""Hash objects using xxHash.
The only requirement is that the object be ``msgpack`` serializable.
"""
_hashers: dict[HashSize, Callable] = {
32: xxhash.xxh32,
64: xxhash.xxh3_64,
128: xxhash.xxh3_128,
}
@require(lambda hash_size: hash_size in (32, 64, 128))
def __init__(self, hash_size: HashSize = 128, signed: bool = False):
self._hash_size = hash_size
self._encoder = msgpack.Encoder()
self._hasher = self._hashers[hash_size]
self._signed = signed
@property
[docs]
def hash_size(self) -> HashSize:
"""Get integer size of the returned hash values this hasher produces."""
return self._hash_size
@require(lambda self, signed: not signed or self._hash_size < 128)
[docs]
def hash(self, obj: Any, signed: Optional[bool] = None) -> int:
"""
Hash the object using the hash size specified in the constructor.
``xxHash`` returns an unsigned value, but it can be converted to a
signed value if the hash size is less than 128 bits.
..note::
Only values supported by
`msgpack <https://jcristharif.com/msgspec/supported-types.html>`__
can be hashed.
..note::
This can only hash integers up to 64-bits.
Parameters
----------
obj : Any
The object to hash
signed : bool, default = False
Whether to convert the value to a signed integer using
:meth:`~rwskit.numeric.to_signed`.
Returns
-------
int
An integer representing the hash of the object.
Raises
------
OverflowError
If the data contains an integer that is outside the range
-2^63 to 2^64-1.
"""
signed = signed or self._signed
data = self._encoder.encode(obj)
unsigned_value = self._hasher(data).intdigest()
return to_signed(unsigned_value) if signed else unsigned_value
@require(lambda self, signed: not signed or self._hash_size < 128)
[docs]
def hash_sorted(self, obj: Any, signed: Optional[bool] = None) -> int:
"""Sort a collection using :func:`~rwskit.collections_.recursive_sort` and hash the result.
This should provide a more robust hash that should return the same
value for collections containing the same data, but in a different
order.
"""
signed = signed or self._signed
return self.hash(recursive_sort(obj), signed=signed)