ValueError: too many file descriptors in select()
при ~350-400 потоках.
from math import log
from array import array
from struct import unpack
from hashlib import sha512
from time import monotonic_ns
from statistics import median
from functools import lru_cache
class Bits:
__slots__ = ('data',)
def __init__(self, cap):
self.data = array('Q', (0 for _ in range((cap // 64) + 1)))
def __setitem__(self, i, x):
byte, bit = divmod(i, 64)
self.data[byte] |= 1 << bit
def __getitem__(self, i):
byte, bit = divmod(i, 64)
return (self.data[byte] & (1 << bit)) >> bit
class Bloom:
__slots__ = ('bit_num', 'probes', 'bits', 'stored')
def __init__(self, capacity, error=0.001):
self.bit_num = int(-(capacity * log(error)) / 0.4804)
self.probes = int(0.6931 * (self.bit_num / capacity))
self.bits = Bits(self.bit_num)
self.stored = 0
@lru_cache()
def _hash(self, obj):
if not isinstance(obj, str):
obj = str(obj)
hashes = unpack('Q'*8, sha512(obj.encode()).digest())
m = 0
l = len(hashes)
for i in range(self.probes):
h = hashes[i % l]
if i and i % l == 0:
m += 1
yield (h >> m) % self.bit_num
def add(self, item):
if item in self:
return
self.stored += 1
for h in self._hash(item):
self.bits[h] = 1
def __contains__(self, item):
return all(self.bits[h] for h in self._hash(item))
if __name__ == '__main__':
b = Bloom(10**6)
set_for_making_sure = set()
actual_lines_count = 0
times_bloom = []
times_set = []
for line in open("..."):
start_bloom = monotonic_ns()
b.add(line)
times_bloom.append(monotonic_ns() - start_bloom)
start_set = monotonic_ns()
set_for_making_sure.add(line)
times_set.append(monotonic_ns() - start_set)
actual_lines_count += 1
print(b.stored, "in the bloom filter")
print(len(set_for_making_sure), "in the set")
print(actual_lines_count, "non-unique lines in the file")
print(b.bits.data.__sizeof__() / 2**20, "megabytes for the bit field")
print(set_for_making_sure.__sizeof__() / 2**20, "megabytes for the set (only hash table, not actual values)")
print(median(times_bloom), "nanoseconds to add in the bloom filter")
print(median(times_set), "nanoseconds to add in the set")
На дерьмовую архитектуру и BS
Можно. Написать нормально
Увы, говнокод на то и говнокод
Вместо того, чтобы брызгать тут слюной, ты мог бы загуглить ровно одно слово "producer-consumer" и уже переписать свой код нормально. Но видимо, мне ещё много лулзов от тебя прилетит :)