Source code for pydistinct.ensemble_estimators
from pydistinct.stats_estimators import *
from pydistinct.utils import _compute_birthday_problem_probability
[docs]def median_estimator(sequence=None, attributes=None):
"""
Takes median result from faster and generally more reliable statistical estimators
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:return: median value of all estimator
:rtype: float
"""
if sequence is None and attributes is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if sequence is not None:
n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
cache = {"n": n, "d": d, "attr": attribute_counts, "freq": frequency_dictionary}
def try_wrap(func, **kwargs):
try:
return func(**kwargs)
except:
return d
estimators = [("chao_estimator", try_wrap(chao_estimator, cache=cache)),
("chao_lee_estimator", try_wrap(chao_lee_estimator, cache=cache)),
("jackknife_estimator", try_wrap(jackknife_estimator, cache=cache)),
("bootstrap_estimator", try_wrap(bootstrap_estimator, cache=cache)),
]
for n_pop in [1000, 100000]:
estimators.append(("horvitz_thompson_estimator_{}".format(n_pop),
try_wrap(horvitz_thompson_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
estimators.append(("smoothed_jackknife_estimator_{}".format(n_pop),
try_wrap(smoothed_jackknife_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
estimators.append(("method_of_moments_v2_estimator_{}".format(n_pop),
try_wrap(method_of_moments_v2_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
return np.median(list(map(lambda x: x[1], estimators)))
[docs]def full_median_estimator(sequence=None, attributes=None):
"""
Takes median result from all statistical estimators. 10 times slower than normal median estimator
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:return: median value of all estimator
:rtype: float
"""
if sequence is None and attributes is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if sequence is not None:
n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
cache = {"n": n, "d": d, "attr": attribute_counts, "freq": frequency_dictionary}
def try_wrap(func, **kwargs):
try:
return func(**kwargs)
except:
return d
estimators = [("chao_estimator", try_wrap(chao_estimator, cache=cache)),
("chao_lee_estimator", try_wrap(chao_lee_estimator, cache=cache)),
("jackknife_estimator", try_wrap(jackknife_estimator, cache=cache)),
("bootstrap_estimator", try_wrap(bootstrap_estimator, cache=cache)),
("method_of_moments_estimator",
try_wrap(method_of_moments_estimator, cache=cache)),
("sichel_estimator", try_wrap(sichel_estimator, cache=cache)),
("shlosser_estimator", try_wrap(shlossers_estimator, cache=cache)),
("hybrid_estimator", try_wrap(hybrid_estimator, cache=cache))]
for n_pop in [1000, 100000]:
estimators.append(("horvitz_thompson_estimator_{}".format(n_pop),
try_wrap(horvitz_thompson_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
estimators.append(("method_of_moments_v2_estimator_{}".format(n_pop),
try_wrap(method_of_moments_v2_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
estimators.append(("smoothed_jackknife_estimator_{}".format(n_pop),
try_wrap(smoothed_jackknife_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
estimators.append(("method_of_moments_v3_estimator_{}".format(n_pop),
try_wrap(method_of_moments_v3_estimator,
pop_estimator=lambda x: x * n_pop, cache=cache)))
return np.median(list(map(lambda x: x[1], estimators)))