Source code for pydistinct.ensemble_estimators

from pydistinct.stats_estimators import *
from pydistinct.utils import _compute_birthday_problem_probability


[docs]def median_estimator(sequence=None, attributes=None): """ Takes median result from faster and generally more reliable statistical estimators :param sequence: sample sequence of integers :type sequence: array of ints :param attributes: dictionary with keys as the unique elements and values as counts of those elements :type attributes: dictionary where keys can be any type, values must be integers :return: median value of all estimator :rtype: float """ if sequence is None and attributes is None: raise Exception("Must provide a sequence, or a dictionary of attribute counts ") if sequence is not None: n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence) else: n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes) if n == d: return _compute_birthday_problem_probability(d) cache = {"n": n, "d": d, "attr": attribute_counts, "freq": frequency_dictionary} def try_wrap(func, **kwargs): try: return func(**kwargs) except: return d estimators = [("chao_estimator", try_wrap(chao_estimator, cache=cache)), ("chao_lee_estimator", try_wrap(chao_lee_estimator, cache=cache)), ("jackknife_estimator", try_wrap(jackknife_estimator, cache=cache)), ("bootstrap_estimator", try_wrap(bootstrap_estimator, cache=cache)), ] for n_pop in [1000, 100000]: estimators.append(("horvitz_thompson_estimator_{}".format(n_pop), try_wrap(horvitz_thompson_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) estimators.append(("smoothed_jackknife_estimator_{}".format(n_pop), try_wrap(smoothed_jackknife_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) estimators.append(("method_of_moments_v2_estimator_{}".format(n_pop), try_wrap(method_of_moments_v2_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) return np.median(list(map(lambda x: x[1], estimators)))
[docs]def full_median_estimator(sequence=None, attributes=None): """ Takes median result from all statistical estimators. 10 times slower than normal median estimator :param sequence: sample sequence of integers :type sequence: array of ints :param attributes: dictionary with keys as the unique elements and values as counts of those elements :type attributes: dictionary where keys can be any type, values must be integers :return: median value of all estimator :rtype: float """ if sequence is None and attributes is None: raise Exception("Must provide a sequence, or a dictionary of attribute counts ") if sequence is not None: n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence) else: n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes) if n == d: return _compute_birthday_problem_probability(d) cache = {"n": n, "d": d, "attr": attribute_counts, "freq": frequency_dictionary} def try_wrap(func, **kwargs): try: return func(**kwargs) except: return d estimators = [("chao_estimator", try_wrap(chao_estimator, cache=cache)), ("chao_lee_estimator", try_wrap(chao_lee_estimator, cache=cache)), ("jackknife_estimator", try_wrap(jackknife_estimator, cache=cache)), ("bootstrap_estimator", try_wrap(bootstrap_estimator, cache=cache)), ("method_of_moments_estimator", try_wrap(method_of_moments_estimator, cache=cache)), ("sichel_estimator", try_wrap(sichel_estimator, cache=cache)), ("shlosser_estimator", try_wrap(shlossers_estimator, cache=cache)), ("hybrid_estimator", try_wrap(hybrid_estimator, cache=cache))] for n_pop in [1000, 100000]: estimators.append(("horvitz_thompson_estimator_{}".format(n_pop), try_wrap(horvitz_thompson_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) estimators.append(("method_of_moments_v2_estimator_{}".format(n_pop), try_wrap(method_of_moments_v2_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) estimators.append(("smoothed_jackknife_estimator_{}".format(n_pop), try_wrap(smoothed_jackknife_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) estimators.append(("method_of_moments_v3_estimator_{}".format(n_pop), try_wrap(method_of_moments_v3_estimator, pop_estimator=lambda x: x * n_pop, cache=cache))) return np.median(list(map(lambda x: x[1], estimators)))