import math
import warnings
import numpy as np
from pydistinct.utils import _get_attribute_counts, _get_frequency_dictionary, _compute_birthday_problem_probability, \
h_x, _check_dict, memoized_h_x, _check_iterable, precompute_from_attr, precompute_from_seq
from scipy.optimize import broyden1, broyden2
from scipy.stats import chi2
[docs]def goodmans_estimator(sequence=None, attributes=None, cache=None):
"""
Implementation of goodmans estimator from Goodman 1949 : throws an error if N is too high due to
numerical complexity
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d: # all values are distinct
return _compute_birthday_problem_probability(d)
memo_fact_dict = {}
def memo_fact(x, memo_dict): # memoize factorial function to make it faster
if x not in memo_dict:
memo_dict[x] = np.math.factorial(x)
return memo_dict[x], memo_dict
sum_goodman = 0
N = n * 2
for i in frequency_dictionary.keys():
num_1, memo_fact_dict = memo_fact(N - n + i - 1, memo_fact_dict)
num_2, memo_fact_dict = memo_fact(n - i, memo_fact_dict)
denom_1, memo_fact_dict = memo_fact(N - n - 1, memo_fact_dict)
denom_2, memo_fact_dict = memo_fact(n, memo_fact_dict)
sum_goodman += ((-1) ** (i + 1)) * num_1 * num_2 * frequency_dictionary[i] / (denom_1 * denom_2)
d_goodman = d + sum_goodman
return d_goodman
[docs]def chao_estimator(sequence=None, attributes=None, cache=None):
"""
Implementation of Chao's estimator from Chao 1984, using counts of values that appear exactly once and twice
d_chao = d + (f_1)^2/(2*(f_2))
returns birthday problem solution if there are no sequences observed of frequency 2
(ie each distinct value observed is never seen again)
also makes insane bets (10x) when every point observed is almost unique. could be good or bad
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if 1 not in frequency_dictionary:
return d # d_chao will return d + 0 anyway
f1 = frequency_dictionary[1]
if 2 not in frequency_dictionary:
return _compute_birthday_problem_probability(d)
f2 = frequency_dictionary[2]
d_chao = d + np.square(f1) / (2 * f2)
return d_chao
[docs]def jackknife_estimator(sequence=None, attributes=None, cache=None):
"""
Jackknife scheme for estimating D (Ozsoyoglu et al., 1991)
good at regimes where sample size is close to actual number of points
D^hat_c_j = d_n - (n - l)(d_(n-1)- d_n).
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
sum_d_n_minus_1 = d * n - frequency_dictionary[1]
d_n_minus_1 = sum_d_n_minus_1 / n
d_jackknife = d - (n - 1) * (d_n_minus_1 - d)
return d_jackknife
[docs]def chao_lee_estimator(sequence=None, attributes=None, cache=None):
"""
Implementation of Chao and Lee's estimator (Chao and Lee, 1984) using a natural estimator of coverage
gamma hat is an estimator for the squared coefficient of variation of the frequencies
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts, frequency_dictionary = cache["n"], cache["d"], cache["attr"], cache["freq"]
elif sequence is not None:
n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if 1 not in frequency_dictionary:
return _compute_birthday_problem_probability(d)
f1 = frequency_dictionary[1]
c_hat = 1 - f1 / n # throws an error if f1 = n (all values observed are unique.)
if c_hat == 0:
return _compute_birthday_problem_probability(d)
gamma_hat_squared = (1 / d) * np.var(list(attribute_counts.values())) / ((n / d) ** 2)
d_cl = (d / c_hat) + ((n * (1 - c_hat) * gamma_hat_squared) / c_hat)
return d_cl
[docs]def shlossers_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 2, n_pop=None, cache=None):
"""
Implementation of Shlosser's Estimator (Shlosser 1981) using a Bernoulli Sampling scheme
Note : Hard to determine q (probability of being included)
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
f1 = frequency_dictionary[1]
q = n / n_pop # placeholder q determination - TODO
numerator = 0
denominator = 0
for i in range(1, n):
if i in frequency_dictionary:
result = frequency_dictionary[i]
else:
result = 0
numerator += ((1 - q) ** i) * result
denominator += (i * q * (1 - q) ** (i - 1)) * result
d_shlosser = d + f1 * numerator / denominator
return d_shlosser
[docs]def sichel_estimator(sequence=None, attributes=None, cache=None):
"""
Implementation of Sichel’s Parametric Estimator (Sichel 1986a, 1986b and 1992)
which uses a zero-truncated generalized inverse Gaussian-Poisson to estimate D
implementation uses broyden 2 to solve and search linear space for good solution
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if not (sequence is not None or attributes or cache):
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
f1 = frequency_dictionary[1]
a = ((2 * n) / d) - np.log(n / f1) # does not depend on g
b = ((2 * f1) / d) + np.log(n / f1) # does not depend on g
def diff_eqn(g):
result = (1 + g) * np.log(g) - a * g + b
return result
d_sichel_set = set()
warnings.filterwarnings('ignore')
for value in np.linspace((f1 / n) + 0.00001, 0.999999, 20):
try:
g = broyden2(diff_eqn, value)
if 1 > g > (f1 / n) and ((n * g) / f1) > 0:
b_hat = g * np.log((n * g) / f1) / (1 - g)
c_hat = (1 - g ** 2) / (n * (g ** 2))
d_sichel = 2 / (b_hat * c_hat)
d_sichel_set.add(d_sichel)
except Exception as e:
continue
warnings.resetwarnings()
if not d_sichel_set:
return d
else:
return min(d_sichel_set)
[docs]def method_of_moments_estimator(sequence=None, attributes=None, cache=None):
"""
Simple Method-of-Moments Estimator to estimate D (Haas et al, 1995)
can be optimised (training rate, stopping value)
d = d_moments(l - e^(-n))/d_moments)
solve for d_moments in d = d_moments(l - e^(-n))/d_moments)
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, frequency_dictionary = cache["n"], cache["d"], cache["freq"]
elif sequence is not None:
n, d, _, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, _, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
def diff_eqn(D):
return D * (1 - np.exp((-n / D))) - d
warnings.filterwarnings('ignore')
try:
d_moments_1 = float(broyden1(diff_eqn, d))
except Exception as e:
print(e)
d_moments_1 = 1000000000000000000000000
try:
d_moments_2 = float(broyden2(diff_eqn, d))
except:
d_moments_2 = 1000000000000000000000000
warnings.resetwarnings()
result = min(d_moments_1, d_moments_2)
if result == 1000000000000000000000000:
return d
else:
return result
[docs]def bootstrap_estimator(sequence=None, attributes=None, cache=None):
"""
Implementation of a bootstrap estimator to estimate D (Smith and Van Bell 1984; Haas et al, 1995)
DBoot = d + sigma (1-nj/n)^n
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts = cache["n"], cache["d"], cache["attr"]
elif sequence is not None:
n, d, attribute_counts, _ = precompute_from_seq(sequence)
else:
n, d, attribute_counts, _ = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
bootstrap_sum = 0
for j, n_j in attribute_counts.items():
final_val = (1 - n_j / n) ** n
bootstrap_sum += final_val
d_bootstrap = d + bootstrap_sum
return d_bootstrap
[docs]def horvitz_thompson_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 10000000, n_pop=None,
cache=None):
"""
Implementation of the Horvitz-Thompson Estimator to estimate D
(Sarndal, Swensson, and Wretman 1992; Haas et al, 1995)
n_j = attribute count of value j
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts = cache["n"], cache["d"], cache["attr"]
elif sequence is not None:
n, d, attribute_counts, _ = precompute_from_seq(sequence)
else:
n, d, attribute_counts, _ = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
d_horvitz_thompson = 0
memo_dict_instance = {}
for j, n_j in attribute_counts.items():
n_j_hat = (n_j * n_pop) / n
h_n_j_hat, memo_dict_instance = memoized_h_x(n_j_hat, n, n_pop, memo_dict_instance)
d_horvitz_thompson += (1 / (1 - h_n_j_hat))
return d_horvitz_thompson
[docs]def method_of_moments_v2_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 1000000, n_pop=None,
cache=None):
"""
Method-of-Moments Estimator with equal frequency assumption while still sampling
from a finite relation (Haas et al, 1995)
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts = cache["n"], cache["d"], cache["attr"]
elif sequence is not None:
n, d, attribute_counts, _ = precompute_from_seq(sequence)
else:
n, d, attribute_counts, _ = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
# need to implement gamma memoized function and h_x again here to enable use of global variables in broydens
memo_dict = {}
def memoized_gamma(x):
"""
:param x:value to evaluate gamma function at
:type x: float
:param memo_dict: memoized dictionary for precomputed gamma values
:type memo_dict: dict
:return: value of gamma function evaluated at x, and memoized results
:rtype: int, dict
"""
x = int(x)
if x in memo_dict:
return memo_dict[x]
else:
result = math.lgamma(x)
memo_dict[x] = result
return result
def h_x(x, n, n_pop):
"""
:param x: h function evaluated at point x
:type x: int
:param n: length of sequence seen
:type n: int
:param n_pop: estimate of total number of tuples in Relation
:type n_pop: int
:return: value of h function evaluated at x
:rtype: float
"""
gamma_num_1 = memoized_gamma(n_pop - x + 1)
gamma_num_2 = memoized_gamma(n_pop - n + 1)
gamma_denom_1 = memoized_gamma(n_pop - x - n + 1)
gamma_denom_2 = memoized_gamma(n_pop + 1)
result = np.exp(gamma_num_1 + gamma_num_2 - gamma_denom_1 - gamma_denom_2)
return result
def diff_eqn(D):
return D * (1 - h_x((n_pop / D), n, n_pop)) - d
warnings.filterwarnings('ignore')
try:
d_moments_1 = float(broyden1(diff_eqn, d))
except Exception as e:
d_moments_1 = 1000000000000000000000000
try:
d_moments_2 = float(broyden2(diff_eqn, d))
except:
d_moments_2 = 1000000000000000000000000
warnings.resetwarnings()
result = min(d_moments_1, d_moments_2)
if result == 1000000000000000000000000:
return d
else:
return result
[docs]def method_of_moments_v3_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 10000000, n_pop=None,
cache=None):
"""
Method-of-Moments Estimator without equal frequency assumption (Haas et al, 1995)
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts = cache["n"], cache["d"], cache["attr"]
elif sequence is not None:
n, d, attribute_counts, _ = precompute_from_seq(sequence)
else:
n, d, attribute_counts, _ = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
gamma_hat_squared = (1 / d) * np.var(list(attribute_counts.values())) / ((n_pop / d) ** 2)
def compute_g_n(_x, _n, _n_pop):
"""
See implementation paper for details
:param _x: g function evaluated at point x
:type _x: int
:param _n: length of sequence seen
:type _n: int
:param _n_pop: estimate of total number of tuples in relation
:type _n_pop: int
:return: value of g function evaluated at x
:rtype: float
"""
return_sum = 0
for k in range(_n):
return_sum += 1 / (_n_pop - _x - _n + k)
return return_sum
d_mm_1 = method_of_moments_v2_estimator(attributes=attribute_counts)
n_pop_tilda = n_pop / d_mm_1
d_mm_2_huge_term = 0.5 * (n_pop_tilda ** 2) * gamma_hat_squared * d_mm_1 * h_x(n_pop_tilda, n, n_pop) * (
compute_g_n(n_pop_tilda, n, n_pop) - compute_g_n(n_pop_tilda, n, n_pop) ** 2)
d_moments_v3 = d * (1 - h_x(n_pop_tilda, n, n_pop) + d_mm_2_huge_term) ** -1
return d_moments_v3
[docs]def smoothed_jackknife_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 10000000, n_pop=None,
cache=None):
"""
Jackknife scheme for estimating D that accounts for true bias structures (Haas et al, 1995)
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: attribute count -> dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts, frequency_dictionary = cache["n"], cache["d"], cache["attr"], cache["freq"]
elif sequence is not None:
n, d, attribute_counts, frequency_dictionary = precompute_from_seq(sequence)
else:
n, d, attribute_counts, frequency_dictionary = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
gamma_hat_squared = (1 / d) * np.var(list(attribute_counts.values())) / ((n_pop / d) ** 2)
def compute_g_n_minus_one(x, n, n_pop):
return_sum = 0
for k in range(n - 1):
return_sum += 1 / (n_pop - x - n + k)
return return_sum
if 1 not in frequency_dictionary:
return d
d_n = d
d_sjk_zero_hat = (d_n - frequency_dictionary[1] / n) * (
1 - (n_pop - n + 1) * frequency_dictionary[1] / (n * n_pop)) ** -1
n_pop_tilda = n_pop / d_sjk_zero_hat
d_sjk = (1 - (n_pop - n_pop_tilda - n + 1) * frequency_dictionary[1] / (n * n_pop)) ** -1 * (
d_n + n_pop * h_x(n_pop_tilda, n, n_pop) * compute_g_n_minus_one(n_pop_tilda, n,
n_pop) * gamma_hat_squared * d_sjk_zero_hat)
return d_sjk
[docs]def hybrid_estimator(sequence=None, attributes=None, pop_estimator=lambda x: x * 10000000, n_pop=None, cache=None):
"""
hybrid_estimator : Hybrid Estimator that uses Shlosser's estimator when data is skewed and Smooth jackknife
estimator when data is not. Skew is computed by using an approximate chi square test for uniformity
:param sequence: sample sequence of integers
:type sequence: array of ints
:param attributes: dictionary with keys as the unique elements and values as
counts of those elements
:type attributes: dictionary where keys can be any type, values must be integers
:param cache: argument used by median methods to avoid recomputation of variables
:type cache: dictionary with 4 elements
{"n":no_elements,"d":no_unique_elements,"attr": attribute_counts,
"freq":frequency_dictionary}
:param pop_estimator: function to estimate population size if possible
:type pop_estimator: function that takes in the length of sequence (int) and outputs
the estimated population size (int)
:param n_pop: estimate of population size if available, will be used over pop_estimator function
:type n_pop: int
:return: estimated distinct count
:rtype: float
"""
if sequence is None and attributes is None and cache is None:
raise Exception("Must provide a sequence, or a dictionary of attribute counts ")
if cache is not None:
n, d, attribute_counts = cache["n"], cache["d"], cache["attr"]
elif sequence is not None:
n, d, attribute_counts, _ = precompute_from_seq(sequence)
else:
n, d, attribute_counts, _ = precompute_from_attr(attributes)
if n == d:
return _compute_birthday_problem_probability(d)
if not n_pop:
n_pop = pop_estimator(n)
n_bar = n / d
mu = sum((((i - n_bar) ** 2) / n_bar) for i in attribute_counts.values())
chi_critical = chi2.isf(0.975, n - 1, loc=n_bar, scale=n_bar) # set alpha is 0.975
if mu <= chi_critical:
return smoothed_jackknife_estimator(attributes=attribute_counts, pop_estimator=pop_estimator, n_pop=n_pop)
else:
return shlossers_estimator(attributes=attribute_counts, n_pop=n_pop)