Source code for ineqpy._statistics

"""Low level desciptive statistics.

References
----------
1. http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
2. https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
   #Weighted_sample_variance
3. https://en.wikipedia.org/wiki/Algorithms%5Ffor%5Fcalculating%5Fvariance
   #Weighted_incremental_algorithm
"""

import numpy as np
from numba import guvectorize

from . import utils


[docs]def c_moment(variable=None, weights=None, order=2, param=None, ddof=0): """Calculate central momment. Calculate the central moment of `x` with respect to `param` of order `n`, given the weights `w`. Parameters ---------- variable : 1d-array Variable weights : 1d-array Weights order : int, optional Moment order, 2 by default (variance) param : int or array, optional Parameter for which the moment is calculated, the default is None, implies use the mean. ddof : int, optional Degree of freedom, zero by default. Returns ------- central_moment : float Notes ----- - The cmoment of order 1 is 0 - The cmoment of order 2 is the variance. Source : https://en.wikipedia.org/wiki/Moment_(mathematics) Todo ---- Implement : https://en.wikipedia.org/wiki/L-moment#cite_note-wang:96-6 """ # return np.sum((x-c)^n*counts) / np.sum(counts) variable = variable.copy() weights = utils.not_empty_weights(weights, like=variable) if param is None: param = mean(variable=variable, weights=weights) elif not isinstance(param, (np.ndarray, int, float)): raise NotImplementedError return np.sum((variable - param) ** order * weights) / ( np.sum(weights) - ddof )
[docs]def percentile( variable, weights, percentile=50, interpolation="lower" ) -> float: """Calculate the percentile. Parameters ---------- variable : str or array weights : str or array percentile : int or list Percentile level, if pass 50 we get the median. interpolation : {'lower', 'higher', 'midpoint'}, optional Select interpolation method. Returns ------- percentile : float """ sorted_idx = np.argsort(variable) cum_weights = np.cumsum(weights[sorted_idx]) lower_percentile_idx = np.searchsorted( cum_weights, (percentile / 100.0) * cum_weights[-1] ) if interpolation == "midpoint": res = np.interp( lower_percentile_idx + 0.5, np.arange(len(variable)), variable[sorted_idx], ) elif interpolation == "lower": res = variable[sorted_idx[lower_percentile_idx]] elif interpolation == "higher": res = variable[sorted_idx[lower_percentile_idx + 1]] else: raise NotImplementedError return float(res)
[docs]def std_moment(variable=None, weights=None, param=None, order=3, ddof=0): """Calculate the standarized moment. Calculate the standarized moment of order `c` for the variable` x` with respect to `c`. Parameters ---------- variable : 1d-array Random Variable weights : 1d-array, optional Weights or probability order : int, optional Order of Moment, three by default param : int or float or array, optional Central trend, default is the mean. ddof : int, optional Degree of freedom. Returns ------- std_moment : float Returns the standardized `n` order moment. References ---------- - https://en.wikipedia.org/wiki/Moment_(mathematics) #Significance_of_the_moments - https://en.wikipedia.org/wiki/Standardized_moment Todo ---- It is the general case of the raw and central moments. Review implementation. """ if param is None: param = mean(variable=variable, weights=weights) res = c_moment( variable=variable, weights=weights, order=order, param=param, ddof=ddof ) res /= var(variable=variable, weights=weights, ddof=ddof) ** (order / 2) return res
[docs]def mean(variable=None, weights=None): """Calculate the mean of `variable` given `weights`. Parameters ---------- variable : array-like or str Variable on which the mean is estimated. weights : array-like or str Weights of the `x` variable. Returns ------- mean : array-like or float """ # if pass a DataFrame separate variables. variable = variable.copy() weights = utils.not_empty_weights(weights, like=variable) variable, weights = utils._clean_nans_values(variable, weights) return np.average(a=variable, weights=weights, axis=0)
[docs]def var(variable=None, weights=None, ddof=0): """Calculate the population variance of ``variable`` given `weights`. Parameters ---------- variable : 1d-array or pd.Series or pd.DataFrame Variable on which the quasivariation is estimated weights : 1d-array or pd.Series or pd.DataFrame Weights of the `variable`. Returns ------- variance : 1d-array or pd.Series or float Estimation of quasivariance of `variable` References ---------- Moment (mathematics). (2017, May 6). In Wikipedia, The Free Encyclopedia. Retrieved 14:40, May 15, 2017, from https://en.wikipedia.org/w/index.php?title=Moment_(mathematics) Notes ----- If stratificated sample must pass with groupby each strata. """ return c_moment(variable=variable, weights=weights, order=2, ddof=ddof)
[docs]def coef_variation(variable=None, weights=None): """Calculate the coefficient of variation. Calculate the coefficient of variation of a `variable` given weights. The coefficient of variation is the square root of the variance of the incomes divided by the mean income. It has the advantages of being mathematically tractable and is subgroup decomposable, but is not bounded from above. Parameters ---------- variable : array-like or str weights : array-like or str Returns ------- coefficient_variation : float References ---------- Coefficient of variation. (2017, May 5). In Wikipedia, The Free Encyclopedia. Retrieved 15:03, May 15, 2017, from https://en.wikipedia.org/w/index.php?title=Coefficient_of_variation """ # todo complete docstring return var(variable=variable, weights=weights) ** 0.5 / abs( mean(variable=variable, weights=weights) )
[docs]def kurt(variable=None, weights=None): """Calculate the asymmetry coefficient. Parameters ---------- variable : 1d-array weights : 1d-array Returns ------- kurt : float Kurtosis coefficient. References ---------- Moment (mathematics). (2017, May 6). In Wikipedia, The Free Encyclopedia. Retrieved 14:40, May 15, 2017, from https://en.wikipedia.org/w/index.php?title=Moment_(mathematics) Notes ----- It is an alias of the standardized fourth-order moment. """ return std_moment(variable=variable, weights=weights, order=4)
[docs]def skew(variable=None, weights=None): """Return the asymmetry coefficient of a sample. Parameters ---------- variable : array-like, str weights : array-like, str Returns ------- skew : float References ---------- Moment (mathematics). (2017, May 6). In Wikipedia, The Free Encyclopedia. Retrieved 14:40, May 15, 2017, from https://en.wikipedia.org/w/index.php?title=Moment_(mathematics) Notes ----- It is an alias of the standardized third-order moment. """ return std_moment(variable=variable, weights=weights, order=3)
@guvectorize( "float64[:], float64[:], int64, float64[:]", "(n),(n),()->()", nopython=True, cache=True, )
[docs]def wvar(x, w, kind, out): """Calculate weighted variance of X. Calculates the weighted variance of x according to a kind of weights. Parameters ---------- x : np.ndarray Main variable. w : np.ndarray Weigths. kind : int Has three modes to calculate de variance, you can control that with this argument, the values and the output are the next: * 1. population variance * 2. sample frequency variance * 3. sample reliability variance. out : np.ndarray Returns ------- weighted_variance : float References ---------- https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance #Weighted_incremental_algorithm """ wSum = wSum2 = mean = S = 0 for i in range(len(x)): # Alternatively "for x, w in zip(data, weights):" wSum = wSum + w[i] wSum2 = wSum2 + w[i] * w[i] meanOld = mean mean = meanOld + (w[i] / wSum) * (x[i] - meanOld) S = S + w[i] * (x[i] - meanOld) * (x[i] - mean) if kind == 1: # population_variance out[0] = S / wSum elif kind == 2: # Bessel's correction for weighted samples # Frequency weights # sample_frequency_variance out[0] = S / (wSum - 1) elif kind == 3: # Reliability weights # sample_reliability_variance out[0] = S / (wSum - wSum2 / wSum)
@guvectorize( "float64[:], float64[:], float64[:], int64, float64[:]", "(n),(n),(n),()->()", nopython=True, cache=True, )
[docs]def wcov(x, y, w, kind, out): """Compute weighted covariance between x and y. Compute the weighted covariance between two variables, we can chose which kind of covariance returns. Parameters ---------- x : np.array Main variable. y : np.array Second variable. w : np.array Weights. kind : int Kind of weighted covariance is returned: 1 : population variance 2 : sample frequency variance 3 : sample reliability variance. out : np.array Returns ------- weighted_covariance = float References ---------- https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online """ meanx = meany = 0 wsum = wsum2 = 0 C = 0 for i in range(len(x)): wsum += w[i] wsum2 += w[i] * w[i] dx = x[i] - meanx meanx += (w[i] / wsum) * dx meany += (w[i] / wsum) * (y[i] - meany) C += w[i] * dx * (y[i] - meany) if kind == 1: # population_covar out[0] = C / wsum
@guvectorize( "float64[:], float64[:], float64[:]", "(n),(n)->()", nopython=True, cache=True, )
[docs]def online_kurtosis(x, w, out): """Online kurtosis.""" n = 0 mean = 0 M2 = 0 M3 = 0 M4 = 0 for i in range(len(x)): n1 = w[i] n = n + w[i] delta = x[i] - mean delta_n = delta / n delta_n2 = delta_n * delta_n term1 = delta * delta_n * n1 mean = mean + w[i] * delta_n / n M4 = ( M4 + term1 * delta_n2 * (n * n - 3 * n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3 ) M3 = M3 + term1 * delta_n * (n - 2) - 3 * delta_n * M2 M2 = M2 + term1 out[0] = (n * M4) / (M2 * M2) - 3
@guvectorize( "float64[:], float64[:], int64, float64[:]", "(n),(n),()->()", nopython=True, cache=True, )
[docs]def Mk(x, w, k, out): """Calculate Mk.""" w_sum = wx_sum = 0 for i in range(len(x)): wx_sum += w[i] * (x[i] ** k) w_sum += w[i] out[0] = wx_sum / w_sum