Source code for ineqpy.grouped.stats

"""Stats' module."""
from .. import utils
from .._statistics import c_moment, mean, std_moment


[docs]def variance_hat_group(data=None, variable="x", weights="w", group="h"):
    """Calculate variance.

    Data a DataFrame calculates the sample variance for each stratum. The
    objective of this function is to make it easy to calculate the moments of
    the distribution that follows an estimator, eg. Can be used to calculate
    the variance that follows the mean.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe containing the series needed for the calculation
    x : str
    weights : str
        Name of the weights `w` in the DataFrame
    group : str
        Name of the stratum variable `h` in the DataFrame

    Returns
    -------
    vhat_h : pandas.Series
        A series with the values of the variance of each `h` stratum.

    Todo
    ----
    Review improvements.

    Examples
    --------
    >>> # Computes the variance of the mean
    >>> data = pd.DataFrame(data=[renta, peso, estrato],
                            columns=["renta", "peso", "estrato"])
    >>> v = variance_hat_group(data)
    >>> v
    stratum
    1                700.917.728,64
    2              9.431.897.980,96
    3            317.865.839.789,10
    4            741.304.873.092,88
    5            535.275.436.859,10
    6            225.573.783.240,68
    7            142.048.272.010,63
    8             40.136.989.131,06
    9             18.501.808.022,56
    dtype: float64

    >>> # the value of de variance of the mean:
    >>> v_total = v.sum() / peso.sum() ** 2
        24662655225.947945
    """
    if data is None:
        data = utils._to_df(x=variable, weights=weights, group=group)
        variable = "x"
        weights = "weights"
        group = "group"

    def v(df):
        r"""Calculate the variance of each stratum `h`.

        Parameters
        ----------
        df : pandas.DataFrame
            Dataframe containing the data.

        Returns
        -------
        vhat : float
            Value of the population variance for the stratum `h`.

        Notes
        -----
        Source:
        .. math:: r`N_h ^2 \cdot fpc \cdot \frac{ \hatS ^2 _h }{n_h}`
        """
        xi = df[variable].copy().values
        Nh = df[weights].sum()
        fpc = 1 - (len(df) / Nh)
        ddof = 1 if len(df) > 1 else 0
        shat2h = c_moment(variable=xi, order=2, ddof=ddof)
        return (Nh ** 2) * fpc * shat2h / len(df)

    return data.groupby(group).apply(v)


[docs]def moment_group(data=None, variable="x", weights="w", group="h", order=2):
    """Calculate the asymmetry of each `h` stratum.

    Parameters
    ----------
    variable : array or str
    weights : array or str
    group : array or str
    data : pd.DataFrame, optional
    order : int, optional

    Returns
    -------
    moment_of_order : float

    TODO
    ----
    Review calculations, it does not appear to be correct.
    Attempt to make a generalization of vhat_group, for any estimator.

    .. warning:: Actually Does Not Work!
    """
    if data is None:
        data = utils._to_df(x=variable, weights=weights, group=group)
        variable = "x"
        weights = "weights"
        group = "group"

    def mh(df, weights=weights):
        x = df[variable].copy().values
        weights = utils.not_empty_weights(weights, x)
        Nh = df.loc[:, weights].sum()
        fpc = 1 - (len(df) / Nh)
        ddof = 1 if len(df) > 1 else 0
        stdm = std_moment(variable=x, weights=weights, order=order, ddof=ddof)
        return (Nh ** order) * fpc * stdm / len(df)

    return data.groupby(group).apply(mh)


[docs]def quasivariance_hat_group(
    data=None, variable=None, weights=None, group=None
):
    """Calculate quasivariance.

    Sample variance of `variable`, calculated as the second-order central
    moment.

    Parameters
    ----------
    data : pd.DataFrame, optional
        pd.DataFrame that contains all variables needed.
    variable : array or str
        variable `x` apply the statistic. If `data` is None then must pass this
        argument as array, else as string name in `data`
    weights : array or str
        weights can be interpreted as frequency, probability,
        density function of `x`, each element in `x`. If `data` is None then
        must pass this argument as array, else as string name in `data`
    group : array or str
        group is a categorical variable to calculate the statistical by each
        group. If `data` is None then must pass this argument as array, else as
        string name in `data`

    Returns
    -------
    shat2_group : array or pd.Series

    References
    ----------
    Moment (mathematics). (2017, May 6). In Wikipedia, The Free Encyclopedia.
    Retrieved 14:40, May 15, 2017, from
    https://en.wikipedia.org/w/index.php?title=Moment_(mathematics)&oldid=778996402

    Notes
    -----
    This function is useful to calculate the variance of the mean.

    TODO
    ----
    Review function
    """
    if data is None:
        data = utils._to_df(x=variable, weights=weights)
        variable = "x"
        weights = "weights"

    def sd(df):
        x = variable
        return c_moment(variable=x, weights=weights, param=mean(x))

    return data.groupby(group).apply(sd)