Source code for tfields.lib.stats

#!/usr/bin/env
# encoding: utf-8
"""
Author:     Daniel Boeckenhoff
Mail:       daniel.boeckenhoff@ipp.mpg.de

part of tfields library
"""
import numpy as np
import scipy
import scipy.stats
import logging


[docs]def mode(array, axis=0, bins='auto', range=None): """ generalisation of the scipy.stats.mode function for floats with binning Examples: Forwarding usage: >>> import tfields # NOQA >>> import numpy as np >>> tfields.lib.stats.mode([[2,2,3], [4,5,3]]) array([[2, 2, 3]]) >>> tfields.lib.stats.mode([[2,2,3], [4,5,3]], axis=1) array([[2], [3]]) Float usage: >>> np.random.seed(seed=0) # deterministic random >>> n = np.random.normal(3.1, 2., 1000) >>> assert np.isclose(tfields.lib.stats.mode(n), [ 2.30838613]) >>> assert np.isclose(tfields.lib.stats.mode(n, bins='sturges'), ... [ 2.81321206]) >>> assert np.allclose(tfields.lib.stats.mode(np.array([n, n]), axis=1), ... [[ 2.30838613], ... [ 2.30838613]]) >>> tfields.lib.stats.mode(np.array([n, n]), axis=0).shape (1000, 1) >>> tfields.lib.stats.mode(np.array([n, n]), axis=1).shape (2, 1) >>> assert np.isclose(tfields.lib.stats.mode(np.array([n, n]), ... axis=None), ... [ 2.81321206]) """ array = np.asarray(array) if issubclass(array.dtype.type, np.integer): return scipy.stats.mode(array, axis=axis).mode # hack only works for 2 dimensional arrays if len(array.shape) > 2: raise NotImplementedError("Only dimensions <= 2 allowed") if len(array.shape) == 2: if axis is None: array = array.reshape(array.size) return mode(array, axis=0, bins=bins, range=range) if axis == 0: array = array.T return np.array([mode(a, axis=0, bins=bins, range=range) for a in array]) # only 1 d arrays remaining if not (axis is None or axis == 0): raise NotImplementedError("Axis is not None or 0 but {0}".format(axis)) hist, binEdges = np.histogram(array, bins) maxIndex = hist.argmax(axis=axis) return np.array([0.5 * (binEdges[maxIndex] + binEdges[maxIndex + 1])])
mean = np.mean median = np.median def _chk_asarray(a, axis): """ copied from scipy.stats """ if axis is None: a = np.ravel(a) outaxis = 0 else: a = np.asarray(a) outaxis = axis if a.ndim == 0: a = np.atleast_1d(a) return a, outaxis def _contains_nan(a, nan_policy='propagate'): """ copied from scipy.stats """ policies = ['propagate', 'raise', 'omit'] if nan_policy not in policies: raise ValueError("nan_policy must be one of {%s}" % ', '.join("'%s'" % s for s in policies)) try: # Calling np.sum to avoid creating a huge array into memory # e.g. np.isnan(a).any() with np.errstate(invalid='ignore'): contains_nan = np.isnan(np.sum(a)) except TypeError: # If the check cannot be properly performed we fallback to omitting # nan values and raising a warning. This can happen when attempting to # sum things that are not numbers (e.g. as in the function `mode`). contains_nan = False nan_policy = 'omit' logging.warning("The input array could not be properly checked for nan" " values. nan values will be ignored.") if contains_nan and nan_policy == 'raise': raise ValueError("The input contains nan values") return (contains_nan, nan_policy)
[docs]def moment(a, moment=1, axis=0, weights=None, nan_policy='propagate'): r""" Calculate the nth moment about the mean for a sample. A moment is a specific quantitative measure of the shape of a set of points. It is often used to calculate coefficients of skewness and kurtosis due to its close relationship with them. Parameters ---------- a : array_like data moment : int or array_like of ints, optional order of central moment that is returned. Default is 1. axis : int or None, optional Axis along which the central moment is computed. Default is 0. If None, compute over the whole array `a`. nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when input contains nan. 'propagate' returns nan, 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default is 'propagate'. Returns ------- n-th central moment : ndarray or float The appropriate moment along the given axis or over all values if axis is None. The denominator for the moment calculation is the number of observations, no degrees of freedom correction is done. See also -------- kurtosis, skew, describe Notes ----- The k-th weighted central moment of a data sample is: .. math:: m_k = \frac{1}{\sum_{j = 1}^n w_i} \sum_{i = 1}^n w_i (x_i - \bar{x})^k Where n is the number of samples and x-bar is the mean. This function uses exponentiation by squares [1]_ for efficiency. References ---------- .. [1] http://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms Examples -------- >>> from tfields.lib.stats import moment >>> moment([1, 2, 3, 4, 5], moment=0) 1.0 >>> moment([1, 2, 3, 4, 5], moment=1) 0.0 >>> moment([1, 2, 3, 4, 5], moment=2) 2.0 Expansion of the scipy.stats moment function by weights: >>> moment([1, 2, 3, 4, 5], moment=1, weights=[-2, -1, 20, 1, 2]) 0.5 >>> moment([1, 2, 3, 4, 5], moment=2, weights=[5, 4, 3, 2, 1]) 2.0 >>> moment([1, 2, 3, 4, 5], moment=2, weights=[5, 4, 3, 2, 1]) 2.0 >>> assert moment([1, 2, 3, 4, 5], moment=2, ... weights=[0.25, 1, 17.5, 1, 0.25]) == 0.2 >>> moment([1, 2, 3, 4, 5], moment=2, weights=[0, 0, 1, 0, 0]) 0.0 """ a, axis = _chk_asarray(a, axis) contains_nan, nan_policy = _contains_nan(a, nan_policy) if contains_nan and nan_policy == 'omit': a = a.masked_invalid(a) return scipy.mstats_basic.moment(a, moment, axis) if a.size == 0: # empty array, return nan(s) with shape matching `moment` if np.isscalar(moment): return np.nan else: return np.ones(np.asarray(moment).shape, dtype=np.float64) * np.nan # for array_like moment input, return a value for each. if not np.isscalar(moment): mmnt = [_moment(a, i, axis, weights=weights) for i in moment] return np.array(mmnt) else: return _moment(a, moment, axis, weights=weights)
def _moment(a, moment, axis, weights=None): if np.abs(moment - np.round(moment)) > 0: raise ValueError("All moment parameters must be integers") if moment == 0: # When moment equals 0, the result is 1, by definition. shape = list(a.shape) del shape[axis] if shape: # return an actual array of the appropriate shape return np.ones(shape, dtype=float) else: # the input was 1D, so return a scalar instead of a rank-0 array return 1.0 elif weights is None and moment == 1: # By definition the first moment about the mean is 0. shape = list(a.shape) del shape[axis] if shape: # return an actual array of the appropriate shape return np.zeros(shape, dtype=float) else: # the input was 1D, so return a scalar instead of a rank-0 array return np.float64(0.0) else: # Exponentiation by squares: form exponent sequence n_list = [moment] current_n = moment while current_n > 2: if current_n % 2: current_n = (current_n - 1) / 2 else: current_n /= 2 n_list.append(current_n) # Starting point for exponentiation by squares a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis) if n_list[-1] == 1: s = a_zero_mean.copy() else: s = a_zero_mean**2 # Perform multiplications for n in n_list[-2::-1]: s = s**2 if n % 2: s *= a_zero_mean return np.average(s, axis, weights=weights) if __name__ == '__main__': import doctest import tfields # NOQA: F401 doctest.testmod()