Source code for astroML.density_estimation.histtools

"""
Tools for working with distributions
"""
import numpy as np
from scipy.special import gammaln

from astropy import stats as astropy_stats

from astroML.utils import deprecated
from astroML.utils.exceptions import AstroMLDeprecationWarning


[docs]@deprecated('0.4', alternative='astropy.stats.scott_bin_width', warning_type=AstroMLDeprecationWarning) def scotts_bin_width(data, return_bins=False): r"""Return the optimal histogram bin width using Scott's rule: Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width freedman_bin_width astroML.plotting.hist """ return astropy_stats.scott_bin_width(data, return_bins)
[docs]@deprecated('0.4', alternative='astropy.stats.freedman_bin_width', warning_type=AstroMLDeprecationWarning) def freedman_bin_width(data, return_bins=False): r"""Return the optimal histogram bin width using the Freedman-Diaconis rule Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width scotts_bin_width astroML.plotting.hist """ return astropy_stats.freedman_bin_width(data, return_bins)
@deprecated('0.4', warning_type=AstroMLDeprecationWarning) class KnuthF: r"""Class which implements the function minimized by knuth_bin_width Parameters ---------- data : array-like, one dimension data to be histogrammed Notes ----- the function F is given by .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. See Also -------- knuth_bin_width astroML.plotting.hist """ def __init__(self, data): self.data = np.array(data, copy=True) if self.data.ndim != 1: raise ValueError("data should be 1-dimensional") self.data.sort() self.n = self.data.size def bins(self, M): """Return the bin edges given a width dx""" return np.linspace(self.data[0], self.data[-1], int(M) + 1) def __call__(self, M): return self.eval(M) def eval(self, M): """Evaluate the Knuth function Parameters ---------- dx : float Width of bins Returns ------- F : float evaluation of the negative Knuth likelihood function: smaller values indicate a better fit. """ M = int(M) if M <= 0: return np.inf bins = self.bins(M) nk, bins = np.histogram(self.data, bins) return -(self.n * np.log(M) + gammaln(0.5 * M) - M * gammaln(0.5) - gammaln(self.n + 0.5 * M) + np.sum(gammaln(nk + 0.5)))
[docs]@deprecated('0.4', alternative='astropy.stats.knuth_bin_width', warning_type=AstroMLDeprecationWarning) def knuth_bin_width(data, return_bins=False, disp=True): r"""Return the optimal histogram bin width using Knuth's rule [1]_ Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- dx : float optimal bin width. Bins are measured starting at the first data point. bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal number of bins is the value M which maximizes the function .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. References ---------- .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms". arXiv:0605197, 2006 See Also -------- KnuthF freedman_bin_width scotts_bin_width """ return astropy_stats.knuth_bin_width(data, return_bins)
[docs]@deprecated('0.4', alternative='astropy.stats.histogram', warning_type=AstroMLDeprecationWarning) def histogram(a, bins=10, range=None, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as numpy.histogram(). Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ a = np.asarray(a) # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'scotts', 'freedman'])): a = a[(a >= range[0]) & (a <= range[1])] if isinstance(bins, str): if bins == 'blocks': bins = astropy_stats.bayesian_blocks(a) elif bins == 'knuth': da, bins = astropy_stats.knuth_bin_width(a, True) elif bins == 'scotts': da, bins = astropy_stats.scott_bin_width(a, True) elif bins == 'freedman': da, bins = astropy_stats.freedman_bin_width(a, True) else: raise ValueError("unrecognized bin code: '{}'".format(bins)) return np.histogram(a, bins, range, **kwargs)