Source code for astroML.correlation

"""
Tools for computing two-point correlation functions.
"""

import numpy as np

from sklearn.neighbors import KDTree
from sklearn.utils import check_random_state


def uniform_sphere(RAlim, DEClim, size=1):
    """Draw a uniform sample on a sphere

    Parameters
    ----------
    RAlim : tuple
        select Right Ascension between RAlim[0] and RAlim[1]
        units are degrees
    DEClim : tuple
        select Declination between DEClim[0] and DEClim[1]
    size : int (optional)
        the size of the random arrays to return (default = 1)

    Returns
    -------
    RA, DEC : ndarray
        the random sample on the sphere within the given limits.
        arrays have shape equal to size.
    """
    zlim = np.sin(np.pi * np.asarray(DEClim) / 180.)

    z = zlim[0] + (zlim[1] - zlim[0]) * np.random.random(size)
    DEC = (180. / np.pi) * np.arcsin(z)
    RA = RAlim[0] + (RAlim[1] - RAlim[0]) * np.random.random(size)

    return RA, DEC


def ra_dec_to_xyz(ra, dec):
    """Convert ra & dec to Euclidean points

    Parameters
    ----------
    ra, dec : ndarrays

    Returns
    x, y, z : ndarrays
    """
    sin_ra = np.sin(ra * np.pi / 180.)
    cos_ra = np.cos(ra * np.pi / 180.)

    sin_dec = np.sin(np.pi / 2 - dec * np.pi / 180.)
    cos_dec = np.cos(np.pi / 2 - dec * np.pi / 180.)

    return (cos_ra * sin_dec,
            sin_ra * sin_dec,
            cos_dec)


def angular_dist_to_euclidean_dist(D, r=1):
    """convert angular distances to euclidean distances"""
    return 2 * r * np.sin(0.5 * D * np.pi / 180.)


[docs]def two_point(data, bins, method='standard', data_R=None, random_state=None): """Two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". data_R : array_like (optional) if specified, use this as the random comparison sample random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) # Fast two-point correlation functions added in scikit-learn v. 0.14 KDT_D = KDTree(data) KDT_R = KDTree(data_R) counts_DD = KDT_D.two_point_correlation(data, bins) counts_RR = KDT_R.two_point_correlation(data_R, bins) DD = np.diff(counts_DD) RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor ** 2 * DD / RR - 1 elif method == 'landy-szalay': counts_DR = KDT_R.two_point_correlation(data, bins) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan return corr
[docs]def bootstrap_two_point(data, bins, Nbootstrap=10, method='standard', return_bootstraps=False, random_state=None): """Bootstrapped two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 Nbootstrap : integer number of bootstrap resamples to perform (default = 10) method : string "standard" or "landy-szalay". return_bootstraps: bool if True, return full bootstrapped samples random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr, corr_err : ndarrays the estimate of the correlation function and the bootstrap error within each bin. shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") if Nbootstrap < 2: raise ValueError("Nbootstrap must be greater than 1") n_samples, n_features = data.shape # get the baseline estimate corr = two_point(data, bins, method=method, random_state=rng) bootstraps = np.zeros((Nbootstrap, len(corr))) for i in range(Nbootstrap): indices = rng.randint(0, n_samples, n_samples) bootstraps[i] = two_point(data[indices, :], bins, method=method, random_state=rng) # use masked std dev in case of NaNs corr_err = np.asarray(np.ma.masked_invalid(bootstraps).std(0, ddof=1)) if return_bootstraps: return corr, corr_err, bootstraps else: return corr, corr_err
[docs]def two_point_angular(ra, dec, bins, method='standard', random_state=None): """Angular two-point correlation function A separate function is needed because angular distances are not euclidean, and random sampling needs to take into account the spherical volume element. Parameters ---------- ra : array_like input right ascention, shape = (n_samples,) dec : array_like input declination bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ ra = np.asarray(ra) dec = np.asarray(dec) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape): raise ValueError('ra and dec must be 1-dimensional ' 'arrays of the same length') # draw a random sample with N points ra_R, dec_R = uniform_sphere((min(ra), max(ra)), (min(dec), max(dec)), 2 * len(ra)) data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T # convert spherical bins to cartesian bins bins_transform = angular_dist_to_euclidean_dist(bins) return two_point(data, bins_transform, method=method, data_R=data_R, random_state=rng)
[docs]def bootstrap_two_point_angular(ra, dec, bins, method='standard', Nbootstraps=10, random_state=None): """Angular two-point correlation function A separate function is needed because angular distances are not euclidean, and random sampling needs to take into account the spherical volume element. Parameters ---------- ra : array_like input right ascention, shape = (n_samples,) dec : array_like input declination bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". Nbootstraps : int number of bootstrap resamples random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins dcorr : ndarray error estimate on dcorr (sample standard deviation of bootstrap resamples) bootstraps : ndarray The full sample of bootstraps used to compute corr and dcorr """ ra = np.asarray(ra) dec = np.asarray(dec) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape): raise ValueError('ra and dec must be 1-dimensional ' 'arrays of the same length') data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T # convert spherical bins to cartesian bins bins_transform = angular_dist_to_euclidean_dist(bins) bootstraps = [] for i in range(Nbootstraps): # draw a random sample with N points ra_R, dec_R = uniform_sphere((min(ra), max(ra)), (min(dec), max(dec)), 2 * len(ra)) data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T if i > 0: # random sample of the data ind = np.random.randint(0, data.shape[0], data.shape[0]) data_b = data[ind] else: data_b = data bootstraps.append(two_point(data_b, bins_transform, method=method, data_R=data_R, random_state=rng)) bootstraps = np.asarray(bootstraps) corr = np.mean(bootstraps, 0) corr_err = np.std(bootstraps, 0, ddof=1) return corr, corr_err, bootstraps