"""Functions for calculating correlation."""
import numpy as np
import scipy.stats as sstats
import scipy.special as sspecial
from sklearn.utils.validation import check_random_state
try: # scipy >= 1.8.0
from scipy.stats._stats_py import _chk2_asarray
except ImportError: # scipy < 1.8.0
from scipy.stats.stats import _chk2_asarray
[docs]
def efficient_pearsonr(a, b, ddof=1, nan_policy='propagate'):
"""
Compute correlation of matching columns in `a` and `b`.
Parameters
----------
a,b : array_like
Sample observations. These arrays must have the same length and either
an equivalent number of columns or be broadcastable
ddof : int, optional
Degrees of freedom correction in the calculation of the standard
deviation. Default: 1
nan_policy : bool, optional
Defines how to handle when input contains nan. 'propagate' returns nan,
'raise' throws an error, 'omit' performs the calculations ignoring nan
values. Default: 'propagate'
Returns
-------
corr : float or numpy.ndarray
Pearson's correlation coefficient between matching columns of inputs
pval : float or numpy.ndarray
Two-tailed p-values
Notes
-----
If either input contains nan and nan_policy is set to 'omit', both arrays
will be masked to omit the nan entries.
Examples
--------
>>> from netneurotools import stats
Generate some not-very-correlated and some highly-correlated data:
>>> np.random.seed(12345678) # set random seed for reproducible results
>>> x1, y1 = stats.make_correlated_xy(corr=0.1, size=100)
>>> x2, y2 = stats.make_correlated_xy(corr=0.8, size=100)
Calculate both correlations simultaneously:
>>> stats.efficient_pearsonr(np.c_[x1, x2], np.c_[y1, y2])
(array([0.10032565, 0.79961189]), array([3.20636135e-01, 1.97429944e-23]))
"""
a, b, _ = _chk2_asarray(a, b, 0)
if len(a) != len(b):
raise ValueError('Provided arrays do not have same length')
if a.size == 0 or b.size == 0:
return np.nan, np.nan
if nan_policy not in ('propagate', 'raise', 'omit'):
raise ValueError(f'Value for nan_policy "{nan_policy}" not allowed')
a, b = a.reshape(len(a), -1), b.reshape(len(b), -1)
if (a.shape[1] != b.shape[1]):
a, b = np.broadcast_arrays(a, b)
mask = np.logical_or(np.isnan(a), np.isnan(b))
if nan_policy == 'raise' and np.any(mask):
raise ValueError('Input cannot contain NaN when nan_policy is "omit"')
elif nan_policy == 'omit':
# avoid making copies of the data, if possible
a = np.ma.masked_array(a, mask, copy=False, fill_value=np.nan)
b = np.ma.masked_array(b, mask, copy=False, fill_value=np.nan)
with np.errstate(invalid='ignore'):
corr = (sstats.zscore(a, ddof=ddof, nan_policy=nan_policy)
* sstats.zscore(b, ddof=ddof, nan_policy=nan_policy))
sumfunc, n_obs = np.sum, len(a)
if nan_policy == 'omit':
corr = corr.filled(np.nan)
sumfunc = np.nansum
n_obs = np.squeeze(np.sum(np.logical_not(np.isnan(corr)), axis=0))
corr = sumfunc(corr, axis=0) / (n_obs - 1)
corr = np.squeeze(np.clip(corr, -1, 1)) / 1
# taken from scipy.stats
ab = (n_obs / 2) - 1
prob = 2 * sspecial.betainc(ab, ab, 0.5 * (1 - np.abs(corr)))
return corr, prob
[docs]
def weighted_pearsonr():
"""Calculate weighted Pearson correlation coefficient."""
pass