Source code for netneurotools.datasets.generators


# -*- coding: utf-8 -*-
"""Functions for making "random" datasets."""

import numpy as np
from sklearn.utils.validation import check_random_state


[docs]def make_correlated_xy(corr=0.85, size=10000, seed=None, tol=0.001): """ Generate random vectors that are correlated to approximately `corr`. Parameters ---------- corr : [-1, 1] float or (N, N) numpy.ndarray, optional The approximate correlation desired. If a float is provided, two vectors with the specified level of correlation will be generated. If an array is provided, it is assumed to be a symmetrical correlation matrix and ``len(corr)`` vectors with the specified levels of correlation will be generated. Default: 0.85 size : int or tuple, optional Desired size of the generated vectors. Default: 1000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Default: None tol : [0, 1] float, optional Tolerance of correlation between generated `vectors` and specified `corr`. Default: 0.001 Returns ------- vectors : numpy.ndarray Random vectors of size `size` with correlation specified by `corr` Examples -------- >>> from netneurotools import datasets By default two vectors are generated with specified correlation >>> x, y = datasets.make_correlated_xy() >>> np.corrcoef(x, y) # doctest: +SKIP array([[1. , 0.85083661], [0.85083661, 1. ]]) >>> x, y = datasets.make_correlated_xy(corr=0.2) >>> np.corrcoef(x, y) # doctest: +SKIP array([[1. , 0.20069953], [0.20069953, 1. ]]) You can also provide correlation matrices to generate more than two vectors if desired. Note that this makes it more difficult to ensure the actual correlations are close to the desired values: >>> corr = [[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]] >>> out = datasets.make_correlated_xy(corr=corr) >>> out.shape (3, 10000) >>> np.corrcoef(out) # doctest: +SKIP array([[1. , 0.50965273, 0.30235686], [0.50965273, 1. , 0.01089107], [0.30235686, 0.01089107, 1. ]]) """ rs = check_random_state(seed) # no correlations outside [-1, 1] bounds if np.any(np.abs(corr) > 1): raise ValueError('Provided `corr` must (all) be in range [-1, 1].') # if we're given a single number, assume two vectors are desired if isinstance(corr, (int, float)): covs = np.ones((2, 2)) * 0.111 covs[(0, 1), (1, 0)] *= corr # if we're given a correlation matrix, assume `N` vectors are desired elif isinstance(corr, (list, np.ndarray)): corr = np.asarray(corr) if corr.ndim != 2 or len(corr) != len(corr.T): raise ValueError('If `corr` is a list or array, must be a 2D ' 'square array, not {}'.format(corr.shape)) if np.any(np.diag(corr) != 1): raise ValueError('Diagonal of `corr` must be 1.') covs = corr * 0.111 means = [0] * len(covs) # generate the variables count = 0 while count < 500: vectors = rs.multivariate_normal(mean=means, cov=covs, size=size).T flat = vectors.reshape(len(vectors), -1) # if diff between actual and desired correlations less than tol, break if np.all(np.abs(np.corrcoef(flat) - (covs / 0.111)) < tol): break count += 1 return vectors