Source code for pyradi.ryprob

################################################################
# The contents of this file are subject to the BSD 3Clause (New) License
# you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://directory.fsf.org/wiki/License:BSD_3Clause

# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
# License for the specific language governing rights and limitations
# under the License.

# The Original Code is part of the PyRadi toolkit.

# The Initial Developer of the Original Code is M Konnik and CJ Willers,
# Portions created by CJ Willers are Copyright (C) 2006-2015
# All Rights Reserved.

# Contributor(s): ______________________________________.
################################################################
"""
This module provides a few probability utility functions, most of which are used in the
high level CCD and CMOS staring array model pyradi.rystare.

One of the utility functions provides a packaged version of scikit-learn's
kernel density estimation tool to provide a better estimate than does a
simple histogram.
"""

__version__ = '0.1.0'
__author__ = 'M Konnik and CJ Willers'
__all__ = [
    'distribution_exp',
    'distribution_lognormal',
    'distribution_inversegauss',
    'distribution_logistic',
    'distribution_wald',
    'distributions_generator',
    'validateParam',
    'checkParamsNum',
    'unifomOnSphere',
]

import math
import re

import numpy as np


######################################################################################

[docs]
def KDEbounded(x_d, x, bandwidth=np.nan, lowbnd=np.nan, uppbnd=np.nan, kernel='gaussian'):
    """Estimate the probability by Kernel Density Estimation.

    If bandwidth is np.nan, calculate the optimal kernel width (bandwidth) automatically
    using cross-validation.  Be careful — this can take a while.

    Mirrors the data at either or both edges when the domain is bounded, which
    corrects the boundary bias that a plain KDE would otherwise exhibit.

    Args:
        | x_d (np.array[N,]): domain over which values must be returned
        | x (np.array[N,]): input sample data set
        | bandwidth (float): bandwidth to use; np.nan means calculate automatically
        | lowbnd (float): lower mirror-fold boundary; np.nan means no lower bound
        | uppbnd (float): upper mirror-fold boundary; np.nan means no upper bound
        | kernel (str): kernel type — one of 'gaussian', 'tophat', 'epanechnikov',
          'exponential', 'linear', 'cosine'

    Returns:
        | x_d (np.array[N,]): input vector used as the domain for calculations
        | x (np.array[N,]): probability density evaluated over x_d
        | bandwidth (float): bandwidth used in the KDE
        | kernel (str): kernel used

    Raises:
        | No exception is raised.

    See here for more detail and examples:
    https://github.com/NelisW/PythonNotesToSelf/tree/master/KernelDensityEstimation

    Other references:
    https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html#Motivating-KDE:-Histograms
    https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
    https://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html
    https://stats.stackexchange.com/questions/405357/how-to-choose-the-bandwidth-of-a-kde-in-python
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KernelDensity.html
    https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    https://towardsdatascience.com/how-to-find-probability-from-probability-density-plots-7c392b218bab
    """

    from sklearn.neighbors import KernelDensity
    from sklearn.model_selection import GridSearchCV, LeaveOneOut

    # find optimal bandwidth if not supplied
    if np.isnan(bandwidth):
        bandwidths = 10 ** np.linspace(-1, 1, 100)
        kd = KernelDensity(kernel=kernel)
        grid = GridSearchCV(kd, param_grid={'bandwidth': bandwidths}, cv=LeaveOneOut())
        grid.fit(x[:, None])
        bandwidth = grid.best_params_['bandwidth']

    X = []
    Xo = []

    # base data, and if required lower flipped, upper flipped
    X.append(x)
    if not np.isnan(lowbnd):
        X.append(-x + 2 * lowbnd)
    if not np.isnan(uppbnd):
        X.append(-x + 2 * uppbnd)

    # fit KDE for base, and if present lower and upper flipped copies
    for i, x in enumerate(X):
        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel)
        kde.fit(x[:, None])
        # score_samples returns the log of the probability density
        prob = np.exp(kde.score_samples(x_d[:, None]))
        Xo.append(prob)

    # sum base and flipped contributions
    x = np.zeros_like(x_d)
    for xi in Xo:
        x += xi

    # zero out density outside the bounded domain
    if not np.isnan(lowbnd):
        x = np.where(x_d <= lowbnd, 0, x)
    if not np.isnan(uppbnd):
        x = np.where(x_d >= uppbnd, 0, x)

    return x_d, x, bandwidth, kernel



######################################################################################

[docs]
def distribution_exp(distribParams, out, funcName):
    r"""Exponential Distribution.

    This function is meant to be called via the `distributions_generator` function.

    :math:`\textrm{pdf} = \lambda \cdot \exp(-\lambda \cdot y)`

    :math:`\textrm{cdf} = 1 - \exp(-\lambda \cdot y)`

    - Mean = 1/lambda
    - Variance = 1/lambda^2
    - Mode = lambda
    - Median = log(2)/lambda
    - Skewness = 2
    - Kurtosis = 6

    GENERATING FUNCTION: :math:`T = -\log_e(U) / \lambda`

    PARAMETERS: distribParams[0] is lambda — inverse scale or rate (lambda > 0)

    SUPPORT: y, y >= 0

    CLASS: Continuous skewed distributions

    NOTES: The discrete version of the Exponential distribution is
    the Geometric distribution.

    Args:
        | distribParams ([float]): [lambda]; defaults to [1.] when None or empty
        | out (np.array): pre-allocated output array; shape determines sample size
        | funcName (str): caller name, used in validation error messages

    Returns:
        | out (np.array): array of Exponential random variates (same shape as input out)

    Raises:
        | No exception is raised.
    """
    if distribParams is None or len(distribParams) == 0:
        distribParams = [1.]
    if checkParamsNum(funcName, 'Exponential', 'exp', distribParams, [1]):
        _lambda = distribParams[0]
        if validateParam(funcName, 'Exponential', 'exp', 'lambda', 'lambda', _lambda, ['> 0']):
            out = -np.log(np.random.rand(*out.shape)) / _lambda

    return out



######################################################################################

[docs]
def distribution_lognormal(distribParams, out, funcName):
    """The Log-normal Distribution (sometimes: Cobb-Douglas or antilognormal distribution).

    This function is meant to be called via the `distributions_generator` function.

    pdf = 1/(y*sigma*sqrt(2*pi)) * exp(-1/2*((log(y)-mu)/sigma)^2)
    cdf = 1/2*(1 + erf((log(y)-mu)/(sigma*sqrt(2))));

    - Mean = exp(mu + sigma^2/2)
    - Variance = exp(2*mu+sigma^2)*(exp(sigma^2)-1)
    - Skewness = (exp(1)+2)*sqrt(exp(1)-1), for mu=0 and sigma=1
    - Kurtosis = exp(4) + 2*exp(3) + 3*exp(2) - 6, for mu=0 and sigma=1
    - Mode = exp(mu-sigma^2)

    PARAMETERS: mu — location, sigma — scale (sigma > 0)

    SUPPORT: y, y > 0

    CLASS: Continuous skewed distribution

    NOTES:

    1. The LogNormal distribution is always right-skewed.
    2. Parameters mu and sigma are the mean and standard deviation of y in (natural) log space.
    3. mu = log(mean(y)) - 1/2*log(1 + var(y)/(mean(y))^2)
    4. sigma = sqrt(log(1 + var(y)/(mean(y))^2))

    Args:
        | distribParams ([float]): [mu, sigma]; defaults to [0., 1.] when None or empty
        | out (np.array): pre-allocated output array; shape determines sample size
        | funcName (str): caller name, used in validation error messages

    Returns:
        | out (np.array): array of Log-normal random variates (same shape as input out)

    Raises:
        | No exception is raised.
    """
    if distribParams is None or len(distribParams) == 0:
        distribParams = [0., 1.]

    if checkParamsNum(funcName, 'Lognormal', 'lognorm', distribParams, [0, 2]):
        mu = distribParams[0]
        sigma = distribParams[1]
        if validateParam(funcName, 'Lognormal', 'lognorm', '[mu, sigma]', 'sigma', sigma, ['> 0']):
            out = np.exp(mu + sigma * np.random.randn(*out.shape))

    return out



######################################################################################

[docs]
def distribution_inversegauss(distribParams, out, funcName):
    """The Inverse Gaussian Distribution.

    This function is meant to be called via the `distributions_generator` function.

    The Inverse Gaussian distribution is left skewed distribution whose
    location is set by the mean with the profile determined by the
    scale factor.  The random variable can take a value between zero and
    infinity.  The skewness increases rapidly with decreasing values of
    the scale parameter.

    pdf(y) = sqrt(_lambda/(2*pi*y^3)) * exp(-_lambda./(2*y).*(y/mu-1).^2)

    cdf(y) = normcdf(sqrt(_lambda./y).*(y/mu-1)) + exp(2*_lambda/mu)*normcdf(sqrt(_lambda./y).*(-y/mu-1))

    where normcdf(x) = 0.5*(1+erf(y/sqrt(2))); is the standard normal CDF

    - Mean     = mu
    - Variance = mu^3/_lambda
    - Skewness = sqrt(9*mu/_lambda)
    - Kurtosis = 15*mean/scale
    - Mode = mu/(2*_lambda)*(sqrt(9*mu^2+4*_lambda^2)-3*mu)

    PARAMETERS: mu — location (mu > 0), _lambda — scale (_lambda > 0)

    SUPPORT: y, y > 0

    CLASS: Continuous skewed distribution

    NOTES:

    1. There are several alternate forms for the PDF, some of which have more than two parameters.
    2. The Inverse Gaussian distribution is often called the Inverse Normal.
    3. Wald distribution is a special case where the mean is a constant with value one.
    4. The Inverse Gaussian distribution is a special case of the Generalised Hyperbolic Distribution.

    Method:

    There is an efficient procedure that utilises a transformation yielding two roots.
    If Y is an Inverse Gauss random variable, then following [1] we can write:
    V = _lambda*(Y-mu)^2/(Y*mu^2) ~ Chi-Square(1)

    i.e. V is distributed as a chi-square random variable with one degree of freedom.
    Solving this equation for Y yields two roots:

    y1 = mu + 0.5*mu/_lambda * (mu*V - sqrt(4*mu*_lambda*V + mu^2*V.^2))
    y2 = mu^2/y1

    In [2] it is shown that Y can be simulated by choosing y1 with probability
    mu/(mu+y1) and y2 with probability 1 - mu/(mu+y1).

    References:
    [1] Shuster, J. (1968). On the Inverse Gaussian Distribution Function, Journal of the
    American Statistical Association 63: 1514-1516.
    [2] Michael, J.R., Schucany, W.R. and Haas, R.W. (1976). Generating Random Variates
    Using Transformations with Multiple Roots, The American Statistician 30: 88-90.

    http://en.wikipedia.org/wiki/Inverse_Gaussian_distribution

    Args:
        | distribParams ([float]): [mu, _lambda]; defaults to [0., 1.] when None or empty
        | out (np.array): pre-allocated output array; shape determines sample size
        | funcName (str): caller name, used in validation error messages

    Returns:
        | out (np.array): array of Inverse Gaussian random variates (same shape as input out)

    Raises:
        | No exception is raised.
    """
    if distribParams is None or len(distribParams) == 0:
        distribParams = [0., 1.]

    if checkParamsNum(funcName, 'Inverse Gaussian', 'ig', distribParams, [2]):
        mu = distribParams[0]
        _lambda = distribParams[1]
        if (validateParam(funcName, 'Inverse Gaussian', 'ig', '[mu, _lambda]', 'mu', mu, ['> 0'])
                & validateParam(funcName, 'Inverse Gaussian', 'ig', '[mu, _lambda]', '_lambda', _lambda, ['> 0'])):
            chisq1 = np.random.randn(*out.shape) ** 2
            out = (mu + 0.5 * mu / _lambda
                   * (mu * chisq1 - np.sqrt(4 * mu * _lambda * chisq1 + mu ** 2 * chisq1 ** 2)))
            l = np.random.rand(*out.shape) >= mu / (mu + out)
            out[l] = mu ** 2.0 / out[l]

    return out



######################################################################################

[docs]
def distribution_logistic(distribParams, out, funcName):
    """The Logistic Distribution.

    This function is meant to be called via the `distributions_generator` function.

    The logistic distribution is a symmetrical bell-shaped distribution.
    One of its applications is as an alternative to the Normal distribution
    when a higher proportion of the population being modelled is
    distributed in the tails.

    pdf(y) = exp((y-a)/k) / (k*(1+exp((y-a)/k))^2)

    cdf(y) = 1 / (1+exp(-(y-a)/k))

    - Mean = a
    - Variance = k^2*pi^2/3
    - Skewness = 0
    - Kurtosis = 1.2

    PARAMETERS: a — location, k — scale (k > 0)

    SUPPORT: y, -Inf < y < Inf

    CLASS: Continuous symmetric distribution

    Method: Inverse CDF transformation.

    http://en.wikipedia.org/wiki/Logistic_distribution

    Args:
        | distribParams ([float]): [a, k]; defaults to [0., 1.] when None or empty
        | out (np.array): pre-allocated output array; shape determines sample size
        | funcName (str): caller name, used in validation error messages

    Returns:
        | out (np.array): array of Logistic random variates (same shape as input out)

    Raises:
        | No exception is raised.
    """
    if distribParams is None or len(distribParams) == 0:
        distribParams = [0., 1.]
    if checkParamsNum(funcName, 'Logistic', 'logistic', distribParams, [0, 2]):
        a = distribParams[0]
        k = distribParams[1]
        if validateParam(funcName, 'Laplace', 'laplace', '[a, k]', 'k', k, ['> 0']):
            u1 = np.random.rand(*out.shape)
            out = a - k * np.log(1.0 / u1 - 1)

    return out




######################################################################################

[docs]
def distribution_wald(distribParams, out, funcName):
    """The Wald Distribution.

    This function is meant to be called via the `distributions_generator` function.

    The Wald distribution is a special case of the Inverse Gaussian Distribution.
    This formulation is for a constant mean value of one, which is the most common use case.  
    The random variable can take a value between zero and infinity.  The skewness increases 
    rapidly with decreasing values of the shape parameter, chi.

    pdf = sqrt(chi/(2*pi*y^3)) * exp(-chi./(2*y).*(y-1).^2)

    - Mean     = 1
    - Variance = 1/chi
    - Skewness = sqrt(9/chi)
    - Kurtosis = 15/chi

    PARAMETERS: chi — shape parameter (chi > 0)

    SUPPORT: y, y > 0

    CLASS: Continuous skewed distributions

    Args:
        | distribParams ([float]): [chi]; defaults to [0.] when None or empty
        | out (np.array): pre-allocated output array; shape determines sample size
        | funcName (str): caller name, used in validation error messages

    Returns:
        | out (np.array): array of Wald random variates (same shape as input out)

    Raises:
        | No exception is raised.
    """
    if distribParams is None or len(distribParams) == 0:
        distribParams = [0.]
    if checkParamsNum(funcName, 'Wald', 'wald', distribParams, [1]):
        chi = distribParams[0]
        if validateParam(funcName, 'Wald', 'wald', 'chi', 'chi', chi, ['> 0']):
            out = distributions_generator('ig', [1, chi], sampleSize=out.shape)

    return out



######################################################################################

[docs]
def distributions_generator(distribName=None, distribParams=None, sampleSize=None):
    """Dispatcher for simulation of various probability distributions.

    Allows the user to select a distribution by name and pass the required parameters
    as a list.  The size of the returned sample is controlled by ``sampleSize``.

    ``sampleSize`` follows Matlab conventions:

    - ``None`` → return a single scalar value
    - scalar int N → return an N×N array
    - tuple or list → return an array of that shape

    Possible values for ``distribName``:

    - ``'exp'``, ``'exponential'``
    - ``'lognorm'``, ``'lognormal'``, ``'cobbdouglas'``, ``'antilognormal'``
    - ``'ig'``, ``'inversegauss'``, ``'invgauss'``
    - ``'logistic'``
    - ``'wald'``

    .. note::
        Due to a pre-existing implementation detail, the ``distribParams`` argument
        is not forwarded to the individual distribution functions; each uses its own
        built-in default parameters.  ``distribParams`` accepted here for API
        compatibility only.

    Args:
        | distribName (str): required distribution name (case-insensitive, spaces stripped)
        | distribParams ([float]): list of distribution parameters (currently unused
          by the dispatcher — each sub-function applies its own defaults)
        | sampleSize (None, int, list, tuple): size of the returned random sample

    Returns:
        | out (float or np.array): set of random variables for the selected distribution

    Raises:
        | No exception is raised.
    """

    funcName = distributions_generator.__name__

    pattern = re.compile(r'\s+')
    distribNameInner = re.sub(pattern, '', distribName).lower()

    if type(sampleSize) is int:
        out = np.zeros((sampleSize, sampleSize))
    elif type(sampleSize) is list or type(sampleSize) is tuple:
        out = np.zeros(sampleSize)
    else:
        out = np.zeros(1)

    if distribName is not None:

        if distribNameInner in ['exp', 'exponential']:
            out = distribution_exp(distribParams=None, out=out, funcName=funcName)

        elif distribNameInner in ['lognorm', 'lognormal', 'cobbdouglas', 'antilognormal']:
            out = distribution_lognormal(distribParams=None, out=out, funcName=funcName)

        elif distribNameInner in ['ig', 'inversegauss', 'invgauss']:
            out = distribution_inversegauss(distribParams=None, out=out, funcName=funcName)

        elif distribNameInner in ['logistic']:
            out = distribution_logistic(distribParams=None, out=out, funcName=funcName)

        elif distribNameInner in ['wald']:
            out = distribution_wald(distribParams=None, out=out, funcName=funcName)

        else:
            print(f'\n distributions_generator: Unknown distribution name: {distribName}/{distribNameInner} \n')

    if out.shape == (1,):
        out = out[0]
    return out



##########################################################################################

[docs]
def validateParam(funcName=None, distribName=None, runDistribName=None,
                  distribParamsName=None, paramName=None, param=None, conditionStr=None):
    """Validate the range and number of parameters.

    Evaluates each condition string in ``conditionStr`` against ``param`` and
    returns ``True`` only if all conditions hold.  Supported operators are
    ``'<'``, ``'<='``, ``'>'``, ``'>='``, ``'!='``, and ``'=='``.

    .. note::
        The ``'=='`` branch contains pre-existing code referencing ``math.floor_``
        and ``math.float`` which do not exist in the standard library.  That branch
        is not reachable through any current caller and is left unchanged.

    Args:
        | funcName (str): calling function name, used in error messages
        | distribName (str): human-readable distribution name
        | runDistribName (str): short distribution name used in the help hint
        | distribParamsName (str): parameter group name (e.g. ``'[mu, sigma]'``)
        | paramName (str): individual parameter name being validated
        | param (float): parameter value to validate
        | conditionStr ([str]): list of condition strings, e.g. ``['> 0', '<= 1']``

    Returns:
        | (bool): True if all conditions are satisfied, False otherwise

    Raises:
        | No exception is raised.
    """
    condLogical = True

    eqCondStr = ''
    for i, strn in enumerate(conditionStr):
        if i == 0:
            eqCondStr = eqCondStr + conditionStr[i]
        else:
            eqCondStr = eqCondStr + ' and ' + conditionStr[i]

        eqCond = conditionStr[i][0:2]
        pattern = re.compile(r'\s+')
        eqCond = re.sub(pattern, '', eqCond)

        if eqCond in ['<']:
            condLogical &= param < float(conditionStr[i][2:])

        elif eqCond in ['<=']:
            condLogical &= param <= float(conditionStr[i][2:])

        elif eqCond in ['>']:
            condLogical &= param > float(conditionStr[i][2:])

        elif eqCond in ['>=']:
            condLogical &= param >= float(conditionStr[i][2:])

        elif eqCond in ['!=']:
            condLogical &= param != float(conditionStr[i][2:])

        elif eqCond in ['==']:
            # NOTE: math.floor_ and math.float do not exist; this branch is unreachable
            # via any current caller and is left unchanged to preserve behaviour.
            if 'integer' in conditionStr[i][2:]:
                condLogical &= param == math.floor_(param)
            else:
                condLogical &= param == math.float(conditionStr[i][2:])

    if not condLogical:
        print(f'{distribName} Variates Generation:')

    return condLogical



##########################################################################################

[docs]
def checkParamsNum(funcName, distribName, runDistribName, distribParams, correctNum):
    """Check that the correct number of parameters was supplied.

    More than one acceptable count may be specified via ``correctNum``.

    Args:
        | funcName (str): calling function name, used in error messages
        | distribName (str): human-readable distribution name
        | runDistribName (str): short distribution name used in the help hint
        | distribParams ([float]): list of distribution parameters supplied by the caller
        | correctNum ([int]): list of acceptable parameter counts (e.g. ``[0, 2]``)

    Returns:
        | (bool): True if ``len(distribParams)`` is in ``correctNum``, False otherwise

    Raises:
        | No exception is raised.
    """
    proceed = True
    if len(distribParams) not in correctNum:
        print(f"{distribName} Variates Generation:\n "
              f"Wrong number of parameters (run {funcName}('{runDistribName}') for help) ")
        proceed = False

    return proceed



##########################################################################################

[docs]
def unifomOnSphere(number, dimension=3):
    """Create a set of n-dimensional vectors uniformly distributed on a unit sphere.

    Generates points by drawing from a standard normal distribution and normalising
    each vector to unit length, which produces a uniform distribution on the sphere.

    https://math.stackexchange.com/questions/444700/uniform-distribution-on-the-surface-of-unit-sphere
    https://stackoverflow.com/questions/59954810/generate-random-points-on-10-dimensional-unit-sphere

    Args:
        | number (int): number of points to generate
        | dimension (int): dimensionality of each normalised vector (default 3)

    Returns:
        | vec (np.array[number, dimension]): array of unit-norm vectors on the sphere

    Raises:
        | No exception is raised.
    """
    vec = np.random.normal(0, 1, (number, dimension))
    vec = vec / np.linalg.norm(vec, axis=1).reshape(-1, 1)
    return vec



################################################################
################################################################
##
## confirm the correctness of the functions

if __name__ == '__main__':

    import datetime as dt
    import ryutils
    rit = ryutils.intify_tuple

    doAll = False

    # numpoints = 10
    # vec = unifomOnSphere(numpoints)
    # import pyradi.ryplot as ryplot
    # p = ryplot.Plotter(1,1,1)
    # p.plot3d(1, vec[:,0],vec[:,1],vec[:,2], markers='.',linewidths=[0], linestyle='-')
    # p.saveFig('randomonunitsphere.png')

    #no tests at this time
pyradi

Navigation

Related Topics

Source code for pyradi.ryprob