Source code for dbldatagen.distributions.gamma

# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This file defines the Gamma statistical distributions related classes

"""

import numpy as np
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType

from .data_distribution import DataDistribution


[docs]class Gamma(DataDistribution):
    """ Specify Gamma distribution with specific shape and scale

    :param shape: shape parameter (k)
    :param scale: scale parameter (theta)

    See https://en.wikipedia.org/wiki/Gamma_distribution

    Scaling is performed to normalize values between 0 and 1

    """

    def __init__(self, shape, scale):
        DataDistribution.__init__(self)
        assert type(shape) in [float, int, np.float64, np.int32, np.int64], "alpha must be int-like or float-like"
        assert type(scale) in [float, int, np.float64, np.int32, np.int64], "beta must be int-like or float-like"
        self._shape = shape
        self._scale = scale

    @property
    def shape(self):
        """ Return shape parameter."""
        return self._shape

    @property
    def scale(self):
        """ Return scale parameter."""
        return self._scale

    def __str__(self):
        """ Return string representation of object """
        return f"GammaDistribution(shape(`k`)={self._shape}, scale(`theta`)={self._scale}, seed={self.randomSeed})"

[docs]    @staticmethod
    def gamma_func(shape_series: pd.Series, scale_series: pd.Series, random_seed: pd.Series) -> pd.Series:
        """ Pandas / Numpy based function to generate gamma samples

        :param shape_series: pandas series of shape (k) values
        :param scale_series: pandas series of scale (theta) values
        :param random_seed:  pandas series of random seed values

        :return: Samples scaled from 0 .. 1
        """
        shape = shape_series.to_numpy()
        scale = scale_series.to_numpy()
        random_seed = random_seed.to_numpy()[0]

        rng = DataDistribution.get_np_random_generator(random_seed)

        results = rng.gamma(shape, scale)

        # scale results to range [0, 1]
        amin = np.amin(results) * 1.0
        amax = np.amax(results) * 1.0

        adjusted_results = results - amin

        scaling_factor = amax - amin

        results2 = adjusted_results / scaling_factor
        return pd.Series(results2)

[docs]    def generateNormalizedDistributionSample(self):
        """ Generate sample of data for distribution

        :return: random samples from distribution scaled to values between 0 and 1
        """
        gamma_sample = F.pandas_udf(self.gamma_func, returnType=FloatType()).asNondeterministic()

        newDef = gamma_sample(F.lit(self._shape),
                             F.lit(self._scale),
                             F.lit(self.randomSeed) if self.randomSeed is not None else F.lit(-1.0))
        return newDef