Source code for dbldatagen.nrange

# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This module defines the `NRange` class used to specify data ranges
"""

import math

from pyspark.sql.types import LongType, FloatType, IntegerType, DoubleType, ShortType, \
    ByteType

from .datarange import DataRange

_OLD_MIN_OPTION = 'min'
_OLD_MAX_OPTION = 'max'


[docs]class NRange(DataRange):
    """ Ranged numeric interval representing the interval minValue .. maxValue inclusive

    A ranged object can be uses as an alternative to the `minValue`, `maxValue`, `step` parameters
    to the DataGenerator `withColumn` and `withColumn` objects.
    Specify by passing an instance of `NRange` to the `dataRange` parameter.

    :param minValue: Minimum value of range. May be integer / long / float
    :param maxValue: Maximum value of range. May be integer / long / float
    :param step: Step value for range. May be integer / long / float
    :param until: Upper bound for range ( i.e maxValue+1)

    You may only specify a `maxValue` or `until` value not both.

    For a decreasing sequence, use a negative step value.
    """

    def __init__(self, minValue=None, maxValue=None, step=None, until=None, **kwArgs):
        # check if older form of `minValue` and `maxValue` are used, and if so
        if _OLD_MIN_OPTION in kwArgs:
            assert minValue is None, \
                "Only one of `minValue` and `minValue` can be specified. Use of `minValue` is preferred"
            self.minValue = kwArgs[_OLD_MIN_OPTION]
            kwArgs.pop(_OLD_MIN_OPTION, None)
        else:
            self.minValue = minValue

        if _OLD_MAX_OPTION in kwArgs:
            assert maxValue is None, \
                "Only one of `maxValue` and `maxValue` can be specified. Use of `maxValue` is preferred"
            self.maxValue = kwArgs[_OLD_MAX_OPTION]
            kwArgs.pop(_OLD_MAX_OPTION, None)
        else:
            self.maxValue = maxValue
        assert len(kwArgs.keys()) == 0, "no keyword options other than `min` and `max` allowed"

        assert until is None if self.maxValue is not None else True, "Only one of maxValue or until can be specified"
        assert self.maxValue is None if until is not None else True, "Only one of maxValue or until can be specified"

        if until is not None:
            self.maxValue = until + 1
        self.step = step

    def __str__(self):
        return f"NRange({self.minValue}, {self.maxValue}, {self.step})"

[docs]    def isEmpty(self):
        """Check if object is empty (i.e all instance vars of note are `None`

        :returns: `True` if empty, `False` otherwise
        """
        return self.minValue is None and self.maxValue is None and self.step is None

[docs]    def isFullyPopulated(self):
        """Check is all instance vars are populated

        :returns: `True` if fully populated, `False` otherwise
        """
        return self.minValue is not None and self.maxValue is not None and self.step is not None

[docs]    def adjustForColumnDatatype(self, ctype):
        """ Adjust default values for column output type

        :param ctype: Spark SQL type instance to adjust range for
        :returns: No return value - executes for effect only
        """
        if ctype.typeName() == 'decimal':
            if self.minValue is None:
                self.minValue = 0.0
            if self.maxValue is None:
                self.maxValue = math.pow(10, ctype.precision - ctype.scale) - 1.0
            if self.step is None:
                self.step = 1.0

        if type(ctype) is ShortType and self.maxValue is not None:
            assert self.maxValue <= 65536, "`maxValue` must be in range of short"

        if type(ctype) is ByteType and self.maxValue is not None:
            assert self.maxValue <= 256, "`maxValue` must be in range of byte (0 - 256)"

        if (type(ctype) is DoubleType or type(ctype) is FloatType) and self.step is None:
            self.step = 1.0

        if (type(ctype) is ByteType
              or type(ctype) is ShortType
              or type(ctype) is IntegerType
              or type(ctype) is LongType) and self.step is None:
            self.step = 1

[docs]    def getDiscreteRange(self):
        """Convert range to discrete range

        :returns: number of discrete values in range. For example `NRange(1, 5, 0.5)` has 8 discrete values

        .. note::
           A range of 0,4, 0.5 has 8 discrete values not 9 as the `maxValue` value is not part of the range

        TODO: check range of values

        """
        if type(self.minValue) is int and type(self.maxValue) is int and self.step == 1:
            return self.maxValue - self.minValue
        else:
            # when any component is a float, we will return a float for the discrete range
            # to simplify computations
            return float(math.floor((self.maxValue - self.minValue) * float(1.0 / self.step)))

[docs]    def getContinuousRange(self):
        """Convert range to continuous range

        :returns: float value for size of interval from `minValue` to `maxValue`
        """
        return (self.maxValue - self.minValue) * float(1.0)

[docs]    def getScale(self):
        """Get scale of range"""
        smin, smax, sstep = 0, 0, 0

        if self.minValue is not None:
            smin = self._precision_and_scale(self.minValue)[1]
        if self.maxValue is not None:
            smax = self._precision_and_scale(self.maxValue)[1]
        if self.step is not None:
            sstep = self._precision_and_scale(self.step)[1]

        # return maximum scale of components
        return max(smin, smax, sstep)

    def _precision_and_scale(self, x):
        max_digits = 14
        int_part = int(abs(x))
        magnitude = 1 if int_part == 0 else int(math.log10(int_part)) + 1
        if magnitude >= max_digits:
            return (magnitude, 0)
        frac_part = abs(x) - int_part
        multiplier = 10 ** (max_digits - magnitude)
        frac_digits = multiplier + int(multiplier * frac_part + 0.5)
        while frac_digits % 10 == 0:
            frac_digits /= 10
        scale = int(math.log10(frac_digits))
        return (magnitude + scale, scale)