Source code for dbldatagen.column_spec_options

# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
.. title::Column Spec Options

This file defines the `ColumnSpecOptions` class
"""

import copy

from .utils import ensure


[docs]class ColumnSpecOptions(object):
    """ Column spec options object - manages options for column specs.

    This class has limited functionality - mainly used to validate and document the options,
    and the class is meant for internal use only.

    :param props: Used to pass list of properties for column generation spec property checking.

    The following options are permitted on data generator `withColumn`, `withColumnSpec` and `withColumnSpecs` methods:

    :param name: Column name

    :param type: Data type of column. Can be either instance of Spark SQL Datatype such as `IntegerType()` \
                 or string containing SQL name of type

    :param minValue: Minimum value for range of generated value. \
                     As an alternative, you may use the `dataRange` parameter

    :param maxValue: Maximum value for range of generated value. \
                     As an alternative, you may use the `dataRange` parameter

    :param step: Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter

    :param numColumns: generate `n` columns numbered from 1 .. n-1 with same definition

    :param numFeatures: generate `n` columns numbered from 0 .. n-1 with same definition. Alias for `numColumns`

    :param structType: If specified as "array" and used with numColumns / numFeatures, will combine columns as array

    :param random: If True, will generate random values for column value. Defaults to `False`

    :param baseColumn: Either the string name of the base column, or a list of columns to use to
                        control data generation. The option ``baseColumns`` is an alias for ``baseColumn``.

    :param baseColumnType: Determines how the value is derived from the base column. Possible values are 'auto',
                           'hash', 'raw_values', 'values'

    :param values: List of discrete values for the colummn. Discrete values for the column can be strings, numbers
                   or constants conforming to type of column

    :param weights: List of discrete weights for the colummn. Should be integer values.
                    For example, you might declare a column for status values with a weighted distribution with
                    the following statement:
                    `withColumn("status", StringType(), values=['online', 'offline', 'unknown'], weights=[3,2,1])`

    :param percentNulls: Specifies numeric percentage of generated values to be populated with SQL `null`.
                          Value is fraction representing percentage between 0.0 and 1.0
                          For example: `percentNulls=0.12` will give approximately 12% nulls for this field in the
                          output.

    :param uniqueValues: Number of unique values for column.
                          If the unique values are specified for a timestamp or date field, the values will be chosen
                          working back from the end of the previous month,
                          unless `begin`, `end` and `interval` parameters are specified

    :param begin: Beginning of range for date and timestamp fields.
                   For dates and timestamp fields, use the `begin`, `end` and `interval`
                   or `dataRange` parameters instead of `minValue`, `maxValue` and `step`

    :param end: End of range for date and timestamp fields.
                   For dates and timestamp fields, use the `begin`, `end` and `interval`
                   or `dataRange` parameters instead of `minValue`, `maxValue` and `step`

    :param interval: Interval of range for date and timestamp fields.
                   For dates and timestamp fields, use the `begin`, `end` and `interval`
                   or `dataRange` parameters instead of `minValue`, `maxValue` and `step`

    :param dataRange: An instance of an `NRange` or `DateRange` object. This can be used in place of `minValue`,
                       `maxValue`, `step` or `begin`, `end`, `interval`.

    :param template: template controlling how text should be generated

    :param textSeparator: string specifying separator to be used when constructing strings with prefix and suffix

    :param prefix: string specifying prefix text to construct field from prefix and numeric value. Both `prefix` and
                   `suffix` can be used together

    :param suffix: string specifying suffix text to construct field from suffix and numeric value. Both `prefix` and
                   `suffix` can be used together

    :param omit: if True, column is omitted from the output. Used to use column for interim effect only.

    :param expr: SQL expression to control data generation. Ignores column base value if present.

    :param implicit: Used by system to mark that column has been inferred from a schema.
                     Allows definition to be explicitly overridden.

    :param precision: Used for rounding to specific decimal layout.

    :param scale: Used for rounding to specific decimal layout.

    :param distribution: Distribution for random number. Ignored if column is not random.

    :param escapeSpecialChars: if True, require escape for all special chars in template

    When a column's value is derived from the value of another column, the `baseColumn` and `baseColumnType` options
    can be used to control how the value is derived. The `baseColumn` option can be used to specify the name of the
    base column, and the `baseColumnType` option can be used to specify how the value is derived from the base column.

    The following values are permitted for the `baseColumnType` option:

    - 'auto': Automatically determine the base column type based on the column type of the base column.
    - 'hash': Use a hash of the base column(s) value to derive the value of the new column.
    - 'raw_values': Use the raw values of the base column to derive the value of the new column.
    - 'values': Use the values of the base column to derive the value of the new column.

    The `baseColumn` option can be used to specify the name of the base column. If the `baseColumn` option is not
    specified, the value of the new column will be derived from the seed or `id` column.

    The `baseColumnType` option is optional. If it is not specified, the value of the new column will be derived
    based on the column type of the base column.

    The derivation from `raw_values` differs from `values` in that the `raw_values` option will use the raw values
    of the base column to derive the value of the new column, while the `values` option will use the values of the
    base column to derive the value of the new column after scaling to the range or implied range of the new column.

    For example a column with four categorical values , 'A', 'B', 'C', 'D' has an implied range of 0 .. 3.

    .. note::
        If the `dataRange` parameter is specified as well as the `minValue`, `maxValue` or `step`,
        the results are undetermined.

        For more information, see :doc:`/reference/api/dbldatagen.daterange`
        or :doc:`/reference/api/dbldatagen.nrange`.

    """

    #: the set of attributes that must be present for any columns
    _REQUIRED_PROPERTIES = {'name', 'type'}

    _PROPERTY_ALIASES = {
        'data_range': 'dataRange',
        'base_column': 'baseColumn',
        'base_column_type': 'baseColumnType',
        'base_columns': 'baseColumn',
        'baseColumns': 'baseColumn',
        'percent_nulls': 'percentNulls',
        'unique_values': 'uniqueValues',
        'random_seed_method': 'randomSeedMethod',
        'random_seed': 'randomSeed',
        'text_separator': 'textSeparator',

    }
    #: the set of attributes that are permitted for any call to data generator `withColumn` or `withColumnSpec`
    _ALLOWED_PROPERTIES = {'name', 'type', 'minValue', 'maxValue', 'step',
                           'prefix', 'random', 'distribution',
                           'range', 'baseColumn', 'baseColumnType', 'values',
                           'numColumns', 'numFeatures', 'structType',
                           'begin', 'end', 'interval', 'expr', 'omit',
                           'weights', 'description', 'continuous',
                           'percentNulls', 'template', 'format',
                           'uniqueValues', 'dataRange', 'text',
                           'precision', 'scale',
                           'randomSeedMethod', 'randomSeed',
                           'nullable', 'implicit', 'escapeSpecialChars',
                           'suffix', 'textSeparator'
                           }

    #: the set of disallowed column attributes for any call to data generator `withColumn` or `withColumnSpec`
    _FORBIDDEN_PROPERTIES = {
        'range'
    }

    #: maxValue values for each column type, only if where value is intentionally restricted
    _MAX_TYPE_RANGE = {
        'byte': 256,
        'short': 65536,
        'int': 4294967296
    }

    def __init__(self, props, aliases=None):  # TODO: check if additional options are needed here as `**kwArgs`
        self._options = props

        self._aliases = aliases if aliases is not None else self._PROPERTY_ALIASES

        # translate aliases
        # need to copy options dictionary as you cant directly change a
        # dictionary that you are iterating over
        updated_options = copy.copy(props)
        for k in props.keys():
            if k in self._aliases:
                v = props[k]
                alias_name = self._aliases[k]
                updated_options[alias_name] = v
                del updated_options[k]

        self._options = updated_options

    @property
    def options(self):
        """ Get options dictionary for object

            :return: options dictionary for object

        """
        return self._options

[docs]    def getOrElse(self, key, default=None):
        """ Get val for key if it exists or else return default"""
        assert key is not None, "key must be valid key string"

        if key in self._options:
            return self._options.get(key, default)
        if key in self._aliases:
            return self._options.get(self._aliases[key], default)
        return default

    def __getitem__(self, key):
        """ implement the built in dereference by key behavior """
        ensure(key is not None, "key should be non-empty")
        return self._options.get(key, None)

[docs]    def checkBoolOption(self, v, name=None, optional=True):
        """ Check that option is either not specified or of type boolean

        :param v: value to test
        :param name: name of value to use in any reported errors or exceptions
        :param optional: If True (default), indicates that value is optional and
                         that `None` is a valid value for the option
        """
        assert name is not None, "`name` must be specified"
        if optional:
            ensure(v is None or type(v) is bool,
                   f"Option `{name}` must be boolean if specified - value: {v}, type: {type(v)}")
        else:
            ensure(type(v) is bool,
                   f"Option `{name}` must be boolean  - value: {v}, type: {type(v)}")

[docs]    def checkExclusiveOptions(self, options):
        """check if the options are exclusive - i.e only one is not None

        :param options: list of options that will be mutually exclusive
        """
        assert options is not None, "options must be non empty"
        assert type(options) is list, "`options` must be list"
        assert len([self[x] for x in options if self[x] is not None]) <= 1, \
            f" only one of of the options: {options} may be specified "

[docs]    def checkOptionValues(self, option, option_values):
        """check if option value is in list of values

        :param option: list of options that will be mutually exclusive
        :param option_values: list of possible option values that will be mutually exclusive
        """
        assert option is not None and len(option.strip()) > 0, "option must be non empty"
        assert type(option_values) is list, "`option_values` must be list"
        assert self[option] in option_values, f"option: `{option}` must have one of the values {option_values}"

[docs]    def checkValidColumnProperties(self, columnProps):
        """
            check that column definition properties are recognized
            and that the column definition has required properties

            :param columnProps:
        """
        ensure(columnProps is not None, "columnProps should be non-empty")

        col_type = self['type']
        if col_type.typeName() in self._MAX_TYPE_RANGE:
            minValue = self['minValue']
            maxValue = self['maxValue']

            if minValue is not None and maxValue is not None:
                effective_range = maxValue - minValue
                if effective_range > self._MAX_TYPE_RANGE[col_type.typeName()]:
                    raise ValueError("Effective range greater than range of type")

        for k in columnProps.keys():
            ensure(k in ColumnSpecOptions._ALLOWED_PROPERTIES or k in ColumnSpecOptions._PROPERTY_ALIASES,
                   f"invalid column option {k}")

        for arg in self._REQUIRED_PROPERTIES:
            ensure(columnProps.get(arg) is not None, f"missing column option {arg}")

        for arg in self._FORBIDDEN_PROPERTIES:
            ensure(arg not in columnProps, f"forbidden column option {arg}")

        # check weights and values
        if 'weights' in columnProps:
            ensure('values' in columnProps,
                   f"weights are only allowed for columns with values - column '{columnProps['name']}' ")
            ensure(columnProps['values'] is not None and len(columnProps['values']) > 0,
                   f"weights must be associated with non-empty list of values - column '{columnProps['name']}' ")
            ensure(len(columnProps['values']) == len(columnProps['weights']),
                   f"length(list of weights) must equal length(list of values) - column '{columnProps['name']}' ")