Source code for dbldatagen.column_spec_options

# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
.. title::Column Spec Options

This file defines the `ColumnSpecOptions` class
"""

import copy

from .utils import ensure


[docs]class ColumnSpecOptions(object): """ Column spec options object - manages options for column specs. This class has limited functionality - mainly used to validate and document the options, and the class is meant for internal use only. :param props: Used to pass list of properties for column generation spec property checking. The following options are permitted on data generator `withColumn`, `withColumnSpec` and `withColumnSpecs` methods: :param name: Column name :param type: Data type of column. Can be either instance of Spark SQL Datatype such as `IntegerType()` \ or string containing SQL name of type :param minValue: Minimum value for range of generated value. \ As an alternative, you may use the `dataRange` parameter :param maxValue: Maximum value for range of generated value. \ As an alternative, you may use the `dataRange` parameter :param step: Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter :param numColumns: generate `n` columns numbered from 1 .. n-1 with same definition :param numFeatures: generate `n` columns numbered from 0 .. n-1 with same definition. Alias for `numColumns` :param structType: If specified as "array" and used with numColumns / numFeatures, will combine columns as array :param random: If True, will generate random values for column value. Defaults to `False` :param baseColumn: Either the string name of the base column, or a list of columns to use to control data generation. The option ``baseColumns`` is an alias for ``baseColumn``. :param values: List of discrete values for the colummn. Discrete values for the column can be strings, numbers or constants conforming to type of column :param weights: List of discrete weights for the colummn. Should be integer values. For example, you might declare a column for status values with a weighted distribution with the following statement: `withColumn("status", StringType(), values=['online', 'offline', 'unknown'], weights=[3,2,1])` :param percentNulls: Specifies numeric percentage of generated values to be populated with SQL `null`. Value is fraction representing percentage between 0.0 and 1.0 For example: `percentNulls=0.12` will give approximately 12% nulls for this field in the output. :param uniqueValues: Number of unique values for column. If the unique values are specified for a timestamp or date field, the values will be chosen working back from the end of the previous month, unless `begin`, `end` and `interval` parameters are specified :param begin: Beginning of range for date and timestamp fields. For dates and timestamp fields, use the `begin`, `end` and `interval` or `dataRange` parameters instead of `minValue`, `maxValue` and `step` :param end: End of range for date and timestamp fields. For dates and timestamp fields, use the `begin`, `end` and `interval` or `dataRange` parameters instead of `minValue`, `maxValue` and `step` :param interval: Interval of range for date and timestamp fields. For dates and timestamp fields, use the `begin`, `end` and `interval` or `dataRange` parameters instead of `minValue`, `maxValue` and `step` :param dataRange: An instance of an `NRange` or `DateRange` object. This can be used in place of `minValue`, `maxValue`, `step` or `begin`, `end`, `interval`. :param template: template controlling how text should be generated :param textSeparator: string specifying separator to be used when constructing strings with prefix and suffix :param prefix: string specifying prefix text to construct field from prefix and numeric value. Both `prefix` and `suffix` can be used together :param suffix: string specifying suffix text to construct field from suffix and numeric value. Both `prefix` and `suffix` can be used together :param omit: if True, column is omitted from the output. Used to use column for interim effect only. :param expr: SQL expression to control data generation. Ignores column base value if present. :param implicit: Used by system to mark that column has been inferred from a schema. Allows definition to be explicitly overridden. :param precision: Used for rounding to specific decimal layout. :param scale: Used for rounding to specific decimal layout. :param distribution: Distribution for random number. Ignored if column is not random. :param escapeSpecialChars: if True, require escape for all special chars in template .. note:: If the `dataRange` parameter is specified as well as the `minValue`, `maxValue` or `step`, the results are undetermined. For more information, see :doc:`/reference/api/dbldatagen.daterange` or :doc:`/reference/api/dbldatagen.nrange`. """ #: the set of attributes that must be present for any columns _REQUIRED_PROPERTIES = {'name', 'type'} _PROPERTY_ALIASES = { 'data_range': 'dataRange', 'base_column': 'baseColumn', 'base_column_type': 'baseColumnType', 'base_columns': 'baseColumn', 'baseColumns': 'baseColumn', 'percent_nulls': 'percentNulls', 'unique_values': 'uniqueValues', 'random_seed_method': 'randomSeedMethod', 'random_seed': 'randomSeed', 'text_separator': 'textSeparator', } #: the set of attributes that are permitted for any call to data generator `withColumn` or `withColumnSpec` _ALLOWED_PROPERTIES = {'name', 'type', 'minValue', 'maxValue', 'step', 'prefix', 'random', 'distribution', 'range', 'baseColumn', 'baseColumnType', 'values', 'numColumns', 'numFeatures', 'structType', 'begin', 'end', 'interval', 'expr', 'omit', 'weights', 'description', 'continuous', 'percentNulls', 'template', 'format', 'uniqueValues', 'dataRange', 'text', 'precision', 'scale', 'randomSeedMethod', 'randomSeed', 'nullable', 'implicit', 'escapeSpecialChars', 'suffix', 'textSeparator' } #: the set of disallowed column attributes for any call to data generator `withColumn` or `withColumnSpec` _FORBIDDEN_PROPERTIES = { 'range' } #: maxValue values for each column type, only if where value is intentionally restricted _MAX_TYPE_RANGE = { 'byte': 256, 'short': 65536, 'int': 4294967296 } def __init__(self, props, aliases=None): # TODO: check if additional options are needed here as `**kwArgs` self._options = props self._aliases = aliases if aliases is not None else self._PROPERTY_ALIASES # translate aliases # need to copy options dictionary as you cant directly change a # dictionary that you are iterating over updated_options = copy.copy(props) for k in props.keys(): if k in self._aliases: v = props[k] alias_name = self._aliases[k] updated_options[alias_name] = v del updated_options[k] self._options = updated_options @property def options(self): """ Get options dictionary for object :return: options dictionary for object """ return self._options
[docs] def getOrElse(self, key, default=None): """ Get val for key if it exists or else return default""" assert key is not None, "key must be valid key string" if key in self._options: return self._options.get(key, default) if key in self._aliases: return self._options.get(self._aliases[key], default) return default
def __getitem__(self, key): """ implement the built in dereference by key behavior """ ensure(key is not None, "key should be non-empty") return self._options.get(key, None)
[docs] def checkBoolOption(self, v, name=None, optional=True): """ Check that option is either not specified or of type boolean :param v: value to test :param name: name of value to use in any reported errors or exceptions :param optional: If True (default), indicates that value is optional and that `None` is a valid value for the option """ assert name is not None, "`name` must be specified" if optional: ensure(v is None or type(v) is bool, f"Option `{name}` must be boolean if specified - value: {v}, type: {type(v)}") else: ensure(type(v) is bool, f"Option `{name}` must be boolean - value: {v}, type: {type(v)}")
[docs] def checkExclusiveOptions(self, options): """check if the options are exclusive - i.e only one is not None :param options: list of options that will be mutually exclusive """ assert options is not None, "options must be non empty" assert type(options) is list, "`options` must be list" assert len([self[x] for x in options if self[x] is not None]) <= 1, \ f" only one of of the options: {options} may be specified "
[docs] def checkOptionValues(self, option, option_values): """check if option value is in list of values :param option: list of options that will be mutually exclusive :param option_values: list of possible option values that will be mutually exclusive """ assert option is not None and len(option.strip()) > 0, "option must be non empty" assert type(option_values) is list, "`option_values` must be list" assert self[option] in option_values, f"option: `{option}` must have one of the values {option_values}"
[docs] def checkValidColumnProperties(self, columnProps): """ check that column definition properties are recognized and that the column definition has required properties :param columnProps: """ ensure(columnProps is not None, "columnProps should be non-empty") col_type = self['type'] if col_type.typeName() in self._MAX_TYPE_RANGE: minValue = self['minValue'] maxValue = self['maxValue'] if minValue is not None and maxValue is not None: effective_range = maxValue - minValue if effective_range > self._MAX_TYPE_RANGE[col_type.typeName()]: raise ValueError("Effective range greater than range of type") for k in columnProps.keys(): ensure(k in ColumnSpecOptions._ALLOWED_PROPERTIES or k in ColumnSpecOptions._PROPERTY_ALIASES, f"invalid column option {k}") for arg in self._REQUIRED_PROPERTIES: ensure(columnProps.get(arg) is not None, f"missing column option {arg}") for arg in self._FORBIDDEN_PROPERTIES: ensure(arg not in columnProps, f"forbidden column option {arg}") # check weights and values if 'weights' in columnProps: ensure('values' in columnProps, f"weights are only allowed for columns with values - column '{columnProps['name']}' ") ensure(columnProps['values'] is not None and len(columnProps['values']) > 0, f"weights must be associated with non-empty list of values - column '{columnProps['name']}' ") ensure(len(columnProps['values']) == len(columnProps['weights']), f"length(list of weights) must equal length(list of values) - column '{columnProps['name']}' ")