Source code for dbldatagen.utils

# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This file defines the `DataGenError` classes and utility functions

These are meant for internal use only
"""

import functools
import json
import re
import time
import warnings
from datetime import timedelta

import jmespath


[docs]def deprecated(message=""):
    """
    Define a deprecated decorator without dependencies on 3rd party libraries

    Note there is a 3rd party library called `deprecated` that provides this feature but goal is to only have
    dependencies on packages already used in the Databricks runtime
    """

    # create closure around function that follows use of the decorator
    def deprecated_decorator(func):
        @functools.wraps(func)
        def deprecated_func(*args, **kwargs):
            warnings.warn(f"`{func.__name__}` is a deprecated function or method. \n{message}",
                          category=DeprecationWarning, stacklevel=1)
            warnings.simplefilter('default', DeprecationWarning)
            return func(*args, **kwargs)

        return deprecated_func

    return deprecated_decorator


[docs]class DataGenError(Exception):
    """Exception class to represent data generation errors

        :param msg: message related to error
        :param baseException: underlying exception, if any that caused the issue
    """

    def __init__(self, msg, baseException=None):
        """ constructor
        """
        super().__init__(msg)
        self._underlyingException = baseException
        self._msg = msg

    def __repr__(self):
        return f"DataGenError(msg='{self._msg}', baseException={self._underlyingException})"

    def __str__(self):
        return f"DataGenError(msg='{self._msg}', baseException={self._underlyingException})"


[docs]def coalesce_values(*args):
    """For a supplied list of arguments, returns the first argument that does not have the value `None`

    :param args: variable list of arguments which are evaluated
    :returns: First argument in list that evaluates to a non-`None` value
    """
    for x in args:
        if x is not None:
            return x
    return None


[docs]def ensure(cond, msg="condition does not hold true"):
    """ensure(cond, s) => throws Exception(s) if c is not true

    :param cond: condition to test
    :param msg: Message to add to exception if exception is raised
    :raises: `DataGenError` exception if condition does not hold true
    :returns: Does not return anything but raises exception if condition does not hold
    """

    def strip_margin(text):
        return re.sub(r'\n[ \t]*\|', '\n', text)

    if not cond:
        raise DataGenError(strip_margin(msg))


[docs]def mkBoundsList(x, default):
    """ make a bounds list from supplied parameter - otherwise use default

        :param x: integer or list of 2 values that define bounds list
        :param default: default value if X is `None`
        :returns: list of form [x,y]
    """
    if x is None:
        retval = (True, [default, default]) if type(default) is int else (True, list(default))
        return retval
    elif type(x) is int:
        bounds_list = [x, x]
        assert len(bounds_list) == 2, "bounds list must be of length 2"
        return False, bounds_list
    else:
        bounds_list = list(x)
        assert len(bounds_list) == 2, "bounds list must be of length 2"
        return False, bounds_list


[docs]def topologicalSort(sources, initial_columns=None, flatten=True):
    """ Perform a topological sort over sources

    Used to compute the column test data generation order of the column generation dependencies.

    The column generation dependencies are based on the value of the `baseColumn` attribute for `withColumn` or
    `withColumnSpec` statements in the data generator specification.

    :arg sources: list of ``(name, set(names of dependencies))`` pairs
    :arg initial_columns: force ``initial_columns`` to be computed first
    :arg flatten: if true, flatten output list
    :returns: list of names in dependency order separated into build phases

    .. note::
       The algorith will give preference to retaining order of inbound sequence
       over modifying order to produce a lower number of build phases.

       Overall the effect is that the input build order should be retained unless there are forward references
    """
    # generate a copy so that we can modify in place
    pending = [(name, set(deps)) for name, deps in sources]
    provided = [] if initial_columns is None else initial_columns[:]
    build_orders = [] if initial_columns is None else [initial_columns]

    while pending:
        next_pending = []
        gen = []
        value_emitted = False
        defer_emitted = False
        gen_provided = []
        for entry in pending:
            name, deps = entry
            deps.difference_update(provided)
            if deps:
                next_pending.append((name, set(deps)))

                # if dependencies will be satisfied by item emitted in this round, defer output
                if not deps.difference(gen_provided):
                    defer_emitted = True
            elif defer_emitted:
                next_pending.append((name, set(deps)))
            elif name in provided:
                value_emitted = True
            else:
                gen.append(name)
                gen_provided.append(name)
                value_emitted = True
        provided.extend(gen_provided)
        build_orders.append(gen)

        if not value_emitted:
            raise ValueError(f"cyclic or missing dependency detected [{next_pending}]")

        pending = next_pending

    if flatten:
        flattened_list = [item for sublist in build_orders for item in sublist]
        return flattened_list
    else:
        return build_orders


PATTERN_NAME_EQUALS_VALUE = re.compile(r"(\w+)\s*\=\s*([0-9]+)")
PATTERN_VALUE_SPACE_NAME = re.compile(r"([0-9]+)\s+(\w+)")
_WEEKS_PER_YEAR = 52


[docs]def parse_time_interval(spec):
    """parse time interval from string"""
    hours = 0
    minutes = 0
    weeks = 0
    microseconds = 0
    milliseconds = 0
    seconds = 0
    years = 0
    days = 0

    assert spec is not None, "Must have valid time interval specification"

    # get time specs such as 12 days, etc. Supported timespans are years, days, hours, minutes, seconds
    timespecs = [x.strip() for x in spec.strip().split(",")]

    for ts in timespecs:
        # allow both 'days=1' and '1 day' syntax
        timespec_parts = re.findall(PATTERN_NAME_EQUALS_VALUE, ts)
        # findall returns list of tuples
        if timespec_parts is not None and len(timespec_parts) > 0:
            num_parts = len(timespec_parts[0])
            assert num_parts >= 1, "must have numeric specification and time element such as `12 hours` or `hours=12`"
            time_value = int(timespec_parts[0][num_parts - 1])
            time_type = timespec_parts[0][0].lower()
        else:
            timespec_parts = re.findall(PATTERN_VALUE_SPACE_NAME, ts)
            num_parts = len(timespec_parts[0])
            assert num_parts >= 1, "must have numeric specification and time element such as `12 hours` or `hours=12`"
            time_value = int(timespec_parts[0][0])
            time_type = timespec_parts[0][num_parts - 1].lower()

        if time_type in ["years", "year"]:
            years = time_value
        elif time_type in ["weeks", "weeks"]:
            weeks = time_value
        elif time_type in ["days", "day"]:
            days = time_value
        elif time_type in ["hours", "hour"]:
            hours = time_value
        elif time_type in ["minutes", "minute"]:
            minutes = time_value
        elif time_type in ["seconds", "second"]:
            seconds = time_value
        elif time_type in ["microseconds", "microsecond"]:
            microseconds = time_value
        elif time_type in ["milliseconds", "millisecond"]:
            milliseconds = time_value

    delta = timedelta(
        days=days,
        seconds=seconds,
        microseconds=microseconds,
        milliseconds=milliseconds,
        minutes=minutes,
        hours=hours,
        weeks=weeks + (years * _WEEKS_PER_YEAR)
    )

    return delta


[docs]def strip_margins(s, marginChar):
    """
    Python equivalent of Scala stripMargins method
    
    Takes a string (potentially multiline) and strips all chars up and including the first occurrence of `marginChar`.
    Used to control the formatting of generated text

    `strip_margins("one\n    |two\n    |three", '|')`

    will produce 
    
    ``
    one 
    two
    three
    ``

    :param s: string to strip margins from
    :param marginChar: character to strip 
    :return: modified string
    """
    assert s is not None and type(s) is str
    assert marginChar is not None and type(marginChar) is str

    lines = s.split('\n')
    revised_lines = []

    for line in lines:
        if marginChar in line:
            revised_line = line[line.index(marginChar) + 1:]
            revised_lines.append(revised_line)
        else:
            revised_lines.append(line)

    return '\n'.join(revised_lines)


[docs]def split_list_matching_condition(lst, cond):
    """
    Split a list on elements that match a condition

    This will find all matches of a specific condition in the list and split the list into sub lists around the
    element that matches this condition.

    It will handle multiple matches performing splits on each match.

    For example, the following code will produce the results below:

    x = ['id', 'city_name', 'id', 'city_id', 'city_pop', 'id', 'city_id', 'city_pop','city_id', 'city_pop','id']
    splitListOnCondition(x, lambda el: el == 'id')

    Result:
    `[['id'], ['city_name'], ['id'], ['city_id', 'city_pop'],
      ['id'], ['city_id', 'city_pop', 'city_id', 'city_pop'], ['id']]`

    :arg lst: list of items to perform condition matches against
    :arg cond: lambda function or function taking single argument and returning True or False
    :returns: list of sublists
    """
    retval = []

    def match_condition(matchList, matchFn):
        """Return first index of element of list matching condition"""
        if matchList is None or len(matchList) == 0:
            return -1

        for i, matchValue in enumerate(matchList):
            if matchFn(matchValue):
                return i

        return -1

    if lst is None:
        retval = lst
    elif len(lst) == 1:
        retval = [lst]
    else:
        ix = match_condition(lst, cond)
        if ix != -1:
            retval.extend(split_list_matching_condition(lst[0:ix], cond))
            retval.append(lst[ix:ix + 1])
            retval.extend(split_list_matching_condition(lst[ix + 1:], cond))
        else:
            retval = [lst]

    # filter out empty lists
    return [el for el in retval if el != []]


[docs]def json_value_from_path(searchPath, jsonData, defaultValue):
    """ Get JSON value from JSON data referenced by searchPath

    searchPath should be a JSON path as supported by the `jmespath` package
    (see https://jmespath.org/)

    :param searchPath: A `jmespath` compatible JSON search path
    :param jsonData: The json data to search (string representation of the JSON data)
    :param defaultValue: The default value to be returned if the value was not found
    :return: Returns the json value if present, otherwise returns the default value
    """
    assert searchPath is not None and len(searchPath) > 0, "search path cannot be empty"
    assert jsonData is not None and len(jsonData) > 0, "JSON data cannot be empty"

    jsonDict = json.loads(jsonData)

    jsonValue = jmespath.search(searchPath, jsonDict)

    if jsonValue is not None:
        return jsonValue

    return defaultValue


[docs]def system_time_millis():
    """ return system time as milliseconds since start of epoch

    :return: system time millis as long
    """
    curr_time = round(time.time() / 1000)
    return curr_time