# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
This file defines the text generator plugin class `PyfuncText`
"""
import importlib
import logging
from .text_generators import TextGenerator
from .utils import DataGenError
[docs]class PyfuncText(TextGenerator): # lgtm [py/missing-equals]
""" Text generator that supports generating text from arbitrary Python function
:param fn: function to call to generate text.
:param init: function to call to initialize context
:param initPerBatch: if init per batch is set to True, initialization of context is performed on every Pandas udf
call. Default is False.
:param name: String representing name of text generator when converted to string via ``repr`` or ``str``
The two functions define the plugin model
The first function, ``fn`` is called whenever text should be generated for a single column of a single row
It is called with the signature ``fn(context, value)`` unless a root property is set, in which the signature is
``fn(rootProperty)`` with rootProperty having the value of the root property of the context.
Context is the stored context containing instances of random number generators, 3rd party
client library objects etc.
The ``initFn`` is called to initialize the function call context. The plugin code can store arbitrary properties
in the context following normal Python object rules.
The context is initialized with the property `textGenerator` prior to being initialized which is a reference to the
enclosing text generator.
.. note::
There are no expectations of repeatability of data generation when using external code
or external libraries to generate text.
However, custom code can call the base class method to get a Numpy random
number generator instance. This will have been seeded using the ``dbldatagen``
random number seed if one was specified, so random numbers generated from this will be repeatable.
The custom code may call the property ``randomSeed`` on the text generator object to get the random seed
which may be used to seed library specific initialization.
This random seed property may have the values ``None`` or ``-1`` which should be treated as meaning dont
use a random seed.
The code does not guarantee thread or cross process safety. If a new instance of the random number
generator is needed, you may call the base class method with the argument `forceNewInstance` set to True.
"""
class _FnCallContext:
""" inner class to support storage of context between calls
initial instances of random number generators, clients for services etc here during execution
of the `initFn` calls
:param txtGen: - reference to outer PyfnText object
"""
def __init__(self, txtGen):
self.textGenerator = txtGen
def __init__(self, fn, init=None, initPerBatch=False, name=None, rootProperty=None):
super().__init__()
assert fn is not None or callable(fn), "Function must be provided wiith signature fn(context, oldValue)"
assert init is None or callable(init), "Init function must be a callable function or lambda if passed"
# if root property is provided, root property will be passed to generate text function
self._rootProperty = rootProperty
self._pyFn = fn # generate text function
self._initFn = init # context initialization function
self._context = None # context used to hold library root object and other properties
# if init per batch is True, initialization of context will be per UDF call
assert initPerBatch in [True, False], "initPerBatch must evaluate to boolean True or False"
self._initPerBatch = initPerBatch
self._name = name if name is not None else "PyfuncText"
def __str__(self):
""" Get string representation of object
``name`` property is used to provide user friendly name for text generator
"""
return f"{self._name}({repr(self._pyFn)}, init={self._initFn})"
def _getContext(self, forceNewInstance=False):
""" Get the context for plugin function calls
:param forceNewInstance: if True, forces each call to create a new context
:return: existing or newly created context.
"""
context = self._context
if context is None or forceNewInstance:
context = PyfuncText._FnCallContext(self)
# init context using context creator if any provided
if self._initFn is not None:
self._initFn(context)
# save context for later use unless forceNewInstance is set
if not forceNewInstance:
self._context = context
else:
return context
return self._context
[docs] def pandasGenerateText(self, v):
""" Called to generate text via Pandas UDF mechanism
:param v: base value of column as Pandas Series
"""
# save object properties in local vars to avoid overhead of object dereferences
# on every call
context = self._getContext(self._initPerBatch)
evalFn = self._pyFn
rootProperty = getattr(context, self._rootProperty) if self._rootProperty is not None else None
# define functions to call with context and with root property
def _valueFromFn(originalValue):
return evalFn(context, originalValue)
def _valueFromFnWithRoot(originalValue):
return evalFn(rootProperty)
if rootProperty is not None:
results = v.apply(_valueFromFnWithRoot, args=None)
else:
results = v.apply(_valueFromFn, args=None)
return results
[docs]class PyfuncTextFactory:
"""PyfuncTextFactory applies syntactic wrapping around creation of PyfuncText objects
:param name: name of generated object (when converted to string via ``str``)
It allows the use of the following constructs:
.. code-block:: python
# initialization (for Faker for example)
# setup use of Faker
def initFaker(ctx):
ctx.faker = Faker(locale="en_US")
ctx.faker.add_provider(internet)
FakerText = (PyfuncTextFactory(name="FakerText")
.withInit(initFaker) # determines how context should be initialized
.withRootProperty("faker") # determines what context property is passed to fn
)
# later use ...
.withColumn("fake_name", text=FakerText("sentence", ext_word_list=my_word_list) )
.withColumn("fake_sentence", text=FakerText("sentence", ext_word_list=my_word_list) )
# translates to generation of lambda function with keyword arguments
# or without as needed
.withColumn("fake_name",
text=FakerText( (lambda faker: faker.name( )),
init=initFaker,
rootProperty="faker",
name="FakerText"))
.withColumn("fake_sentence",
text=FakerText( (lambda faker:
faker.sentence( **{ "ext_word_list" : my_word_list} )),
init=initFaker,
rootProperty="faker",
name="FakerText"))
"""
def __init__(self, name=None):
"""
:param name: name of generated object (when converted to string via ``str``)
"""
self._initFn = None
self._rootProperty = None
self._name = "PyfuncText" if name is None else name
self._initPerBatch = False
[docs] def withInit(self, fn):
""" Specifies context initialization function
:param fn: function pointer or lambda function for initialization
signature should ``initFunction(context)``
.. note::
This variation initializes the context once per worker process per text generator
instance.
"""
self._initFn = fn
return self
[docs] def withInitPerBatch(self, fn):
""" Specifies context initialization function
:param fn: function pointer or lambda function for initialization
signature should ``initFunction(context)``
.. note::
This variation initializes the context once per internal pandas UDF call.
The UDF call will be called once per 10,000 rows if system is configured using defaults.
Setting the pandas batch size as an argument to the DataSpec creation will change the default
batch size.
"""
self._initPerBatch = True
return self.withInit(fn)
[docs] def withRootProperty(self, prop):
""" If called, specifies the property of the context to be passed to the text generation function.
If not called, the context object itself will be passed to the text generation function.
"""
self._rootProperty = prop
return self
def __call__(self, evalFn, *args, isProperty=False, **kwargs):
""" Internal function call mechanism that implements the syntax expansion
:param evalFn: text generation function or lambda
:param args: optional args to be passed by position
:param kwargs: optional keyword args following Python keyword passing mechanism
:param isProperty: if true, interpret evalFn as string name of property, not a function or method
"""
assert evalFn is not None and (type(evalFn) is str or callable(evalFn)), "Function must be provided"
if type(evalFn) is str:
assert self._rootProperty is not None and len(self._rootProperty.strip()) > 0, \
"string named functions can only be used on text generators with root property"
fnName = evalFn
if len(args) > 0 and len(kwargs) > 0:
# generate lambda with both kwargs and args
assert not isProperty, "isProperty cannot be true if using arguments"
evalFn = lambda root: getattr(root, fnName)(*args, **kwargs)
elif len(args) > 0:
# generate lambda with positional args
assert not isProperty, "isProperty cannot be true if using arguments"
evalFn = lambda root: getattr(root, fnName)(*args)
elif len(kwargs) > 0:
# generate lambda with keyword args
assert not isProperty, "isProperty cannot be true if using arguments"
evalFn = lambda root: getattr(root, fnName)(**kwargs)
elif isProperty:
# generate lambda with property access, not method call
evalFn = lambda root: getattr(root, fnName)
else:
# generate lambda with no args
evalFn = (lambda root: getattr(root, fnName)())
# returns the actual PyfuncText text generator object.
# Note all syntax expansion is performed once only
return PyfuncText(evalFn, init=self._initFn, name=self._name, rootProperty=self._rootProperty)
[docs]class FakerTextFactory(PyfuncTextFactory):
""" Factory object for Faker text generator flavored ``PyfuncText`` objects
:param locale: list of locales. If empty, defaults to ``en-US``
:param providers: list of providers
:param name: name of generated objects. Defaults to ``FakerText``
:param lib: library import name of Faker library. If none passed, uses ``faker``
:param rootClass: name of root object class If none passed, uses ``Faker``
..note ::
Both the library name and root object class can be overridden - this is primarily for internal testing purposes.
"""
_FAKER_LIB = "faker"
_defaultFakerTextFactory = None
def __init__(self, locale=None, providers=None, name="FakerText", lib=None,
rootClass=None):
super().__init__(name)
# set up the logger
self._logger = logging.getLogger("FakerTextFactory")
self._logger.setLevel(logging.WARNING)
# setup Faker library to use
if lib is None:
lib = self._FAKER_LIB
# allow overriding the root object class for test purposes
if rootClass is None:
self._rootObjectClass = "Faker"
else:
self._rootObjectClass = rootClass
# load the library
fakerModule = self._loadLibrary(lib)
# make the initialization function
initFn = self._mkInitFn(fakerModule, locale, providers)
self.withInit(initFn)
self.withRootProperty("faker")
@classmethod
def _getDefaultFactory(cls, lib=None, rootClass=None):
"""Class method to get default faker text factory
Not intended for general use
"""
if cls._defaultFakerTextFactory is None:
cls._defaultFakerTextFactory = FakerTextFactory(lib=lib, rootClass=rootClass)
return cls._defaultFakerTextFactory
def _mkInitFn(self, libModule, locale, providers):
""" Make Faker initialization function
:param locale: locale string or list of locale strings
:param providers: providers to load
:return:
"""
assert libModule is not None, "must have a valid loaded Faker library module"
fakerClass = getattr(libModule, self._rootObjectClass)
# define the initialization function for Faker
def fakerInitFn(ctx):
if locale is not None:
ctx.faker = fakerClass(locale=locale)
else:
ctx.faker = fakerClass()
if providers is not None:
for provider in providers:
ctx.faker.add_provider(provider)
return fakerInitFn
def _loadLibrary(self, lib):
""" Load faker library if not already loaded
:param lib: library name of Faker library. If none passed, uses ``faker``
"""
# load library
try:
if lib is not None:
assert type(lib) is str and len(lib.strip()), f"Library ``{lib}`` must be a valid library name"
if lib in globals():
return globals()[lib]
else:
fakerModule = importlib.import_module(lib)
globals()[lib] = fakerModule
return fakerModule
except RuntimeError as err:
# pylint: disable=raise-missing-from
raise DataGenError("Could not load or initialize Faker library", err)
[docs]def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs):
"""Generate faker text generator object using default FakerTextFactory
instance
:param mname: method name to invoke
:param args: positional args to be passed to underlying Faker instance
:param _lib: internal only param - library to load
:param _rootClass: internal only param - root class to create
:returns : instance of PyfuncText for use with Faker
``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")``
"""
defaultFactory = FakerTextFactory._getDefaultFactory(lib=_lib,
rootClass=_rootClass)
return defaultFactory(mname, *args, **kwargs) # pylint: disable=not-callable