databricks.labs.dqx.profiler.generator
DQGenerator Objects
class DQGenerator(DQEngineBase)
__init__
def __init__(workspace_client: WorkspaceClient,
spark: SparkSession | None = None,
llm_model_config: LLMModelConfig | None = None,
custom_check_functions: dict[str, Callable] | None = None)
Initializes the DQGenerator with optional Spark session and LLM model configuration.
Arguments:
workspace_client- Databricks WorkspaceClient instance.spark- Optional SparkSession instance. If not provided, a new session will be created.llm_model_config- Optional LLM model configuration for AI-assisted rule generation.custom_check_functions- Optional dictionary of custom check functions.
generate_dq_rules
@telemetry_logger("generator", "generate_dq_rules")
def generate_dq_rules(profiles: list[DQProfile] | None = None,
level: str = "error") -> list[dict]
Generates a list of data quality rules based on the provided dq profiles.
Arguments:
profiles- A list of data quality profiles to generate rules for.level- The criticality level of the rules (default is "error").
Returns:
A list of dictionaries representing the data quality rules.
generate_dq_rules_ai_assisted
@telemetry_logger("generator", "generate_dq_rules_ai_assisted")
def generate_dq_rules_ai_assisted(user_input: str,
table_name: str = "") -> list[dict]
Generates data quality rules using LLM based on natural language input.
Arguments:
user_input- Natural language description of data quality requirements.table_name- Optional fully qualified table name. If not provided, LLM will be used to guess the table schema.
Returns:
A list of dictionaries representing the generated data quality rules.
Raises:
MissingParameterError- If DSPy compiler is not available.
dq_generate_is_in
@staticmethod
def dq_generate_is_in(column: str, level: str = "error", **params: dict)
Generates a data quality rule to check if a column's value is in a specified list.
Arguments:
column- The name of the column to check.level- The criticality level of the rule (default is "error").params- Additional parameters, including the list of values to check against.
Returns:
A dictionary representing the data quality rule.
dq_generate_min_max
@staticmethod
def dq_generate_min_max(column: str, level: str = "error", **params: dict)
Generates a data quality rule to check if a column's value is within a specified range.
Arguments:
column- The name of the column to check.level- The criticality level of the rule (default is "error").params- Additional parameters, including the minimum and maximum values.
Returns:
A dictionary representing the data quality rule, or None if no limits are provided.
dq_generate_is_not_null
@staticmethod
def dq_generate_is_not_null(column: str, level: str = "error", **params: dict)
Generates a data quality rule to check if a column's value is not null.
Arguments:
column- The name of the column to check.level- The criticality level of the rule (default is "error").params- Additional parameters.
Returns:
A dictionary representing the data quality rule.
dq_generate_is_not_null_or_empty
@staticmethod
def dq_generate_is_not_null_or_empty(column: str,
level: str = "error",
**params: dict)
Generates a data quality rule to check if a column's value is not null or empty.
Arguments:
column- The name of the column to check.level- The criticality level of the rule (default is "error").params- Additional parameters, including whether to trim strings.
Returns:
A dictionary representing the data quality rule.