databricks.labs.dqx.base
DQEngineBase Objects
class DQEngineBase(abc.ABC)
ws
@cached_property
def ws() -> WorkspaceClient
Return a verified WorkspaceClient configured for DQX.
Ensures workspace connectivity and sets the product info used for telemetry so that requests are attributed to dqx.
spark
@cached_property
def spark() -> SparkSession
Return the SparkSession associated with this engine.
The session is created during initialization using SparkSession.builder.getOrCreate().
DQEngineCoreBase Objects
class DQEngineCoreBase(DQEngineBase)
apply_checks
@abc.abstractmethod
def apply_checks(df: DataFrame,
checks: list[DQRule],
ref_dfs: dict[str, DataFrame] | None = None) -> DataFrame
Apply data quality checks to the given DataFrame.
Arguments:
df
- Input DataFrame to check.checks
- List of checks to apply to the DataFrame. Each check must be a DQRule instance.ref_dfs
- Optional reference DataFrames to use in the checks.
Returns:
DataFrame that includes errors and warnings result columns.
apply_checks_and_split
@abc.abstractmethod
def apply_checks_and_split(
df: DataFrame,
checks: list[DQRule],
ref_dfs: dict[str, DataFrame] | None = None
) -> tuple[DataFrame, DataFrame]
Apply data quality checks to the given DataFrame and split the results into two DataFrames ("good" and "bad").
Arguments:
df
- Input DataFrame to check.checks
- List of checks to apply to the DataFrame. Each check must be a DQRule instance.ref_dfs
- Optional reference DataFrames to use in the checks.
Returns:
A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows with errors or warnings and the corresponding result columns).
apply_checks_by_metadata
@abc.abstractmethod
def apply_checks_by_metadata(
df: DataFrame,
checks: list[dict],
custom_check_functions: dict[str, Callable] | None = None,
ref_dfs: dict[str, DataFrame] | None = None) -> DataFrame
Apply data quality checks defined as metadata to the given DataFrame.
Arguments:
df
- Input DataFrame to check.checks
- List of dictionaries describing checks. Each check dictionary must contain the following:- check - A check definition including check function and arguments to use.
- name - Optional name for the resulting column. Auto-generated if not provided.
- criticality - Optional; either error (rows go only to the "bad" DataFrame) or warn (rows appear in both DataFrames).
custom_check_functions
- Optional dictionary with custom check functions (e.g., globals() of the calling module).ref_dfs
- Optional reference DataFrames to use in the checks.
Returns:
DataFrame that includes errors and warnings result columns.
apply_checks_by_metadata_and_split
@abc.abstractmethod
def apply_checks_by_metadata_and_split(
df: DataFrame,
checks: list[dict],
custom_check_functions: dict[str, Callable] | None = None,
ref_dfs: dict[str, DataFrame] | None = None
) -> tuple[DataFrame, DataFrame]
Apply data quality checks defined as metadata to the given DataFrame and split the results into two DataFrames ("good" and "bad").
Arguments:
df
- Input DataFrame to check.checks
- List of dictionaries describing checks. Each check dictionary must contain the following:- check - A check definition including check function and arguments to use.
- name - Optional name for the resulting column. Auto-generated if not provided.
- criticality - Optional; either error (rows go only to the "bad" DataFrame) or warn (rows appear in both DataFrames).
custom_check_functions
- Optional dictionary with custom check functions (e.g., globals() of the calling module).ref_dfs
- Optional reference DataFrames to use in the checks.
Returns:
A tuple of two DataFrames: "good" (may include rows with warnings but no result columns) and "bad" (rows with errors or warnings and the corresponding result columns).
validate_checks
@staticmethod
@abc.abstractmethod
def validate_checks(
checks: list[dict],
custom_check_functions: dict[str, Callable] | None = None,
validate_custom_check_functions: bool = True
) -> ChecksValidationStatus
Validate checks defined as metadata to ensure they conform to the expected structure and types.
This method validates the presence of required keys, the existence and callability of functions, and the types of arguments passed to those functions.
Arguments:
checks
- List of checks to apply to the DataFrame. Each check should be a dictionary.custom_check_functions
- Optional dictionary with custom check functions (e.g., globals() of the calling module).validate_custom_check_functions
- If True, validate custom check functions.
Returns:
ChecksValidationStatus indicating the validation result.
get_invalid
@abc.abstractmethod
def get_invalid(df: DataFrame) -> DataFrame
Return records that violate data quality checks (rows with warnings or errors).
Arguments:
df
- Input DataFrame.
Returns:
DataFrame with rows that have errors or warnings and the corresponding result columns.
get_valid
@abc.abstractmethod
def get_valid(df: DataFrame) -> DataFrame
Return records that do not violate data quality checks (rows with warnings but no errors).
Arguments:
df
- Input DataFrame.
Returns:
DataFrame with warning rows but without the results columns.
load_checks_from_local_file
@staticmethod
@abc.abstractmethod
def load_checks_from_local_file(filepath: str) -> list[dict]
Load DQ rules (checks) from a local JSON or YAML file.
The returned checks can be used as input to apply_checks_by_metadata.
Arguments:
filepath
- Path to a file containing checks definitions.
Returns:
List of DQ rules (checks).
save_checks_in_local_file
@staticmethod
@abc.abstractmethod
def save_checks_in_local_file(checks: list[dict], filepath: str)
Save DQ rules (checks) to a local YAML or JSON file.
Arguments:
checks
- List of DQ rules (checks) to save.filepath
- Path to a file where the checks definitions will be saved.