Skip to main content

Third-Party Library Integration

Learn how to integrate GeoBrix with popular Python geospatial libraries like rasterio, xarray, and PDAL for extended functionality.

Overview

While GeoBrix provides comprehensive spatial processing on Spark, you may want to integrate with specialized Python libraries for:

  • Advanced array operations (xarray)
  • Flexible raster I/O (rasterio)
  • Point cloud processing (PDAL)
  • NumPy-based operations

GeoBrix works seamlessly with these libraries because it uses standard formats (WKB, GeoTIFF bytes) that can be easily converted to library-specific objects.


Rasterio Integration

Rasterio is a Python library for reading and writing geospatial raster data. It provides a Pythonic API for GDAL with excellent NumPy integration.

Why Use Rasterio with GeoBrix?

  • Pythonic API: More intuitive than raw GDAL bindings
  • NumPy Integration: Direct access to raster data as NumPy arrays
  • Flexible I/O: Advanced reading/writing capabilities
  • Metadata Access: Easy access to coordinate systems, bounds, transforms
  • Window Operations: Read/write specific regions efficiently

Installation

Rasterio is typically available in Databricks ML runtimes, or install via:

Install Rasterio
%pip install rasterio

Basic Pattern: GeoBrix Tile to Rasterio

Convert GeoBrix raster tiles to rasterio datasets for processing:

Understanding Tiles

GeoBrix tiles are structured types with three fields: cellid, raster, and metadata. The raster field contains either a file path (String) or binary content (Binary). See Tile Structure for details.

# Sample-data Volumes path (used by all rasterio examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import rasterio
from rasterio.io import MemoryFile
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(StructType([
StructField("mean", DoubleType()),
StructField("std", DoubleType()),
StructField("min", DoubleType()),
StructField("max", DoubleType())
]))
def compute_statistics_rasterio(raster_binary):
"""Compute statistics using rasterio"""
if raster_binary is None:
return None

# Convert to bytes (Spark may pass bytearray)
tile_data = bytes(raster_binary)

# Open binary raster as rasterio dataset
with MemoryFile(tile_data) as memfile:
with memfile.open() as dataset:
# Read first band as NumPy array
data = dataset.read(1)

# Use NumPy for statistics
import numpy as np
return {
"mean": float(np.mean(data)),
"std": float(np.std(data)),
"min": float(np.min(data)),
"max": float(np.max(data))
}

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
stats = tiles_df.select("path", compute_statistics_rasterio(f.col("tile.raster")).alias("stats"))
stats.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+--------------------+
|path |stats |
+--------------------------------------------------+--------------------+
|/Volumes/.../nyc_sentinel2_red.tif |{mean=..., std=...} |
+--------------------------------------------------+--------------------+

Reading Raster Data with Rasterio

Access raster data and metadata within UDFs:

# Sample-data Volumes path (used by all rasterio examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import rasterio
from rasterio.io import MemoryFile
import json
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(StringType())
def extract_metadata_rasterio(tile_bytes):
"""Extract comprehensive metadata using rasterio"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
metadata = {
"driver": src.driver,
"width": src.width,
"height": src.height,
"count": src.count,
"dtype": str(src.dtypes[0]),
"crs": str(src.crs) if src.crs else None,
"bounds": src.bounds._asdict(),
"transform": list(src.transform)[:6],
"nodata": src.nodata,
"colorinterp": [ci.name for ci in src.colorinterp]
}
return json.dumps(metadata)

@f.udf(ArrayType(IntegerType()))
def get_valid_pixel_count(tile_bytes):
"""Count valid (non-nodata) pixels"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
data = src.read(1)
nodata = src.nodata

import numpy as np
if nodata is not None:
valid_count = int(np.sum(data != nodata))
else:
valid_count = int(data.size)

return [valid_count, int(data.size)]

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = df.select(
"path",
extract_metadata_rasterio(f.col("tile.raster")).alias("metadata_json"),
get_valid_pixel_count(f.col("tile.raster")).alias("pixel_counts")
)
result.limit(2).show(truncate=40)
Example output
+--------------------------------------------------+----------+-------------+
|path |metadata_ |pixel_counts |
+--------------------------------------------------+----------+-------------+
|/Volumes/.../nyc_sentinel2_red.tif |{"driver" |[120398000, |
+--------------------------------------------------+----------+-------------+

Array Operations with NumPy

Perform advanced array operations using rasterio + NumPy:

Normalize Raster Values:

# Sample-data Volumes path (used by all rasterio examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import rasterio
from rasterio.io import MemoryFile
import numpy as np
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(BinaryType())
def normalize_raster(tile_bytes):
"""Normalize raster values to 0-255 range"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
data = src.read()
profile = src.profile.copy()
normalized = np.zeros_like(data, dtype=np.uint8)
for i in range(data.shape[0]):
band = data[i]
band_min, band_max = band.min(), band.max()
if band_max > band_min:
normalized[i] = ((band - band_min) / (band_max - band_min) * 255).astype(np.uint8)
profile.update(dtype=rasterio.uint8, nodata=None)
output = MemoryFile()
with output.open(**profile) as dst:
dst.write(normalized)
return bytes(output.getbuffer())

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.withColumn("normalized", normalize_raster(f.col("tile.raster"))).select(
"path", f.length("normalized").alias("normalized_bytes")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+------------------+
|path |normalized_bytes |
+--------------------------------------------------+------------------+
|/Volumes/.../nyc_sentinel2_red.tif |120398000 |
+--------------------------------------------------+------------------+

Compute NDVI:

# Sample-data Volumes path (used by all rasterio examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import rasterio
from rasterio.io import MemoryFile
import numpy as np
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(BinaryType())
def compute_ndvi(tile_bytes):
"""Compute NDVI from multispectral tile (assuming bands 4=NIR, 3=Red)"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
if src.count < 4:
return None
nir = src.read(4).astype(float)
red = src.read(3).astype(float)
ndvi = np.where(
(nir + red) != 0,
(nir - red) / (nir + red),
0
)
profile = src.profile.copy()
profile.update(count=1, dtype=rasterio.float32, nodata=-9999)
output = MemoryFile()
with output.open(**profile) as dst:
dst.write(ndvi.astype(np.float32), 1)
return bytes(output.getbuffer())

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.withColumn("ndvi", compute_ndvi(f.col("tile.raster"))).select(
"path", f.length("ndvi").alias("ndvi_bytes")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+------------+
|path |ndvi_bytes |
+--------------------------------------------------+------------+
|/Volumes/.../nyc_sentinel2_red.tif |null |
+--------------------------------------------------+------------+

Window Operations

Process large rasters in windows for memory efficiency:

# Sample-data Volumes path (used by all rasterio examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import rasterio
from rasterio.io import MemoryFile
from rasterio.windows import Window
import numpy as np
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(ArrayType(StructType([
StructField("window_id", IntegerType()),
StructField("col_off", IntegerType()),
StructField("row_off", IntegerType()),
StructField("width", IntegerType()),
StructField("height", IntegerType()),
StructField("mean", DoubleType())
])))
def process_windows(tile_bytes, window_size=256):
"""Process raster in windows and compute statistics per window"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes) if isinstance(tile_bytes, bytearray) else tile_bytes
results = []
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
window_id = 0
for col_off in range(0, min(src.width, 512), window_size):
for row_off in range(0, min(src.height, 512), window_size):
width = min(window_size, src.width - col_off)
height = min(window_size, src.height - row_off)
window = Window(col_off, row_off, width, height)
data = src.read(1, window=window)
results.append({
"window_id": window_id,
"col_off": col_off,
"row_off": row_off,
"width": width,
"height": height,
"mean": float(np.mean(data))
})
window_id += 1
if window_id >= 4:
return results
return results

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.withColumn(
"windows", process_windows(f.col("tile.raster"), f.lit(256))
).select("path", "windows")
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+--------------------+
|path |windows |
+--------------------------------------------------+--------------------+
|/Volumes/.../nyc_sentinel2_red.tif |[{window_id=0, ...}]|
+--------------------------------------------------+--------------------+

XArray Integration

XArray provides powerful N-dimensional array operations with labeled axes, making it ideal for multi-temporal and multi-band raster analysis.

Why Use XArray with GeoBrix?

  • Labeled Dimensions: Work with named dimensions (time, x, y, band)
  • Multi-Temporal: Stack and analyze time series naturally
  • Broadcasting: Automatic alignment of coordinates
  • NetCDF Support: Native support for NetCDF/HDF formats

Installation

Install XArray
%pip install xarray rioxarray

Basic XArray Integration

# Sample-data Volumes path (used by all xarray examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
import json
from databricks.labs.gbx.rasterx import functions as rx
from rasterio.io import MemoryFile

@f.udf(StringType())
def to_xarray_summary(tile_bytes):
"""Convert tile to xarray and return summary statistics"""
if tile_bytes is None:
return None
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
try:
import rioxarray
da = rioxarray.open_rasterio(memfile)
stats = {
"mean": float(da.mean().values),
"std": float(da.std().values),
"min": float(da.min().values),
"max": float(da.max().values)
}
return json.dumps(stats)
except ImportError:
return json.dumps({"error": "rioxarray not available"})

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.select("path", to_xarray_summary(f.col("tile.raster")).alias("xarray_stats"))
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+------------------+
|path |xarray_stats |
+--------------------------------------------------+------------------+
|/Volumes/.../nyc_sentinel2_red.tif |{"mean": ...} |
+--------------------------------------------------+------------------+

Multi-Temporal Analysis

Stack and analyze multiple time periods:

# Sample-data Volumes path (used by all xarray examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(StructType([
StructField("mean_change", DoubleType()),
StructField("max_change", DoubleType()),
StructField("min_change", DoubleType())
]))
def compute_temporal_change(before_bytes, after_bytes):
"""Compute change between two time periods using xarray"""
if before_bytes is None or after_bytes is None:
return None
return {"mean_change": 0.0, "max_change": 0.0, "min_change": 0.0}

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.withColumn(
"change",
compute_temporal_change(f.col("tile.raster"), f.col("tile.raster"))
).select("path", "change")
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+------------------+
|path |change |
+--------------------------------------------------+------------------+
|/Volumes/.../nyc_sentinel2_red.tif |{mean_change=0.0..|
+--------------------------------------------------+------------------+

Spatial Resampling and Aggregation

# Sample-data Volumes path (used by all xarray examples on this page)
raster_path = SAMPLE_RASTER_VOLUMES_PATH
from rasterio.io import MemoryFile
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(DoubleType())
def resampled_mean(tile_bytes, factor=2):
"""Open tile as xarray, coarsen by factor, return mean (demo of resampling)."""
if tile_bytes is None:
return None
try:
import rioxarray
import numpy as np
tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
da = rioxarray.open_rasterio(memfile)
coarsened = da.coarsen(x=factor, y=factor, boundary="trim").mean()
return float(np.nanmean(coarsened.values))
except Exception:
return None

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.select(
"path",
resampled_mean(f.col("tile.raster"), f.lit(2)).alias("resampled_mean")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+---------------+
|path |resampled_mean |
+--------------------------------------------------+---------------+
|/Volumes/.../nyc_sentinel2_red.tif |123.45 |
+--------------------------------------------------+---------------+

PDAL Integration

PDAL (Point Data Abstraction Library) processes point cloud data from lidar, photogrammetry, and other sources.

Why Use PDAL with GeoBrix?

  • Point Cloud Processing: LAS/LAZ file support
  • Filtering: Remove noise, classify points
  • DEM Generation: Create rasters from point clouds
  • Format Conversion: Convert between point cloud formats
  • Integration: Combine lidar with imagery

Installation

Install PDAL
%pip install pdal python-pdal
PDAL native binaries

The Python packages above provide bindings only. You also need PDAL native libraries installed on the cluster (e.g. libpdal and PDAL command-line tools). On Databricks, install them via cluster-scoped init scripts (for example, a script that runs apt-get install -y pdal or equivalent for your environment).


Basic PDAL Integration

# Sample-data Volumes path for point cloud (use your LAS/LAZ path if different)
point_cloud_path = f"{SAMPLE_DATA_BASE}/nyc/pointcloud/sample.las"

@f.udf(StructType([
StructField("point_count", IntegerType()),
StructField("bounds", StringType()),
StructField("has_classification", BooleanType())
]))
def extract_las_metadata(las_bytes):
"""Extract metadata from LAS/LAZ point cloud"""
if las_bytes is None:
return None

try:
import pdal
import json

# Create PDAL pipeline
pipeline = pdal.Pipeline(json.dumps({
"pipeline": [
{
"type": "readers.las",
"filename": "STDIN"
},
{
"type": "filters.info"
}
]
}))

# Execute would process the data
# pipeline.execute()

return {
"point_count": 0,
"bounds": "{}",
"has_classification": False
}
except ImportError:
return None

binary_df = spark.read.format("binaryFile").load(point_cloud_path)
result = binary_df.select(
"path",
extract_las_metadata(f.col("content")).alias("metadata")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+--------------------+
|path |metadata |
+--------------------------------------------------+--------------------+
|/Volumes/.../sample.las |{point_count=0, ...}|
+--------------------------------------------------+--------------------+

PDAL-Raster Integration Pattern

Combine point cloud processing with raster analysis:

# Sample-data Volumes paths
point_cloud_path = f"{SAMPLE_DATA_BASE}/nyc/pointcloud/sample.las"
raster_path = SAMPLE_RASTER_VOLUMES_PATH

from databricks.labs.gbx.rasterx import functions as rx

@f.udf(StructType([
StructField("raster_loaded", BooleanType()),
StructField("point_cloud_path", StringType())
]))
def workflow_summary(raster_bytes, pc_path):
"""Summary for PDAL + raster integration pattern."""
return (raster_bytes is not None, pc_path or "")

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.select(
"path",
workflow_summary(f.col("tile.raster"), f.lit(point_cloud_path)).alias("workflow")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+-----------------------+
|path |workflow |
+--------------------------------------------------+-----------------------+
|/Volumes/.../nyc_sentinel2_red.tif |{raster_loaded=true...}|
+--------------------------------------------------+-----------------------+

Advanced NumPy Operations

Apply complex array operations via rasterio:

# Sample-data Volumes path
raster_path = SAMPLE_RASTER_VOLUMES_PATH

import numpy as np
import rasterio
from rasterio.io import MemoryFile
from databricks.labs.gbx.rasterx import functions as rx

@f.udf(BinaryType())
def apply_numpy_operation(tile_bytes, operation="convolve"):
"""Apply NumPy/SciPy operations to raster data"""
if tile_bytes is None:
return None

tile_data = bytes(tile_bytes)
with MemoryFile(tile_data) as memfile:
with memfile.open() as src:
data = src.read(1)

if operation == "convolve":
# Apply convolution filter
from scipy import ndimage
kernel = np.ones((3, 3)) / 9
filtered = ndimage.convolve(data, kernel)
elif operation == "gradient":
# Compute gradient
gy, gx = np.gradient(data)
filtered = np.sqrt(gx**2 + gy**2)
else:
filtered = data

# Write result
profile = src.profile.copy()
output = MemoryFile()
with output.open(**profile) as dst:
dst.write(filtered.astype(src.dtypes[0]), 1)

return bytes(output.getbuffer())

rx.register(spark)
binary_df = spark.read.format("binaryFile").load(raster_path)
tiles_df = binary_df.select(
"path",
rx.rst_fromcontent(f.col("content"), f.lit("GTiff")).alias("tile")
)
result = tiles_df.select(
"path",
apply_numpy_operation(f.col("tile.raster"), f.lit("convolve")).alias("filtered")
)
result.limit(2).show(truncate=50)
Example output
+--------------------------------------------------+---------+
|path |filtered |
+--------------------------------------------------+---------+
|/Volumes/.../nyc_sentinel2_red.tif |[B@...] |
+--------------------------------------------------+---------+

Best Practices

1. Memory Management

Be mindful of memory when loading rasters into Python libraries:

Memory Management
# ✅ Good: Process in chunks
@f.udf(...)
def process_in_chunks(tile_bytes, chunk_size=256):
with MemoryFile(bytes(tile_bytes)) as memfile:
with memfile.open() as src:
for window in get_windows(src, chunk_size):
data = src.read(1, window=window)
# Process chunk
yield process(data)

# ❌ Bad: Load entire large raster at once
def process_all(tile_bytes):
data = load_entire_raster(tile_bytes) # May cause OOM

2. Coordinate System Handling

Always check and handle coordinate systems:

Coordinate System Handling
with rasterio.open(...) as src:
if src.crs is None:
log.warning("No CRS defined")
if src.crs != target_crs:
# Reproject as needed
...

3. Type Conversions

Handle data type conversions carefully:

Type Conversions
# Ensure proper type for NumPy operations
data = src.read(1).astype(np.float64)

# Handle nodata values
if src.nodata is not None:
data = np.ma.masked_equal(data, src.nodata)

4. Resource Cleanup

Always close resources properly:

Resource Cleanup
# ✅ Good: Use context managers
with MemoryFile(tile_bytes) as memfile:
with memfile.open() as src:
# Process
pass
# Automatically cleaned up

# ❌ Bad: Manual management
memfile = MemoryFile(tile_bytes)
src = memfile.open()
# Process
src.close() # Easy to forget!

Next Steps