#!/usr/bin/env -S uv run --script
# /// script
# dependencies = [
#   "numpy",
#   "scipy",
#   "Pillow",
#   "imageio",
#   "rawpy",
#   "colour-science",
#   "torch",
#   "warp-lang",
# ]
# [tool.uv.sources]
# torch = [{ index = "pytorch-cu128" }]
# [[tool.uv.index]]
# name = "pytorch-cu128"
# url = "https://download.pytorch.org/whl/cu128"
# explicit = true
# ///

import argparse
import json
import math
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Optional

import colour
import imageio.v3 as iio
import numpy as np
import rawpy
import torch
import torch.nn.functional as F
import warp as wp
from colour.colorimetry import SDS_ILLUMINANTS
from scipy.integrate import quad
from scipy.interpolate import interp1d
from scipy.ndimage import (gaussian_filter, gaussian_filter1d,  # For LUTs
                           map_coordinates)
from scipy.signal.windows import gaussian  # For creating Gaussian kernel

# --- Configuration ---
EPSILON = 1e-10
### PERFORMANCE ###
# Size of the 3D LUTs. 33 is a common industry size (e.g., in .cube files).
# 17 is faster to generate, 65 is more accurate but much slower to generate.
LUT_SIZE = 65

GLOBAL_DEBUG = False


# --- Global variables to ensure all debug images for a single run go to the same folder ---
# This creates a unique, timestamped directory for each script execution.
RUN_TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H%M%S")
DEBUG_OUTPUT_DIR = Path(f"debug_outputs/{RUN_TIMESTAMP}")


def save_debug_image(image: np.ndarray, tag: str, output_format: str = "jpeg"):
    """
    Saves a debug image at any point in the processing pipeline.

    The function takes a NumPy array (assumed to be float data in the [0.0, 1.0] range),
    a descriptive tag, and an optional format. It saves the image to a timestamped
    sub-directory within a `debug_outputs` folder.

    Args:
        image (np.ndarray): The image data to save. Expected to be a floating-point
                            array with values scaled between 0.0 and 1.0.
        tag (str): A descriptive name for this processing stage (e.g.,
                   "after_exposure_lut", "final_linear_image"). This will be
                   part of the filename.
        output_format (str, optional): The desired output format. Can be 'jpeg' (or 'jpg')
                                       for 8-bit JPEG, or 'tiff' (or 'tif') for 16-bit
                                       TIFF. Defaults to 'jpeg'.
    """
    global GLOBAL_DEBUG, DEBUG_OUTPUT_DIR
    if not GLOBAL_DEBUG:
        # If global debug is off, do nothing
        return
    try:
        # 1. Ensure the debug directory for this run exists.
        DEBUG_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

        # 2. Sanitize the tag to create a safe filename.
        # Removes invalid characters and replaces spaces with underscores.
        safe_tag = (
            "".join(c for c in tag if c.isalnum() or c in (" ", "_", "-"))
            .rstrip()
            .replace(" ", "_")
        )

        # 3. Create a unique filename with a precise timestamp.
        image_timestamp = datetime.now().strftime("%H%M%S_%f")

        # 4. Determine file extension and prepare the final image data.
        # Clip the float image data to the [0.0, 1.0] range to prevent wrap-around
        # errors or artifacts when converting to integer types.
        clipped_image = np.clip(image, 0.0, 1.0)

        if "tif" in output_format.lower():
            # Convert to 16-bit for TIFF
            output_image = (clipped_image * 65535.0).astype(np.uint16)
            extension = "tiff"
        else:
            # Convert to 8-bit for JPEG
            output_image = (clipped_image * 255.0).astype(np.uint8)
            extension = "jpg"

        filename = f"{image_timestamp}_{safe_tag}.{extension}"
        filepath = DEBUG_OUTPUT_DIR / filename

        # 5. Save the image and print a confirmation message.
        iio.imwrite(filepath, output_image)
        print(f"✅ DEBUG: Saved '{tag}' to '{filepath}'")

    except Exception as e:
        # If anything goes wrong, print a warning but don't crash the main script.
        print(
            f"⚠️ DEBUG WARNING: Could not save debug image for tag '{tag}'.\n   Reason: {e}"
        )


class Info:
    name: str
    description: str
    format_mm: int
    version: str

    def __init__(
        self, name: str, description: str, format_mm: int, version: str
    ) -> None:
        self.name, self.description, self.format_mm, self.version = (
            name,
            description,
            format_mm,
            version,
        )


class Balance:
    r_shift: float
    g_shift: float
    b_shift: float

    def __init__(self, r_shift: float, g_shift: float, b_shift: float) -> None:
        self.r_shift, self.g_shift, self.b_shift = r_shift, g_shift, b_shift


class Gamma:
    r_factor: float
    g_factor: float
    b_factor: float

    def __init__(self, r_factor: float, g_factor: float, b_factor: float) -> None:
        self.r_factor, self.g_factor, self.b_factor = r_factor, g_factor, b_factor


class Processing:
    gamma: Gamma
    balance: Balance

    def __init__(self, gamma: Gamma, balance: Balance) -> None:
        self.gamma, self.balance = gamma, balance


class Couplers:
    saturation_amount: float
    dir_amount_rgb: List[float]
    dir_diffusion_um: float
    dir_diffusion_interlayer: float

    def __init__(
        self,
        saturation_amount: float,
        dir_amount_rgb: List[float],
        dir_diffusion_um: float,
        dir_diffusion_interlayer: float,
    ) -> None:
        (
            self.saturation_amount,
            self.dir_amount_rgb,
            self.dir_diffusion_um,
            self.dir_diffusion_interlayer,
        ) = (
            saturation_amount,
            dir_amount_rgb,
            dir_diffusion_um,
            dir_diffusion_interlayer,
        )


class HDCurvePoint:
    d: Optional[float]
    r: float
    g: float
    b: float

    def __init__(self, d: Optional[float], r: float, g: float, b: float) -> None:
        self.d, self.r, self.g, self.b = d, r, g, b


class SpectralSensitivityCurvePoint:
    wavelength: float
    y: float
    m: float
    c: float

    def __init__(self, wavelength: float, y: float, m: float, c: float) -> None:
        self.wavelength, self.y, self.m, self.c = wavelength, y, m, c


class RGBValue:
    r: float
    g: float
    b: float

    def __init__(self, r: float, g: float, b: float) -> None:
        self.r, self.g, self.b = r, g, b


class Curves:
    hd: List[HDCurvePoint]
    spectral_sensitivity: List[SpectralSensitivityCurvePoint]

    def __init__(
        self,
        hd: List[HDCurvePoint],
        spectral_sensitivity: List[SpectralSensitivityCurvePoint],
    ) -> None:
        self.hd, self.spectral_sensitivity = hd, spectral_sensitivity


class Halation:
    strength: RGBValue
    size_um: RGBValue

    def __init__(self, strength: RGBValue, size_um: RGBValue) -> None:
        self.strength, self.size_um = strength, size_um


class Interlayer:
    diffusion_um: float

    def __init__(self, diffusion_um: float) -> None:
        self.diffusion_um = diffusion_um


class Calibration:
    iso: int
    middle_gray_logE: float

    def __init__(self, iso: int, middle_gray_logE: float) -> None:
        self.iso, self.middle_gray_logE = iso, middle_gray_logE


class Properties:
    halation: Halation
    couplers: Couplers
    interlayer: Interlayer
    curves: Curves
    calibration: Calibration

    def __init__(
        self,
        halation: Halation,
        couplers: Couplers,
        interlayer: Interlayer,
        curves: Curves,
        calibration: Calibration,
    ) -> None:
        self.halation, self.couplers, self.interlayer, self.curves, self.calibration = (
            halation,
            couplers,
            interlayer,
            curves,
            calibration,
        )


class FilmDatasheet:
    info: Info
    processing: Processing
    properties: Properties

    def __init__(
        self, info: Info, processing: Processing, properties: Properties
    ) -> None:
        self.info, self.processing, self.properties = info, processing, properties


### NEW: GPU-accelerated separable Gaussian blur helper
def _gpu_separable_gaussian_blur(
    image_tensor: torch.Tensor, sigma: float, device: str = "cpu"
) -> torch.Tensor:
    """
    Applies a fast, separable Gaussian blur to a tensor on a specified device.

    Args:
        image_tensor (torch.Tensor): The input image tensor, shape (B, C, H, W).
        sigma (float): The standard deviation of the Gaussian kernel.
        device (str): The device to run on ('cpu', 'cuda', 'mps').

    Returns:
        torch.Tensor: The blurred image tensor.
    """
    kernel_radius = int(math.ceil(3 * sigma))
    kernel_size = 2 * kernel_radius + 1

    # Create a 1D Gaussian kernel
    x = torch.arange(
        -kernel_radius, kernel_radius + 1, dtype=torch.float32, device=device
    )
    kernel_1d = torch.exp(-0.5 * (x / sigma) ** 2)
    kernel_1d /= kernel_1d.sum()

    # Get tensor shape
    B, C, H, W = image_tensor.shape

    # Prepare kernels for 2D convolution
    # To blur horizontally, the kernel shape is (C, 1, 1, kernel_size)
    kernel_h = kernel_1d.view(1, 1, 1, kernel_size).repeat(C, 1, 1, 1)

    # To blur vertically, the kernel shape is (C, 1, kernel_size, 1)
    kernel_v = kernel_1d.view(1, 1, kernel_size, 1).repeat(C, 1, 1, 1)

    # Apply horizontal blur
    # padding='same' requires PyTorch 1.9+. For older versions, calculate padding manually.
    padding_h = (kernel_size // 2, 0)
    blurred_tensor = F.conv2d(image_tensor, kernel_h, padding=padding_h, groups=C)

    # Apply vertical blur
    padding_v = (0, kernel_size // 2)
    blurred_tensor = F.conv2d(blurred_tensor, kernel_v, padding=padding_v, groups=C)

    return blurred_tensor


# --- Datasheet Parsing (unchanged) ---
def parse_datasheet_json(json_filepath) -> FilmDatasheet | None:
    # This function remains identical to your last version
    # ...
    if not os.path.exists(json_filepath):
        print(f"Error: Datasheet file not found at {json_filepath}", file=sys.stderr)
        return None
    try:
        with open(json_filepath, "r") as jsonfile:
            data = json.load(jsonfile)
            info = Info(
                name=data["info"]["name"],
                description=data["info"]["description"],
                format_mm=data["info"]["format_mm"],
                version=data["info"]["version"],
            )
            gamma = Gamma(
                r_factor=data["processing"]["gamma"]["r_factor"],
                g_factor=data["processing"]["gamma"]["g_factor"],
                b_factor=data["processing"]["gamma"]["b_factor"],
            )
            balance = Balance(
                r_shift=data["processing"]["balance"]["r_shift"],
                g_shift=data["processing"]["balance"]["g_shift"],
                b_shift=data["processing"]["balance"]["b_shift"],
            )
            processing = Processing(gamma=gamma, balance=balance)
            halation = Halation(
                strength=RGBValue(
                    r=data["properties"]["halation"]["strength"]["r"],
                    g=data["properties"]["halation"]["strength"]["g"],
                    b=data["properties"]["halation"]["strength"]["b"],
                ),
                size_um=RGBValue(
                    r=data["properties"]["halation"]["size_um"]["r"],
                    g=data["properties"]["halation"]["size_um"]["g"],
                    b=data["properties"]["halation"]["size_um"]["b"],
                ),
            )
            couplers = Couplers(
                saturation_amount=data["properties"]["couplers"]["saturation_amount"],
                dir_amount_rgb=data["properties"]["couplers"]["dir_amount_rgb"],
                dir_diffusion_um=data["properties"]["couplers"]["dir_diffusion_um"],
                dir_diffusion_interlayer=data["properties"]["couplers"][
                    "dir_diffusion_interlayer"
                ],
            )
            interlayer = Interlayer(
                diffusion_um=data["properties"]["interlayer"]["diffusion_um"]
            )
            calibration = Calibration(
                iso=data["properties"]["calibration"]["iso"],
                middle_gray_logE=data["properties"]["calibration"]["middle_gray_logE"],
            )
            curves = Curves(
                hd=[
                    HDCurvePoint(d=point["d"], r=point["r"], g=point["g"], b=point["b"])
                    for point in data["properties"]["curves"]["hd"]
                ],
                spectral_sensitivity=[
                    SpectralSensitivityCurvePoint(
                        wavelength=point["wavelength"],
                        y=point["y"],
                        m=point["m"],
                        c=point["c"],
                    )
                    for point in data["properties"]["curves"]["spectral_sensitivity"]
                ],
            )
            properties = Properties(
                calibration=calibration,
                halation=halation,
                couplers=couplers,
                interlayer=interlayer,
                curves=curves,
            )
            return FilmDatasheet(
                info=info, processing=processing, properties=properties
            )
    except Exception as e:
        print(f"Error parsing datasheet JSON '{json_filepath}': {e}", file=sys.stderr)
        return None


# --- Core Simulation Functions ---


### PERFORMANCE ###
def apply_3d_lut(image: np.ndarray, lut: np.ndarray, device="cpu") -> np.ndarray:
    """
    Applies a 3D LUT to an image using PyTorch's grid_sample.

    Args:
        image: Input image, shape (H, W, 3), values in [0, 1].
        lut: 3D LUT, shape (N, N, N, 3).
        device: 'cpu' or 'cuda' for GPU acceleration.

    Returns:
        Output image, shape (H, W, 3).
    """
    # Ensure data is float32, which is standard for torch
    if image.dtype != np.float32:
        image = image.astype(np.float32)
    if lut.dtype != np.float32:
        lut = lut.astype(np.float32)

    # Convert to torch tensors and move to the target device
    image_torch = torch.from_numpy(image).to(device)
    lut_torch = torch.from_numpy(lut).to(device)

    # Prepare image coordinates for grid_sample
    # grid_sample expects coordinates in the range [-1, 1]
    # The image is used as the grid of sampling coordinates
    # We add a batch and depth dimension to match the 5D input requirement
    # Shape: (H, W, 3) -> (1, 1, H, W, 3)
    grid = image_torch * 2 - 1  # Scale from [0, 1] to [-1, 1]
    grid = grid.unsqueeze(0).unsqueeze(0)  # Add batch and depth dims

    # Prepare LUT for grid_sample
    # grid_sample expects the input grid to be (N, C, D_in, H_in, W_in)
    # Our LUT is (N, N, N, 3), which is (D, H, W, C)
    # Permute to (C, D, H, W) -> (3, N, N, N) and add a batch dimension
    # Shape: (N, N, N, 3) -> (1, 3, N, N, N)
    lut_torch = lut_torch.permute(3, 0, 1, 2).unsqueeze(0)

    # Apply the LUT
    # align_corners=True ensures that the corners of the LUT cube map to -1 and 1
    result_torch = F.grid_sample(lut_torch, grid, mode="bilinear", align_corners=True)

    # Reshape the output and convert back to numpy
    # Shape: (1, 3, 1, H, W) -> (H, W, 3)
    result_numpy = result_torch.squeeze().permute(1, 2, 0).cpu().numpy()

    return result_numpy


### PERFORMANCE ###
def create_exposure_lut(
    spectral_data: list[SpectralSensitivityCurvePoint],
    ref_illuminant_spd,
    middle_gray_logE: float,
    size: int,
) -> np.ndarray:
    """Generates a 3D LUT for the linear sRGB -> Log Exposure conversion."""
    print(f"Generating {size}x{size}x{size} exposure LUT...")

    # Create a grid of sRGB input values
    grid_points = np.linspace(0.0, 1.0, size)
    r, g, b = np.meshgrid(grid_points, grid_points, grid_points, indexing="ij")
    # Reshape grid into an "image" for batch processing
    srgb_grid = np.stack([r, g, b], axis=-1).reshape(
        -1, 3
    )  # Shape: (size*size*size, 3)
    srgb_grid = srgb_grid.reshape(size, size * size, 3)  # Make it 2D for the function

    # Run the full spectral calculation on this grid
    lut_data = calculate_film_log_exposure(
        srgb_grid, spectral_data, ref_illuminant_spd, middle_gray_logE, EPSILON
    )

    # Reshape back into a 3D LUT
    return lut_data.reshape(size, size, size, 3)


### PERFORMANCE ###
def create_density_lut(
    processing: Processing,
    hd_curve: List[HDCurvePoint],
    middle_gray_logE: float,
    log_exposure_range: tuple,
    size: int,
) -> np.ndarray:
    """Generates a 3D LUT for the Log Exposure -> Density conversion."""
    print(
        f"Generating {size}x{size}x{size} density LUT for range {log_exposure_range}..."
    )
    le_min, le_max = log_exposure_range

    grid_points = np.linspace(le_min, le_max, size)
    le_r, le_g, le_b = np.meshgrid(grid_points, grid_points, grid_points, indexing="ij")
    log_exposure_grid = np.stack([le_r, le_g, le_b], axis=-1)

    lut_data = apply_hd_curves(
        log_exposure_grid, processing, hd_curve, middle_gray_logE
    )
    return lut_data


# --- (The rest of the core functions: compute_inhibitor_matrix, compute_uncoupled_hd_curves,
#       apply_dir_coupler_simulation, um_to_pixels, apply_hd_curves, apply_saturation_rgb,
#       apply_spatial_effects, calculate_film_log_exposure, remain UNCHANGED from the previous version) ---
def compute_inhibitor_matrix(
    amount_rgb: List[float], diffusion_interlayer: float
) -> np.ndarray:
    matrix = np.eye(3)
    if diffusion_interlayer > 0:
        matrix = gaussian_filter1d(
            matrix, sigma=diffusion_interlayer, axis=0, mode="constant", cval=0
        )
        matrix = gaussian_filter1d(
            matrix, sigma=diffusion_interlayer, axis=1, mode="constant", cval=0
        )
    row_sums = matrix.sum(axis=1)
    row_sums[row_sums == 0] = 1  # Avoid division by zero
    matrix = matrix / row_sums[:, np.newaxis]
    return matrix * np.array(amount_rgb)[:, np.newaxis]


def compute_uncoupled_hd_curves(
    hd_curve_data: List[HDCurvePoint], inhibitor_matrix: np.ndarray
) -> List[HDCurvePoint]:
    log_E_values = np.array([p.d for p in hd_curve_data if p.d is not None])
    density_r, density_g, density_b = (
        np.array([p.r for p in hd_curve_data]),
        np.array([p.g for p in hd_curve_data]),
        np.array([p.b for p in hd_curve_data]),
    )
    source_density_matrix = np.tile(density_g[:, np.newaxis], (1, 3))
    inhibitor_effect = source_density_matrix @ inhibitor_matrix
    uncoupled_r = np.interp(
        log_E_values, log_E_values - inhibitor_effect[:, 0], density_r
    )
    uncoupled_g = np.interp(
        log_E_values, log_E_values - inhibitor_effect[:, 1], density_g
    )
    uncoupled_b = np.interp(
        log_E_values, log_E_values - inhibitor_effect[:, 2], density_b
    )
    return [
        HDCurvePoint(d=log_e, r=uncoupled_r[i], g=uncoupled_g[i], b=uncoupled_b[i])
        for i, log_e in enumerate(log_E_values)
    ]


def apply_dir_coupler_simulation(
    log_exposure_rgb,
    naive_density_rgb,
    inhibitor_matrix,
    diffusion_um,
    film_format_mm,
    image_width_px,
):
    diffusion_pixels = um_to_pixels(diffusion_um, image_width_px, film_format_mm)
    inhibitor_signal_diffused = (
        gaussian_filter(
            naive_density_rgb, sigma=(diffusion_pixels, diffusion_pixels, 0)
        )
        if diffusion_pixels > EPSILON
        else naive_density_rgb
    )
    inhibitor_effect = np.einsum(
        "...s, sm -> ...m", inhibitor_signal_diffused, inhibitor_matrix
    )
    return log_exposure_rgb - inhibitor_effect


def um_to_pixels(sigma_um, image_width_px, film_format_mm):
    if film_format_mm <= 0 or image_width_px <= 0:
        return 0
    microns_per_pixel = (film_format_mm * 1000.0) / image_width_px
    return sigma_um / microns_per_pixel


def apply_hd_curves(
    log_exposure_rgb,
    processing: Processing,
    hd_curve: List[HDCurvePoint],
    middle_gray_logE: float,
) -> np.ndarray:
    density_rgb, gamma_factors, balance_shifts = (
        np.zeros_like(log_exposure_rgb),
        [
            processing.gamma.r_factor,
            processing.gamma.g_factor,
            processing.gamma.b_factor,
        ],
        [
            processing.balance.r_shift,
            processing.balance.g_shift,
            processing.balance.b_shift,
        ],
    )
    log_e_points = [p.d for p in hd_curve if p.d is not None]
    min_logE, max_logE = log_e_points[0], log_e_points[-1]
    for i, channel in enumerate(["R", "G", "B"]):
        gamma_factor = gamma_factors[i]
        log_exposure_adjusted = middle_gray_logE + (
            log_exposure_rgb[..., i] - middle_gray_logE
        ) / (gamma_factor if abs(gamma_factor) > EPSILON else EPSILON)
        y_points = (
            [p.r for p in hd_curve]
            if channel == "R"
            else [p.g for p in hd_curve] if channel == "G" else [p.b for p in hd_curve]
        )
        interp_func = interp1d(
            log_e_points,
            y_points,
            kind="linear",
            bounds_error=False,
            fill_value=(y_points[0], y_points[-1]),  # type: ignore
        )
        density_rgb[..., i] = np.maximum(
            interp_func(np.clip(log_exposure_adjusted, min_logE, max_logE))
            + balance_shifts[i],
            0,
        )
    return density_rgb


def apply_saturation_rgb(image_linear, saturation_factor):
    if saturation_factor == 1.0:
        return image_linear
    luminance = np.einsum(
        "...c, c -> ...", image_linear, np.array([0.2126, 0.7152, 0.0722])
    )
    return np.clip(
        np.expand_dims(luminance, axis=-1)
        + saturation_factor * (image_linear - np.expand_dims(luminance, axis=-1)),
        0.0,
        1.0,
    )


def white_balance_to_d65(image_linear_srgb: np.ndarray) -> np.ndarray:
    """
    Attempts to white balance a linear sRGB image to a D65 illuminant.

    This function is for non-RAW images where the white balance is already baked in.
    It uses the "Gray World" assumption to estimate the current illuminant.

    Args:
        image_linear_srgb: A numpy array representing the image in linear sRGB space.

    Returns:
        A numpy array of the white-balanced image, also in linear sRGB space.
    """
    print("Attempting to force D65 white balance using Gray World estimation...")

    # 1. Estimate the illuminant of the source image.
    # The Gray World assumption states that the average pixel color is the illuminant.
    source_illuminant_rgb = np.mean(image_linear_srgb, axis=(0, 1))

    # Avoid division by zero if the image is black
    if np.all(source_illuminant_rgb < EPSILON):
        return image_linear_srgb

    # 2. Define the input and output color spaces and illuminants.
    # We assume the image is sRGB, which also uses a D65 white point *by definition*.
    # However, if the image has a color cast, its *actual* illuminant is not D65.
    # We are adapting from the *estimated* illuminant back to the *ideal* D65.
    srgb_cs = colour.models.RGB_COLOURSPACE_sRGB

    # The target illuminant is D65. We get its XYZ value from colour-science.
    target_illuminant_xyz = colour.CCS_ILLUMINANTS[
        "CIE 1931 2 Degree Standard Observer"
    ]["D65"]

    # Convert our estimated RGB illuminant to XYZ.
    # source_illuminant_rgb are assumed to be in the sRGB colourspace.
    # srgb_cs is the sRGB colourspace definition, which includes its D65 whitepoint and conversion matrix.
    source_illuminant_xyz = colour.RGB_to_XYZ(
        source_illuminant_rgb,  # RGB values to convert
        srgb_cs,  # Definition of the RGB colourspace these values are in (sRGB)
        srgb_cs.whitepoint,  # CIE XYZ of the illuminant for the sRGB colourspace (D65)
        # This ensures conversion without further chromatic adaptation at this step,
        # as the input RGB values are already adapted to srgb_cs.whitepoint.
    )

    # 3. Perform the chromatic adaptation.
    # We use the Von Kries transform, which is a common and effective method.
    image_adapted_linear_srgb = colour.adaptation.chromatic_adaptation(
        image_linear_srgb,
        source_illuminant_xyz,
        target_illuminant_xyz,
        method="Von Kries",
        transform="CAT02",  # CAT02 is a well-regarded choice
    )

    # 4. Clip to ensure values remain valid.
    return np.clip(image_adapted_linear_srgb, 0.0, 1.0)


def apply_spatial_effects(
    image,
    film_format_mm,
    couplerData: Couplers,
    interlayerData: Interlayer,
    halationData: Halation,
    image_width_px,
):
    sigma_pixels_diffusion = um_to_pixels(
        interlayerData.diffusion_um, image_width_px, film_format_mm
    )
    if sigma_pixels_diffusion > EPSILON:
        image = gaussian_filter(
            image,
            sigma=[sigma_pixels_diffusion, sigma_pixels_diffusion, 0],
            mode="nearest",
        )
    halation_applied, blurred_image_halation = False, np.copy(image)
    strengths, sizes_um = [
        halationData.strength.r,
        halationData.strength.g,
        halationData.strength.b,
    ], [halationData.size_um.r, halationData.size_um.g, halationData.size_um.b]
    for i in range(3):
        sigma_pixels_halation = um_to_pixels(
            sizes_um[i], image_width_px, film_format_mm
        )
        if strengths[i] > EPSILON and sigma_pixels_halation > EPSILON:
            halation_applied = True
            channel_blurred = gaussian_filter(
                image[..., i], sigma=sigma_pixels_halation, mode="nearest"
            )
            blurred_image_halation[..., i] = (
                image[..., i] * (1.0 - strengths[i]) + channel_blurred * strengths[i]
            )
    return (
        np.clip(blurred_image_halation, 0.0, 1.0)
        if halation_applied
        else np.clip(image, 0.0, 1.0)
    )


def calculate_film_log_exposure(
    image_linear_srgb,
    spectral_data: list[SpectralSensitivityCurvePoint],
    ref_illuminant_spd,
    middle_gray_logE,
    EPSILON,
):
    common_shape, common_wavelengths = (
        colour.SpectralShape(380, 780, 5),
        colour.SpectralShape(380, 780, 5).wavelengths,
    )
    sensitivities = np.stack(
        [
            interp1d(
                [p.wavelength for p in spectral_data],
                [p.c for p in spectral_data],
                bounds_error=False,
                fill_value=0,
            )(common_wavelengths),
            interp1d(
                [p.wavelength for p in spectral_data],
                [p.m for p in spectral_data],
                bounds_error=False,
                fill_value=0,
            )(common_wavelengths),
            interp1d(
                [p.wavelength for p in spectral_data],
                [p.y for p in spectral_data],
                bounds_error=False,
                fill_value=0,
            )(common_wavelengths),
        ],
        axis=-1,
    )
    illuminant_aligned = ref_illuminant_spd.copy().align(common_shape)
    mallett_basis_aligned = (
        colour.recovery.MSDS_BASIS_FUNCTIONS_sRGB_MALLETT2019.copy().align(common_shape)
    )
    spectral_reflectance = np.einsum(
        "...c, kc -> ...k", image_linear_srgb, mallett_basis_aligned.values
    )
    light_from_scene = spectral_reflectance * illuminant_aligned.values
    film_exposure_values = np.einsum(
        "...k, ks -> ...s", light_from_scene, sensitivities
    )
    gray_reflectance = np.einsum(
        "c, kc -> k", np.array([0.184, 0.184, 0.184]), mallett_basis_aligned.values
    )
    gray_light = gray_reflectance * illuminant_aligned.values
    exposure_18_gray_film = np.einsum("k, ks -> s", gray_light, sensitivities)
    log_shift = middle_gray_logE - np.log10(exposure_18_gray_film[1] + EPSILON)
    return np.log10(film_exposure_values + EPSILON) + log_shift


def apply_spatial_effects_new(
    image: np.ndarray,
    film_format_mm: int,
    interlayerData: Interlayer,
    halationData: Halation,
    image_width_px: int,
    device: str = "cpu",
    halation_threshold: float = 0.8,
) -> np.ndarray:
    """
    Applies realistic, performant spatial effects (interlayer diffusion and halation).

    Args:
        image (np.ndarray): Input linear image array (H, W, 3).
        film_format_mm (int): Width of the film format in mm.
        interlayerData (Interlayer): Object with interlayer diffusion data.
        halationData (Halation): Object with halation strength and size data.
        image_width_px (int): Width of the input image in pixels.
        device (str): Device for torch operations ('cpu', 'cuda', 'mps').
        halation_threshold (float): Linear brightness value above which halation occurs.

    Returns:
        np.ndarray: Image with spatial effects applied.
    """
    # 1. Setup: Convert numpy image to a torch tensor for GPU processing
    # Tensor shape convention is (Batch, Channels, Height, Width)
    image_tensor = (
        torch.from_numpy(image.astype(np.float32))
        .permute(2, 0, 1)
        .unsqueeze(0)
        .to(device)
    )

    # 2. Interlayer Diffusion: A subtle, overall blur simulating light scatter.
    sigma_pixels_diffusion = um_to_pixels(
        interlayerData.diffusion_um, image_width_px, film_format_mm
    )
    if sigma_pixels_diffusion > EPSILON:
        diffused_tensor = _gpu_separable_gaussian_blur(
            image_tensor, sigma_pixels_diffusion, device
        )
    else:
        diffused_tensor = image_tensor

    # 3. Halation: A more physically-based model
    # a. Create a "halation source" mask from the brightest parts of the image.
    luminance = torch.einsum(
        "bchw, c -> bhw",
        [diffused_tensor, torch.tensor([0.2126, 0.7152, 0.0722]).to(device)],
    ).unsqueeze(1)

    # Apply threshold: only bright areas contribute to the glow.
    halation_source = torch.relu(luminance - halation_threshold)

    # b. Create the colored glow by blurring the source mask with per-channel settings.
    halation_strengths = torch.tensor(
        [halationData.strength.r, halationData.strength.g, halationData.strength.b],
        device=device,
    ).view(1, 3, 1, 1)

    halation_sizes_px = [
        um_to_pixels(halationData.size_um.r, image_width_px, film_format_mm),
        um_to_pixels(halationData.size_um.g, image_width_px, film_format_mm),
        um_to_pixels(halationData.size_um.b, image_width_px, film_format_mm),
    ]

    # Use a downsampling-upsampling pyramid for a very fast, high-quality large blur.
    # This is much faster than a single large convolution.
    halation_glow = torch.zeros_like(diffused_tensor)

    # Initial source for the pyramid
    mip_source = halation_source

    for i in range(4):  # 4 levels of downsampling for a wide, soft glow
        # Blur the current mip level
        mip_blurred = _gpu_separable_gaussian_blur(
            mip_source, 2.0, device
        )  # Small blur at each level

        # Upsample to full size to be added to the final glow
        # Note: 'bilinear' upsampling ensures a smooth result
        upsampled_glow = F.interpolate(
            mip_blurred,
            size=diffused_tensor.shape[2:],
            mode="bilinear",
            align_corners=False,
        )

        # Add this layer's contribution to the total glow
        halation_glow += upsampled_glow

        # Downsample for the next iteration, if not the last level
        if i < 3:
            mip_source = F.max_pool2d(mip_source, kernel_size=2, stride=2)

    # c. Apply color tint and strength, then add the final glow to the image
    # The glow is tinted by the halation strength colors and added to the diffused image
    final_image_tensor = diffused_tensor + (halation_glow * halation_strengths)

    # 4. Finalize: Clip values and convert back to a numpy array
    final_image_tensor = torch.clamp(final_image_tensor, 0.0, 1.0)
    result_numpy = final_image_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy()

    return result_numpy


def calculate_and_apply_exposure_correction(final_image_to_save):
    patch_ratio = 0.8

    # --- Target Patch Extraction ---
    h, w, _ = final_image_to_save.shape
    patch_h = int(h * patch_ratio)
    patch_w = int(w * patch_ratio)

    # Calculate top-left corner of the central patch
    start_h = (h - patch_h) // 2
    start_w = (w - patch_w) // 2

    # Extract the patch using numpy slicing
    patch = final_image_to_save[
        start_h : start_h + patch_h, start_w : start_w + patch_w
    ]

    # --- Colorimetric Measurement of the Patch ---
    # Calculate the mean sRGB value of the patch
    avg_rgb_patch = np.mean(patch, axis=(0, 1))
    print(f"  - Average linear sRGB of patch: {np.round(avg_rgb_patch, 4)}")

    # Define our target: In linear space, middle gray is typically around 0.18 (18%)
    # This corresponds to L*≈46.6 in CIELAB space, not 50
    target_luminance_Y = 0.18  # Standard photographic middle gray

    # For reference, calculate what L* value this corresponds to
    target_lab_L = colour.XYZ_to_Lab(
        np.array([target_luminance_Y, target_luminance_Y, target_luminance_Y])
    )[0]
    print(f"  - Target middle gray L*: {target_lab_L:.1f}")

    # --- Calculating the Scaling Factor ---
    # Convert the average patch sRGB to CIE XYZ
    input_xyz = colour.sRGB_to_XYZ(avg_rgb_patch)
    input_luminance_Y = input_xyz[1]

    print(f"  - Input patch luminance (Y): {input_luminance_Y:.4f}")
    print(f"  - Target middle gray luminance (Y): {target_luminance_Y:.4f}")

    # The scale factor is the ratio of target luminance to input luminance
    # Avoid division by zero for black patches
    if input_luminance_Y < EPSILON:
        scale_factor = 1.0  # Cannot correct a black patch, so do nothing
    else:
        scale_factor = target_luminance_Y / input_luminance_Y

    print(f"  - Calculated exposure scale factor: {scale_factor:.4f}")
    ev_change = np.log2(scale_factor)
    print(f"  - Equivalent EV change: {ev_change:+.2f} EV")
    # cleamp scale_factor to a ev change of +/- 1.5
    if ev_change < -1.5:
        scale_factor = 2**-1.5
    elif ev_change > 1.5:
        scale_factor = 2**1.5
    ev_change = np.log2(scale_factor)
    print(
        f"  - Clamped exposure scale factor: {scale_factor:.4f} (EV change: {ev_change:+.2f})"
    )

    # --- Applying the Correction ---
    # Apply the calculated scale factor to the entire image
    final_image_to_save = final_image_to_save * scale_factor

    # Clip the result to the valid [0.0, 1.0] range
    final_image_to_save = np.clip(final_image_to_save, 0.0, 1.0)
    return final_image_to_save


def apply_shades_of_grey_wb(final_image_to_save):
    print("Applying Shades of Gray white balance...")

    # Parameters for the white balance algorithm
    p = 6  # Minkowski norm parameter (p=1 is Gray World, p=∞ is White Patch)
    clip_percentile = 5  # Percentage of pixels to clip (for robustness)
    epsilon = 1e-10  # Small value to avoid division by zero

    # Helper functions for linearization
    def _linearize_srgb(x):
        # Convert from sRGB to linear RGB
        return np.where(x <= 0.04045, x / 12.92, ((x + 0.055) / 1.055) ** 2.4)

    def _delinearize_srgb(x):
        # Convert from linear RGB to sRGB
        return np.where(x <= 0.0031308, 12.92 * x, 1.055 * (x ** (1 / 2.4)) - 0.055)

        # Work with a copy of the image

    img_float = final_image_to_save.copy()

    # Extract RGB channels
    r = img_float[..., 0]
    g = img_float[..., 1]
    b = img_float[..., 2]

    # Handle clipping of dark and bright pixels for robustness
    if clip_percentile > 0:
        lower = np.percentile(img_float, clip_percentile)
        upper = np.percentile(img_float, 100 - clip_percentile)
        img_float = np.clip(img_float, lower, upper)
        r = img_float[..., 0]
        g = img_float[..., 1]
        b = img_float[..., 2]

        # Calculate illuminant estimate using the Minkowski p-norm
    if p == float("inf"):
        # White Patch (max RGB) case
        illum_r = np.max(r) + epsilon
        illum_g = np.max(g) + epsilon
        illum_b = np.max(b) + epsilon
    else:
        # Standard Minkowski norm
        illum_r = np.power(np.mean(np.power(r, p)), 1 / p) + epsilon
        illum_g = np.power(np.mean(np.power(g, p)), 1 / p) + epsilon
        illum_b = np.power(np.mean(np.power(b, p)), 1 / p) + epsilon

        # The illuminant is the estimated color of the light source
    illuminant = np.array([illum_r, illum_g, illum_b])

    # The "gray" value is typically the average of the illuminant channels
    gray_val = np.mean(illuminant)

    # Calculate scaling factors
    scale_r = gray_val / illum_r
    scale_g = gray_val / illum_g
    scale_b = gray_val / illum_b

    # Apply scaling factors to the original (non-clipped) image
    r_orig = final_image_to_save[..., 0]
    g_orig = final_image_to_save[..., 1]
    b_orig = final_image_to_save[..., 2]

    # Apply the white balance correction
    balanced_r = r_orig * scale_r
    balanced_g = g_orig * scale_g
    balanced_b = b_orig * scale_b

    # Merge channels
    balanced_img = np.stack([balanced_r, balanced_g, balanced_b], axis=-1)

    # Clip to valid range
    final_image_to_save = np.clip(balanced_img, 0.0, 1.0)
    return final_image_to_save


def apply_darktable_color_calibration(final_image_to_save):
    print("Applying DarkTable-style Color Calibration white balance...")

    # --- Illuminant Estimation ---
    # Get original shape and prepare for sampling
    height, width, _ = final_image_to_save.shape

    # Sample the central 80% of the image area to estimate the illuminant
    # This is more robust than using the whole image, which might have black borders etc.
    area_ratio = 0.8
    scale = np.sqrt(area_ratio)
    sample_width = int(width * scale)
    sample_height = int(height * scale)
    x_offset = (width - sample_width) // 2
    y_offset = (height - sample_height) // 2

    # Extract the central patch for sampling
    patch = final_image_to_save[
        y_offset : y_offset + sample_height, x_offset : x_offset + sample_width
    ]

    # Estimate the source illuminant using the Gray World assumption on the patch
    # The average color of the scene is assumed to be the illuminant color.
    source_rgb_avg = np.mean(patch, axis=(0, 1))

    # Avoid issues with pure black patches
    if np.all(source_rgb_avg < EPSILON):
        print("  - Patch is black, skipping white balance.")
        return final_image_to_save

    # Convert the average RGB value to CIE XYZ. This represents our source illuminant.
    # We use the sRGB colorspace definition, which assumes a D65 reference white for the RGB values.
    source_illuminant_xyz = colour.RGB_to_XYZ(
        source_rgb_avg,
        colour.models.RGB_COLOURSPACE_sRGB,
        colour.models.RGB_COLOURSPACE_sRGB.whitepoint,
    )

    # --- Chromatic Adaptation ---
    # Define the target illuminant. For sRGB output, this should be D65.
    # Using D65 ensures that neutral colors in the scene are mapped to neutral
    # colors in the final sRGB image.
    target_illuminant_xyz = colour.CCS_ILLUMINANTS[
        "CIE 1931 2 Degree Standard Observer"
    ]["D65"]

    # Ensure target_illuminant_xyz has 3 components (sometimes returns only x,y)
    if len(target_illuminant_xyz) == 2:
        # Convert xyY to XYZ assuming Y=1
        x, y = target_illuminant_xyz
        X = x / y
        Y = 1.0
        Z = (1 - x - y) / y
        target_illuminant_xyz = np.array([X, Y, Z])

    print(f"  - Source Illuminant (XYZ): {np.round(source_illuminant_xyz, 3)}")
    print(f"  - Target Illuminant (XYZ): {np.round(target_illuminant_xyz, 3)}")

    # Apply chromatic adaptation to the entire image.
    # This transforms the image colors as if the scene was lit by the target illuminant.
    # CAT16 is a modern and accurate Chromatic Adaptation Transform.
    final_image_to_save = colour.adaptation.chromatic_adaptation(
        final_image_to_save,
        source_illuminant_xyz,  # Source illuminant XYZ
        target_illuminant_xyz,  # Target illuminant (D65)
        method="Von Kries",
        transform="CAT16",
    )

    # Clip to valid range and return
    final_image_to_save = np.clip(final_image_to_save, 0.0, 1.0)
    return final_image_to_save


def main():
    parser = argparse.ArgumentParser(
        description="Simulate film stock color characteristics using a datasheet JSON."
    )
    parser.add_argument(
        "input_image",
        help="Path to the input RGB image (e.g., PNG, TIFF). Assumed linear RGB.",
    )
    parser.add_argument("datasheet_json", help="Path to the film datasheet JSON file.")
    parser.add_argument("output_image", help="Path to save the output emulated image.")
    parser.add_argument(
        "--no-cache",
        action="store_true",
        help="Force regeneration of LUTs and do not save them.",
    )
    parser.add_argument(
        "--force-d65",
        action="store_true",
        help="Force white balance when loading RAW files.",
    )
    # --- Modified argument for border ---
    parser.add_argument(
        "--border",
        action="store_true",
        help="Add a 10%% border with the film base color and stock info.",
    )
    parser.add_argument(
        "--gpu", action="store_true", help="Use GPU for LUT processing if available."
    )
    parser.add_argument(
        "--print-film-base-color",
        help="Outputs the film base color determined from HD curve as a CIEXYZ value (D65 illuminant).",
        action="store_true",
    )
    parser.add_argument(
        "--perform-negative-correction",
        action="store_true",
        help="Apply negative film correction based on the datasheet base color.",
    )
    parser.add_argument(
        "--perform-white-balance",
        action="store_true",
        help="Apply white balance to the final result. This is only effective when also using --perform-negative-correction.",
    )
    parser.add_argument(
        "--perform-exposure-correction",
        action="store_true",
        help="Apply exposure correction to the final result. This is only effective when also using --perform-negative-correction. Exposure correction is applied before white balance (if white balancing).",
    )
    parser.add_argument(
        "--raw-auto-exposure",
        action="store_true",
        help="Automatically adjust exposure for RAW images on load.",
    )
    parser.add_argument(
        "--simulate-grain",
        action="store_true",
        help="Simulate film grain in the output image.",
    )
    parser.add_argument(
        "--mono-grain",
        action="store_true",
        help="Simulate monochrome film grain in the output image.",
    )
    args = parser.parse_args()

    datasheet: FilmDatasheet | None = parse_datasheet_json(args.datasheet_json)
    if datasheet is None:
        sys.exit(1)
    print(
        f"Simulating: {datasheet.info.name} ({datasheet.info.format_mm}mm) (v{datasheet.info.version})\n\t{datasheet.info.description}"
    )

    lut_device = "cuda" if args.gpu and torch.cuda.is_available() else "cpu"
    # Check for Apple ARM support
    if lut_device == "cuda" and torch.backends.mps.is_available():
        print("Using Apple MPS backend for GPU acceleration.")
        lut_device = "mps"

    # --- LUT Generation and Pre-computation ---
    cache_dir = Path.home() / ".filmsim"
    # Sanitize datasheet name for use in a filename
    safe_name = "".join(
        c for c in datasheet.info.name if c.isalnum() or c in (" ", "_")
    ).rstrip()
    base_filename = (
        f"{safe_name.replace(' ', '_')}_v{datasheet.info.version}_size{LUT_SIZE}"
    )
    exposure_lut_path = cache_dir / f"{base_filename}_exposure.cube"
    naive_density_lut_path = cache_dir / f"{base_filename}_naive_density.cube"
    uncoupled_density_lut_path = cache_dir / f"{base_filename}_uncoupled_density.cube"
    user_consent_to_save = None

    middle_gray_logE = float(datasheet.properties.calibration.middle_gray_logE)

    exposure_lut: Optional[np.ndarray] = None  # Initialize exposure_lut

    if not args.no_cache and exposure_lut_path.exists():
        try:
            print(f"Loading exposure LUT from cache: {exposure_lut_path}")
            loaded_obj = colour.read_LUT(str(exposure_lut_path))
            if isinstance(loaded_obj, colour.LUT3D):
                exposure_lut = loaded_obj.table
                print(f"Successfully loaded exposure LUT from {exposure_lut_path}")
            else:
                # Log a warning and fall through to regeneration
                print(
                    f"Warning: Cached exposure LUT {exposure_lut_path} is not a LUT3D (type: {type(loaded_obj)}). Will regenerate."
                )
        except Exception as e:
            # Log a warning and fall through to regeneration
            print(
                f"Warning: Error loading exposure LUT from {exposure_lut_path}: {e}. Will regenerate."
            )

    # If LUT wasn't loaded from cache (or cache was disabled, or loading failed), generate it
    if exposure_lut is None:
        print("Generating new exposure LUT...")
        generated_exposure_lut_table = create_exposure_lut(
            datasheet.properties.curves.spectral_sensitivity,
            SDS_ILLUMINANTS["D65"],
            middle_gray_logE,
            size=LUT_SIZE,
        )
        exposure_lut = (
            generated_exposure_lut_table  # Assign to the main variable used later
        )

        # Save the newly generated LUT if caching is enabled
        if not args.no_cache:
            if (
                user_consent_to_save is None
            ):  # Ask for consent only once per run if needed
                response = input(
                    f"Save generated LUTs to {cache_dir} for future use? [Y/n]: "
                ).lower()
                user_consent_to_save = response in ("y", "yes", "")

            if user_consent_to_save:
                try:
                    cache_dir.mkdir(parents=True, exist_ok=True)
                    colour.write_LUT(
                        colour.LUT3D(
                            table=generated_exposure_lut_table,
                            name=f"{base_filename}_exposure",
                        ),
                        str(exposure_lut_path),
                    )
                    print(f"Saved exposure LUT to {exposure_lut_path}")
                except Exception as e:
                    print(
                        f"Error saving exposure LUT to {exposure_lut_path}: {e}",
                        file=sys.stderr,
                    )

    # Ensure exposure_lut is now populated. If not, something went wrong.
    if exposure_lut is None:
        # This case should ideally not be reached if create_exposure_lut always returns a valid LUT or raises an error.
        print(
            "Critical error: Exposure LUT could not be loaded or generated. Exiting.",
            file=sys.stderr,
        )
        sys.exit(1)

    # For density LUTs, we need to know the typical range of log exposure values
    # We can estimate this from the H&D curve data itself.
    log_e_points = [p.d for p in datasheet.properties.curves.hd if p.d is not None]
    log_exposure_range = (log_e_points[0], log_e_points[-1])

    density_lut_naive: Optional[np.ndarray] = None  # Initialize naive density LUT
    if not args.no_cache and naive_density_lut_path.exists():
        try:
            print(f"Loading naive density LUT from cache: {naive_density_lut_path}")
            loaded_obj = colour.read_LUT(str(naive_density_lut_path))
            if isinstance(loaded_obj, colour.LUT3D):
                density_lut_naive = loaded_obj.table
                print(
                    f"Successfully loaded naive density LUT from {naive_density_lut_path}"
                )
            else:
                # Log a warning and fall through to regeneration
                print(
                    f"Warning: Cached naive density LUT {naive_density_lut_path} is not a LUT3D (type: {type(loaded_obj)}). Will regenerate."
                )
        except Exception as e:
            # Log a warning and fall through to regeneration
            print(
                f"Warning: Error loading naive density LUT from {naive_density_lut_path}: {e}. Will regenerate."
            )

    if density_lut_naive is None:
        print("Generating new naive density LUT...")
        density_lut_naive = create_density_lut(
            datasheet.processing,
            datasheet.properties.curves.hd,
            middle_gray_logE,
            log_exposure_range,
            size=LUT_SIZE,
        )
        if not args.no_cache:
            if user_consent_to_save is None:
                response = input(
                    f"Save generated naive density LUT to {cache_dir} for future use? [Y/n]: "
                ).lower()
                user_consent_to_save = response in ("y", "yes", "")
            if user_consent_to_save:
                try:
                    cache_dir.mkdir(parents=True, exist_ok=True)
                    colour.write_LUT(
                        colour.LUT3D(
                            table=density_lut_naive,
                            name=f"{base_filename}_naive_density",
                        ),
                        str(naive_density_lut_path),
                    )
                    print(f"Saved naive density LUT to {naive_density_lut_path}")
                except Exception as e:
                    print(
                        f"Error saving naive density LUT to {naive_density_lut_path}: {e}",
                        file=sys.stderr,
                    )

    if density_lut_naive is None:
        # This case should ideally not be reached if create_density_lut always returns a valid LUT or raises an error.
        print(
            "Critical error: Naive density LUT could not be loaded or generated. Exiting.",
            file=sys.stderr,
        )
        sys.exit(1)

    inhibitor_matrix = compute_inhibitor_matrix(
        datasheet.properties.couplers.dir_amount_rgb,
        datasheet.properties.couplers.dir_diffusion_interlayer,
    )

    density_lut_uncoupled: Optional[np.ndarray] = None  # Initialize
    if not args.no_cache and uncoupled_density_lut_path.exists():
        try:
            print(
                f"Loading uncoupled density LUT from cache: {uncoupled_density_lut_path}"
            )
            loaded_obj = colour.read_LUT(str(uncoupled_density_lut_path))
            if isinstance(loaded_obj, colour.LUT3D):
                density_lut_uncoupled = loaded_obj.table
                print(
                    f"Successfully loaded uncoupled density LUT from {uncoupled_density_lut_path}"
                )
            else:
                # Log a warning and fall through to regeneration
                print(
                    f"Warning: Cached uncoupled density LUT {uncoupled_density_lut_path} is not a LUT3D (type: {type(loaded_obj)}). Will regenerate."
                )
        except Exception as e:
            # Log a warning and fall through to regeneration
            print(
                f"Warning: Error loading uncoupled density LUT from {uncoupled_density_lut_path}: {e}. Will regenerate."
            )

    if density_lut_uncoupled is None:
        uncoupled_hd_curve = compute_uncoupled_hd_curves(
            datasheet.properties.curves.hd, inhibitor_matrix
        )
        density_lut_uncoupled = create_density_lut(
            datasheet.processing,
            uncoupled_hd_curve,
            middle_gray_logE,
            log_exposure_range,
            size=LUT_SIZE,
        )
        if not args.no_cache:
            if user_consent_to_save is None:
                response = input(
                    f"Save generated uncoupled density LUT to {cache_dir} for future use? [Y/n]: "
                ).lower()
                user_consent_to_save = response in ("y", "yes", "")
            if user_consent_to_save:
                try:
                    cache_dir.mkdir(parents=True, exist_ok=True)
                    colour.write_LUT(
                        colour.LUT3D(
                            table=density_lut_uncoupled,
                            name=f"{base_filename}_uncoupled_density",
                        ),
                        str(uncoupled_density_lut_path),
                    )
                    print(
                        f"Saved uncoupled density LUT to {uncoupled_density_lut_path}"
                    )
                except Exception as e:
                    print(
                        f"Error saving uncoupled density LUT to {uncoupled_density_lut_path}: {e}",
                        file=sys.stderr,
                    )

    if density_lut_uncoupled is None:
        # This case should ideally not be reached if create_density_lut always returns a valid LUT or raises an error.
        print(
            "Critical error: Uncoupled density LUT could not be loaded or generated. Exiting.",
            file=sys.stderr,
        )
        sys.exit(1)

    # --- Load and Prepare Input Image ---
    try:
        if any(
            args.input_image.lower().endswith(ext) for ext in [".dng", ".raw", ".arw"]
        ):
            with rawpy.imread(args.input_image) as raw:
                wb_params = {}
                if args.force_d65:
                    # Use a neutral white balance if forcing D65, as post-process will handle it.
                    # Note: rawpy's 'camera' wb is often the best starting point. Forcing D65 might
                    # be better handled by a different method if colors seem off.
                    wb_params = {"user_wb": (1.0, 1.0, 1.0, 1.0)}
                else:
                    wb_params = {"use_camera_wb": True}

                image_raw = raw.postprocess(
                    demosaic_algorithm=rawpy.DemosaicAlgorithm.AHD,  # type: ignore
                    output_bps=16,
                    gamma=(1, 1),  # Linear gamma
                    no_auto_bright=args.raw_auto_exposure,  # type: ignore
                    output_color=rawpy.ColorSpace.sRGB,  # type: ignore
                    **wb_params,
                )
                image_linear = image_raw.astype(np.float64) / 65535.0
                print("RAW file processed to linear floating point directly.")
        else:
            image_raw = iio.imread(args.input_image)
            image_float = image_raw.astype(np.float64) / (
                65535.0 if image_raw.dtype == np.uint16 else 255.0
            )
            image_linear = (
                colour.cctf_decoding(image_float, function="sRGB")
                if image_raw.dtype in (np.uint8, np.uint16)
                else image_float
            )
            if args.force_d65:
                image_linear = white_balance_to_d65(image_linear)
    except Exception as e:
        print(f"Error reading input image: {e}", file=sys.stderr)
        sys.exit(1)

    if image_linear.shape[-1] > 3:
        image_linear = image_linear[..., :3]
    image_linear, image_width_px = np.maximum(image_linear, 0.0), image_linear.shape[1]

    save_debug_image(image_linear, "01_input_linear_sRGB")

    # --- Pipeline Steps ---
    ### PERFORMANCE ###
    # 1. Convert Linear RGB to Log Exposure using the LUT
    print("Applying exposure LUT...")
    log_exposure_rgb = apply_3d_lut(image_linear, exposure_lut, device=lut_device)
    save_debug_image(log_exposure_rgb, "02_log_exposure_RGB")

    # 2. Apply DIR Coupler Simulation
    print("Applying DIR coupler simulation...")
    # 2a. Calculate "naive" density using its LUT
    naive_density_rgb = apply_3d_lut(
        (log_exposure_rgb - log_exposure_range[0])
        / (log_exposure_range[1] - log_exposure_range[0]),
        density_lut_naive,
        device=lut_device,
    )
    save_debug_image(naive_density_rgb, "03_naive_density_RGB")
    # naive_density_rgb = apply_hd_curves(log_exposure_rgb, datasheet.processing, datasheet.properties.curves.hd, middle_gray_logE)  # Apply H&D curves to naive density

    # 2b. Use this density to modify log exposure
    modified_log_exposure_rgb = apply_dir_coupler_simulation(
        log_exposure_rgb,
        naive_density_rgb,
        inhibitor_matrix,
        datasheet.properties.couplers.dir_diffusion_um,
        datasheet.info.format_mm,
        image_width_px,
    )
    save_debug_image(modified_log_exposure_rgb, "04_modified_log_exposure_RGB")

    # 3. Apply the *uncoupled* density LUT to the *modified* exposure
    print("Applying uncoupled density LUT...")
    # Normalize the modified log exposure to the [0,1] range for LUT lookup
    norm_log_exposure = (modified_log_exposure_rgb - log_exposure_range[0]) / (
        log_exposure_range[1] - log_exposure_range[0]
    )
    density_rgb = apply_3d_lut(
        np.clip(norm_log_exposure, 0, 1), density_lut_uncoupled, device=lut_device
    )
    save_debug_image(density_rgb, "05_uncoupled_density_RGB")

    # (Rest of the pipeline is unchanged)
    print("Converting density to linear transmittance...")
    linear_transmittance = np.clip(10.0 ** (-density_rgb), 0.0, 1.0)
    save_debug_image(linear_transmittance, "06_linear_transmittance_RGB")
    print("Applying spatial effects (diffusion, halation)...")
    linear_post_spatial = apply_spatial_effects_new(
        linear_transmittance,
        datasheet.info.format_mm,
        datasheet.properties.interlayer,
        datasheet.properties.halation,
        image_width_px,
    )

    save_debug_image(linear_post_spatial, "07_linear_post_spatial_RGB")
    print("Applying saturation adjustment...")
    linear_post_saturation = apply_saturation_rgb(
        linear_post_spatial, datasheet.properties.couplers.saturation_amount
    )
    save_debug_image(linear_post_saturation, "08_linear_post_saturation_RGB")

    final_image_to_save = linear_post_saturation

    # 1. Get Dmin (minimum density) from the first point of the H&D curve.
    # This represents the density of the unexposed film base (the orange mask).
    dmin_r = datasheet.properties.curves.hd[0].r
    dmin_g = datasheet.properties.curves.hd[0].g
    dmin_b = datasheet.properties.curves.hd[0].b

    # 2. The final density is also affected by the balance shifts.
    dmin_r += datasheet.processing.balance.r_shift
    dmin_g += datasheet.processing.balance.g_shift
    dmin_b += datasheet.processing.balance.b_shift

    base_density = np.array([dmin_r, dmin_g, dmin_b])

    # 3. Convert this final base density to a linear color value.
    film_base_color_linear_rgb = 10.0 ** (-base_density)

    if args.border or args.print_film_base_color:

        # 3. Convert this final base density to a linear color value.
        base_color_linear = 10.0 ** (-base_density)
        if args.print_film_base_color:
            # Convert the base color to CIEXYZ using D65 illuminant
            base_color_xyz = colour.RGB_to_XYZ(
                base_color_linear,
                colour.models.RGB_COLOURSPACE_sRGB,
                colour.models.RGB_COLOURSPACE_sRGB.whitepoint,
            )
            print(f"Film base color (D65): {base_color_xyz} (XYZ)")

        if args.border:
            print("Adding film base colored border...")
            # 4. Create the new image with a 2.25% border on all sides.
            h, w, _ = final_image_to_save.shape
            border_h = int(h * 0.0225)
            border_w = int(w * 0.0225)

            bordered_image = np.full(
                (h + 2 * border_h, w + 2 * border_w, 3),
                base_color_linear,
                dtype=final_image_to_save.dtype,
            )

            # 5. Place the original rendered image in the center.
            bordered_image[border_h : h + border_h, border_w : w + border_w, :] = (
                final_image_to_save
            )
            try:
                from PIL import Image, ImageDraw, ImageFont

                # Convert the float image (0.0-1.0) to a uint8 image (0-255) for Pillow
                bordered_image_uint8 = (np.clip(bordered_image, 0.0, 1.0) * 255).astype(
                    np.uint8
                )
                pil_image = Image.fromarray(bordered_image_uint8)
                draw = ImageDraw.Draw(pil_image)

                # Prepare the text
                text_to_draw = f"{datasheet.info.name.upper()} {datasheet.properties.calibration.iso}"
                text_to_repeat = f"  {text_to_draw}  "

                # Find a suitable monospaced font, with size relative to image height
                font_size = max(
                    4, int(border_h * 0.75)
                )  # At least 4px, 4% of original height otherwise
                font = None
                font_paths = [
                    "Menlo.ttc",
                    "Consolas.ttf",
                    "DejaVuSansMono.ttf",
                    "cour.ttf",
                ]
                for path in font_paths:
                    try:
                        font = ImageFont.truetype(path, size=font_size)
                        break
                    except IOError:
                        continue
                if font is None:
                    print(
                        "Warning: A common monospaced font not found. Using Pillow's default font."
                    )
                    try:
                        font = ImageFont.load_default(size=font_size)
                    except AttributeError:  # For older Pillow versions
                        font = ImageFont.load_default()

                # Calculate text size using the more accurate textbbox
                _, _, text_width, text_height = draw.textbbox(
                    (0, 0), text_to_repeat, font=font
                )

                # Calculate vertical positions to center text in the border
                y_top = (border_h - text_height) // 2
                y_bottom = (h + border_h) + (border_h - text_height) // 2

                # Start drawing with a 2% margin
                start_x = int(w * 0.02)

                # Repeat text across top and bottom borders
                for y_pos in [y_top, y_bottom]:
                    x = start_x
                    while x < pil_image.width:
                        draw.text(
                            (x, y_pos), text_to_repeat, font=font, fill=(200, 200, 200)
                        )  # Light gray fill
                        x += text_width

                # Convert back to float numpy array for saving
                final_image_to_save = np.array(pil_image).astype(np.float64) / 255.0

                save_debug_image(final_image_to_save, "09_bordered_image_RGB")
            except ImportError:
                print(
                    "Warning: Pillow library not found. Skipping text drawing on border.",
                    file=sys.stderr,
                )
                final_image_to_save = bordered_image
            except Exception as e:
                print(f"An error occurred during text drawing: {e}", file=sys.stderr)
                final_image_to_save = bordered_image

    # Apply Film Negative Correction if requested
    if args.perform_negative_correction:
        print("Applying film negative correction...")
        print("Film base color:", film_base_color_linear_rgb)
        masked_removed = final_image_to_save / film_base_color_linear_rgb
        inverted_image = 1.0 / (masked_removed + EPSILON)  # Avoid division by zero
        max_val = np.percentile(inverted_image, 99.9)
        final_image_to_save = np.clip(inverted_image / max_val, 0.0, 1.0)
        save_debug_image(final_image_to_save, "10_negative_corrected_image_RGB")

    # Apply White Balance Correction
    DO_SHADES_OF_GRAY = True  # Use Shades of Gray algorithm for white balance
    if args.perform_white_balance:
        if not args.perform_negative_correction:
            print(
                "Warning: White balance correction is only effective when using --perform-negative-correction. Ignoring flag."
            )
        else:
            if DO_SHADES_OF_GRAY:
                final_image_to_save = apply_shades_of_grey_wb(final_image_to_save)
                save_debug_image(
                    final_image_to_save, "11_shades_of_gray_corrected_image_RGB"
                )
            else:
                final_image_to_save = apply_darktable_color_calibration(
                    final_image_to_save
                )
                save_debug_image(
                    final_image_to_save,
                    "11_darktable_color_calibration_corrected_image_RGB",
                )

    # Apply Exposure Correction
    if args.perform_exposure_correction:
        print("Applying exposure correction...")
        if not args.perform_negative_correction:
            print(
                "Warning: Exposure correction is only effective when using --perform-negative-correction. Ignoring flag."
            )
        else:
            # Define the patch ratio for measurement
            final_image_to_save = calculate_and_apply_exposure_correction(
                final_image_to_save
            )
            save_debug_image(final_image_to_save, "12_exposure_corrected_image_RGB")

    # Apply Tone Curve Correction

    # Apply Film Grain
    if args.simulate_grain:
        print("Simulating film grain...")
        # Import warp if available for GPU acceleration
        try:
            wp_available = True
            wp.init()
            print("Using NVIDIA Warp for GPU-accelerated film grain simulation")
        except ImportError:
            wp_available = False
            print("Warp not available. Using CPU for film grain (slower)")

        # Get ISO from film datasheet
        iso = datasheet.properties.calibration.iso
        height, width = final_image_to_save.shape[:2]

        # Function to get grain parameters based on image dimensions and ISO
        def get_grain_parameters(width, height, iso):
            # --- Baseline Parameters (calibrated for a 24MP image @ ISO 400) ---
            PIXELS_BASE = 24_000_000.0
            ISO_BASE = 400.0
            MU_R_BASE = 0.075
            SIGMA_BASE = 0.25

            # --- Scaling Exponents (Artistically chosen for a natural feel) ---
            ISO_EXPONENT_MU = 0.4
            ISO_EXPONENT_SIGMA = 0.3

            # Clamp ISO to a reasonable range
            iso = max(64.0, min(iso, 8000.0))

            # Calculate the total number of pixels in the actual image
            pixels_actual = float(width * height)

            # Calculate the resolution scaler
            resolution_scaler = math.sqrt(pixels_actual / PIXELS_BASE)
            print(
                f"Resolution scaler: {resolution_scaler:.4f} (for {width}x{height} image)"
            )

            # Calculate the ISO scaler
            iso_ratio = iso / ISO_BASE
            iso_scaler_mu = iso_ratio**ISO_EXPONENT_MU
            iso_scaler_sigma = iso_ratio**ISO_EXPONENT_SIGMA
            print(
                f"ISO scaler: μ = {iso_scaler_mu:.4f}, σ = {iso_scaler_sigma:.4f} (for ISO {iso})"
            )

            # Calculate the final parameters by applying both scalers
            final_mu_r = MU_R_BASE * resolution_scaler * iso_scaler_mu
            final_sigma = SIGMA_BASE * resolution_scaler * iso_scaler_sigma

            return (final_mu_r, final_sigma)

        if wp_available:
            # Define Warp functions and kernels
            @wp.func
            def w_func(x: float):
                if x >= 2.0:
                    return 0.0
                elif x < 0.0:
                    return 1.0
                else:
                    acos_arg = x / 2.0
                    if acos_arg > 1.0:
                        acos_arg = 1.0
                    if acos_arg < -1.0:
                        acos_arg = -1.0

                    sqrt_term_arg = 1.0 - acos_arg * acos_arg
                    if sqrt_term_arg < 0.0:
                        sqrt_term_arg = 0.0

                    overlap_area_over_pi = (
                        2.0 * wp.acos(acos_arg) - x * wp.sqrt(sqrt_term_arg)
                    ) / wp.pi
                    return overlap_area_over_pi

            @wp.func
            def CB_const_radius_unit(u_pixel: float, x: float):
                safe_u = wp.min(u_pixel, 0.99999)
                if safe_u < 0.0:
                    safe_u = 0.0

                one_minus_u = 1.0 - safe_u
                if one_minus_u <= 1e-9:
                    return 0.0

                wx = w_func(x)
                exponent = 2.0 - wx

                term1 = wp.pow(one_minus_u, exponent)
                term2 = one_minus_u * one_minus_u
                return term1 - term2

            @wp.kernel
            def generate_noise_kernel(
                u_image: wp.array2d(dtype=float),
                variance_lut: wp.array(dtype=float),
                noise_out: wp.array2d(dtype=float),
                mu_r: float,
                sigma_filter: float,
                seed: int,
            ):
                ix, iy = wp.tid()
                height = u_image.shape[0]
                width = u_image.shape[1]
                if ix >= height or iy >= width:
                    return

                lut_size = variance_lut.shape[0]
                u_val = u_image[ix, iy]

                lut_pos = u_val * float(lut_size - 1)
                lut_index0 = int(lut_pos)
                lut_index0 = wp.min(wp.max(lut_index0, 0), lut_size - 2)
                lut_index1 = lut_index0 + 1
                t = lut_pos - float(lut_index0)
                if t < 0.0:
                    t = 0.0
                if t > 1.0:
                    t = 1.0

                integral_val = wp.lerp(
                    variance_lut[lut_index0], variance_lut[lut_index1], t
                )

                var_bp = 0.0
                if sigma_filter > 1e-6 and mu_r > 1e-6:
                    var_bp = wp.max(
                        0.0,
                        (mu_r * mu_r)
                        / (2.0 * sigma_filter * sigma_filter)
                        * integral_val,
                    )

                std_dev = wp.sqrt(var_bp)
                state = wp.rand_init(seed, ix * width + iy + seed)
                noise_sample = wp.randn(state) * std_dev
                noise_out[ix, iy] = noise_sample

            @wp.kernel
            def convolve_2d_kernel(
                input_array: wp.array2d(dtype=float),
                kernel: wp.array(dtype=float),
                kernel_radius: int,
                output_array: wp.array2d(dtype=float),
            ):
                ix, iy = wp.tid()
                height = input_array.shape[0]
                width = input_array.shape[1]
                if ix >= height or iy >= width:
                    return

                kernel_dim = 2 * kernel_radius + 1
                accum = float(0.0)

                for ky_offset in range(kernel_dim):
                    for kx_offset in range(kernel_dim):
                        k_idx = ky_offset * kernel_dim + kx_offset
                        weight = kernel[k_idx]

                        # Image coordinates to sample from
                        read_row = ix + (ky_offset - kernel_radius)
                        read_col = iy + (kx_offset - kernel_radius)

                        clamped_row = wp.max(0, wp.min(read_row, height - 1))
                        clamped_col = wp.max(0, wp.min(read_col, width - 1))

                        sample_val = input_array[clamped_row, clamped_col]
                        accum += weight * sample_val
                output_array[ix, iy] = accum

            @wp.kernel
            def add_rgb_noise_and_clip_kernel(
                r_in: wp.array2d(dtype=float),
                g_in: wp.array2d(dtype=float),
                b_in: wp.array2d(dtype=float),
                noise_r: wp.array2d(dtype=float),
                noise_g: wp.array2d(dtype=float),
                noise_b: wp.array2d(dtype=float),
                r_out: wp.array2d(dtype=float),
                g_out: wp.array2d(dtype=float),
                b_out: wp.array2d(dtype=float),
            ):
                ix, iy = wp.tid()

                height = r_in.shape[0]
                width = r_in.shape[1]
                if ix >= height or iy >= width:
                    return

                r_out[ix, iy] = wp.clamp(r_in[ix, iy] + noise_r[ix, iy], 0.0, 1.0)
                g_out[ix, iy] = wp.clamp(g_in[ix, iy] + noise_g[ix, iy], 0.0, 1.0)
                b_out[ix, iy] = wp.clamp(b_in[ix, iy] + noise_b[ix, iy], 0.0, 1.0)

            def integrand_variance(x, u_pixel):
                if x < 0:
                    return 0.0
                if x >= 2.0:
                    return 0.0

                safe_u = np.clip(u_pixel, 0.0, 0.99999)
                one_minus_u = 1.0 - safe_u
                if one_minus_u <= 1e-9:
                    return 0.0

                acos_arg = x / 2.0
                if acos_arg > 1.0:
                    acos_arg = 1.0
                if acos_arg < -1.0:
                    acos_arg = -1.0

                sqrt_term_arg = 1.0 - acos_arg * acos_arg
                if sqrt_term_arg < 0.0:
                    sqrt_term_arg = 0.0

                wx = (2.0 * np.arccos(acos_arg) - x * np.sqrt(sqrt_term_arg)) / np.pi

                cb = np.power(one_minus_u, 2.0 - wx) - np.power(one_minus_u, 2.0)
                return cb * x

            def precompute_variance_lut(num_u_samples=256):
                print(f"Precomputing variance LUT with {num_u_samples+1} entries...")
                u_values_for_lut = np.linspace(
                    0.0, 1.0, num_u_samples + 1, endpoint=True
                )
                lut = np.zeros(num_u_samples + 1, dtype=np.float32)

                for i, u in enumerate(u_values_for_lut):
                    result, error = quad(
                        integrand_variance, 0, 2, args=(u,), epsabs=1e-6, limit=100
                    )
                    if result < 0:
                        result = 0.0
                    lut[i] = result
                    if i % ((num_u_samples + 1) // 10) == 0:
                        print(f"  LUT progress: {i}/{num_u_samples+1}")
                print("Variance LUT computed.")
                return lut

            def create_gaussian_kernel_2d(sigma, radius):
                kernel_size = 2 * radius + 1
                g = gaussian(kernel_size, sigma, sym=True)
                kernel_2d = np.outer(g, g)
                sum_sq = np.sum(kernel_2d**2)
                if sum_sq > 1e-9:
                    kernel_2d /= np.sqrt(sum_sq)
                return kernel_2d.flatten().astype(np.float32)

            # Calculate grain parameters
            mu_r, sigma_filter = get_grain_parameters(width, height, iso)
            print(f"Film grain parameters: μr = {mu_r}, σ_filter = {sigma_filter}")

            # Use 256 u_samples for LUT
            variance_lut_np = precompute_variance_lut(num_u_samples=256)
            variance_lut_wp = wp.array(variance_lut_np, dtype=float, device="cuda")

            kernel_radius = max(1, int(np.ceil(3 * sigma_filter)))
            kernel_np = create_gaussian_kernel_2d(sigma_filter, kernel_radius)
            kernel_wp = wp.array(kernel_np, dtype=float, device="cuda")
            print(
                f"Using Gaussian filter with sigma={sigma_filter}, radius={kernel_radius}"
            )

            # Prepare image arrays on GPU
            img_r = final_image_to_save[:, :, 0]
            img_g = final_image_to_save[:, :, 1]
            img_b = final_image_to_save[:, :, 2]

            r_original_wp = wp.array(img_r, dtype=float, device="cuda")
            g_original_wp = wp.array(img_g, dtype=float, device="cuda")
            b_original_wp = wp.array(img_b, dtype=float, device="cuda")

            # Allocate noise arrays
            noise_r_unfiltered_wp = wp.empty_like(r_original_wp)
            noise_g_unfiltered_wp = wp.empty_like(g_original_wp)
            noise_b_unfiltered_wp = wp.empty_like(b_original_wp)

            noise_r_filtered_wp = wp.empty_like(r_original_wp)
            noise_g_filtered_wp = wp.empty_like(g_original_wp)
            noise_b_filtered_wp = wp.empty_like(b_original_wp)

            # Output arrays
            r_output_wp = wp.empty_like(r_original_wp)
            g_output_wp = wp.empty_like(g_original_wp)
            b_output_wp = wp.empty_like(b_original_wp)

            # Use a random seed based on the ISO value for consistency
            seed = args.seed if hasattr(args, "seed") else int(iso)

            # Generate and apply noise for each channel
            if args.mono_grain:
                # Create luminance image
                img_gray_np = 0.299 * img_r + 0.587 * img_g + 0.114 * img_b
                u_gray_wp = wp.array(img_gray_np, dtype=float, device="cuda")
                noise_image_wp = wp.empty_like(u_gray_wp)

                print("Generating monochromatic noise...")
                wp.launch(
                    kernel=generate_noise_kernel,
                    dim=(height, width),
                    inputs=[
                        u_gray_wp,
                        variance_lut_wp,
                        noise_image_wp,
                        mu_r,
                        sigma_filter,
                        seed,
                    ],
                    device="cuda",
                )

                noise_filtered_wp = wp.empty_like(u_gray_wp)
                wp.launch(
                    kernel=convolve_2d_kernel,
                    dim=(height, width),
                    inputs=[
                        noise_image_wp,
                        kernel_wp,
                        kernel_radius,
                        noise_filtered_wp,
                    ],
                    device="cuda",
                )

                # Copy the same noise to all channels
                noise_r_filtered_wp.assign(noise_filtered_wp)
                noise_g_filtered_wp.assign(noise_filtered_wp)
                noise_b_filtered_wp.assign(noise_filtered_wp)
            else:
                # Process each channel separately
                print("Processing R channel...")
                wp.launch(
                    kernel=generate_noise_kernel,
                    dim=(height, width),
                    inputs=[
                        r_original_wp,
                        variance_lut_wp,
                        noise_r_unfiltered_wp,
                        mu_r,
                        sigma_filter,
                        seed,
                    ],
                    device="cuda",
                )
                wp.launch(
                    kernel=convolve_2d_kernel,
                    dim=(height, width),
                    inputs=[
                        noise_r_unfiltered_wp,
                        kernel_wp,
                        kernel_radius,
                        noise_r_filtered_wp,
                    ],
                    device="cuda",
                )

                print("Processing G channel...")
                wp.launch(
                    kernel=generate_noise_kernel,
                    dim=(height, width),
                    inputs=[
                        g_original_wp,
                        variance_lut_wp,
                        noise_g_unfiltered_wp,
                        mu_r,
                        sigma_filter,
                        seed + 1,
                    ],
                    device="cuda",
                )
                wp.launch(
                    kernel=convolve_2d_kernel,
                    dim=(height, width),
                    inputs=[
                        noise_g_unfiltered_wp,
                        kernel_wp,
                        kernel_radius,
                        noise_g_filtered_wp,
                    ],
                    device="cuda",
                )

                print("Processing B channel...")
                wp.launch(
                    kernel=generate_noise_kernel,
                    dim=(height, width),
                    inputs=[
                        b_original_wp,
                        variance_lut_wp,
                        noise_b_unfiltered_wp,
                        mu_r,
                        sigma_filter,
                        seed + 2,
                    ],
                    device="cuda",
                )
                wp.launch(
                    kernel=convolve_2d_kernel,
                    dim=(height, width),
                    inputs=[
                        noise_b_unfiltered_wp,
                        kernel_wp,
                        kernel_radius,
                        noise_b_filtered_wp,
                    ],
                    device="cuda",
                )

                # Add the noise to the original image
                print("Adding noise to image...")
                wp.launch(
                    kernel=add_rgb_noise_and_clip_kernel,
                    dim=(height, width),
                    inputs=[
                        r_original_wp,
                        g_original_wp,
                        b_original_wp,
                        noise_r_filtered_wp,
                        noise_g_filtered_wp,
                        noise_b_filtered_wp,
                        r_output_wp,
                        g_output_wp,
                        b_output_wp,
                    ],
                    device="cuda",
                )

            # Copy results back to CPU
            img_with_grain = np.zeros((height, width, 3), dtype=np.float32)
            img_with_grain[:, :, 0] = r_output_wp.numpy()
            img_with_grain[:, :, 1] = g_output_wp.numpy()
            img_with_grain[:, :, 2] = b_output_wp.numpy()

            final_image_to_save = img_with_grain

        else:
            # CPU fallback implementation (simplified)
            print("Using CPU for film grain simulation (this will be slower)")
            print("Consider installing NVIDIA Warp for GPU acceleration")

            # Calculate grain parameters
            mu_r, sigma_filter = get_grain_parameters(width, height, iso)

            # Simple noise generation for CPU fallback
            rng = np.random.RandomState(
                args.seed if hasattr(args, "seed") else int(iso)
            )

            # Generate random noise for each channel
            noise_strength = mu_r * 0.15  # Simplified approximation

            if args.mono_grain:
                # Generate one noise channel and apply to all
                noise = rng.normal(0, noise_strength, (height, width))
                noise = gaussian_filter(noise, sigma=sigma_filter)

            # Apply same noise to all channels
            for c in range(3):
                final_image_to_save[:, :, c] = np.clip(
                    final_image_to_save[:, :, c] + noise, 0.0, 1.0
                )
            else:
                # Generate separate noise for each channel
                for c in range(3):
                    noise = rng.normal(0, noise_strength, (height, width))
                    noise = gaussian_filter(noise, sigma=sigma_filter)
                    final_image_to_save[:, :, c] = np.clip(
                        final_image_to_save[:, :, c] + noise, 0.0, 1.0
                    )

        save_debug_image(final_image_to_save, "13_final_image_with_grain_RGB")

    print("Converting to output format and saving...")
    # For some reason DNG files have the top 64 pixel rows black, so we crop them out.
    if args.input_image.lower().endswith((".dng")):
        final_image_to_save = final_image_to_save[64:, :, :]

    output_image = (
        np.clip(final_image_to_save, 0.0, 1.0)
        * (65535.0 if args.output_image.lower().endswith((".tiff", ".tif")) else 255.0)
    ).astype(
        np.uint16 if args.output_image.lower().endswith((".tiff", ".tif")) else np.uint8
    )
    iio.imwrite(args.output_image, output_image)
    print("Done.")


if __name__ == "__main__":
    main()