Source code for geoprior.utils.validator

# SPDX-License-Identifier: Apache-2.0
# Author: LKouadio <etanoyau@gmail.com>
# Adapted from: earthai-tech/gofast — https://github.com/earthai-tech/gofast
# Modified for GeoPrior-v3 API conventions.

"""
Provides a comprehensive set of functions and warnings
for validating and ensuring the integrity of data. This includes
utilities for checking data consistency, validating machine learning targets,
ensuring proper data types, and handling various validation scenarios.
"""

from __future__ import annotations

import inspect
import numbers
import operator
import re
import types
import warnings
from collections.abc import Iterable
from contextlib import suppress
from datetime import datetime
from functools import wraps
from inspect import Parameter, isclass, signature
from typing import (
    Any,
    Literal,
)

import joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.typing import ArrayLike

from ..compat.numpy import ComplexWarning
from ._array_api import _asarray_with_order, get_namespace

FLOAT_DTYPES = (np.float64, np.float32, np.float16)

__all__ = [
    "DataConversionWarning",
    "PositiveSpectrumWarning",
    "array_to_frame",
    "array_to_frame2",
    "assert_all_finite",
    "assert_xy_in",
    "build_data_if",
    "check_X_y",
    "check_array",
    "check_classification_targets",
    "check_consistency_size",
    "check_consistent_length",
    "check_donut_inputs",
    "check_epsilon",
    "check_has_run_method",
    "check_is_fitted",
    "check_is_fitted2",
    "check_is_runned",
    "check_memory",
    "check_mixed_data_types",
    "check_random_state",
    "check_scalar",
    "check_symmetric",
    "check_y",
    "contains_nested_objects",
    "convert_array_to_pandas",
    "ensure_2d",
    "ensure_non_negative",
    "filter_valid_kwargs",
    "get_estimator_name",
    "handle_zero_division",
    "has_methods",
    "has_fit_parameter",
    "has_required_attributes",
    "is_binary_class",
    "is_categorical",
    "is_frame",
    "is_installed",
    "is_normalized",
    "is_square_matrix",
    "is_time_series",
    "is_valid_policies",
    "normalize_array",
    "parameter_validator",
    "process_y_pairs",
    "to_dtype_str",
    "validate_and_adjust_ranges",
    "validate_batch_size",
    "validate_comparison_data",
    "validate_data_types",
    "validate_dates",
    "validate_distribution",
    "validate_dtype_selector",
    "validate_estimator_methods",
    "validate_fit_weights",
    "validate_length_range",
    "validate_multiclass_target",
    "validate_multioutput",
    "validate_nan_policy",
    "validate_numeric",
    "validate_performance_data",
    "validate_positive_integer",
    "validate_sample_weights",
    "validate_sets",
    "validate_strategy",
    "validate_scores",
    "validate_square_matrix",
    "validate_weights",
    "validate_yy",
]



[docs]
def process_y_pairs(
    *ys: ArrayLike,
    error: Literal["raise", "warn", "ignore"] = "warn",
    solo_return: bool = False,
    ops: Literal["check_only", "validate"] = "check_only",
) -> (
    tuple[ArrayLike, ArrayLike]
    | tuple[list[ArrayLike], list[ArrayLike]]
):
    r"""
    Process and validate paired arrays of ground truth (``y_true``) and 
    predicted values (``y_pred``) for machine learning evaluation.

    Parameters
    ----------
    *ys : ArrayLike
        Variable-length sequence of array-likes containing alternating 
        (``y_true``, ``y_pred``) pairs. Must contain even number of inputs.
    error : {'raise', 'warn', 'ignore'}, default='warn'
        Handling strategy for validation errors:
        - ``'raise'``: Immediately raise ValueError
        - ``'warn'``: Issue UserWarning but continue processing
        - ``'ignore'``: Silently skip invalid pairs
    solo_return : bool, default=False
        When processing single pair, return as individual arrays instead of 
        length-1 lists.
    ops : {'check_only', 'validate'}, default='check_only'
        Processing mode:
        - ``'check_only'``: Verify pair lengths without modification
        - ``'validate'``: Clean data (remove NaNs) and validate dtypes

    Returns
    -------
    Tuple[List[ArrayLike], List[ArrayLike]] or Tuple[ArrayLike, ArrayLike]
        Processed pairs as (``y_trues``, ``y_preds``) tuple. Return type 
        depends on ``solo_return`` and number of valid pairs.

    Raises
    ------
    ValueError
        - If input count is odd and ``error='raise'``
        - Length mismatch in pairs when ``error='raise'``
        - Invalid ``error`` or ``ops`` values

    UserWarning
        - When odd input count and ``error='warn'``
        - Length mismatches when ``error='warn'``

    Examples
    --------
    Basic usage with valid pairs:

    >>> from geoprior.utils.validator  import process_y_pairs
    >>> y_true1 = [1.2, 2.3, 3.4]
    >>> y_pred1 = [1.1, 2.4, 3.3]
    >>> y_true2 = [4.5, 5.6]
    >>> y_pred2 = [4.4, 5.7]
    >>> process_y_pairs(y_true1, y_pred1, y_true2, y_pred2)
    ([[1.2, 2.3, 3.4], [4.5, 5.6]], [[1.1, 2.4, 3.3], [4.4, 5.7]])

    Handling mismatched pair with warnings:

    >>> y_bad = [1, 2, 3]
    >>> p_bad = [1, 2]
    >>> process_y_pairs(y_bad, p_bad, error='warn')  # doctest: +SKIP
    UserWarning: Length mismatch in pair 0: 3 vs 2
    ([], [])

    Full validation pipeline:

    >>> import numpy as np
    >>> y_clean, p_clean = process_y_pairs(
    ...     [1, np.nan, 3], [np.nan, 2.1, 3.2],
    ...     ops='validate', solo_return=True
    ... )
    >>> y_clean
    array([3.])
    >>> p_clean
    array([3.2])

    Notes
    -----
    Ensures input pairs meet requirements for downstream analysis through:
    
    .. math::
        \forall i \in \{0,2,4,...\},\ (y_{true}^i, y_{pred}^i) \rightarrow 
        (\tilde{y}_{true}^i, \tilde{y}_{true}^i)\ \text{where}
        
        \text{len}(\tilde{y}_{true}^i) = \text{len}(\tilde{y}_{pred}^i)
        
        \text{and}\ \tilde{y}_{true}^i \in \mathbb{R}^{n},\ 
        \tilde{y}_{pred}^i \in \mathbb{R}^{n}
        
    1. Uses ``drop_nan_in`` for NaN removal and index resetting during 
       validation
    2. Applies ``validate_yy`` for dtype consistency checks and array 
       flattening
    3. Forward references for ``ArrayLike`` allow flexibility - accepts any 
       array-like structure (list, numpy array, pandas Series, etc.)
    4. The type and array-handling conventions rely on the Python language
       reference and NumPy's array-programming model
       :cite:p:`PythonReferenceManual2001,NumPyNature2020`.

    See Also
    --------
    drop_nan_in : Core NaN removal and index resetting function
    validate_yy : Array validation and dtype consistency checker
    sklearn.utils.check_consistent_length : Scikit-learn's length validation
    """
    # from ..core.array_manager import drop_nan_in

    # # Validate error handling mode using direct string comparison for
    # # performance
    # if error not in ("raise", "warn", "ignore"):
    #     raise ValueError(
    #         f"Invalid error mode '{error}'. Valid options: 'raise', "
    #         "'warn', 'ignore'"
    #     )

    # # Check pair parity using bitwise AND for efficient even/odd check
    # if len(ys) % 2 != 0:
    #     msg = (
    #         f"Received {len(ys)} array-likes - requires even count for "
    #         "paired processing"
    #     )
    #     if error == "raise":
    #         raise ValueError(msg)
    #     elif error == "warn":
    #         warnings.warn(
    #             msg + ". Truncating to last even pair.",
    #             UserWarning,
    #             stacklevel=2,
    #         )
    #     ys = ys[
    #         : len(ys) // 2 * 2
    #     ]  # Floor division for index calculation

    # y_trues, y_preds = [], []
    # for i in range(
    #     0, len(ys), 2
    # ):  # Process pairs in steps of 2
    #     y_true, y_pred = ys[i], ys[i + 1]

    #     if ops == "validate":
    #         # Simultaneous NaN removal with index alignment
    #         y_true, y_pred = drop_nan_in(
    #             y_true, y_pred, error=error
    #         )

    #         # Type validation and array conversion
    #         y_true, y_pred = validate_yy(
    #             y_true,
    #             y_pred,
    #             expected_type="continuous",
    #             flatten="auto",
    #         )
    #     elif ops == "check_only":
    #         # Length check using exception-free comparison
    #         if len(y_true) != len(y_pred):
    #             msg = (
    #                 f"Pair {i // 2} length mismatch: "
    #                 f"{len(y_true)} vs {len(y_pred)}"
    #             )
    #             if error == "raise":
    #                 raise ValueError(msg)
    #             elif error == "warn":
    #                 warnings.warn(
    #                     msg, UserWarning, stacklevel=2
    #                 )
    #     else:  # Guard against invalid ops values
    #         raise ValueError(
    #             f"Invalid operation mode '{ops}'. "
    #             "Choose 'check_only' or 'validate'."
    #         )

    #     y_trues.append(y_true)
    #     y_preds.append(y_pred)

    # # Return type handling using boolean short-circuiting
    # # # Extract y_trues and y_preds from processed_pairs
    # # y_trues, y_preds = map(list, zip(*processed_pairs))

    # return (
    #     (y_trues[0], y_preds[0])
    #     if (solo_return and len(y_trues) == 1)
    #     else (y_trues, y_preds)
    # )

    # def process_y_pairs(
    #     *ys: ArrayLike,
    #     error: Literal["raise", "warn", "ignore"] = "warn",
    #     solo_return: bool = False,
    #     ops: Literal["check_only", "validate"] = "check_only",
    # ) -> (
    #     tuple[ArrayLike, ArrayLike]
    #     | tuple[list[ArrayLike], list[ArrayLike]]
    # ):
    from ..core.array_manager import drop_nan_in

    if error not in ("raise", "warn", "ignore"):
        raise ValueError(
            f"Invalid error mode '{error}'. Valid options: 'raise', "
            "'warn', 'ignore'"
        )

    if len(ys) % 2 != 0:
        msg = (
            f"Received {len(ys)} array-likes - requires even count for "
            "paired processing"
        )
        if error == "raise":
            raise ValueError(msg)
        elif error == "warn":
            warnings.warn(
                msg + ". Truncating to last even pair.",
                UserWarning,
                stacklevel=2,
            )
        ys = ys[: len(ys) // 2 * 2]

    y_trues, y_preds = [], []
    for i in range(0, len(ys), 2):
        y_true, y_pred = ys[i], ys[i + 1]

        if ops == "validate":
            y_true, y_pred = drop_nan_in(
                y_true, y_pred, error=error
            )

            # --- new: skip empty pairs after NaN removal ---
            if len(y_true) == 0 or len(y_pred) == 0:
                msg = f"Pair {i // 2} became empty after NaN removal."
                if error == "raise":
                    raise ValueError(msg)
                elif error == "warn":
                    warnings.warn(
                        msg,
                        UserWarning,
                        stacklevel=2,
                    )
                continue

            # --- new: normalize downstream validation failures ---
            try:
                y_true, y_pred = validate_yy(
                    y_true,
                    y_pred,
                    expected_type="continuous",
                    flatten="auto",
                )
            except (ValueError, TypeError, IndexError) as exc:
                msg = (
                    f"Pair {i // 2} failed validation: {exc}"
                )
                if error == "raise":
                    raise ValueError(msg) from exc
                elif error == "warn":
                    warnings.warn(
                        msg,
                        UserWarning,
                        stacklevel=2,
                    )
                continue

        elif ops == "check_only":
            if len(y_true) != len(y_pred):
                msg = (
                    f"Pair {i // 2} length mismatch: "
                    f"{len(y_true)} vs {len(y_pred)}"
                )
                if error == "raise":
                    raise ValueError(msg)
                elif error == "warn":
                    warnings.warn(
                        msg,
                        UserWarning,
                        stacklevel=2,
                    )
                else:
                    continue
        else:
            raise ValueError(
                f"Invalid operation mode '{ops}'. "
                "Choose 'check_only' or 'validate'."
            )

        y_trues.append(y_true)
        y_preds.append(y_pred)

    return (
        (y_trues[0], y_preds[0])
        if (solo_return and len(y_trues) == 1)
        else (y_trues, y_preds)
    )




[docs]
def check_donut_inputs(
    values=None,
    data=None,
    labels=None,
    ops="check",
    labels_as_index=True,
    index=None,
    origin_index="drop",
    value_name="auto",
):
    r"""
    Validate and/or build inputs for donut chart plotting.

    This function accepts inputs in various forms and returns a pair
    of numeric values and labels or builds a new :math:`n \\times 1`
    DataFrame from them. The function supports two modes:

    - In ``ops="check"``, it returns a tuple ``(values, labels)``
      after validating that the numeric values are appropriate for
      plotting.
    - In ``ops="build"``, it returns a pandas DataFrame constructed
      from the inputs. If ``labels_as_index`` is ``True``, the labels
      become the DataFrame index; otherwise, they form a separate
      column. If an ``index`` is provided, it is used to reset the
      DataFrame index and the original index is either dropped or
      kept based on ``origin_index``.

    The function also accepts inputs through a DataFrame or Series
    (``data``). In such cases, if ``values`` is a :math:`\\text{str}`,
    it is interpreted as a column name of the DataFrame. Similarly,
    if ``labels`` is a :math:`\\text{str}`, it is used to fetch the
    label column.

    .. math::
       S = \\{ x_i \\}_{i=1}^{n} \\quad \\text{and} \\quad L =
       \\{ l_i \\}_{i=1}^{n}

    where :math:`S` denotes the numeric values and :math:`L` denotes
    the corresponding labels.

    Parameters
    ----------
    values : array-like or ``str``, optional
        Numeric values for the donut slices. If ``data`` is a DataFrame
        and ``values`` is a double backtick string`` (``"colname"``),
        then the column ``"colname"`` is used. If ``data`` is a Series
        and ``values`` is not provided, the series values are used.
    data : pandas.Series or pandas.DataFrame, optional
        Data source from which to fetch ``values`` and ``labels``. If
        provided, the function extracts the corresponding numeric data.
        For a DataFrame, if ``values`` (or ``labels``) is a double
        backtick string`` (``"colname"``), the function fetches the
        column named ``"colname"``.
    labels : array-like or ``str``, optional
        Labels for the donut slices. If ``data`` is provided and
        ``labels`` is a double backtick string`` (``"colname"``), then
        the function uses the specified column as labels. If omitted,
        the function uses the index of the DataFrame or Series.
    ops : ``"check"`` or ``"build"``, optional
        Operation mode of the function. In ``"check"`` mode, the
        function returns a tuple ``(values, labels)`` after validation.
        In ``"build"`` mode, it returns a new DataFrame built from the
        inputs. The default is ``"check"``.
    labels_as_index : bool, optional
        If ``ops="build"``, this flag determines whether the labels are
        used as the DataFrame index. If ``True``, the labels become the
        index; if ``False``, they form a separate column. The default
        is ``True``.
    index : array-like or ``str``, optional
        New index to assign in ``"build"`` mode. If a double backtick
        string`` is provided, it must correspond to a column in the
        DataFrame and that column is used as the new index. If a list
        is provided, it directly replaces the DataFrame index. In case
        the original index is to be retained, see ``origin_index``.
    origin_index : ``"drop"`` or ``"keep"``, optional
        Specifies whether to drop or retain the original index when
        resetting the DataFrame index. If set to ``"keep"``, the
        original index is saved in a new column named ``origin_index``.
        The default is ``"drop"``.
    value_name : ``"auto"`` or ``str``, optional
        Name to use for the numeric values in the built DataFrame (when
        ``ops="build"``). If set to ``"auto"`` (or ``None``), the default
        name ``"Value"`` is used unless overridden by the source data.
        Otherwise, the provided double backtick string`` (e.g.,
        ``"Total"``) is used as the column name.

    Returns
    -------
    tuple of (ndarray, list) or pandas.DataFrame
        - If ``ops="check"``, returns a tuple ``(values, labels)``
          where ``values`` is a NumPy array of numeric values and
          ``labels`` is a list of labels.
        - If ``ops="build"``, returns a pandas DataFrame constructed
          from the inputs. If ``labels_as_index`` is ``True``, the
          DataFrame index is set to the provided labels (or the new
          index if ``index`` is specified). Otherwise, the DataFrame
          contains separate columns for the labels and numeric values.

    Examples
    --------
    Build inputs from a DataFrame with explicit column names:

    >>> from geoprior.utils.validator import check_donut_inputs
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     "Sales": [100, 200, 150],
    ...     "Country": ["USA", "Canada", "Mexico"]
    ... })
    >>> # Build a DataFrame using "Sales" as values and "Country" as index
    >>> new_df = check_donut_inputs(
    ...     values="Sales",
    ...     data=df,
    ...     labels="Country",
    ...     ops="build",
    ...     labels_as_index=True,
    ...     index="Country",
    ...     origin_index="drop"
    ... )
    >>> new_df
            Sales
    USA      100
    Canada   200
    Mexico   150

    Check inputs when only numeric values are provided:

    >>> values, labs = check_donut_inputs(
    ...     values=[10, 20, 30],
    ...     labels=["A", "B", "C"],
    ...     ops="check"
    ... )
    >>> values
    array([10., 20., 30.])
    >>> labs
    ['A', 'B', 'C']

    Notes
    -----
    The function internally calls the inline helper 
    ``check_numeric_dtype`` to ensure that the provided 
    numeric data satisfies the necessary type constraints.
    The function supports grouping or multiple donut charts by 
    using the input DataFrame directly. See also 
    :func:`~geoprior.core.checks.check_numeric_dtype` for numeric 
    type validation.

    Formulation
    -------------
    The function processes inputs :math:`S` and :math:`L` as follows:

    .. math::
       S = \\{x_i\\}_{i=1}^{n}, \\quad L = \\{l_i\\}_{i=1}^{n}

    and, in build mode, constructs a DataFrame :math:`D` such that

    .. math::
       D = \\begin{bmatrix} l_1 & x_1 \\\\ \\vdots & \\vdots \\\\
       l_n & x_n \\end{bmatrix}

    where :math:`l_i` is set as the index if ``labels_as_index`` is 
    ``True``. The new index may also be reset using the provided 
    ``index`` parameter.

    See Also
    --------
    geoprior.core.checks.check_numeric_dtype: 
        Validate numeric types in arrays.  
    geoprior.core.parameter_validator: Validate string parameters.

    """
    from ..core.checks import check_numeric_dtype

    ops = parameter_validator(
        "mode",
        target_strs={"check", "build", "make"},
        error_msg=f"Invalid mode {ops!r}. Use 'check' or 'build'.",
    )(ops)

    value_name = (
        "Value"
        if value_name in ("auto", None)
        else value_name
    )  # use in build mode

    # Helper: check if x is array-like (but not a string)
    def is_arraylike(x):
        return hasattr(x, "__iter__") and not isinstance(
            x, str
        )

    # -------------------- A) Data Provided --------------------
    if data is not None:
        # A.1) Data is a Series
        if isinstance(data, pd.Series):
            if values is None:
                values = data.values

                if labels is None:
                    labels = data.index
            else:
                raise ValueError(
                    "When 'data' is a Series and 'values' is specified, "
                    "usage is ambiguous. Remove 'values' or"
                    " convert to DataFrame."
                )
            if labels is None:
                labels = data.index

            if value_name == "Value":
                value_name = data.name

        # A.2) Data is a DataFrame
        elif isinstance(data, pd.DataFrame):
            # If values is a string, use it as column name
            if isinstance(values, str):
                if values not in data.columns:
                    raise ValueError(
                        f"Column '{values}' not found in DataFrame."
                    )
                if value_name == "Value":
                    value_name = (
                        values  # keep for build purpose
                    )

                values = data[values].values
            elif values is None:
                if ops == "check":
                    raise ValueError(
                        "When 'data' is a DataFrame, specify the column for "
                        "'values' (e.g., values='my_numeric_col')."
                    )
                else:
                    values = data.copy()
            else:
                if len(values) != len(data):
                    raise ValueError(
                        "'values' length does not match number of rows."
                    )

            # Process labels from DataFrame: if string, use as column name
            if isinstance(labels, str):
                if labels not in data.columns:
                    raise ValueError(
                        f"Column '{labels}' not found in DataFrame."
                    )
                labels = data[labels].values
            elif labels is None:
                labels = data.index
        else:
            raise TypeError(
                "The 'data' parameter must be a pandas Series or DataFrame."
            )
    # -------------------- B) No Data Provided --------------------
    else:
        if values is None:
            raise ValueError(
                "No 'data' provided and 'values' is None. Cannot plot."
            )
        if labels is None:
            labels = [
                f"Slice {i + 1}" for i in range(len(values))
            ]

    # -------------------- C) Numeric Check on Values --------------------
    values = check_numeric_dtype(
        values,
        ops="validate",
        param_names={"X": "Value"},
        coerce=True,
    )

    if not is_arraylike(labels):
        labels = [labels]
    else:
        labels = list(labels)

    # -------------------- D) Return Based on Mode --------------------
    if ops == "check":
        return values, labels

    elif ops == "build":
        # Case 1: Data provided and is a DataFrame.
        if data is not None and isinstance(
            data, pd.DataFrame
        ):
            df = data.copy()

            # If an index is provided, reset the DataFrame's index.
            if index is not None:
                if isinstance(index, str):
                    if index not in df.columns:
                        raise ValueError(
                            f"Index column '{index}' not found in DataFrame."
                        )
                    df = df.set_index(
                        index, drop=(origin_index == "drop")
                    )
                elif is_arraylike(index):
                    df.index = index
                    if origin_index == "keep":
                        df["origin_index"] = data.index
            else:
                # If no index is provided and labels_as_index is True,
                # reset index using labels.
                if labels_as_index:
                    df.index = labels
            try:
                # Build new DataFrame with numeric 'Value' and current index.
                new_df = pd.DataFrame(
                    {value_name: values}, index=df.index
                )
            except:
                # otherwise return df
                return df

            return new_df

        # Case 2: Data is None; build DataFrame from values and labels.
        else:
            if labels_as_index:
                df = pd.DataFrame(
                    {value_name: values}, index=labels
                )
            else:
                df = pd.DataFrame(
                    {"Label": labels, value_name: values}
                )
            return df




[docs]
def validate_strategy(
    strategy: str | dict[str, str] | None = None,
    error: str = "raise",
    ops: str = "validate",
    rename_key: bool = False,
    **kwargs,
) -> dict[str, str] | bool:
    """
    Validate and construct a strategy dictionary for imputing missing data.

    This function processes the input ``strategy`` to ensure it conforms to
    the expected format for imputing missing values in numerical and
    categorical features. It provides flexibility in handling different
    strategies and error management, making it suitable for integration
    with scikit-learn's imputation tools.

    Parameters
    ----------
    strategy : Optional[Union[str, Dict[str, str]]], default=None
        Defines the imputation strategy for numerical and categorical
        features. A string is parsed into a dictionary with keys
        ``"numeric"`` and ``"categorical"``, a dictionary is used directly,
        and ``None`` selects the default strategy.

    error : str, default='raise'
        Error handling behavior for invalid strategy tokens. Use ``"raise"``
        to raise a ``ValueError``, ``"warn"`` to emit a warning, or
        ``"ignore"`` to skip invalid tokens silently.

    ops: str, default='validate'
        Operation mode of the validator. Use ``"passthrough"`` to return the
        input strategy unchanged when it is already a dictionary,
        ``"check_only"`` to validate without modifying it, or ``"validate"``
        to validate and construct the strategy dictionary from the input.

    rename_key : bool, default=False
        If ``True``, rename aliases such as ``"num"``, ``"numeric"``, or
        ``"numerical"`` to ``"numeric"``, and aliases such as ``"cat"``,
        ``"categorical"``, or ``"categoric"`` to ``"categorical"``. Other
        keys remain unchanged.

    **kwargs
        Additional keyword arguments for future extensions.

    Returns
    -------
    Union[Dict[str, str], bool]
        Returns the input strategy dictionary for ``ops='passthrough'``,
        ``True`` or ``False`` for ``ops='check_only'``, and the validated or
        modified strategy dictionary for ``ops='validate'``.

    Raises
    ------
    ValueError
        If an invalid `error` or `ops` parameter is provided, or if the 
        strategy tokens are invalid and `error` is set to `'raise'`.

    Notes
    -----
    The function limits numerical strategies to ``"median"`` and
    ``"mean"`` while categorical strategies default to ``"constant"``. It
    also handles key aliasing to keep the returned dictionary consistent.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_strategy
    >>> validate_strategy('mean constant')
    {'numeric': 'mean', 'categorical': 'constant'}

    >>> validate_strategy({'num': 'mean', 'cat': 'constant'}, rename_key=True)
    {'numeric': 'mean', 'categorical': 'constant'}

    >>> validate_strategy('numeric categorical', ops='check_only')
    False

    >>> validate_strategy('invalid_strategy', error='warn')
    {'numeric': 'median', 'categorical': 'constant'}

    See Also
    --------
    sklearn.impute.SimpleImputer :
        Imputation transformer for completing missing values.

    """

    numeric_strategies = {"median", "mean"}
    categorical_strategies = {"constant"}
    default_strategy = {
        "numeric": "median",
        "categorical": "constant",
    }

    numeric_aliases = {"num", "numeric", "numerical"}
    categorical_aliases = {"cat", "categorical", "categoric"}

    valid_errors = {"raise", "warn", "ignore"}
    valid_ops = {"passthrough", "check_only", "validate"}

    if error not in valid_errors:
        raise ValueError(
            f"Invalid error handling option: '{error}'."
            f" Choose from {valid_errors}."
        )

    if ops not in valid_ops:
        raise ValueError(
            f"Invalid ops option: '{ops}'. Choose from {valid_ops}."
        )

    def rename_keys(
        input_dict: dict[str, Any],
    ) -> dict[str, Any]:
        renamed = {}
        for key, value in input_dict.items():
            key_lower = key.lower()
            if key_lower in numeric_aliases:
                renamed["numeric"] = value
            elif key_lower in categorical_aliases:
                renamed["categorical"] = value
            else:
                renamed[key] = value
        return renamed

    def parse_strategy_str(
        strategy_str: str,
    ) -> dict[str, str]:
        tokens = strategy_str.lower().split()
        strategy_dict = {}
        for token in tokens:
            if token in numeric_strategies:
                strategy_dict["numeric"] = token
            elif token in numeric_aliases:
                strategy_dict["numeric"] = "median"
            elif token in categorical_strategies:
                strategy_dict["categorical"] = token
            elif token in categorical_aliases:
                strategy_dict["categorical"] = "constant"
            elif token == "constant":
                strategy_dict["categorical"] = "constant"
            else:
                message = f"Unknown strategy token: '{token}'"
                if error == "raise":
                    raise ValueError(message)
                elif error == "warn":
                    warnings.warn(message, stacklevel=2)
                # 'ignore' does nothing
        # Fill defaults where necessary
        for key, val in default_strategy.items():
            strategy_dict.setdefault(key, val)
        return strategy_dict

    def validate_strategy_dict(
        strategy_dict: dict[str, str],
    ) -> bool:
        valid = True
        # Validate numeric strategy
        numeric = strategy_dict.get("numeric")
        if numeric not in numeric_strategies:
            valid = False
        # Validate categorical strategy
        categorical = strategy_dict.get("categorical")
        if categorical not in categorical_strategies:
            valid = False
        return valid

    if rename_key and isinstance(strategy, dict):
        strategy = rename_keys(strategy)

    if ops == "passthrough":
        if isinstance(strategy, dict):
            return strategy
        else:
            return (
                parse_strategy_str(strategy)
                if strategy
                else default_strategy.copy()
            )

    elif ops == "check_only":
        if not isinstance(strategy, dict):
            return False
        return validate_strategy_dict(strategy)

    elif ops == "validate":
        if isinstance(strategy, dict):
            is_valid = validate_strategy_dict(strategy)
            if not is_valid:
                message = "Invalid strategy dictionary."
                if error == "raise":
                    raise ValueError(message)
                elif error == "warn":
                    warnings.warn(message, stacklevel=2)
                # 'ignore' does nothing
            return (
                strategy
                if is_valid
                else default_strategy.copy()
            )
        elif isinstance(strategy, str):
            try:
                parsed = parse_strategy_str(strategy)
                if not validate_strategy_dict(parsed):
                    raise ValueError(
                        "Parsed strategy is invalid."
                    )
                return parsed
            except ValueError as ve:
                if error == "raise":
                    raise ve
                elif error == "warn":
                    warnings.warn(str(ve), stacklevel=2)
                    return default_strategy.copy()
                else:
                    return default_strategy.copy()
        else:
            return default_strategy.copy()

    # Fallback to default if ops is somehow not handled
    return default_strategy.copy()




[docs]
def has_methods(
    models,
    methods,
    strict=True,
    check_status="check_only",
    msg=None,
):
    """
    Validate that one or more model objects implement required methods.
    
    Parameters
    ----------
    models : object or list of objects
        Model instance or collection of model instances to validate.
    methods : list of str
        Public method names that each model must implement.
    strict : bool, optional
        If ``True``, raise an ``AttributeError`` when a required method is
        missing.
    check_status : {'validate', 'check_only'}, optional
        Return mode. Use ``'validate'`` to return validated models and
        ``'check_only'`` to return a boolean flag.
    msg : str or None, optional
        Optional custom error message using ``{model}`` and ``{methods}``
        placeholders.
    
    Returns
    -------
    list of objects or bool
        Validated models when ``check_status='validate'`` or a boolean flag
        when ``check_status='check_only'``.
    
    Raises
    ------
    AttributeError
        If a required method is missing and ``strict=True``.
    TypeError
        If ``methods`` is not a list of strings.
    ValueError
        If ``check_status`` is invalid.
    """
    if isinstance(models, dict):
        models = list(models.values())

    # Ensure 'models' is a list
    if not isinstance(models, list):
        models = [models]

    # Validate 'methods' parameter
    if not isinstance(methods, list) or not all(
        isinstance(m, str) for m in methods
    ):
        raise TypeError(
            "'methods' should be a list of method name strings."
        )

    # Validate 'check_status' parameter
    valid_check_status = {"validate", "check_only"}
    if check_status not in valid_check_status:
        raise ValueError(
            f"'check_status' must be one of {valid_check_status}, "
            f"got '{check_status}'."
        )

    missing_methods_report = {}
    validated_models = []

    for model in models:
        missing = []
        for method in methods:
            if not hasattr(model, method) or not callable(
                getattr(model, method)
            ):
                missing.append(method)
        if missing:
            model_name = getattr(
                model, "__name__", type(model).__name__
            )
            missing_methods_report[model_name] = missing
            if strict:
                # Use custom message if provided
                if msg:
                    error_message = msg.format(
                        model=model_name,
                        methods=", ".join(missing),
                    )
                else:
                    error_message = (
                        f"Model '{model_name}' is missing "
                        f"required methods: {', '.join(missing)}."
                    )
                raise AttributeError(error_message)
        else:
            validated_models.append(model)

    if check_status == "validate":
        if strict:
            # If strict and no exception was raised, return the list of models
            return models
        else:
            # Return only the validated models
            return validated_models

    elif check_status == "check_only":
        if strict:
            # If strict and no exception was raised, return True
            return True
        else:
            # Return True if no missing methods, else False
            return len(missing_methods_report) == 0




[docs]
def check_is_runned(
    estimator, attributes=None, *, msg=None, all_or_any=all
):
    """
    Validate if an estimator instance has been "runned" (executed) prior
    to invoking dependent methods. This check ensures that the estimator is in
    the appropriate operational state, allowing users to identify and address
    runtime issues effectively.

    If an estimator does not set "runned" attributes (such as ``_is_runned``),
    it may define a ``__gofast_is_runned__`` method. This method should return
    a boolean indicating whether the estimator is "runned" or not.

    Parameters
    ----------
    estimator : object
        The instance of the estimator or class being validated. This
        parameter represents the object in which dependent methods
        are validated to confirm that the "runned" state has been achieved.

        To determine the "runned" status, the function checks for specific
        attributes or, if defined, the ``__gofast_is_runned__`` method.

    attributes : str, list, or tuple of str, optional, default=None
        Specifies the name(s) of attributes that indicate the "runned" status,
        such as ``['_is_runned']`` or ``['_is_fitted']``. If these attributes
        are present and set to `True`, the estimator is considered to have
        been runned.

        If ``attributes`` is set to `None`, the function will default to
        checking for ``_is_runned``. This default provides flexibility for
        estimators that employ standard runned flags.

    msg : str, optional, default=None
        Custom error message to be displayed if the validation fails.
        By default, this error message uses the class name of the
        `estimator` in the format:

        "This %(name)s instance has not been 'runned' yet. Call 'run' with
        appropriate arguments before using this method."

        To customize the message, include `%(name)s` as a placeholder for
        the estimator's class name.

    all_or_any : callable, {all, any}, optional, default=all
        Determines whether all or any of the specified `attributes` must be
        present and set to `True`. By default, the function expects all
        attributes to be set to `True`. Set to `any` for greater flexibility
        with multiple attributes.

    Methods
    -------
    ``__gofast_is_runned__`` : optional, callable
        If defined within the `estimator`, this method should return a
        boolean indicating the "runned" status of the estimator. This
        provides an alternative to using attributes.

    Raises
    ------
    RuntimeError
        If none of the specified attributes are set to `True` or if the
        `__gofast_is_runned__` method (if present) returns `False`.

    Notes
    -----
    The `check_is_runned` function ensures that methods dependent on
    the "runned" status are only executed after the estimator has completed
    all required preliminary processes, like `fit` or `run`. This helper
    mirrors the fitted-state checks described in
    :cite:p:`SklearnCheckIsFittedDocs,PythonClassInstanceAttrsDocs`.

    Examples
    --------
    >>> from geoprior.utils.validator import check_is_runned
    >>> class ExampleClass:
    ...     def __init__(self):
    ...         self._is_runned = False
    ...
    ...     def run(self):
    ...         self._is_runned = True
    ...         print("Run completed.")
    ...
    ...     def process_data(self):
    ...         check_is_runned(self)
    ...         print("Processing data...")
    >>> model = ExampleClass()
    >>> model.process_data()  # Raises RuntimeError
    >>> model.run()
    >>> model.process_data()  # Now it works

    See Also
    --------
    check_is_fitted : Validates that an estimator has been "fitted" before
                      further use.
    validate_estimator_methods : Validates essential estimator methods.
    """
    from ..exceptions import NotRunnedError

    # Default attribute if none is provided
    if attributes is None:
        attributes = ["_is_runned"]
    elif not isinstance(attributes, list | tuple):
        attributes = [attributes]

    # Define default error message if not provided
    if msg is None:
        msg = (
            "This %(name)s instance has not been 'runned' yet. Call 'run' with "
            "appropriate arguments before using this method."
        )

    # First check if a custom `__gofast_is_runned__` method is available
    if hasattr(estimator, "__gofast_is_runned__"):
        is_runned = estimator.__gofast_is_runned__()
    else:
        # Verify attributes are present and set to True if no custom method is provided
        is_runned = all_or_any(
            [
                getattr(estimator, attr, False)
                for attr in attributes
            ]
        )

    if not is_runned:
        raise NotRunnedError(
            msg % {"name": type(estimator).__name__}
        )




[docs]
def check_has_run_method(
    estimator, msg=None, method_name="run"
):
    """
    Check if the given estimator has a callable `run` method or any other 
    specified method. This utility helps validate that an object can 
    execute the expected method before further actions are taken.

    Parameters
    ----------
    estimator : object
        The object (instance or class) to check for the presence of the 
        `run` method or another specified method.
    
    msg : str, optional
        Custom error message to display if the method is missing. If None, 
        a default message is generated based on the `method_name`.
    
    method_name : str, default="run"
        The method name to check for. This defaults to `run`, but you can 
        specify any method name. The method must be callable.
    
    Raises
    ------
    AttributeError
        Raised if the `run` method (or any specified method) does not 
        exist on the object or is not callable.
    
    Examples
    --------
    >>> from geoprior.utils.validator import check_has_run_method
    >>> class MyClass:
    ...     def run(self):
    ...         pass
    >>> check_has_run_method(MyClass())  # No error

    >>> class MyClassWithoutRun:
    ...     pass
    >>> check_has_run_method(MyClassWithoutRun())  # Raises AttributeError

    Notes
    -----
    This function performs several checks:
    
    1. **Existence check**: It checks whether the `run` method (or any 
       other specified method) exists in the `estimator` object.
    2. **Callable check**: It ensures that the method is callable, which 
       rules out attributes that might exist but aren't methods.
    3. **Static/class method check**: The function accepts static or 
       class methods as valid callable methods.
    4. **Bound method check**: It verifies that instance methods are 
       bound to an object when required, which ensures they can be called 
       properly in the given context.

    This function can be expressed as a validation function:

    .. math:: 
        \text{check\_has\_method}(estimator, method\_name) = 
        \begin{cases} 
        \text{valid}, & \text{if method exists and callable} \\
        \text{invalid}, & \text{if method is missing or not callable}
        \end{cases}

    It determines whether the method is callable or raises an error 
    otherwise. Callable-method validation here follows the Python
    documentation and the staticmethod overview in
    :cite:p:`Python3Docs,RealPythonStaticMethods`.

    See Also
    --------
    validate_estimator_methods : A helper function to validate multiple 
                                 methods on an estimator.

    """

    # Step 1: Check if the method exists
    if not hasattr(estimator, method_name):
        if msg is None:
            msg = f"'{estimator.__class__.__name__}' object has no attribute '{method_name}'"
        raise AttributeError(msg)

    method = getattr(estimator, method_name)

    # Step 2: Ensure the method is callable
    if not callable(method):
        if msg is None:
            msg = f"'{method_name}' attribute of '{estimator.__class__.__name__}' is not callable."
        raise AttributeError(msg)

    # Step 3: Check for static or class methods
    if isinstance(
        getattr(estimator.__class__, method_name, None),
        staticmethod | classmethod,
    ):
        return  # Valid if it's a static or class method

    # Step 4: If it's an instance method, ensure it's bound
    if not isinstance(
        method, staticmethod | classmethod
    ) and not hasattr(method, "__self__"):
        raise AttributeError(
            f"'{method_name}' method of '{estimator.__class__.__name__}' is unbound."
        )

    # If no errors were raised, the method exists and is callable
    return




[docs]
def validate_batch_size(
    batch_size,
    n_samples,
    min_batch_size=1,
    max_batch_size=None,
):
    """
    Validate the batch size against the number of samples.

    This function checks whether the provided `batch_size` is appropriate
    given the total number of samples `n_samples`. It ensures that the batch
    size meets specified minimum and maximum limits, raising appropriate
    errors if any constraints are violated.

    Parameters
    ----------
    batch_size : int
        The size of each batch. This must be a positive integer, as batches
        must contain at least one sample. A ValueError will be raised if this
        value is less than the minimum allowed batch size or exceeds the
        total number of samples.

    n_samples : int
        The total number of samples in the dataset. This value must be
        positive and greater than or equal to the `batch_size`. If `batch_size`
        is greater than `n_samples`, a ValueError is raised.

    min_batch_size : int, optional
        The minimum allowed batch size (default is 1). This parameter defines
        the smallest permissible batch size. A ValueError will be raised if
        the `batch_size` is less than this value.

    max_batch_size : int, optional
        The maximum allowed batch size (default is None, meaning no upper limit).
        This parameter can be used to restrict the size of the batch to a
        specified maximum value. If `max_batch_size` is provided, a ValueError
        will be raised if the `batch_size` exceeds this limit.

    Return
    ------
        batch_size: Validated number of batch size

    Raises
    ------
    ValueError
        If the `batch_size` is less than the `min_batch_size`, greater than the
        `n_samples`, or exceeds the `max_batch_size` if specified. Additionally,
        if `batch_size` is not a positive integer, a ValueError is raised.

    Notes
    ------
    Let `B` represent the `batch_size` and `N` represent the `n_samples`.
    The validation can be expressed mathematically as:

    .. math::
        \text{If } B < \text{min\_batch\_size} \text{ or } B > N \text{ or } B > \text{max\_batch\_size}:
        \quad \text{raise ValueError}

    This function is essential for managing data batching in machine learning
    workflows, where improper batch sizes can lead to inefficient training or
    runtime errors. The practical mini-batch constraint follows standard
    deep-learning training guidance
    :cite:p:`GoodfellowEtAl2016DeepLearning`.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_batch_size
    >>> validate_batch_size(32, 100)  # Valid case
    >>> validate_batch_size(0, 100)  # Raises ValueError
    >>> validate_batch_size(150, 100)  # Raises ValueError
    >>> validate_batch_size(32, 100, max_batch_size=32)  # Valid case
    >>> validate_batch_size(40, 100, max_batch_size=32)  # Raises ValueError
    """
    n_samples = validate_positive_integer(
        n_samples, "N-samples"
    )

    # Check if batch_size is a positive integer
    batch_size = validate_positive_integer(
        batch_size,
        "Batch size",
        msg=(
            f"Batch size must be a positive integer. Given: {batch_size}."
        ),
    )

    # Check if batch_size meets the minimum requirement
    if batch_size < min_batch_size:
        raise ValueError(
            f"Batch size ({batch_size}) cannot be less than"
            f" the minimum allowed ({min_batch_size})."
        )

    # Check if batch_size exceeds the maximum limit, if provided
    if (
        max_batch_size is not None
        and batch_size > max_batch_size
    ):
        raise ValueError(
            f"Batch size ({batch_size}) cannot exceed"
            f" the maximum allowed ({max_batch_size})."
        )

    # Check if batch_size exceeds the total number of samples
    if batch_size > n_samples:
        raise ValueError(
            f"Batch size ({batch_size}) cannot exceed"
            f" number of samples ({n_samples})."
        )
    return batch_size




[docs]
def validate_estimator_methods(estimator, methods, msg=None):
    """
    Validate that the specified methods exist and are callable on the
    given estimator.

    This utility function is designed to check whether an estimator (or
    any object) contains the required methods, such as ``fit`` or
    ``predict``, and ensures that those methods are callable. It helps
    prevent runtime errors by verifying the presence of expected methods.

    Parameters
    ----------
    estimator : object
        The object (instance or class) to check for the presence of the
        specified methods. The estimator can be an instance of a class or
        the class itself, and it should implement the required methods.

    methods : list of str
        List of method names (as strings) to validate. Each method name
        must exist on the estimator and be callable. Examples of methods
        might include ``fit``, ``run``, or ``predict``.

    msg : str, optional
        Custom error message to display if any method is missing or not
        callable. If None, a default message is generated for each missing
        or invalid method based on the method name.

    Raises
    ------
    AttributeError
        If any method in ``methods`` is not present or not callable on the
        estimator.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_estimator_methods
    >>> class MyClass:
    ...     def fit(self):
    ...         pass
    ...     def run(self):
    ...         pass
    >>> validate_estimator_methods(MyClass(), ['fit', 'run'])  # No error

    >>> class IncompleteClass:
    ...     def fit(self):
    ...         pass
    >>> validate_estimator_methods(IncompleteClass(), ['fit', 'run'])
    # Raises AttributeError for missing `run` method

    Notes
    -----
    This helper is useful when you want to ensure that an object, such as an
    estimator or a model, exposes several callable methods before proceeding.
    If any method is missing or not callable, the function raises an
    ``AttributeError``. Method-callability checks follow the Python
    documentation and the callable-object discussion in
    :cite:p:`Python3Docs,RealPythonCallable`.

    See Also
    --------
    check_has_run_method : Validate the presence of a single method, defaulting to ``run``.

    """
    if isinstance(methods, str):
        methods = [methods]

    for method_name in methods:
        # Step 1: Check if the method exists on the estimator
        if not hasattr(estimator, method_name):
            # If a custom message is provided, use it; otherwise, generate a default message
            if msg is None:
                msg = f"'{estimator.__class__.__name__}' object has no attribute '{method_name}'"
            raise AttributeError(msg)

        method = getattr(estimator, method_name)

        # Step 2: Ensure the method is callable
        if not callable(method):
            if msg is None:
                msg = f"'{method_name}' attribute of '{estimator.__class__.__name__}' is not callable."
            raise AttributeError(msg)

        # Step 3: Check if it's a valid static or class method
        if isinstance(
            getattr(estimator.__class__, method_name, None),
            staticmethod | classmethod,
        ):
            continue  # Static or class methods are valid and callable

        # Step 4: Ensure instance methods are properly bound
        if not isinstance(
            method, staticmethod | classmethod
        ) and not hasattr(method, "__self__"):
            raise AttributeError(
                f"'{method_name}' method of '{estimator.__class__.__name__}' is unbound."
            )

    # If all methods pass, the validation is successful
    return




[docs]
def filter_valid_kwargs(callable_obj, kwargs):
    """
    Filter and return only the valid keyword arguments for a given
    callable object.

    This function checks if the arguments in `kwargs` are valid for the
    provided callable object (function, lambda function, method, or class).
    If any argument is not valid, it is removed from `kwargs`. The function
    returns only the valid `kwargs`.

    Parameters
    ----------
    callable_obj : callable
        The callable object (function, lambda function, method, or class) for
        which the keyword arguments need to be validated.

    kwargs : dict
        Dictionary of keyword arguments to be validated against the callable object.

    Returns
    -------
    valid_kwargs : dict
        Dictionary containing only the valid keyword arguments for the callable object.

    Examples
    --------
    >>> def example_func(a, b, c=3):
    ...     pass
    >>> kwargs = {'a': 1, 'b': 2, 'd': 4}
    >>> filter_valid_kwargs(example_func, kwargs)
    {'a': 1, 'b': 2}

    >>> class ExampleClass:
    ...     def __init__(self, x, y, z=10):
    ...         pass
    >>> kwargs = {'x': 1, 'y': 2, 'a': 3}
    >>> filter_valid_kwargs(ExampleClass, kwargs)
    {'x': 1, 'y': 2}
    >>> filter_valid_kwargs(ExampleClass(), kwargs)
    {'x': 1, 'y': 2}

    Notes
    -----
    This function uses the `inspect` module to retrieve the signature of
    the given callable object and validate the keyword arguments.
    """
    # If the callable_obj is an instance, get its class
    if not inspect.isclass(callable_obj) and not callable(
        callable_obj
    ):
        callable_obj = callable_obj.__class__

    # Get the function signature
    signature = inspect.signature(callable_obj)

    # Extract parameter names from the function signature
    valid_params = set(signature.parameters.keys())

    # Filter kwargs to retain only valid parameters
    valid_kwargs = {
        k: v for k, v in kwargs.items() if k in valid_params
    }

    return valid_kwargs




[docs]
def validate_sets(
    data,
    mode: str = "base",
    allow_empty: bool = True,
    element_type: type = None,
    key_type: type = str,
):
    """
    Validates whether the input data is a set in 'base' mode or a dictionary
    of sets in 'deep' mode. Provides additional parameters for flexibility
    and versatility. Returns the data if it passes validation.

    Parameters
    ----------
    data : Union[set, Dict[str, set]]
        The input data to validate. It can be either a single set or a dictionary
        where keys are set names and values are sets.

        - `base mode` : A single set.
        - `deep mode` : A dictionary of sets.

    mode : str, optional
        The mode in which to validate the data. Options are 'base' for a single
        set and 'deep' for a dictionary of sets. Default is 'base'.

    allow_empty : bool, optional
        Whether to allow empty sets or dictionaries. Default is True.

    element_type : type, optional
        The expected type of elements in the set(s). If provided, the function
        checks whether all elements are of this type. Default is None (no type
        check).

    key_type : type, optional
        The expected type of keys in the dictionary when in 'deep' mode. Default
        is `str`.

    Returns
    -------
    Union[set, Dict[str, set]]
        The original data if it matches the specified mode and additional
        criteria. Raises ValueError if validation fails.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_sets
    >>> validate_sets({1, 2, 3}, mode='base')
    {1, 2, 3}

    >>> validate_sets({"Set1": {1, 2, 3}, "Set2": {3, 4, 5}}, mode='deep')
    {"Set1": {1, 2, 3}, "Set2": {3, 4, 5}}

    >>> validate_sets({"Set1": {1, 2, 3}, "Set2": [3, 4, 5]}, mode='deep')
    Traceback (most recent call last):
        ...
    ValueError: Data validation failed: expected all values to be sets

    >>> validate_sets(set(), mode='base', allow_empty=False)
    Traceback (most recent call last):
        ...
    ValueError: Data validation failed: empty set is not allowed

    >>> validate_sets({"Set1": set()}, mode='deep', allow_empty=False)
    Traceback (most recent call last):
        ...
    ValueError: Data validation failed: empty dictionary is not allowed

    >>> validate_sets({"Set1": {1, 2, 3}}, mode='deep', element_type=int)
    {"Set1": {1, 2, 3}}

    Notes
    -----
    This function checks the type of the input data based on the specified mode.
    In 'base' mode, it ensures the data is a set. In 'deep' mode, it ensures the
    data is a dictionary where all values are sets. Additional parameters allow
    for checking if sets are empty, if elements are of a specific type, and if
    dictionary keys are of a specific type. The core type test used here is
    documented in :cite:t:`PythonIsinstanceDocs`.

    See Also
    --------
    isinstance : Python built-in function to check an object's type.

    """
    if mode == "base":
        if not isinstance(data, set):
            raise ValueError(
                f"Data validation failed: expected a set, got {type(data).__name__!r}"
            )
        if not allow_empty and not data:
            raise ValueError(
                "Data validation failed: empty set is not allowed"
            )
        if element_type is not None and any(
            not isinstance(el, element_type) for el in data
        ):
            raise ValueError(
                "Data validation failed: all elements must"
                f" be of type {element_type.__name__!r}"
            )
    elif mode == "deep":
        if not isinstance(data, dict):
            raise ValueError(
                "Data validation failed: expected a dictionary,"
                f" got {type(data).__name__!r}"
            )
        if not allow_empty and not data:
            raise ValueError(
                "Data validation failed: empty dictionary is not allowed"
            )
        if any(
            not isinstance(k, key_type) for k in data.keys()
        ):
            raise ValueError(
                f"Data validation failed: all keys must be of type {key_type.__name__!r}"
            )
        if any(not isinstance(v, set) for v in data.values()):
            raise ValueError(
                "Data validation failed: expected all values to be sets"
            )
        if element_type is not None and any(
            not isinstance(el, element_type)
            for v in data.values()
            for el in v
        ):
            raise ValueError(
                "Data validation failed: all elements must"
                f" be of type {element_type.__name__!r}"
            )

    else:
        raise ValueError(
            "Mode must be either 'base' or 'deep'."
        )

    return data




[docs]
def validate_scores(
    scores,
    true_labels=None,
    mode="strict",
    accept_multi_output=False,
):
    """
    Validates that the scores represent valid probability distributions and
    checks consistency between scores and true labels in multi-output scenarios.

    Parameters
    ----------
    scores : list or np.ndarray
        A list of np.ndarrays for multi-output probabilities, or a single np.ndarray
        for single-output probabilities. Each ndarray should contain probability
        distributions where each row sums to approximately 1 and has
        non-negative values.
    true_labels : list or np.ndarray, optional
        The true labels corresponding to the scores. This parameter must
        be provided in multi-output scenarios to check the alignment of labels
        and scores. Each element or row in true_labels should correspond to
        the equivalent in scores.
    mode : str, optional (default "strict")
        Validation mode for checking probability distributions. Use
        ``"strict"`` to require each row to sum to ``1`` within numerical
        tolerance, ``"soft"`` to require non-negative scores with totals no
        greater than ``1``, or ``"passthrough"`` to only check that each
        score lies in the interval ``[0, 1]``.
    accept_multi_output : bool, default False
        Flag indicating whether scores with multiple outputs are accepted.
        If False and scores are provided as a list, a ValueError will be
        raised.

    Returns
    -------
    np.ndarray
        The validated scores as a NumPy array.

    Raises
    ------
    ValueError
        If multi-output scores are provided and not accepted.
        If there is a mismatch in the number of outputs between scores and
        true_labels.
        If scores or any subset of scores do not form valid probability
        distributions.
        If there is a mismatch in format expectations between scores and
        true_labels in terms of multi-output handling.

    Notes
    -----
    The function is designed to handle both single and multi-output probability
    distributions. For multi-output scenarios, both scores and true_labels
    should be lists of np.ndarrays.
    This function is particularly useful in scenarios involving machine learning
    models where output probabilities need to be validated before further
    processing or metrics calculations.

    Examples
    --------
    >>> import numpy as np
    >>> from geoprior.utils.validator import validate_scores
    >>> scores_single = np.array([[0.1, 0.9], [0.8, 0.2]])
    >>> print(validate_scores(scores_single))
    [[0.1, 0.9]
     [0.8, 0.2]]

    >>> scores_multi = [np.array([[0.1, 0.9]]), np.array([[0.8, 0.2]])]
    >>> true_labels_multi = [np.array([1]), np.array([0])]
    >>> print(validate_scores(scores_multi, true_labels_multi, accept_multi_output=True))
    [array([[0.1, 0.9]]), array([[0.8, 0.2]])]
    """
    # Check if scores are in a list for multi-output handling
    if isinstance(scores, list):
        if not accept_multi_output:
            raise ValueError(
                "Multi-output scores provided but not accepted."
            )
        if true_labels is not None and len(scores) != len(
            true_labels
        ):
            raise ValueError(
                "Mismatch in the number of outputs between"
                " scores and true_labels."
            )
        if any(
            not _is_probability_distribution(score, mode=mode)
            for score in scores
        ):
            raise ValueError(
                "Each set of scores must be a valid"
                " probability distribution."
            )
    else:
        if not _is_probability_distribution(
            scores, mode=mode
        ):
            raise ValueError(
                "Scores must be a valid probability distribution."
            )
        if true_labels is not None:
            if accept_multi_output and not isinstance(
                true_labels, list
            ):
                raise ValueError(
                    "Expected multi-output for true_labels"
                    " but got a single output."
                )
            if not accept_multi_output and isinstance(
                true_labels, list
            ):
                raise ValueError(
                    "Non-multi-output scores with multi-output"
                    " true_labels."
                )
    # Return scores as numpy array
    return np.asarray(scores)



def _is_probability_distribution(
    y, mode="strict", error="ignore"
):
    """
    Checks if `y` is a probability distribution across the last axis according
    to the specified mode.

    Parameters
    ----------
    y : np.ndarray
        Array containing score values which need to be validated as probability
        distributions.
    mode : str, optional
        Validation mode to use. ``"strict"`` requires the sum of scores to
        equal ``1`` within a tolerance, ``"soft"`` requires non-negative
        scores with totals no greater than ``1``, and ``"passthrough"``
        only checks that scores stay within ``[0, 1]``.
    error : str, optional
        Error handling behavior. Use ``"raise"`` to raise an error when the
        check fails, ``"warn"`` to emit a warning and return ``False``, or
        ``"ignore"`` to silently return ``False``. Default is ``"ignore"``.

    Returns
    -------
    bool
        True if `y` satisfies the conditions of the specified mode, False
        otherwise.

    Raises
    ------
    ValueError
        If an invalid mode is specified, or if `error` is set to 'raise' and
        the distribution check fails in strict mode.

    Examples
    --------
    >>> from geoprior.utils.validator import _is_probability_distribution
    >>> y = np.array([0.3, 0.7])
    >>> print(_is_probability_distribution(y, mode='strict'))
    True

    >>> y = np.array([0.5, 0.5, 0.2])
    >>> print(_is_probability_distribution(y, mode='soft'))
    False

    >>> y = np.array([0.2, 0.3, 0.4])
    >>> print(_is_probability_distribution(y, mode='passthrough'))
    True
    """
    y = np.asarray(y)

    mode_status = "."
    if mode == "strict":
        is_valid = np.all(
            np.isclose(np.sum(y, axis=-1), 1)
        ) and np.all(y >= 0)
        mode_status = (
            ": Requires that the sum of scores"
            " exactly equals 1 (within a tolerance)"
        )

    elif mode == "soft":
        is_valid = np.all(np.sum(y, axis=-1) <= 1) and np.all(
            y >= 0
        )
        mode_status = (
            ": Requires that the sum of scores does not"
            " exceed 1 and all scores are non-negative"
        )
    elif mode == "passthrough":
        is_valid = np.all(y <= 1) and np.all(y >= 0)
        mode_status = (
            ": Only checks that all scores are non-negative"
            " and do not exceed 1, without summing them."
        )
    else:
        raise ValueError(
            f"Invalid validation mode: '{mode}'. Valid modes"
            " are 'strict', 'soft', or 'passthrough'."
        )

    if not is_valid:
        if error == "raise":
            raise ValueError(
                f"Input array does not meet the {mode} mode "
                "requirements for a probability distribution"
                f"{mode_status}"
            )
        elif error == "warn":
            warnings.warn(
                f"Input array does not meet the {mode} mode "
                "requirements for a probability distribution"
                "{mode_status}",
                stacklevel=2,
            )
            return False
        elif error == "ignore":
            return False

    return is_valid



[docs]
def validate_square_matrix(
    data, align=False, align_mode="auto", message=""
):
    """
    Validate that the input data forms a square matrix and optionally aligns its
    indices and columns if specified.

    Parameters
    ----------
    data : DataFrame or array-like
        The input data to validate as a square matrix.
    align : bool, default False
        Whether to align the DataFrame's index with its columns.
    align_mode : str, default 'auto'
        Alignment mode if indices and columns do not match. Options are 'auto',
        'index_to_columns', and 'columns_to_index'.
    message : str, default ''
        Additional message to append to the error if validation fails.

    Returns
    -------
    data
        The validated or aligned square matrix.

    Raises
    ------
    ValueError
        If the input is not a square matrix.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_square_matrix
    >>> validate_square(np.array([[1, 2], [3, 4]]))
    array([[1, 2],
           [3, 4]])

    >>> validate_square(pd.DataFrame([[1, 2], [3, 4, 5]]))
    ValueError: Input must be a square matrix.

    Notes
    -----
    A square matrix is defined as having equal number of rows and columns.
    This function checks the dimensionality of the data and optionally aligns
    the index and columns if ``align`` is set to ``True``.
    """
    if not is_square_matrix(data):
        raise ValueError(
            f"Input must be a square matrix. {message}"
        )
    if align:
        data = validate_comparison_data(
            data, alignment=align_mode
        )
    return data




[docs]
def is_square_matrix(data, data_type=None):
    """
    Determine whether the input, either a DataFrame or an array-like
    structure, forms a square matrix.

    Automatically detects the data type unless specified. Supports data inputs
    that can be converted to a NumPy array.

    Parameters
    ----------
    data : DataFrame, array-like, or any object convertible to a numpy array
        The input data to check.
    data_type : str, optional
        The expected type of the input data. Valid options are 'array' or
        'dataframe'.
        If not specified, the data type is inferred. Default interpretation
        is as an 'array'.

    Returns
    -------
    bool
        Returns True if the data is a square matrix, otherwise False.

    Raises
    ------
    ValueError
        If `data_type` is neither 'array' nor 'dataframe'.
    TypeError
        If the input `data` does not match the expected format or
        cannot be processed.

    Examples
    --------
    >>> is_square_matrix(np.array([[1, 2], [3, 4]]))
    True

    >>> is_square_matrix(pd.DataFrame([[1, 2, 3], [4, 5, 6]]))
    False

    >>> is_square_matrix([[1, 2], [3, 4]], data_type='array')
    True

    Notes
    -----
    A square matrix has an equal number of rows and columns. This function
    checks the dimensionality and shape of the data to confirm if it meets
    this criterion.
    """
    # Determine the type based on the data provided
    if data_type is None:
        if isinstance(data, np.ndarray):
            data_type = "array"
        elif isinstance(data, pd.DataFrame):
            data_type = "dataframe"
        else:
            data = np.array(
                data
            )  # Attempt to convert to a numpy array
            data_type = "array"

    if data_type not in ["array", "dataframe"]:
        raise ValueError(
            "data_type must be either 'array' or 'dataframe'"
        )

    # Check if the data is a square matrix
    if data_type == "array":
        if data.ndim != 2 or data.shape[0] != data.shape[1]:
            return False
    elif data_type == "dataframe":
        if data.shape[0] != data.shape[1]:
            return False
    else:
        raise TypeError(
            f"Unsupported or mismatched data type: {data_type}"
        )

    return True




[docs]
def validate_multiclass_target(
    y, accept_multioutput=False, return_classes=False
):
    """
    Validates that the target data is suitable for multiclass classification.
    Optionally accepts multi-output targets and can return the unique classes.

    Parameters
    ----------
    y : array-like
        The target data to be validated, expected to contain class labels for
        multiclass classification. Can be a multi-output array if accept_multioutput
        is set to True.
    accept_multioutput : bool, optional
        Allows the target array to be multi-dimensional (default is False).
    return_classes : bool, optional
        If True, returns the unique classes instead of a validation boolean.

    Returns
    -------
    bool or array
        If return_classes is False, returns True if the target data is valid for
        multiclass classification, otherwise raises a ValueError.
        If return_classes is True, returns the unique classes in the target data.

    Raises
    ------
    ValueError
        If any of the following conditions are not met:
        - If accept_multioutput is False, the target data must be one-dimensional.
        - All elements in the target array must be non-negative integers.
        - The target array must contain at least two distinct classes.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_multiclass_target
    >>> validate_multiclass_target([0, 1, 2, 1, 0])
    array([0, 1, 2, 1, 0])
    >>> validate_multiclass_target([0, 0, 0])
    ValueError: Target array must contain at least two distinct classes.
    >>> validate_multiclass_target([0.5, 1.2, 2.3])
    ValueError: All elements in the target array must be non-negative integers.
    >>> validate_multiclass_target([[1, 2], [2, 3]], accept_multioutput=True,
    ...                              return_classes=True)
    (array([1, 2, 2, 3]), 3)
    True
    """
    # Convert input to a numpy array and create a copy if modifying data structure
    y = np.asarray(y)
    y_eval = y.copy() if accept_multioutput else y

    # Ensure the array is one-dimensional if multi-output is not accepted
    if not accept_multioutput and y.ndim > 1:
        raise ValueError(
            "Target array must be one-dimensional unless"
            " multi-output is accepted."
        )

    # Validate that all elements are non-negative integers
    if not (
        np.issubdtype(y_eval.dtype, np.integer)
        and np.all(y_eval >= 0)
    ):
        raise ValueError(
            "All elements in the target array must be non-negative integers."
        )

    # Flatten the array for unique class check if multi-output is accepted
    if accept_multioutput:
        y_eval = y_eval.flatten()

    # Ensure there are at least two distinct classes
    unique_classes = np.unique(y_eval)
    if unique_classes.size < 2:
        raise ValueError(
            "Target array must contain at least two distinct classes."
        )

    # Return the original array and the number of unique classes if requested
    if return_classes:
        return y, unique_classes.size

    return y




[docs]
def validate_sample_weights(weights, y, normalize=False):
    """
    Validates that the sample weights are suitable for use in calculations.

    This function checks that the sample weights are non-negative and match
    the length of the target array `y`. It raises an error if any conditions
    are not met. If a single number is provided as weights, it will be
    converted into an array with repeated values matching the length of `y`.

    Parameters
    ----------
    weights : array-like or number
        The sample weights to be validated. Each weight must be non-negative.
        A single number will be converted to an array with repeated values.
    y : array-like
        The target array that the weights should correspond to. The length
        of `weights` must match the length of `y`.
    normalize : bool, optional
        If True, weights will be normalized to sum to 1. Default is False.

    Returns
    -------
    numpy.ndarray
        The validated sample weights as a numpy array.

    Raises
    ------
    ValueError
        If `weights` are not one-dimensional, if any weight is negative,
        or if the length of `weights` does not match the length of `y`.

    Examples
    --------
    >>> frpm geoprior.utils.validator import validate_sample_weights
    >>> y = [0, 1, 2, 3]
    >>> weights = [0.1, 0.2, 0.3, 0.4]
    >>> validate_sample_weights(weights, y)
    array([0.1, 0.2, 0.3, 0.4])

    >>> weights = [-0.1, 0.2, 0.3, 0.4]
    >>> validate_sample_weights(weights, y)
    ValueError: Sample weights must be non-negative.

    >>> weights = [0.1, 0.2, 0.3]
    >>> validate_sample_weights(weights, y)
    ValueError: Length of sample weights must match length of y.
    """
    if isinstance(
        weights, int | float | np.integer | np.floating
    ):
        weights = np.full_like(
            y, fill_value=weights, dtype=np.float
        )

    weights = np.asarray(weights)
    y = np.asarray(y)

    # Check if weights are one-dimensional
    if weights.ndim != 1:
        raise ValueError(
            "Sample weights must be one-dimensional."
        )

    # Check if any weights are negative
    if np.any(weights < 0):
        raise ValueError(
            "Sample weights must be non-negative."
        )

    # Check if the length of weights matches the length of y
    if weights.size != y.size:
        raise ValueError(
            "Length of sample weights must match length of y."
        )

    weights = normalize_array(
        weights, normalize=normalize, method="sum"
    )

    return weights  # Return the validated weights as a numpy array




[docs]
def validate_weights(
    weights,
    min_value=None,
    max_value=None,
    normalize=False,
    allowed_dims=1,
):
    """
    Validates and optionally normalizes the given weights array to ensure all elements
    meet specified criteria and the structure is suitable for computations.

    Parameters
    ----------
    weights : array-like
        Weights to be validated. Can be a list, tuple, or numpy array.
    min_value : float, optional
        Minimum allowable value for weights (inclusive). If None, weights are
        expected to be non-negative. Explicitly set to a negative value if
        negative weights are allowed.
    max_value : float or None, optional
        Maximum allowable value for weights (inclusive). If None, no upper
        limit is enforced.
    normalize : bool, optional
        If True, weights will be normalized to sum to 1. Default is False.
    allowed_dims : int or tuple, optional
        Specifies the allowed dimensions of the weights array. Default is 1
        (one-dimensional). If a tuple is provided, weights must match one of
        the dimensions specified in the tuple.

    Returns
    -------
    np.ndarray
        A numpy array of the validated and optionally normalized weights.

    Raises
    ------
    ValueError
        If weights contain values outside the specified range, or if the
        format or dimensions are not suitable.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_weights

    >>> validate_weights([0.25, 0.75, 0.5], normalize=True)
    array([0.2, 0.6, 0.4])

    >>> validate_weights([-0.1, 0.9], min_value=0)
    ValueError: Weights must be non-negative.

    >>> validate_weights([0.1, 0.2, 0.7], max_value=0.5)
    ValueError: Weights must not exceed 0.5.

    >>> validate_weights([1, 2, 3], allowed_dims=(1, 2))
    ValueError: Weights dimensions not allowed.
    """
    try:
        weights_array = np.asarray(weights, dtype=float)
    except Exception as e:
        raise ValueError(
            "Weights must be provided in a format that can be"
            " converted to a numpy array."
        ) from e

    if isinstance(allowed_dims, int):
        allowed_dims = (allowed_dims,)
    if weights_array.ndim not in allowed_dims:
        raise ValueError(
            f"Weights must have dimensions in {allowed_dims}."
        )

    # Check if min_value is None and enforce non-negative weights by default
    if min_value is None:
        if np.any(weights_array < 0):
            raise ValueError(
                "Weights must be non-negative unless 'min_value'"
                " is explicitly set to allow negative values."
            )
        min_value = 0.0
    if np.any(weights_array < min_value) or (
        max_value is not None
        and np.any(weights_array > max_value)
    ):
        raise ValueError(
            f"Weights must be between {min_value} and"
            f" {max_value if max_value is not None else '∞'}."
        )

    if normalize:
        if np.sum(weights_array) == 0:
            raise ValueError(
                "Cannot normalize weights because their sum is zero."
            )

        if not is_normalized(weights_array, method="sum"):
            weights_array /= np.sum(weights_array)

    return weights_array




[docs]
def is_normalized(arr, method="sum"):
    """
    Checks if the provided array is normalized according to the specified method.

    Parameters
    ----------
    arr : array-like
        The array to check for normalization.
    method : str, optional
        The normalization method to check against. Use ``"01"`` to confirm
        values are within ``[0, 1]`` with minimum ``0`` and maximum ``1``,
        ``"zscore"`` to confirm mean ``0`` and standard deviation ``1``, or
        ``"sum"`` to confirm the array sums to ``1``. Default is ``"sum"``.

    Returns
    -------
    bool
        Returns True if the array is normalized according to the specified method,
        False otherwise.

    Examples
    --------
    >>> arr = np.array([0.25, 0.25, 0.25, 0.25])
    >>> is_normalized(arr, method='sum')
    True

    >>> arr = np.array([0, 0.5, 1])
    >>> is_normalized(arr, method='01')
    True

    >>> arr = np.array([1, -1, 1, -1])
    >>> is_normalized(arr, method='zscore')
    True
    """
    arr = np.asarray(arr, dtype=float)
    method = parameter_validator(
        "method", target_strs={"01", "zscore", "sum"}
    )(method)

    if method == "01":
        # Check if all elements are within [0, 1] and max is 1, min is 0
        return (
            np.all((arr >= 0) & (arr <= 1))
            and np.isclose(np.min(arr), 0)
            and np.isclose(np.max(arr), 1)
        )
    elif method == "zscore":
        # Check if mean is approximately 0 and std is approximately 1
        mean = np.mean(arr)
        std = np.std(arr)
        return np.isclose(mean, 0) and np.isclose(std, 1)
    elif method == "sum":
        # Check if the sum of the elements is approximately 1
        return np.isclose(np.sum(arr), 1)




[docs]
def normalize_array(arr, normalize="auto", method="01"):
    """
    Checks if an array is normalized according to the specified method and
    normalizes it if required based on the 'normalize' parameter.

    Parameters
    ----------
    arr : array-like
        The input array to check and potentially normalize.

    normalize : str, optional
        Controls whether normalization is applied. Use ``"auto"`` to
        normalize only when the array is not already normalized for the
        selected ``method``. Use ``True`` to always normalize and ``False``
        to return the array unchanged. Default is ``"auto"``.

    method : str, optional
        Normalization method to apply. Use ``"01"`` for min-max scaling,
        ``"zscore"`` for standardization, or ``"sum"`` to scale values so
        they sum to ``1``. Default is ``"01"``.

    Returns
    -------
    np.ndarray
        The normalized array, or the original array if no normalization was applied.

    Raises
    ------
    ValueError
        If an unknown normalization method is specified or if normalization
        cannot be performed due to data characteristics (e.g., zero variance).

    Examples
    --------
    >>> import numpy as np
    >>> from geoprior.utils.validator import normalize_array

    >>> data = np.array([1, 2, 3, 4, 5])
    >>> normalized_data = normalize_array(data, normalize=True, method='01')
    >>> print("Normalized between 0 and 1:", normalized_data)
    Normalized between 0 and 1: [0.   0.25 0.5  0.75 1.  ]

    >>> zscore_data = normalize_array(data, normalize=True, method='zscore')
    >>> print("Standardized (Z-score):", zscore_data)
    Standardized (Z-score): [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]

    >>> sum_data = normalize_array(data, normalize=True, method='sum')
    >>> print("Normalized by sum:", sum_data)
    Normalized by sum: [0.06666667 0.13333333 0.2        0.26666667 0.33333333]
    """
    arr = np.asarray(arr, dtype=float)
    is_normed = is_normalized(arr, method=method)

    normalize = parameter_validator(
        "normalize", target_strs={True, False, "auto"}
    )(normalize)

    if normalize == "auto":
        normalize = not is_normed

    if normalize:
        if method == "01":
            min_val = np.min(arr)
            max_val = np.max(arr)
            if min_val == max_val:
                raise ValueError(
                    "Normalization impossible with zero variance."
                )
            arr = (arr - min_val) / (max_val - min_val)
        elif method == "zscore":
            mean = np.mean(arr)
            std = np.std(arr)
            if std == 0:
                raise ValueError(
                    "Standardization impossible with zero variance."
                )
            arr = (arr - mean) / std
        elif method == "sum":
            total = np.sum(arr)
            if total == 0:
                raise ValueError(
                    "Normalization by sum impossible with zero sum."
                )
            arr = arr / total

    # If normalization is not required, return the original array
    return arr




[docs]
def is_binary_class(y, accept_multioutput=False):
    """
    Check whether the target array represents binary classification. Optionally,
    handle multi-output arrays if each output is binary.

    Parameters
    ----------
    y : array-like
        The target array to be checked. This can be a 1D array for single output
        or a 2D array for multiple outputs if `accept_multioutput` is True.
    accept_multioutput : bool, default False
        If True, the function checks if each column in a multi-dimensional array
        is binary. If False, the function checks if the entire array is binary.

    Returns
    -------
    bool
        Returns True if `y` is binary (or each output is binary if multi-output
        is accepted), False otherwise.

    Examples
    --------
    >>> from geoprior.utils.validator import is_binary_class
    >>> is_binary_class([0, 1, 1, 0])
    True
    >>> is_binary_class([[0, 1], [1, 0], [0, 1], [1, 0]], accept_multioutput=True)
    True
    >>> is_binary_class([0, 1, 2, 3])
    False
    """
    y = np.asarray(y)
    y = check_y(y, multi_output=True, y_numeric=True)

    if not accept_multioutput:
        # Check if the entire array is binary
        unique_values = np.unique(y)
        return len(unique_values) == 2 and np.all(
            np.isin(unique_values, [0, 1])
        )

    if y.ndim == 1:
        # If the array is 1D and multioutput is expected, treat it as a single column
        y = y.reshape(-1, 1)

    if y.ndim > 1:
        # Check each column independently
        for column in y.T:
            unique_values = np.unique(column)
            if not (
                len(unique_values) == 2
                and np.all(np.isin(unique_values, [0, 1]))
            ):
                return False
        return True

    return False




[docs]
def handle_zero_division(
    y_true,
    zero_division="warn",
    metric_name="metric computation",
    epsilon=1e-15,
    replace_with=None,
):
    """
    Preprocess input arrays to handle cases where zero could cause division errors
    in subsequent metric computations.

    Parameters
    ----------
    y_true : array-like
        The input data array where zeros might cause division errors.
    zero_division : {'warn', 'raise', 'ignore'}, default 'warn'
        Determines the action to perform when a zero is encountered. Use
        ``"warn"`` to issue a warning and replace zeros with ``replace_with``
        or ``epsilon``, ``"raise"`` to raise an error, or ``"ignore"`` to
        leave zeros unchanged when the metric can handle them natively.
    metric_name : str, optional
        Name of the metric for which this preprocessing is being done, to be included
        in warnings or error messages for better context.
    epsilon : float, optional
        Small value to use as default replacement if `replace_with` is None,
        default is 1e-15.
    replace_with : float or None, optional
        A specific value to replace zeros with, if None, `epsilon` is used.

    Returns
    -------
    numpy.ndarray
        The processed array with modifications based on the zero_division strategy.

    Raises
    ------
    ValueError
        If `zero_division` is 'raise' and zero is found in `y_true`.

    Notes
    -----
    Using `replace_with` allows for custom behavior when handling zeros, which can
    be tailored to the specific requirements of different metric computations.

    Examples
    --------
    >>> from geoprior.utils.validator import handle_zero_division
    >>> y_true = [0, 1, 2, 3, 0]
    >>> processed_y_true = handle_zero_division(
    ...     y_true, replace_with=0.001, zero_division='warn'
    ... )
    >>> print(processed_y_true)
    [1.e-03 1.e+00 2.e+00 3.e+00 1.e-03]

    """
    y_true_processed = np.asarray(y_true, dtype=float)
    zero_division = parameter_validator(
        "zero_division",
        target_strs=["warn", "raise", "ignore"],
    )(zero_division)

    zeros_mask = y_true_processed == 0
    if np.any(zeros_mask):
        if zero_division == "warn":
            warnings.warn(
                f"Encountered zero in y_true, which may lead to"
                f" infinite values or NaNs in {metric_name}.",
                RuntimeWarning,
                stacklevel=2,
            )
            replacement_value = (
                replace_with
                if replace_with is not None
                else epsilon
            )
            y_true_processed[zeros_mask] = replacement_value
        elif zero_division == "raise":
            raise ValueError(
                f"Encountered zero in y_true, leading to division"
                f" by zero in {metric_name} computation."
            )
        elif zero_division == "ignore":
            pass  # Do nothing, let the calling function handle zeros natively.

    return y_true_processed



def convert_to_numeric(
    value, preserve_integers=True, context_description="Data"
):
    """
    Helper function to convert values to float. It ensures that integers are
    converted to floats (unless preserve_integers is True) and raises a detailed
    error for non-numeric values.

    Parameters
    ----------
    value : Any
        The value to be converted to float. Integer values are converted, while
        floats are returned as-is. Non-numeric types raise a ValueError.

    preserve_integers : bool, optional, default True
        If True, integer values are preserved as integers and not converted to floats
        and False otherwise.

    context_description : str, optional, default 'Data'
        A description of the type of data being processed, used in error messages
        to provide context (e.g., 'Performance data', 'Input data').

    Returns
    -------
    float or int
        The converted numeric value (float by default, or int if preserve_integers is True).

    Raises
    ------
    ValueError
        If the value cannot be converted to a numeric type
        (e.g., strings that do not represent numbers).

    Examples
    --------
    >>> from geoprior.utils.validator import convert_to_numeric
    >>> convert_to_numeric(5)
    5.0
    >>> convert_to_numeric(5, preserve_integers=True)
    5
    >>> convert_to_numeric(3.14)
    3.14
    >>> convert_to_numeric('0.85')
    0.85
    >>> convert_to_numeric('abc')
    ValueError: Data expected numeric values, but got str: 'abc'
    """
    try:
        # Check if the value is an integer
        if isinstance(value, int):
            if preserve_integers:
                return value  # Keep the integer as is
            else:
                return float(value)  # Convert to float
        # If the value is already a float, return it
        elif isinstance(value, float):
            return value
        # Attempt to convert any other type (like strings) to float
        else:
            return float(
                value
            )  # Handle strings that represent numbers
    except (ValueError, TypeError) as e:
        # Raise a clear error with context-specific description
        raise ValueError(
            f"{context_description} expected numeric values,"
            f" but got {type(value).__name__}: '{value}'"
        ) from e



[docs]
def validate_performance_data(
    model_performance_data=None,
    nan_policy="raise",
    convert_integers=True,
    check_performance_range=True,
    verbose=False,
):
    """
    Validates and preprocesses model performance data to ensure it conforms
    to the necessary structure and constraints for statistical and machine
    learning analysis. The function accepts either a dictionary or a
    DataFrame as input and performs the following tasks:

    1. Converts data to a DataFrame if it is provided as a dictionary.
    2. Converts integer values to floats, ensuring compatibility with
       statistical processing.
    3. Manages NaN values according to the specified `nan_policy`.
    4. Validates that performance data falls within a valid range, ensuring
       values lie within [0, 1].

    The function is adaptable, capable of being used directly or as a
    decorator, with or without configuration parameters.

    Parameters
    ----------
    model_performance_data : Union[Dict[str, List[float]], pd.DataFrame], optional
        The input model performance data to validate. Can be provided as
        either a dictionary (with model names as keys and performance
        metrics as lists) or a DataFrame where each column represents a model.

    nan_policy : str, default='raise'
        The policy to handle NaN values:
        * 'raise': Raises a ValueError if NaNs are detected.
        * 'omit': Drops rows with NaNs.
        * 'propagate': Ignores NaNs during performance range checks.

    convert_integers : bool, default=True
        Converts integer values within the data to floats if set to True,
        which is useful for consistency when computing metrics.

    check_performance_range : bool, default=True
        Ensures that performance values lie within the range [0, 1].
        If any value falls outside this range, an error is raised unless
        `nan_policy` is set to 'propagate'.

    verbose : bool, default=False
        If True, displays steps of the data validation process for tracking
        operations and debugging.

    Methods
    -------
    actual_validate_performance_data(data)
        Validates and processes the data according to specified policies
        and constraints.

    Usage
    -----
    This function can be utilized in three primary ways:

    1. **As a function**: Provide data directly to perform validation.

    >>> from geoprior.utils.validator import validate_performance_data
    >>> data = {'model1': [0.85, 0.90, 0.92], 'model2': [0.80, 0.87, 0.88]}
    >>> validate_performance_data(data)

    2. **As a decorator**: Use as a decorator to validate the first
       argument of a function. If used without parentheses, default values
       will be applied.

    >>> @validate_performance_data
    >>> def process_data(validated_data):
    >>>     print(validated_data)

    3. **As a decorator with parameters**: Customize validation by
       specifying parameters.

    >>> @validate_performance_data(nan_policy='omit', verbose=True)
    >>> def process_data(validated_data):
    >>>     print(validated_data)

    Notes
    -----
    The validation process includes statistical pre-checks, using custom
    modules to convert data and handle NaNs. For integer-to-float
    conversion, the `convert_to_numeric` function is utilized, while NaN
    policies are verified using `is_valid_policies`. The comparison
    framing for multiple models follows
    :cite:t:`Demsar2006Classifiers`.

    See Also
    --------
    DataFrameFormatter : Formatter for handling DataFrame structures.
    MultiFrameFormatter : Formatter for handling multiple DataFrames.

    """

    from ..api.formatter import (
        DataFrameFormatter,
        MultiFrameFormatter,
        formatter_validator,
    )
    from ..decorators import isdf

    @isdf
    def actual_validate_performance_data(data):
        # Convert to DataFrame if input is a dictionary
        if isinstance(data, dict):
            if verbose:
                print("Converting dictionary to DataFrame...")
            df = pd.DataFrame(data)
        elif isinstance(data, pd.DataFrame):
            df = data.copy()
        else:
            raise ValueError(
                "Input data must be either a dictionary or a DataFrame."
            )

        # Ensure all values are float, convert integers to floats if needed
        if convert_integers:
            if verbose:
                print(
                    "Converting integer values to floats where necessary..."
                )
            df = df.applymap(
                convert_to_numeric,
                preserve_integers=False,
                context_description="Performance data",
            )

        # Handle NaN values according to nan_policy
        is_valid_policies(
            nan_policy,
            allowed_policies=["raise", "omit", "propagate"],
        )

        if df.isna().any().any():  # Check for NaN values
            if nan_policy == "raise":
                raise ValueError(
                    "NaN values detected in the data. Set"
                    " `nan_policy='omit'` to drop them."
                )
            elif nan_policy == "omit":
                if verbose:
                    print("Dropping rows with NaN values...")
                df = df.dropna()

        # Ensure all values are float type
        df = df.astype(float)

        # Check if performance values are within the valid range [0, 1]
        if check_performance_range:
            if nan_policy == "propagate":
                df_checked = df.dropna()
            else:
                df_checked = df

            if (df_checked < 0).any().any():
                raise ValueError(
                    "Performance values cannot be negative."
                )
            if (df_checked > 1).any().any():
                raise ValueError(
                    "Performance values must be in the range [0, 1]."
                )

        if verbose:
            print(
                "Validation and conversion complete."
                " Data is ready for further processing."
            )

        return df

    if model_performance_data is not None and callable(
        model_performance_data
    ):
        # Used as a decorator without arguments
        func = model_performance_data

        @wraps(func)
        def wrapper(*args, **kwargs):
            data = args[0]
            validated_data = actual_validate_performance_data(
                data
            )
            return func(validated_data, *args[1:], **kwargs)

        return wrapper

    elif model_performance_data is not None:
        # Used as a normal function
        # Validate and extract DataFrame if data is a formatter instance
        if isinstance(
            model_performance_data,
            DataFrameFormatter | MultiFrameFormatter,
        ):
            model_performance_data = formatter_validator(
                model_performance_data,
                df_indices=[0],
                only_df=True,
            )

        return actual_validate_performance_data(
            model_performance_data
        )

    else:
        # Used as a decorator with arguments
        def decorator(func):
            @wraps(func)
            def wrapper(*args, **kwargs):
                data = args[0]
                validated_data = (
                    actual_validate_performance_data(data)
                )
                return func(
                    validated_data, *args[1:], **kwargs
                )

            return wrapper

        return decorator



def validate_sequences(
    sequences: np.ndarray,
    n_features: int | None = None,
    batch_size: int | None = None,
    sequence_length: int | None = None,
    check_shape: bool = ...,
) -> np.ndarray:
    """
    Validate and reshape sequences input for a neural network.

    Parameters
    ----------
    sequences : `numpy.ndarray`
        Array of input sequences with shape (batch_size, sequence_length, num_features).
        It must be a 3D numpy array.

    n_features : int, optional
        The number of features in the sequences. If provided, the function
        will reshape the sequence accordingly. If the sequence doesn't have
        `n_features`, an error will be raised.

    batch_size : int, optional
        The batch size to check or reshape the sequences. If provided, it will
        validate if the sequences align with this batch size.

    sequence_length : int, optional
        The length of each sequence to check or reshape.

    check_shape : bool, default=True
        Whether to check if the sequences are of valid shape. If set to False,
        shape validation will be skipped.

    Returns
    -------
    np.ndarray
        A validated and reshaped sequence array with shape
        (batch_size, sequence_length, num_features).

    Raises
    ------
    TypeError
        If `sequences` is not a `numpy.ndarray`.

    ValueError
        If the shape of the `sequences` array does not match the expected
        shape, or if `n_features` does not align with the sequence dimensions.

    Examples
    ---------
    >>> import numpy as np
    >>> from geoprior.utils.validator import validate_sequences

    >>> # Example 3D sequences array (batch_size=2, sequence_length=3, n_features=4)
    >>> sequences = np.random.rand(2, 3, 4)

    >>> # Validate and reshape sequences if necessary
    >>> validated_sequences = validate_sequences(sequences, n_features=5, check_shape=True)
    >>> print(validated_sequences.shape)

    """
    # Ensure that sequences is a numpy array
    try:
        sequences = np.asarray(
            sequences
        )  # just for consistency
    except Exception as e:
        raise TypeError(
            f"The sequences input must be a numpy.ndarray. {e}"
        )

    # Check if the sequences array is 3D
    if len(sequences.shape) != 3:
        raise ValueError(
            "The sequences array must have 3 dimensions: "
            "(batch_size, sequence_length, num_features)."
        )

    # Extract the current shape of the sequences array
    (
        current_batch_size,
        current_sequence_length,
        current_n_features,
    ) = sequences.shape

    # Validate the provided parameters if check_shape is True
    if check_shape:
        if (
            batch_size is not None
            and current_batch_size != batch_size
        ):
            raise ValueError(
                f"Expected batch size of {batch_size},"
                " but got {current_batch_size}."
            )
        if (
            sequence_length is not None
            and current_sequence_length != sequence_length
        ):
            raise ValueError(
                f"Expected sequence length of {sequence_length},"
                f" but got {current_sequence_length}."
            )
        if (
            n_features is not None
            and current_n_features != n_features
        ):
            raise ValueError(
                f"Expected {n_features} features,"
                f" but got {current_n_features}."
            )

    # Reshaping based on n_features if it's provided
    if n_features is not None:
        if current_n_features != n_features:
            # Reshape the sequences to match the provided n_features
            sequences = sequences.reshape(
                current_batch_size,
                current_sequence_length,
                n_features,
            )
            current_n_features = n_features

    # Optionally adjust the batch_size or sequence_length if needed
    if batch_size is not None or sequence_length is not None:
        sequences = sequences[
            :batch_size, :sequence_length, :current_n_features
        ]

    # Return the validated and possibly reshaped sequences
    return sequences



[docs]
def validate_comparison_data(df, alignment="auto"):
    """
    Validates a DataFrame to ensure it is a square matrix and that the index
    and column names match. Optionally aligns the index names to the column
    names or vice versa based on the alignment parameter.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to validate.
    alignment : str, default 'auto'
        Controls how the DataFrame's index and columns are aligned if they d
        o not match.
        Options are 'auto', 'index_to_columns', and 'columns_to_index'.

    Returns
    -------
    pandas.DataFrame
        The validated and potentially modified DataFrame.

    Raises
    ------
    ValueError
        If the DataFrame is not square or if index and column names do not match
        and no suitable alignment option is specified.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_comparison_data
    >>> data = pd.DataFrame({
    ...     'A': [1, 0.9, 0.8],
    ...     'B': [0.9, 1, 0.85],
    ...     'C': [0.8, 0.85, 1]
    ... }, index=['A', 'B', 'X'])
    >>> print(validate_comparison_data(data, alignment='index_to_columns'))

    >>> data = pd.DataFrame({
    ...     1: [1, 0.9, 0.8],
    ...     2: [0.9, 1, 0.85],
    ...     3: [0.8, 0.85, 1]
    ... }, index=[1, 2, 'X'])
    >>> print(validate_comparison_data(data, alignment='auto'))
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError(
            f"Performance data expects a DataFrame; got {type(df).__name__!r}"
        )
    # Check if DataFrame is square
    if df.shape[0] != df.shape[1]:
        raise ValueError(
            "DataFrame must be square (equal number of rows and columns)."
        )

    # Check if indices and columns match
    if not df.index.equals(df.columns):
        if alignment == "index_to_columns":
            df.index = df.columns
        elif alignment == "columns_to_index":
            df.columns = df.index
        elif alignment == "auto":
            # Automatically decide which one to use based on data types
            if (
                df.index.dtype == "object"
                and df.columns.dtype == "int64"
            ):
                df.index = df.columns
            elif (
                df.columns.dtype == "object"
                and df.index.dtype == "int64"
            ):
                df.columns = df.index
            else:
                raise ValueError(
                    "Automatic alignment failed. Index and column names do not match "
                    "and are of the same type. Please specify alignment explicitly."
                )
        else:
            raise ValueError(
                "Invalid alignment option provided. Please choose from 'index_to_columns', "
                "'columns_to_index', or 'auto'."
            )

    return df




[docs]
def validate_data_types(
    data,
    expected_type="numeric",
    nan_policy="omit",
    return_data=False,
    error="raise",
):
    """
    Checks for mixed data types in a pandas Series or DataFrame and handles
    according to the specified policies. This function is designed to ensure
    data consistency by verifying that data matches expected type criteria,
    offering options to manage and report any discrepancies.

    Parameters
    ----------
    data : pd.Series or pd.DataFrame
        The data to be checked. This can be a pandas Series or DataFrame.
    expected_type : {'numeric', 'categoric', 'both'}, default 'numeric'
        Specifies the type of data expected:

        - 'numeric': All data should be of numeric types (int, float).
        - 'categoric': All data should be categorical, typically strings
          or pandas Categorical datatype.
        - 'both': Any mix of numeric and categorical data is considered valid.

    nan_policy : {'raise', 'omit', 'propagate'}, default 'omit'
        Determines how NaN values are handled:

        - 'raise': Raises an error if NaN values are found.
        - 'warn': Issues a warning if NaN values are found but proceeds.
        - 'propagate': Continues execution without addressing NaNs.

    return_data : bool, default False
        If True, returns a DataFrame or Series (depending on the input) that
        only includes data rows that conform to the expected_type. If False,
        returns None.

    error : {'raise', 'warn'}, default 'raise'
        Configures the error handling behavior when data types do not conform
        to the expected_type:

        - 'raise': Raises a TypeError if mixed types are detected.
        - 'warn': Emits a warning but attempts to continue by filtering
          non-conforming data if `return_data` is True.

    Returns
    -------
    pd.Series or pd.DataFrame or None
        Depending on `return_data`, this function may return a filtered version
        of `data` that conforms to the `expected_type` or None if `return_data`
        is False.

    Raises
    ------
    ValueError
        If NaN values are present and `nan_policy` is set to 'error'.
    TypeError
        If data types do not conform to `expected_type` and `error` is set to 'raise'.

    Examples
    --------
    >>> import pandas as pd
    >>> from geoprior.utils.validator import validate_data_types
    >>> df = pd.DataFrame({'A': [1, 2, 'a', 3.5, np.nan], 'B': ['x', 'y', 'z', None, 't']})
    >>> validate_data_types(df, expected_type='numeric', nan_policy='warn',
    ...                  return_data=True, error='warn')
    UserWarning: NaN values found in the data, but processing will continue.
    UserWarning: Expected numeric types but found mixed types.
    Non-numeric data will be ignored.
       A
    0  1.0
    1  2.0
    3  3.5

    Notes
    -----
    The `check_data_types` function is useful in data preprocessing steps,
    particularly when you need to ensure that data fed into a machine learning
    algorithm meets certain type requirements. Handling mixed data types early
    on can prevent issues in model training and evaluation.
    """
    expected_type = parameter_validator(
        "expected_type",
        target_strs={"numeric", "categoric", "both"},
    )(expected_type)

    if not isinstance(data, pd.Series | pd.DataFrame):
        data = build_data_if(
            data,
            raise_exception=True,
            force=True,
            input_name="feature",
        )

    if isinstance(data, pd.Series):
        data = data.to_frame()

    # Handle NaN values according to the nan_policy
    nan_policy = is_valid_policies(nan_policy)
    if nan_policy == "raise" and data.isnull().any().any():
        raise ValueError("NaN values found in the data.")
    elif (
        nan_policy == "propagate"
        and data.isnull().any().any()
    ):
        warnings.warn(
            "NaN values found in the data, but processing will continue.",
            stacklevel=2,
        )

    def _handle_numeric(data, return_data):
        is_numeric = pd.to_numeric(
            data, errors="coerce"
        ).notna()
        if not is_numeric.all():
            if error == "raise":
                raise TypeError(
                    "Mixed types detected. Please encode categorical variables first."
                )
            elif error == "warn":
                warnings.warn(
                    "Expected numeric types but found mixed types."
                    " Non-numeric data will be ignored.",
                    stacklevel=2,
                )
                if return_data:
                    return data.loc[is_numeric]
        return data[is_numeric] if return_data else None

    def _handle_categoric(data, return_data):
        is_categoric = data.apply(
            lambda x: isinstance(x, str | pd.CategoricalDtype)
        )
        if not is_categoric.all():
            if error == "raise":
                raise TypeError(
                    "Mixed types detected with unexpected numeric data."
                )
            elif error == "warn":
                warnings.warn(
                    "Expected categoric types but found numeric data.",
                    stacklevel=2,
                )
                if return_data:
                    return data[is_categoric]
        return data[is_categoric] if return_data else None

    results = pd.DataFrame()

    for column in data.columns:
        col_data = data[column]
        if expected_type == "numeric":
            result = _handle_numeric(col_data, return_data)
        elif expected_type == "categoric":
            result = _handle_categoric(col_data, return_data)
        elif expected_type == "both":
            if error == "warn":
                warnings.warn(
                    "Mixed data types found. Be cautious of unintended data type issues.",
                    stacklevel=2,
                )
            result = col_data if return_data else None
        else:
            raise ValueError(
                "Unsupported expected_type provided. Choose"
                " 'numeric', 'categoric', or 'both'."
            )

        if return_data and result is not None:
            results[column] = result

    return results if not results.empty else None




[docs]
def ensure_2d(X, output_format="auto"):
    """
    Ensure that the input X is converted to a 2-dimensional structure.

    Parameters
    ----------
    X : array-like or pandas.DataFrame
        The input data to convert. Can be a list, numpy array, or DataFrame.
    output_format : str, optional
        The format of the returned object. Options are "auto", "array", or "frame".
        "auto" returns a DataFrame if X is a DataFrame, otherwise a numpy array.
        "array" always returns a numpy array.
        "frame" always returns a pandas DataFrame.

    Returns
    -------
    ndarray or DataFrame
        The converted 2-dimensional structure, either as a numpy array or DataFrame.

    Raises
    ------
    ValueError
        If the `output_format` is not one of the allowed values.

    Examples
    --------
    >>> import numpy as np
    >>> from geoprior.utils.validator import ensure_2d
    >>> X = np.array([1, 2, 3])
    >>> ensure_2d(X, output_format="array")
    array([[1],
           [2],
           [3]])
    >>> df = pd.DataFrame([1, 2, 3])
    >>> ensure_2d(df, output_format="frame")
       0
    0  1
    1  2
    2  3
    """
    # Check for allowed output_format values
    output_format = parameter_validator(
        "output_format",
        target_strs=["auto", "array", "frame"],
    )(output_format)

    # Detect if the input is a DataFrame
    is_dataframe = isinstance(X, pd.DataFrame)

    # Ensure X is at least 2-dimensional
    if isinstance(X, np.ndarray):
        if X.ndim == 1:
            X = X[:, np.newaxis]
    elif isinstance(X, pd.DataFrame):
        if (
            X.shape[1] == 0
        ):  # Implies an empty DataFrame or misshapen
            X = X.values.reshape(
                -1, 1
            )  # reshape and handle as array
            is_dataframe = False
    else:
        X = np.array(
            X
        )  # Convert other types like lists to np.array
        if X.ndim == 1:
            X = X[:, np.newaxis]

    # Decide on return type based on output_format
    if output_format == "array":
        return X if isinstance(X, np.ndarray) else X.values
    elif output_format == "frame":
        return pd.DataFrame(X) if not is_dataframe else X
    else:  # auto handling
        if is_dataframe:
            return X
        return pd.DataFrame(X) if is_dataframe else X




[docs]
def is_categorical(data, column, strict=False, error="raise"):
    """
    Checks if a specified column in a DataFrame or Series is of
    a categorical type.

    Parameters
    ----------
    data : DataFrame or Series
        The DataFrame or Series to check.
    column : str
        The name of the column to check.
    strict : bool, optional
        If True, only considers pandas CategoricalDtype as categorical. If False,
        also considers object dtype that often represents categorical data.
        Default is False.
    error : str, optional
        Specifies how to handle situations when the column does not exist.
        Options are 'raise', 'warn', or 'ignore'. Default is 'raise'.

    Returns
    -------
    bool
        True if the column is categorical, otherwise False.

    Raises
    ------
    ValueError
        If the column does not exist and error is set to 'raise'.

    Examples
    --------
    >>> import pandas as pd
    >>> from geoprior.utils.validator import is_categorical
    >>> df = pd.DataFrame({
    ...     'fruit': ['Apple', 'Banana', 'Cherry'],
    ...     'count': [10, 20, 15]
    ... })
    >>> df['fruit'] = df['fruit'].astype('category')
    >>> print(is_categorical(df, 'fruit'))
    True
    >>> print(is_categorical(df, 'count'))
    False
    >>> print(is_categorical(df, 'non_existent', error='warn'))
    Warning: Column 'non_existent' not found in the dataframe.
    False
    """
    if column not in data.columns:
        message = (
            f"Column '{column}' not found in the dataframe."
        )
        if error == "raise":
            raise ValueError(message)
        elif error == "warn":
            warnings.warn(message, stacklevel=2)
        return False  # Return False if error is 'ignore' or 'warn' and column is not found

    col_type = data[column].dtype
    if strict:
        return pd.api.types.is_categorical_dtype(col_type)
    else:
        return pd.api.types.is_categorical_dtype(
            col_type
        ) or pd.api.types.is_object_dtype(col_type)




[docs]
def parameter_validator(
    param_name,
    target_strs,
    match_method="contains",
    raise_exception=True,
    **kws,
):
    """
    Creates a validator function for ensuring a parameter's value matches one
    of the allowed target strings, optionally applying normalization.

    This higher-order function returns a validator that can be used to check
    if a given parameter value matches allowed criteria, optionally raising
    an exception or normalizing the input.

    Parameters
    ----------
    param_name : str
        Name of the parameter to be validated. Used in error messages to
        indicate which parameter failed validation.
    target_strs : list of str
        A list of acceptable string values for the parameter.
    match_method : str, optional
        The method used to match the input string against the target strings.
        The default method is 'contains', which checks if the input string
        contains any of the target strings.
    raise_exception : bool, optional
        Specifies whether an exception should be raised if validation fails.
        Defaults to True, raising an exception on failure.
    **kws: dict,
       Keyword arguments passed to :func:`geoprior.core.utils.normalize_string`.
    Returns
    -------
    function
        A closure that takes a single string argument (the parameter value)
        and returns a normalized version of it if the parameter matches the
        target criteria. If the parameter does not match and `raise_exception`
        is True, it raises an exception; otherwise, it returns the original value.

    Examples
    --------
    >>> from geoprior.utils.validator import parameter_validator
    >>> validate_outlier_method = parameter_validator(
    ...  'outlier_method', ['z_score', 'iqr'])
    >>> outlier_method = "z_score"
    >>> print(validate_outlier_method(outlier_method))
    'z_score'

    >>> validate_fill_missing = parameter_validator(
    ...  'fill_missing', ['median', 'mean', 'mode'], raise_exception=False)
    >>> fill_missing = "average"  # This does not match but won't raise an exception.
    >>> print(validate_fill_missing(fill_missing))
    'average'

    Notes
    -----
    - The function leverages a custom utility function `normalize_string`
      from a module named `geoprior.core.utils`. This utility is assumed to handle
      string normalization and matching based on the provided `match_method`.
    - If `raise_exception` is set to False and the input does not match any
      target string, the input string is returned unchanged. This behavior
      allows for optional enforcement of the validation rules.
    - The primary use case for this function is to validate and optionally
      normalize parameters for configuration settings or function arguments
      where only specific values are allowed.
    """
    from ..core.utils import normalize_string

    def validator(param_value):
        """Validate param value from :func:`~normalize_string`"""
        if param_value:
            return normalize_string(
                param_value,
                target_strs=target_strs,
                return_target_only=True,
                match_method=match_method,
                raise_exception=raise_exception,
                **kws,
            )
        return param_value  # Return the original value if it's None or empty

    return validator




[docs]
def validate_distribution(
    distribution,
    elements=None,
    kind=None,
    check_normalization=True,
):
    """
    Validates or generates distributions for given elements, ensuring the
    sum equals 1 if `check_normalization` is True.

    Parameters
    ----------
    distribution : str, tuple, list
        The distribution to be validated or generated. If 'auto',
        generates a random distribution for the specified number of elements.
        Can also be a tuple or list representing an explicit distribution.
    elements : int, list of str, optional
        Defines how many elements the distribution should be generated for
        when 'auto' is used. If a list of strings is provided, its length
        is used to determine the number of elements.
    kind : str, optional
        Specifies the kind of distribution. It can be ``{"probs"}`` for
        probability distributions, where the sum should equal 1 and
        values must be non-negative.
    check_normalization : bool, optional
        If True, ensures that the sum of the distribution equals 1.
        Default is True.

    Returns
    -------
    tuple
        A tuple representing the validated or generated distribution.

    Raises
    ------
    ValueError
        If the provided distribution does not meet the specified conditions.

    Examples
    ---------
    >>> from geoprior.utils.validator import validate_distribution
    >>> validate_distribution("auto", elements=['positive', 'neutral', 'negative'])
    (0.1450318690603951, 0.5660028611331361, 0.2889652698064687)
    """
    # Determine the number of elements if a list is provided
    distributed_elements = None
    if elements is not None:
        if isinstance(elements, list | tuple | np.ndarray):
            distributed_elements = len(elements)
        elif isinstance(
            elements, float | int | np.integer | np.floating
        ):
            distributed_elements = int(elements)
        else:
            raise ValueError(
                "'elements' must be an integer or a list of strings."
            )

    # Generate a random distribution if specified as 'auto'
    if str(distribution).lower() == "auto":
        if distributed_elements is None:
            raise ValueError(
                "'distributed_elements' must be specified when"
                " using 'auto' distribution."
            )
        # Generate a random distribution
        random_values = np.random.rand(distributed_elements)
        distribution = tuple(
            random_values / np.sum(random_values)
        )
    else:
        if not hasattr(
            distribution, "__iter__"
        ) or isinstance(distribution, str):
            raise ValueError(
                "Distribution must be 'auto', a tuple, or a list of values."
            )

        distribution = tuple(distribution)

        if (
            distributed_elements is not None
            and len(distribution) != distributed_elements
        ):
            raise ValueError(
                f"The distribution must have exactly {distributed_elements} elements."
            )

        validated_distribution = []
        for value in distribution:
            if not isinstance(value, int | float):
                raise ValueError(
                    "All distribution values must be numeric."
                )
            validated_distribution.append(float(value))

        # Check if the distribution is normalized
        if check_normalization and not np.isclose(
            sum(validated_distribution), 1
        ):
            raise ValueError(
                "The sum of the distribution values must be equal to 1."
            )

        distribution = tuple(validated_distribution)

    # Check if the distribution matches a probability distribution
    if kind == "probs":
        _is_probability_distribution(
            distribution, mode="strict", error="raise"
        )

    return distribution




[docs]
def validate_length_range(
    length_range, sorted_values=True, param_name=None
):
    """
    Validates the review length range ensuring it's a tuple with two integers
    where the first value is less than the second.

    Parameters
    ----------
    length_range : tuple
        A tuple containing two values that represent the minimum and maximum
        lengths of reviews.
    sorted_values: bool, default=True
        If True, the function expects the input length range to be sorted in
        ascending order and will automatically sort it if not. If False, the
        input length range is not expected to be sorted, and it will remain
        as provided.
    param_name : str, optional
        The name of the parameter being validated. If None, the default name
        'length_range' will be used in error messages.

    Returns
    -------
    tuple
        The validated length range.

    Raises
    ------
    ValueError
        If the length range does not meet the requirements.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_length_range
    >>> validate_length_range ( (202, 25) )
    (25, 202)
    >>> validate_length_range ( (202,) )
    ValueError: length_range must be a tuple with two elements.
    """
    param_name = param_name or "length_range"
    if (
        not isinstance(length_range, list | tuple)
        or len(length_range) != 2
    ):
        raise ValueError(
            f"{param_name} must be a tuple with two elements."
        )

    min_length, max_length = length_range

    if not all(
        isinstance(x, float | int | np.integer | np.floating)
        for x in length_range
    ):
        raise ValueError(
            f"Both elements in {param_name} must be numeric."
        )

    if sorted_values:
        length_range = tuple(sorted([min_length, max_length]))
        if length_range[0] >= length_range[1]:
            raise ValueError(
                f"The first element in {param_name} must be less than the second."
            )
    else:
        length_range = tuple([min_length, max_length])

    return length_range




[docs]
def contains_nested_objects(
    lst, strict=False, allowed_types=None
):
    """
    Determines whether a list contains nested objects.

    Parameters
    ----------
    lst : list
        The list to be checked for nested objects.
    strict : bool, optional
        If True, all items in the list must be nested objects. If False, the function
        returns True if any item is a nested object. Default is False.
    allowed_types : tuple of types, optional
        A tuple of types to consider as nested objects. If None, common nested types
        like list, set, dict, and tuple are checked. Default is None.

    Returns
    -------
    bool
        True if the list contains nested objects according to the given parameters,
        otherwise False.

    Notes
    -----
    A nested object is defined as any item within the list that is not a primitive
    data type (e.g., int, float, str) or is a complex structure like lists, sets,
    dictionaries, etc. The function can be customized to check for specific types
    using the `allowed_types` parameter.

    Examples
    --------
    >>> from geoprior.utils.validator import contains_nested_objects
    >>> example_list1 = [{1, 2}, [3, 4], {'key': 'value'}]
    >>> example_list2 = [1, 2, 3, [4]]
    >>> example_list3 = [1, 2, 3, 4]
    >>> contains_nested_objects(example_list1)
    True  # non-strict, contains nested objects
    >>> contains_nested_objects(example_list1, strict=True)
    True  # strict, all are nested objects
    >>> contains_nested_objects(example_list2)
    True  # non-strict, contains at least one nested object
    >>> contains_nested_objects(example_list2, strict=True)
    False  # strict, not all are nested objects
    >>> contains_nested_objects(example_list3)
    False  # non-strict, no nested objects
    >>> contains_nested_objects(example_list3, strict=True)
    False  # strict, no nested objects
    """
    if allowed_types is None:
        allowed_types = (
            list,
            set,
            dict,
            tuple,
            pd.Series,
            pd.DataFrame,
            np.ndarray,
        )  # Default nested types

    # Function to check if an item is a nested type
    def is_nested(item):
        return isinstance(item, allowed_types)

    if strict:
        # Check if all items are nested objects
        return all(is_nested(item) for item in lst)
    else:
        # Check if any item is a nested object
        return any(is_nested(item) for item in lst)




[docs]
def validate_nan_policy(
    nan_policy, *arrays, sample_weights=None
):
    """
    Validates and applies a specified nan_policy to input arrays and
    optionally to sample weights. This utility is essential for pre-processing
    data prior to statistical analyses or model training, where appropriate
    handling of NaN values is critical to ensure accurate and reliable outcomes.

    Parameters
    ----------
    nan_policy : {'propagate', 'raise', 'omit'}
        Defines how to handle NaNs in the input arrays. 'propagate' returns the
        input data without changes. 'raise' throws an error if NaNs are detected.
        'omit' removes rows with NaNs across all input arrays and sample weights.
    *arrays : array-like
        Variable number of input arrays to be validated and adjusted based on
        the specified nan_policy.
    sample_weights : array-like, optional
        Sample weights array to be validated and adjusted in tandem with the
        input arrays according to nan_policy. Defaults to None.

    Returns
    -------
    arrays : tuple of np.ndarray
        Adjusted input arrays, with modifications applied based on nan_policy.
        The order of arrays in the tuple corresponds to the order of input.
    sample_weights : np.ndarray or None
        Adjusted sample weights, modified according to nan_policy if provided.
        Returns None if no sample_weights were provided.

    Raises
    ------
    ValueError
        If `nan_policy` is not among the valid options ('propagate', 'raise',
        'omit') or if NaNs are detected when `nan_policy` is set to 'raise'.

    Notes
    -----
    Handling NaN values is a critical step in data preprocessing, especially
    in datasets with missing values. The choice of nan_policy can significantly
    impact subsequent statistical analysis or predictive modeling by either
    including, excluding, or signaling errors for observations with missing
    values. This function ensures consistent application of the chosen policy
    across multiple datasets, facilitating robust and error-free analyses.

    Examples
    --------
    >>> import numpy as np
    >>> from geoprior.utils.validator import validate_nan_policy
    >>> y_true = np.array([1, np.nan, 3])
    >>> y_pred = np.array([1, 2, 3])
    >>> sample_weights = np.array([0.5, 0.5, 1.0])
    >>> arrays, sw = validate_nan_policy('omit', y_true, y_pred,
    ...                                  sample_weights=sample_weights)
    >>> arrays
    (array([1., 3.]), array([1., 3.]))
    >>> sw
    array([0.5, 1. ])
    """
    nan_policy = str(nan_policy).lower()
    valid_policies = ["propagate", "raise", "omit"]
    if nan_policy not in valid_policies:
        raise ValueError(
            f"Invalid nan_policy: {nan_policy}. Valid options are {valid_policies}."
        )

    if nan_policy == "omit":
        # Find indices that are not NaN in all arrays
        not_nan_mask = ~np.isnan(np.column_stack(arrays)).any(
            axis=1
        )
        if sample_weights is not None:
            not_nan_mask &= ~np.isnan(sample_weights)

        # Filter out NaNs from all arrays and sample_weights
        arrays = tuple(
            array[not_nan_mask] for array in arrays
        )
        if sample_weights is not None:
            sample_weights = sample_weights[not_nan_mask]

    elif nan_policy == "raise":
        # Check for NaNs in any of the arrays or sample_weights
        if any(np.isnan(array).any() for array in arrays) or (
            sample_weights is not None
            and np.isnan(sample_weights).any()
        ):
            raise ValueError(
                "Input values contain NaNs and nan_policy is 'raise'."
            )

    # Return adjusted arrays and sample_weights
    if sample_weights is not None:
        return (*arrays, sample_weights)

    return arrays




[docs]
def validate_fit_weights(
    y, sample_weight=None, weighted_y=False
):
    """
    Validate and compute sample weights for fitting.

    Parameters
    ----------
    y : array-like of shape (n_samples,)
        Target values.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights. If None, then samples are equally weighted.

    weighted_y : bool, default=False
        If True, compute the weighted target values.

    Returns
    -------
    sample_weight : array-like of shape (n_samples,)
        Validated sample weights.

    weighted_y_values : array-like of shape (n_samples,), optional
        Weighted target values if `weighted_y` is True.

    Raises
    ------
    ValueError
        If `sample_weight` is not None and its length does not match the length of `y`.
        If any value in `sample_weight` is negative.

    Notes
    -----
    This function checks the input sample weights, ensuring they are consistent with
    the target values `y`. If `sample_weight` is None, it returns an array of ones
    indicating equal weighting. Otherwise, it validates and returns the given
    sample weights. If `weighted_y` is True, it also computes and returns the
    weighted target values.

    Examples
    --------
    >>> import numpy as np
    >>> y = np.array([0, 1, 1, 0, 1])
    >>> validate_fit_weights(y)
    array([1., 1., 1., 1., 1.])

    >>> sample_weight = np.array([1, 0.5, 1, 1.5, 1])
    >>> validate_fit_weights(y, sample_weight)
    array([1. , 0.5, 1. , 1.5, 1. ])

    >>> validate_fit_weights(y, sample_weight, weighted_y=True)
    (array([1. , 0.5, 1. , 1.5, 1. ]), array([0. , 0.5, 1. , 0. , 1. ]))

    >>> validate_fit_weights(y, weighted_y=True)
    (array([1., 1., 1., 1., 1.]), array([0., 1., 1., 0., 1.]))
    """
    y = check_array(y, ensure_2d=False)

    if sample_weight is None:
        sample_weight = np.ones_like(y, dtype=float)
    else:
        sample_weight = check_array(
            sample_weight, ensure_2d=False
        )
        check_consistent_length(y, sample_weight)

        if not np.all(sample_weight >= 0):
            raise ValueError(
                "Sample weights must be non-negative"
            )

    if weighted_y:
        weighted_y_values = y * sample_weight
        return sample_weight, weighted_y_values

    return sample_weight




[docs]
def is_valid_policies(nan_policy, allowed_policies=None):
    """
    Validates the `nan_policy` or any policy argument to ensure it is one
    of the acceptable options (`allowed_policies`).

    Function is used to enforce conformity to predefined NaN handling
    strategies in data processing tasks.

    Parameters
    ----------
    nan_policy : str
        The NaN handling policy to validate. Acceptable values are:
        'propagate' - NaN values are propagated, i.e., no action is taken.
        'omit' - NaN values are omitted before proceeding with the operation.
        'raise' - Raises an error if NaN values are present.

    allowed_policies : list of str, optional
        A list of allowable policy options. If None,
        defaults to ['propagate', 'omit', 'raise'].

    Raises
    ------
    ValueError
        If `nan_policy` is not one of the valid options in `allowed_policies`.

    Returns
    -------
    str
        The verified `nan_policy` value, confirming it is within
        allowed parameters.

    Examples
    --------
    >>> from geoprior.utils.validator import is_valid_policies
    >>> is_valid_policies('omit')  # This should pass without an error.
    >>> is_valid_policies('ignore')  # This should raise a ValueError.

    """
    # Set default policies if none provided
    if allowed_policies is None:
        allowed_policies = ["propagate", "omit", "raise"]

    # Ensure allowed_policies is a list even if a single string was provided
    if isinstance(allowed_policies, str):
        allowed_policies = [allowed_policies]

    # Normalize the input policy for comparison
    nan_policy = str(nan_policy).lower().strip()

    # Check if the provided nan_policy is in the list of allowed policies
    if nan_policy not in allowed_policies:
        raise ValueError(
            f"Invalid nan_policy {nan_policy!r}. Choose from {allowed_policies}."
        )

    return nan_policy




[docs]
def validate_multioutput(value, extra=""):
    """
    Validate the `multioutput` parameter value and handle special cases.

    This function checks if the provided `multioutput` value is one of the
    accepted strings ('raw_values', 'uniform_average', 'raise', 'warn'). It
    warns or raises an error based on the value if it's applicable.

    Parameters
    ----------
    value : str
        The value of the `multioutput` parameter to be validated. Accepted
        values are 'raw_values', 'uniform_average', 'raise', 'warn'.
    extra : str, optional
        Additional text to include in the warning or error message if
        `multioutput` is not applicable.

    Returns
    -------
    str
        The validated `multioutput` value in lowercase if it's one of the
        accepted values. If the value is 'warn' or 'raise', the function
        handles the case accordingly without returning a value.

    Raises
    ------
    ValueError
        If `value` is not one of the accepted strings and is not 'raise'.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_multioutput
    >>> validate_multioutput('raw_values')
    'raw_values'

    >>> validate_multioutput('warn', extra=' for Dice Similarity Coefficient')
    # This will warn that multioutput parameter is not applicable for Dice
    # Similarity Coefficient.

    >>> validate_multioutput('raise', extra=' for Gini Coefficient')
    # This will raise a ValueError indicating that multioutput parameter
    # is not applicable for Gini Coefficient.

    >>> validate_multioutput('average')
    # This will raise a ValueError indicating 'average' is an invalid value
    # for multioutput parameter.

    Note
    ----
    The function is designed to ensure API consistency across various metrics
    functions by providing a standard way to handle `multioutput` parameter
    values, especially in contexts where multiple outputs are not applicable.
    """
    valid_values = ["raw_values", "uniform_average"]
    value_lower = str(value).lower()
    if value_lower == "average_uniform":
        value_lower = "uniform_average"
    if value_lower in ["raise", "warn"]:
        warn_msg = (
            "The `multioutput` parameter is not applicable"
            + extra
            + " as it inherently combines outputs into a single score."
        )
        if value_lower == "warn":
            warnings.warn(warn_msg, UserWarning, stacklevel=2)
        elif value_lower == "raise":
            raise ValueError(warn_msg)
    elif value_lower not in valid_values:
        raise ValueError(
            "Invalid value for multioutput parameter. Expect 'raw_values' or "
            f"'uniform_average'. Got '{value}'."
        )

    return value_lower




[docs]
def ensure_non_negative(*arrays, err_msg=None):
    """
    Ensure that provided arrays contain only non-negative values.

    This function checks each provided array for non-negativity. If any negative
    values are found in any array, it raises a ValueError. This check is crucial
    for computations or algorithms where negative values are not permissible, such
    as logarithmic transformations.

    Parameters
    ----------
    *arrays : array-like
        One or more array-like structures (e.g., lists, numpy arrays). Each array
        is checked for non-negativity.
    err_msg: str, optional
        Specify a custom error message if negative values are found.

    Raises
    ------
    ValueError
        If any array contains negative values, a ValueError is raised with a message
        indicating that only non-negative values are expected.

    Examples
    --------
    >>> y_true = [0, 1, 2, 3]
    >>> y_pred = [0.5, 2.1, 3.5, -0.1]
    >>> ensure_non_negative(y_true, y_pred)
    ValueError: Negative value found. Expect only non-negative values.

    Note
    ----
    The function uses a variable number of arguments, allowing flexibility in the number
    of arrays checked in a single call.
    """
    for i, array in enumerate(arrays, start=1):
        if np.any(np.asarray(array) < 0):
            err_msg = err_msg or (
                f"Array at index {i} contains negative values."
                " Expect only non-negative values."
            )
            raise ValueError(err_msg)




[docs]
def check_epsilon(
    eps,
    y_true=None,
    y_pred=None,
    base_epsilon=1e-10,
    scale_factor=1e-5,
):
    """
    Dynamically determine or validate an epsilon value for numerical computations.

    This function either validates a provided epsilon if it is a numeric value, or
    calculates an appropriate epsilon dynamically based on the input data. The dynamic
    calculation aims to adjust epsilon based on the scale of the input data, providing
    flexibility and adaptability in algorithms where numerical stability is critical.

    Parameters
    ----------
    eps : {'auto', float}
        The epsilon value to use. If 'auto', the function dynamically determines an
        appropriate epsilon based on `y_true` and `y_pred`. If a float, it validates
        this as the epsilon value.
    y_true : array-like, optional
        True values array. Used in conjunction with `y_pred` to dynamically determine
        epsilon if `eps` is 'auto'. If `None`, this input is ignored.
    y_pred : array-like, optional
        Predicted values array. Used alongside `y_true` for epsilon determination.
        If `None`, this input is ignored.
    base_epsilon : float, optional
        Base epsilon value used as a starting point in dynamic determination. This
        value is adjusted based on the `scale_factor` and the input data to compute
        the final epsilon.
    scale_factor : float, optional
        Scaling factor applied to adjust the base epsilon in relation to the scale
        of the input data. Helps tailor the epsilon to the problem's numerical scale.

    Returns
    -------
    float
        The determined or validated epsilon value. Ensures numerical operations
        are conducted with an appropriate epsilon to avoid division by zero or
        other numerical instabilities.

    Examples
    --------
    >>> y_true = [1, 2, 3]
    >>> y_pred = [1.1, 1.9, 3.05]
    >>> check_epsilon('auto', y_true, y_pred)
    0.00001  # Example output, actual value depends on `determine_epsilon` implementation.

    >>> check_epsilon(1e-8)
    1e-8

    Notes
    -----
    Using 'auto' for `eps` allows algorithms to adapt to different scales of data,
    enhancing numerical stability without manually tuning the epsilon value.
    """
    from .mathext import determine_epsilon

    # Initialize a list to hold arrays for dynamic epsilon determination
    y_arrays = []

    # Convert inputs to numpy arrays and add to y_arrays if they are not None
    if y_true is not None:
        y_true = np.asarray(y_true, dtype=np.float64)
        y_arrays.append(y_true)
    if y_pred is not None:
        y_pred = np.asarray(y_pred, dtype=np.float64)
        y_arrays.append(y_pred)

    # Ensure y_true and y_pred have consistent lengths if both are provided
    if y_true is not None and y_pred is not None:
        check_consistent_length(y_true, y_pred)

    # If both arrays are provided, concatenate them for epsilon determination
    if len(y_arrays) == 2:
        y_arrays = [np.concatenate(y_arrays)]

    # Dynamically determine epsilon if 'auto', else ensure it's a float
    if str(eps).lower() == "auto" and y_arrays:
        eps = determine_epsilon(
            y_arrays[0],
            base_epsilon=base_epsilon,
            scale_factor=scale_factor,
        )
    else:
        try:
            eps = float(eps)
        except ValueError:
            raise ValueError(
                f"Epsilon must be 'auto' or convertible to float. Got '{eps}'"
            )

    return eps



def _ensure_y_is_valid(y_true, y_pred, **kwargs):
    """
    Validates that the true and predicted target arrays are suitable for further
    processing. This involves ensuring that both arrays are non-empty, of the
    same length, and meet any additional criteria specified by keyword arguments.

    Parameters
    ----------
    y_true : array-like
        The true target values.
    y_pred : array-like
        The predicted target values.
    **kwargs : dict
        Additional keyword arguments to pass to the check_y function for any
        extra validation criteria.

    Returns
    -------
    y_true : array-like
        Validated true target values.
    y_pred : array-like
        Validated predicted target values.

    Raises
    ------
    ValueError
        If the validation checks fail, indicating that the input arrays do not
        meet the required criteria for processing.

    Examples
    --------
    Suppose `check_y` validates that the input is a non-empty numpy array and
    `check_consistent_length` ensures the arrays have the same number of elements.
    Then, usage could be as follows:

    >>> y_true = np.array([1, 2, 3])
    >>> y_pred = np.array([1.1, 2.1, 3.1])
    >>> y_true_valid, y_pred_valid = _ensure_y_is_valid(y_true, y_pred)
    >>> print(y_true_valid, y_pred_valid)
    [1 2 3] [1.1 2.1 3.1]
    """
    # Convert y_true and y_pred to numpy arrays if they are not already
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Ensure individual array validity
    y_true = check_y(y_true, **kwargs)
    y_pred = check_y(y_pred, **kwargs)

    # Check if the arrays have consistent lengths
    check_consistent_length(y_true, y_pred)

    return y_true, y_pred



[docs]
def check_classification_targets(
    *y, target_type="numeric", strategy="auto", verbose=False
):
    """
    Validate that the target arrays are suitable for classification tasks.

    This function is designed to ensure that target arrays (`y`) contain only
    finite, categorical values, and it raises a ValueError if the targets do
    not meet the criteria necessary for classification tasks, such as the
    presence of continuous values, NaNs, or infinite values.

    This validation is crucial for preprocessing steps in machine learning
    pipelines to ensure that the data is appropriate for classification
    algorithms.

    Parameters
    ----------
    *y : array-like
        One or more target arrays to be validated. The input can be in the
        form of lists, numpy arrays, or pandas series. Each array is checked
        individually to ensure it  meets the criteria for classification targets.

    target_type : str, optional
        The expected data type of the target arrays. Supported values are
        'numeric' and 'object'. If 'numeric', the function attempts to
        convert the target arrays to integers, raising an error if conversion
        is not possible due to non-numeric values. If 'object', the target
        arrays are left as numpy arrays of dtype `object`, suitable for
        categorical classification without conversion. Default is 'numeric'.

    strategy : str, optional
        Defines the approach for evaluating if the target arrays are suitable
        for classification based on their unique values and data types. The
        'auto' strategy uses heuristic or automatic detection to decide whether
        target data should be treated as categorical, which is useful for most
        cases. Custom strategies can be defined to enforce specific validation
        rules or preprocessing steps based on the nature of the target data
        (e.g., 'continuous', 'multilabel-indicator', 'unknown'). These custom
        strategies should align with the outcomes of a predefined
        `type_of_target` function, allowing for nuanced handling of different
        target data scenarios. The default value is ``'auto'``, which applies
        general rules for categorization and numeric conversion where applicable.

        If a strategy other than ``'auto'`` is specified, it directly influences
        how the data is validated and potentially converted, based on the
        expected or detected type of target data:

        - If 'continuous', the function checks if the data can be used for
          regression tasks and raises an error for classification use without
          explicit binning.
        - If 'multilabel-indicator', it validates the data for multilabel
          classification tasks and ensures appropriate format.
        - If 'unknown', it attempts to validate the data with generic checks,
          raising errors for any unclear or unsupported data formats.

    verbose : bool, optional
        If set to True, the function prints a message for each target array
        checked, confirming that it is suitable for classification. This
        is helpful for debugging and when validating multiple target arrays
        simultaneously.

    Raises
    ------
    ValueError
        If any of the target arrays contain values unsuitable for classification. This
        includes arrays with continuous values, NaNs, infinite values, or arrays that do
        not represent categorical data properly.

    Examples
    --------
    Using the function with a single array of integer labels:

    >>> from geoprior.utils.validator import check_classification_targets
    >>> y = [1, 2, 3, 2, 1]
    >>> check_classification_targets(y)
    [array([1, 2, 3, 2, 1], dtype=object)]

    Using the function with multiple arrays, including a mix of integer and
    string labels:

    >>> y1 = [0, 1, 0, 1]
    >>> y2 = ["spam", "ham", "spam", "ham"]
    >>> check_classification_targets(y1, y2, verbose=True)
    Targets are suitable for classification.
    Targets are suitable for classification.
    [array([0, 1, 0, 1], dtype=object), array(['spam', 'ham', 'spam', 'ham'], dtype=object)]

    Attempting to use the function with an array containing NaN values:

    >>> y_with_nan = [1, np.nan, 2, 1]
    >>> check_classification_targets(y_with_nan)
    ValueError: Target values contain NaN or infinite numbers, which are not
    suitable for classification.

    Attempting to use the function with a continuous target array:

    >>> y_continuous = np.linspace(0, 1, 10)
    >>> check_classification_targets(y_continuous)
    ValueError: The number of unique values is too high for a classification task.
    Validating and converting a mixed-type target array to numeric:

    >>> y_mixed = [1, '2', 3.0, '4', 5]
    >>> check_classification_targets(y_mixed, target_type='numeric')
    ValueError: Target array at index 0 contains non-numeric values, which
    cannot be converted to integers: ['2', '4']...

    Validating object target arrays without attempting conversion:

    >>> y_str = ["apple", "banana", "cherry"]
    >>> check_classification_targets(y_str, target_type='object')
    [array(['apple', 'banana', 'cherry'], dtype=object)]

    """
    validated_targets = [
        _check_y(target, strategy=strategy) for target in y
    ]

    if target_type == "numeric":
        # Try to convert validated targets to numeric (integer), if possible
        for i, target in enumerate(validated_targets):
            if all(
                isinstance(
                    item,
                    int | float | np.integer | np.floating,
                )
                for item in target
            ):
                try:
                    # Attempt conversion to integer
                    validated_targets[i] = target.astype(
                        np.int64
                    )
                except ValueError as e:
                    raise ValueError(
                        f"Error converting target array at index {i} to integers. "
                        "Ensure all values are numeric and representable as integers. "
                        f"Original error: {e}"
                    )
            else:
                non_numeric = [
                    item
                    for item in target
                    if not isinstance(
                        item,
                        int
                        | float
                        | np.integer
                        | np.floating,
                    )
                ]
                raise ValueError(
                    f"Target array at index {i} contains non-numeric values, "
                    f"which cannot be converted to integers: {non_numeric[:5]}..."
                )
    elif target_type == "object":
        # If target_type is 'object', no conversion is needed
        # The function ensures they are numpy arrays, which might already suffice
        pass
    else:
        # In case an unsupported target_type is provided
        raise ValueError(
            f"Unsupported target_type '{target_type}'. Use 'numeric' or 'object'."
        )

    if verbose:
        print("Targets are suitable for classification.")

    return validated_targets



def _check_y(y, strategy="auto"):
    """
    Validates the target array `y`, ensuring it is suitable for classification
    or regression tasks based on its content and the specified strategy.

    Parameters
    ----------
    y : array-like
        Target array to validate.
    strategy : str, default="auto"
        Strategy used to determine whether ``y`` is categorical or continuous.
        Use ``"auto"`` for automatic detection based on unique values or use
        ``type_of_target`` semantics for more nuanced determination.
    """
    from ..core.utils import type_of_target

    # Convert y to a numpy array of objects to handle mixed types
    y = np.array(y, dtype=object)

    # Check for NaN or infinite values in numeric data
    numeric_types = "biufc"  # Numeric types
    if y.dtype.kind in numeric_types:
        numeric_y = y.astype(
            float, casting="safe"
        )  # Safely cast numeric types to float
        if not np.all(np.isfinite(numeric_y)):
            raise ValueError(
                "Numeric target values contain NaN or infinite numbers,"
                " not suitable for classification."
            )
    else:
        # For non-numeric data, ensure no elements are None or equivalent to np.nan
        if any(el is None or el is np.nan for el in y):
            raise ValueError(
                "Non-numeric target values contain None or NaN,"
                " not suitable for classification."
            )
    unique_values = np.unique(y)
    # Apply specific strategy for determining categorization
    if strategy != "auto":
        # Implement custom logic based on `type_of_target` outcomes
        target_type = type_of_target(y)
        if target_type == "continuous":
            raise ValueError(
                "Continuous data not suitable for classification"
                " without explicit binning."
            )
        elif target_type == "multilabel-indicator":
            raise ValueError(
                "Multilabel-indicator format detected,"
                " requiring different handling."
            )
        elif target_type == "unknown":
            raise ValueError(
                "Unable to determine the target type,"
                " please check the input data."
            )
    else:
        # Auto detection based on unique values count
        if unique_values.shape[0] > np.sqrt(len(y)):
            raise ValueError(
                "Automatic strategy detected too many unique values"
                " for a classification task."
            )

    # Check for non-numeric data convertibility to categorical if not already checked
    if y.dtype.kind not in numeric_types:
        if not all(
            isinstance(val, str | bool | int)
            for val in unique_values
        ):
            raise ValueError(
                "Target values must be categorical, numeric,"
                " or convertible to categories."
            )

    return y



[docs]
def validate_yy(
    y_true,
    y_pred,
    expected_type=None,
    *,
    validation_mode="strict",
    flatten=False,
):
    """
    Validates the shapes and types of actual and predicted target arrays,
    ensuring they are compatible for further analysis or metrics calculation.

    Parameters
    ----------
    y_true : array-like
        True target values.
    y_pred : array-like
        Predicted target values.
    expected_type : str, optional
        The expected sklearn type of the target ('binary', 'multiclass', etc.).
    validation_mode : str, optional
        Validation strictness. Currently, only 'strict' is implemented,
        which requires y_true and y_pred to have the same shape and match the
        expected_type.
    flatten : bool, optional
        If True, both y_true and y_pred are flattened to one-dimensional arrays.

    Raises
    ------
    ValueError
        If y_true and y_pred do not meet the validation criteria.

    Returns
    -------
    tuple
        The validated y_true and y_pred arrays, potentially flattened.
    """
    from ..core.utils import type_of_target

    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if str(flatten) == "auto":
        # check whether is two and the second dimension is 1
        if y_pred.ndim == 2 and y_pred.shape[1] == 1:
            y_pred = y_pred.ravel()
        if y_true.ndim == 2 and y_true.shape[1] == 1:
            y_true = y_true.ravel()

    if flatten:
        y_true = y_true.ravel()
        y_pred = y_pred.ravel()

    if y_true.ndim != 1 or y_pred.ndim != 1:
        msg = (
            "Both y_true and y_pred must be one-dimensional arrays."
            f" Got {y_true.shape} and {y_pred.shape}. Set ``flatten=True``"
            " to raveling arrays back to one-dimensional."
        )

        raise ValueError(msg)

    check_consistent_length(y_true, y_pred)

    if expected_type is not None:
        actual_type_y_true = type_of_target(y_true)
        actual_type_y_pred = type_of_target(y_pred)
        if validation_mode == "strict" and (
            actual_type_y_true != expected_type
            or actual_type_y_pred != expected_type
        ):
            msg = (
                f"Validation failed in strict mode. Expected type '{expected_type}'"
                f" for both y_true and y_pred, but got '{actual_type_y_true}'"
                f" and '{actual_type_y_pred}' respectively."
            )
            raise ValueError(msg)

    return y_true, y_pred




[docs]
def check_mixed_data_types(data) -> bool:
    """
    Checks if the given data (DataFrame or numpy array) contains both numerical
    and categorical columns.

    Parameters
    ----------
    data : pd.DataFrame or np.ndarray
        The data to check. Can be a pandas DataFrame or a numpy array. If `data`
        is a numpy array, it is temporarily converted to a DataFrame for type
        checking.

    Returns
    -------
    bool
        True if the data contains both numerical and categorical columns, False
        otherwise.

    Examples
    --------
    Using with a pandas DataFrame:

    >>> import numpy as np
    >>> import pandas as pd
    >>> from geoprior.utils.validator import check_mixed_data_types
    >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
    >>> print(check_mixed_data_types(df))
    True

    Using with a numpy array:

    >>> array = np.array([[1, 'a'], [2, 'b'], [3, 'c']])
    >>> print(check_mixed_data_types(array))
    True

    With data containing only numerical values:

    >>> df_numeric_only = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
    >>> print(check_mixed_data_types(df_numeric_only))
    False

    With data containing only categorical values:

    >>> df_categorical_only = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['d', 'e', 'f']})
    >>> print(check_mixed_data_types(df_categorical_only))
    False
    """
    # Convert numpy array to DataFrame if necessary
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)

    # Check for the presence of numerical and categorical data types
    has_numerical = any(
        data.dtypes.apply(
            lambda dtype: np.issubdtype(dtype, np.number)
        )
    )
    has_categorical = any(
        data.dtypes.apply(
            lambda dtype: (
                dtype == "object"
                or dtype.name == "category"
                or dtype == "bool"
            )
        )
    )

    return has_numerical and has_categorical




[docs]
def has_required_attributes(
    model: Any, attributes: list[str]
) -> bool:
    """
    Check if the model has all required Keras-specific attributes.

    This function is part of the deep validation process to ensure that the
    model not only inherits from Keras model classes but also implements
    essential methods.

    Parameters
    ----------
    model : Any
        The model object to inspect.
    attributes : list of str
        A list of strings representing the names of the attributes to check for
        in the model.

    Returns
    -------
    bool
        True if the model contains all specified attributes, False otherwise.
    """
    return all(hasattr(model, attr) for attr in attributes)




[docs]
def validate_dates(
    start_date,
    end_date,
    return_as_date_str=False,
    date_format="%Y-%m-%d",
):
    """
    Validates and parses start and end years/dates, with options for output formatting.

    This function ensures the validity of provided start and end years or dates, checks
    if they fall within a reasonable range, and allows the option to return the validated
    years or dates in a specified string format.

    Parameters
    ----------
    start_date : int, float, or str
        The starting year or date. Can be an integer, float (converted to integer),
        or string in "YYYY" or "YYYY-MM-DD" format.
    end_date : int, float, or str
        The ending year or date, with the same format options as `start_date`.
    return_as_date_str : bool, optional
        If True, returns the start and end dates as strings in the specified format.
        Default is False, returning years as integers.
    date_format : str, optional
        The format string for output dates if `return_as_date_str` is True.
        Default format is "%Y-%m-%d".

    Returns
    -------
    tuple
        A tuple of two elements, either integers (years) or strings (formatted dates),
        representing the validated start and end years or dates.

    Raises
    ------
    ValueError
        If the input years or dates are invalid, out of the acceptable range,
        or if the start year/date does not precede the end year/date.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_dates
    >>> validate_dates(1999, 2001)
    (1999, 2001)

    >>> validate_dates("1999/01/01", "2001/12/31", return_as_date_str=True)
    ('1999-01-01', '2001-12-31')

    >>> validate_dates("1999", "1998")
    ValueError: The start date/time must precede the end date/time.

    >>> validate_years("1899", "2001")
    ValueError: Years must be within the valid range: 1900 to [current year].

    Notes
    -----
    The function supports flexible input formats for years and dates, including
    handling both slash "/" and dash "-" separators in date strings. It enforces
    logical and chronological order between start and end inputs and allows
    customization of the output format for date strings.
    """

    def parse_year_input(year_input):
        if isinstance(year_input, int | float):
            return datetime(int(year_input), 1, 1)
        elif isinstance(year_input, str):
            year_input = year_input.replace("/", "-")
            try:
                return datetime.strptime(
                    year_input, date_format
                )
            except ValueError:
                try:
                    # Fallback to parsing as year only
                    return datetime(int(year_input), 1, 1)
                except TypeError as type_err:
                    raise TypeError(
                        "Expected int, float, or str for"
                        f" year, got {type(year_input)}."
                    ) from type_err
                except ValueError as value_err:
                    raise ValueError(
                        "Check your date data. For datetime value, set `date_format`"
                        " to '%Y-%m-%d %H:%M:%S'"
                    ) from value_err
        raise TypeError(
            f"Invalid input '{year_input}'."
            " Expected format: YYYY or YYYY-MM-DD."
        )

    start_date, end_date = map(
        parse_year_input, [start_date, end_date]
    )

    if start_date >= end_date:
        raise ValueError(
            "Start date/time must be earlier than end date/time."
        )

    if return_as_date_str:
        return start_date.strftime(
            date_format
        ), end_date.strftime(date_format)

    current_year = datetime.now().year
    for year in (start_date.year, end_date.year):
        if not 1900 <= year <= current_year:
            raise ValueError(
                f"Year {year} is out of the valid"
                f" range: 1900 to {current_year}."
            )

    # Additional validation for non-string return format
    if (
        start_date.year == end_date.year
        and start_date != end_date
        and not return_as_date_str
    ):
        raise ValueError(
            "Start and end dates are within the same year but not the same date. "
            "Consider using return_as_date_str=True or providing specific dates."
        )

    return start_date.year, end_date.year




[docs]
def validate_positive_integer(
    value,
    variable_name,
    include_zero=False,
    round_float=None,
    msg=None,
):
    """
    Validates whether the given value is a positive integer or zero based
    on the parameter and rounds float values according to the specified method.

    Parameters
    ----------
    value : int or float
        The value to validate.
    variable_name : str
        The name of the variable for error message purposes.
    include_zero : bool, optional
        If True, zero is considered a valid value. Default is False.
    round_float : str, optional
        If "ceil", rounds up float values; if "floor", rounds down float values;
        if None, truncates float values to the nearest whole number towards zero.
    msg : str, optional
        Error message when checking for proper type failed.

    Returns
    -------
    int
        The validated value converted to an integer.

    Raises
    ------
    ValueError
        If the value is not a positive integer or zero (based on `include_zero`),
        or if the `round_float` parameter is improperly specified.
    """
    import math

    # Determine the minimum acceptable value
    min_value = 0 if include_zero else 1

    if isinstance(value, str):
        # Try to convert it if possible
        try:
            value = int(value)
        except ValueError:
            # Raise a nice informative error message
            raise ValueError(
                f"Value {value} is not convertible to an integer."
            )

    # Check for proper type and round if necessary
    if not isinstance(
        value, int | float | np.integer | np.floating
    ):
        msg = (
            msg
            or f"{variable_name} must be an integer or float. Got {value}"
        )
        raise ValueError(msg)

    if isinstance(value, float):
        if round_float == "ceil":
            value = math.ceil(value)
        elif round_float == "floor":
            value = math.floor(value)
        elif round_float is None:
            value = int(value)
        else:
            raise ValueError(
                f"Invalid rounding method '{round_float}'."
                " Choose 'ceil', 'floor', or None."
            )
    # if isinstance(value, float) and not value.is_integer():
    #     raise ValueError(f"{variable_name} must be a whole number, got {value}.")
    if value < min_value:
        condition = (
            "a non-negative integer"
            if include_zero
            else "a positive integer"
        )
        raise ValueError(
            f"{variable_name} must be {condition}, got {value}."
        )

    return int(value)




[docs]
def validate_and_adjust_ranges(**kwargs):
    """
    Validates and adjusts the provided range tuples to ensure each is
    composed of two numerical values and is sorted in ascending order.

    This function takes multiple range specifications as keyword arguments,
    each expected to be a tuple of two numerical values (min, max). It validates
    the format and contents of each range, adjusting them if necessary to ensure
    that each tuple is ordered as (min, max).

    Parameters
    ----------
    **kwargs : dict
        Keyword arguments where each key is the name of a range (e.g., 'lat_range')
        and its corresponding value is a tuple of two numerical values representing
        the minimum and maximum of that range.

    Returns
    -------
    dict
        A dictionary with the same keys as the input, but with each tuple value
        adjusted to ensure it is in the format (min, max).

    Raises
    ------
    ValueError
        If any provided range tuple does not contain exactly two values, contains
        non-numerical values, or if the min value is not less than the max value.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_and_adjust_ranges
    >>> validate_and_adjust_ranges(lat_range=(34.00, 36.00), lon_range=(-118.50, -117.00))
    {'lat_range': (34.00, 36.00), 'lon_range': (-118.50, -117.00)}

    >>> validate_and_adjust_ranges(time_range=(10.0, 0.01))
    {'time_range': (0.01, 10.0)}

    >>> validate_and_adjust_ranges(invalid_range=(1, 'a'))
    ValueError: invalid_range must contain numerical values.

    Notes
    -----
    This function is particularly useful for preprocessing input ranges for
    various analyses, ensuring consistency and correctness of range specifications.
    It automates the adjustment of provided ranges, simplifying the setup process
    for further data processing or modeling tasks.
    """
    adjusted_ranges = {}

    for range_name, range_tuple in kwargs.items():
        if (
            not isinstance(range_tuple, tuple)
            or len(range_tuple) != 2
        ):
            raise ValueError(
                f"{range_name} must be a tuple of two values."
            )

        if not all(
            isinstance(value, int | float)
            for value in range_tuple
        ):
            raise ValueError(
                f"{range_name} must contain numerical values."
            )

        # Ensure the range is in (min, max) format
        min_value, max_value = sorted(range_tuple)
        adjusted_ranges[range_name] = (min_value, max_value)

    return adjusted_ranges



def recheck_data_types(
    data: pd.DataFrame | pd.Series | list | dict,
    coerce_numeric: bool = True,
    coerce_datetime: bool = True,
    column_prefix: str = "col",
    return_as_numpy: bool | str = "auto",
) -> pd.DataFrame | pd.Series | np.ndarray:
    """
    Rechecks and coerces column data types in a DataFrame to the most appropriate
    numeric or datetime types if initially identified as objects. It can also handle
    non-DataFrame inputs by attempting to construct a DataFrame before processing.

    Parameters
    ----------
    data : pd.DataFrame, pd.Series, list, or dict
        The data to process. If not a DataFrame, an attempt will be made to convert it.
    coerce_numeric : bool, default=True
        If True, tries to convert object columns to numeric data types.
    coerce_datetime : bool, default=True
        If True, tries to convert object columns to datetime data types.
    column_prefix : str, default="col"
        Prefix for column names when constructing a DataFrame from non-DataFrame input.
    return_as_numpy : bool or str, default="auto"
        If True or "auto", converts the DataFrame to a NumPy array upon returning.
        If "auto", the output type matches the input type.

    Returns
    -------
    Union[pd.DataFrame, np.ndarray]
        The processed data, either as a DataFrame or a NumPy array.

    Examples
    --------
    >>> data = {'a': ['1', '2', '3'], 'b': ['2021-01-01', '2021-02-01', 'not a date'],
                'c': ['1.1', '2.2', '3.3']}
    >>> df = pd.DataFrame(data)
    >>> df = recheck_data_types(df)
    >>> print(df.dtypes)
    a             int64
    b            object  # remains object due to mixed valid and invalid dates
    c           float64
    """
    return_as_numpy = parameter_validator(
        "return_as_numpy", target_strs={"auto", True, False}
    )(return_as_numpy)
    is_frame = True
    if not isinstance(data, pd.DataFrame):
        is_frame = False
        try:
            data = pd.DataFrame(
                data,
                columns=[
                    column_prefix + str(i)
                    for i in range(len(data))
                ],
            )
        except Exception as e:
            raise ValueError(
                "Failed to construct a DataFrame from the provided data. "
                "Ensure that your input data is structured correctly, such as "
                "a list of lists or a dictionary with equal-length lists. "
                "Alternatively, provide a DataFrame directly."
            ) from e

    for column in data.columns:
        if data[column].dtype == "object":
            if coerce_datetime:
                try:
                    data[column] = pd.to_datetime(
                        data[column]
                    )
                    continue  # Skip further processing if datetime conversion is successful
                except (TypeError, ValueError):
                    pass  # Continue if datetime conversion fails

            if coerce_numeric:
                try:
                    data[column] = pd.to_numeric(data[column])
                except ValueError:
                    pass  # Keep as object if conversion fails

    if return_as_numpy == "auto" and not is_frame:
        return_as_numpy = True  # Automatically determine if output should be a NumPy array

    if (
        return_as_numpy is True
    ):  # Explicitly set to True since "auto" is True
        return data.to_numpy()

    return data



[docs]
def is_installed(module: str) -> bool:
    """
    Checks if TensorFlow is installed.

    This function attempts to find the TensorFlow package specification without
    importing the package. It's a lightweight method to verify the presence of
    TensorFlow in the environment.

    Returns
    -------
    bool
        True if TensorFlow is installed, False otherwise.

    Examples
    --------
    >>> from geoprior.utils.validator import is_installed
    >>> print(is_installed("tensorflow"))
    True  # Output will be True if TensorFlow is installed, False otherwise.
    """
    import importlib.util

    module_spec = importlib.util.find_spec(module)
    return module_spec is not None




[docs]
def is_time_series(data, time_col, check_time_interval=False):
    """
    Check if the provided DataFrame is time series data.

    Parameters
    ----------
    data : pandas.DataFrame
        The DataFrame to be checked.
    time_col : str
        The name of the column in `df` expected to represent time.

    Returns
    -------
    bool
        True if `df` is a time series, False otherwise.

    Example
    -------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
        'Date': ['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05'],
        'Value': [1, 2, 3, 4, 5]
    })
    >>> # Should return True if Date column
    >>> # can be converted to datetime
    >>> print(is_time_series(df, 'Date'))

    """
    if time_col not in data.columns:
        print(
            f"Time column '{time_col}' not found in DataFrame."
        )
        return False

    # Check if the column is datetime type or can be converted to datetime
    if not pd.api.types.is_datetime64_any_dtype(
        data[time_col]
    ):
        try:
            pd.to_datetime(data[time_col])
        except ValueError:
            print(
                f"Column '{time_col}' does not contain datetime objects."
            )
            return False

    if check_time_interval:
        # Optional: Check for regular intervals (commented out by default)
        intervals = (
            pd.to_datetime(data[time_col]).diff().dropna()
        )
        if not intervals.nunique() == 1:
            print("Time intervals are not regular.")
            return False

    return True




[docs]
def check_is_fitted2(estimator, attributes, *, msg=None):
    """
    Perform is_fitted validation for estimator.

    Checks if the estimator is fitted by looking for attributes set during fitting.
    Typically, these attributes end with an underscore ('_').

    Parameters
    ----------
    estimator : BaseEstimator
        An instance of a scikit-learn estimator.

    attributes : str or list of str
        The attributes to check for. These are typically set in the 'fit' method.

    msg : str, optional
        The message to raise in the NotFittedError. If not provided, a default
        message is used.

    Raises
    ------
    NotFittedError
        If the given attributes are not found in the estimator.

    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> clf = RandomForestClassifier()
    >>> check_is_fitted(clf, ['feature_importances_'])
    NotFittedError: This RandomForestClassifier instance is not fitted yet.
    """
    from ..exceptions import NotFittedError

    if not hasattr(estimator, "fit"):
        raise TypeError(
            f"{estimator} is not an estimator instance."
        )

    if not isinstance(attributes, list | tuple):
        attributes = [attributes]

    fitted = all(
        [hasattr(estimator, attr) for attr in attributes]
    )

    if not fitted:
        if msg is None:
            cls_name = estimator.__class__.__name__
            msg = (
                f"This {cls_name} instance is not fitted yet. Call 'fit' with appropriate "
                "arguments before using this estimator."
            )

        raise NotFittedError(msg)




[docs]
def assert_xy_in(
    x,
    y,
    *,
    data=None,
    asarray=True,
    to_frame=False,
    columns=None,
    xy_numeric=False,
    ignore=None,
    **kws,
):
    """
    Assert the name of x and y in the given data.

    Check whether string arguments passed to x and y are valid in the data,
    then retrieve the x and y array values.

    Parameters
    -----------
    x, y : Arraylike 1d or str, str
       One dimensional arrays. In principle if data is supplied, they must
       constitute series.  If `x` and `y` are given as string values, the
       `data` must be supplied. x and y names must be included in the
       dataframe otherwise an error raises.

    data: pd.DataFrame,
       Data containing x and y names. Need to be supplied when x and y
       are given as string names.
    asarray: bool, default =True
       Returns x and y as array rather than series.
    to_frame: bool, default=False,
       Convert data to a dataframe using either the columns names or
       the input_names when the keyword parameter ``force=True``.
    columns: list of str, Optional
       Name of columns to transform the array ( ``data``) to a dataframe.
    xy_numeric:bool, default=False
       Convert x and y to numeric values.
    ignore: str, optional
       It should be 'x' or 'y'. If set the array is ignored and not asserted.

    kws: dict,
       Keyword arguments passed to :func:`~.array_to_frame`.


    Returns
    --------
    x, y : Arraylike
       One dimensional array or pd.Series

    Examples
    ---------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from geoprior.utils.validator import assert_xy_in
    >>> x, y = np.random.rand(7 ), np.arange (7 )
    >>> data = pd.DataFrame ({'x': x, 'y':y} )
    >>> assert_xy_in (x='x', y='y', data = data )
    (array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
            0.15599452, 0.05808361]),
     array([0, 1, 2, 3, 4, 5, 6]))
    >>> assert_xy_in (x=x, y=y)
    (array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
            0.15599452, 0.05808361]),
     array([0, 1, 2, 3, 4, 5, 6]))
    >>> assert_xy_in (x=x, y=data.y) # y is a series
    (array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
            0.15599452, 0.05808361]),
     array([0, 1, 2, 3, 4, 5, 6]))
    >>> assert_xy_in (x=x, y=data.y, asarray =False ) # return y like it was
    (array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
            0.15599452, 0.05808361]),
    0    0
    1    1
    2    2
    3    3
    4    4
    5    5
    6    6
    Name: y, dtype: int32)
    """
    from ..core.checks import exist_features

    if to_frame:
        data = array_to_frame(
            data,
            to_frame=True,
            input_name="Data",
            columns=columns,
            **kws,
        )
    if data is not None:
        if not hasattr(data, "__array__") and not hasattr(
            data, "columns"
        ):
            raise TypeError(
                f"Expect a dataframe. Got {type(data).__name__!r}"
            )

    if (
        isinstance(x, str) or isinstance(y, str)
    ) and data is None:
        raise TypeError(
            "Data cannot be None when x and y have string"
            " arguments."
        )
    if (x is None or y is None) and data is None:
        raise TypeError(
            "Missing x and y. NoneType not supported."
        )

    if isinstance(x, str):
        exist_features(data, x)
        x = data[x]
    if isinstance(y, str):
        exist_features(data, y)
        y = data[y]

    if hasattr(x, "__len__") and not hasattr(x, "__array__"):
        x = np.array(x)
    if hasattr(y, "__len__") and not hasattr(y, "__array__"):
        y = np.array(y)

    _validate_input(ignore, x, y, _is_arraylike_1d)

    check_consistent_length(x, y)

    if xy_numeric:
        if not _is_numeric_dtype(
            x, to_array=True
        ) or not _is_numeric_dtype(y, to_array=True):
            raise ValueError(
                "x and y must be a numeric array."
            )

        x = x.astype(np.float64)
        y = y.astype(np.float64)

    return (np.array(x), np.array(y)) if asarray else (x, y)



def _validate_input(ignore: str, x, y, _is_arraylike_1d):
    """
    Validates that x and y are one-dimensional array-like structures based
    on the ignore parameter.

    Parameters
    ----------
    ignore : str
        Specifies which variable ('x' or 'y') to ignore during validation.
    x, y : array-like
        The variables to be validated.
    _is_arraylike_1d : function
        Function to check if the input is array-like and one-dimensional.

    Raises
    ------
    ValueError
        If the non-ignored variable(s) are not one-dimensional array-like structures.
    """
    validation_checks = {
        "x": lambda: _is_arraylike_1d(y),
        "y": lambda: _is_arraylike_1d(x),
        "both": lambda: (
            _is_arraylike_1d(x) and _is_arraylike_1d(y)
        ),
    }

    check = validation_checks.get(
        ignore, validation_checks["both"]
    )
    if not check():
        if ignore in ["x", "y"]:
            raise ValueError(
                f"Expected '{'y' if ignore == 'x' else 'x'}' to be"
                " a one-dimensional array-like structure."
            )
        else:
            raise ValueError(
                "Expected both 'x' and 'y' to be one-dimensional "
                "array-like structures."
            )



[docs]
def validate_numeric(
    value,
    convert_to="float",
    allow_negative=True,
    min_value=None,
    max_value=None,
    check_mode="soft",
):
    """
    Validates if a given value is numeric. It can accept numeric strings
    and numpy arrays of single values. Optionally converts the value to
    either float or integer.

    Parameters
    ----------
    value : Any
        The value to be validated as numeric. This can be of any type
        but is expected to be convertible to a numeric type. Accepted
        types include numeric strings (e.g., ``"42"``), single-element
        numpy arrays (e.g., `np.array([3.14])`), integers, and floats.
    convert_to : str, optional
        Type to convert the validated numeric value to. Use ``"float"`` for
        floating-point output or ``"int"`` for integer output. Defaults to
        ``"float"``.
    allow_negative : bool, optional
        Whether to allow negative values. If ``False``, negative values raise
        a ``ValueError``. Defaults to ``True``.
    min_value : float or int, optional
        The minimum value allowed. If ``None``, no minimum value check
        is applied. Defaults to ``None``.
    max_value : float or int, optional
        The maximum value allowed. If ``None``, no maximum value check
        is applied. Defaults to ``None``.
    check_mode : str, optional
        Validation mode. Use ``"soft"`` to accept single-element iterables and
        validate their single value, or ``"strict"`` to accept only non-iterable
        numeric inputs. Defaults to ``"soft"``.

    Returns
    -------
    float or int
        The validated and optionally converted numeric value. The type
        of the return value is determined by the `convert_to` parameter.

    Raises
    ------
    ValueError
        If the value is not numeric or does not meet the specified criteria.

    Notes
    -----
    The function can coerce single-element NumPy arrays, numeric strings, and,
    in ``soft`` mode, single-element iterables before validating the result.
    The validated value is then converted to ``float`` or ``int`` and checked
    against the sign and range constraints. Array coercion details are
    documented in :cite:t:`NumPyDocs`.

    Examples
    --------
    >>> from geoprior.utils.validator import validate_numeric
    >>> validate_numeric("42", convert_to='int')
    42
    >>> validate_numeric(np.array([3.14]), convert_to='float')
    3.14
    >>> validate_numeric([123], check_mode='soft')
    123.0
    >>> validate_numeric([123], check_mode='strict')
    Traceback (most recent call last):
        ...
    ValueError: Value '[123]' is not a numeric type.
    >>> validate_numeric("-123.45", allow_negative=False)
    Traceback (most recent call last):
        ...
    ValueError: Negative values are not allowed: -123.45

    See Also
    --------
    numpy.array : Numpy arrays, which can be validated by this function.
    """
    # Check if the value is a numpy array with a single element
    if isinstance(value, np.ndarray):
        if value.size != 1:
            raise ValueError(
                "Numpy array must contain exactly one element."
            )
        value = value.item()

    # If check_mode is 'soft', handle single-element iterables
    if (
        check_mode == "soft"
        and isinstance(value, list | tuple | set)
        and len(value) == 1
    ):
        value = next(iter(value))

    # Check if the value is a numeric string
    if isinstance(value, str):
        try:
            value = float(value)
        except ValueError:
            raise ValueError(
                f"Value '{value}' is not a valid numeric string."
            )

    # Check if the value is numeric
    if not isinstance(value, int | float):
        raise ValueError(
            f"Value '{value}' is not a numeric type."
        )

    # Convert the value to the desired type
    if convert_to == "int":
        value = int(value)
    else:
        value = float(value)

    # Check if negative values are allowed
    if not allow_negative and value < 0:
        raise ValueError(
            f"Negative values are not allowed: {value}"
        )

    # Check if the value is within the specified range
    if min_value is not None and value < min_value:
        raise ValueError(
            f"Value {value} is less than the minimum allowed value {min_value}."
        )
    if max_value is not None and value > max_value:
        raise ValueError(
            f"Value {value} is greater than the maximum allowed value {max_value}."
        )

    return value



def is_array_like(obj, numpy_check=False):
    """
    Check if an object is array-like (
        e.g., a list, tuple, numpy array, pandas Series).

    Parameters
    ----------
    obj : object
        The object to check.

    numpy_check : bool, optional, default=False
        If True, checks for numpy array-like objects (
            including numpy.ndarray, np.generic, list, and tuple).
        If False, checks for iterable objects (excluding strings).

    Returns
    -------
    bool
        True if the object is array-like, False otherwise.
    """
    if numpy_check:
        # Check for numpy array-like objects (ndarray, np.generic, list, tuple)
        return isinstance(
            obj, np.ndarray | list | tuple | np.generic
        )

    # Check for iterable objects, excluding strings
    return isinstance(obj, Iterable) and not isinstance(
        obj, str
    )


def _is_numeric_dtype(o, to_array=False):
    """Determine whether the argument has a numeric datatype, when
    converted to a NumPy array.

    Booleans, unsigned integers, signed integers, floats and complex
    numbers are the kinds of numeric datatype.

    :param o: object, arraylike
        Object presumed to be an array
    :param to_array: bool, default=False
        If `o` is passed as non-array like list or tuple or other iterable
        object. Setting `to_array` to ``True`` will convert `o` to array.
    :return: bool,
        ``True`` if `o` has a numeric dtype and ``False`` otherwise.
    """
    _NUMERIC_KINDS = set("buifc")
    if not hasattr(o, "__iter__"):
        raise TypeError(
            "'o' is expected to be an iterable object."
            f" got: {type(o).__name__!r}"
        )
    if to_array:
        o = np.array(o)
    if not hasattr(o, "__array__"):
        raise ValueError(
            f"Expect type array, got: {type(o).__name__!r}"
        )
    # use NUMERICKIND rather than # pd.api.types.is_numeric_dtype(arr)
    # for series and dataframes
    return (
        o.values.dtype.kind
        if (hasattr(o, "columns") or hasattr(o, "name"))
        else o.dtype.kind
    ) in _NUMERIC_KINDS


def _check_consistency_size(ar1, ar2, error="raise"):
    """Check consistency of two arrays and raises error if both sizes
    are differents.
    Returns 'False' if sizes are not consistent and error is set to 'ignore'.
    """
    if error == "raise":
        msg = "Array sizes must be consistent: '{}' and '{}' were given."
        assert len(ar1) == len(ar2), msg.format(
            len(ar1), len(ar2)
        )

    return len(ar1) == len(ar2)



[docs]
def check_consistency_size(*arrays):
    """Check consistency of array and raises error otherwise."""
    lengths = [len(X) for X in arrays if X is not None]
    uniques = np.unique(lengths)
    if len(uniques) > 1:
        raise ValueError(
            "Found input variables with inconsistent numbers of samples: %r"
            % [int(l) for l in lengths]
        )



def _is_buildin(o, mode="soft"):
    """Returns 'True' wether the module is a Python buidling function.

    If  `mode` is ``strict`` only assert the specific predifined-functions
    like 'str', 'len' etc, otherwise check in the whole predifined functions
    including the object with type equals to 'module'

    :param o: object
        Any object for verification
    :param mode: str , default='soft'
        mode for asserting object. Can also be 'strict' for the specific
        predifined build-in functions.
    :param module:
    """
    assert mode in {"strict", "soft"}, (
        f"Unsupports mode {mode!r}, expects 'strict'or 'soft'"
    )

    return (
        (
            isinstance(o, types.BuiltinFunctionType)
            and inspect.isbuiltin(o)
        )
        if mode == "strict"
        else type(o).__module__ == "builtins"
    )



[docs]
def get_estimator_name(estimator):
    """Get the estimator name whatever it is an instanciated object or not

    :param estimator: callable or instanciated object,
        callable or instance object that has a fit method.

    :return: str,
        name of the estimator.
    """
    if isinstance(estimator, str):
        return estimator

    name = " "
    if hasattr(estimator, "__qualname__") and hasattr(
        estimator, "__name__"
    ):
        name = estimator.__name__
    elif hasattr(estimator, "__class__") and not hasattr(
        estimator, "__name__"
    ):
        name = estimator.__class__.__name__
    return name



def _is_cross_validated(estimator):
    """Check whether the estimator has already passed the cross validation
     procedure.

    We assume it has the attributes 'best_params_' and 'best_estimator_'
    already populated.

    :param estimator: callable or instanciated object, that has a fit method.
    :return: bool,
        estimator has already passed the cross-validation procedure.

    """
    return hasattr(estimator, "best_estimator_") and hasattr(
        estimator, "best_params_"
    )


def _check_array_in(obj, arr_name):
    """Returns the array from the array name attribute. Note that the singleton
    array is not admitted.

    This helper function tries to return array from object attribute  where
    object attribute is the array name if exists. Otherwise raises an error.

    Parameters
    ----------
    obj : object
       Object that is expected to contain the array attribute.
    Returns
    -------
    X : array
       Array fetched from its name in `obj`.
    """

    type_ = type(obj)
    try:
        type_name = f"{obj.__module__}.{obj.__qualname__}"
        o_ = f" in {obj.__name__!r}"
    except AttributeError:
        type_name = type_.__qualname__
        o_ = ""

    message = (
        f"Unable to find the name {arr_name!r}"
        f"{o_} from {type_name!r}"
    )

    if not hasattr(obj, arr_name):
        raise TypeError(message)

    X = getattr(obj, f"{arr_name}")

    if not hasattr(X, "__len__") and not hasattr(X, "shape"):
        if not hasattr(X, "__array__"):
            raise TypeError(message)
        # Only convert X to a numpy array if there is no cheaper, heuristic
        # option.
        X = np.asarray(X)

    if hasattr(X, "shape"):
        if (
            not hasattr(X.shape, "__len__")
            or len(X.shape) <= 1
        ):
            warnings.warn(
                f"A singleton array {X!r} cannot be considered a valid collection.",
                stacklevel=2,
            )
            message += f" with shape {X.shape}"
            raise TypeError(message)

    return X


def _deprecate_positional_args(func=None, *, version="1.3"):
    """Decorator for methods that issues warnings for positional arguments.

    Using the keyword-only argument syntax in pep 3102, arguments after the
    * will issue a warning when passed as a positional argument.

    Parameters
    ----------
    func : callable, default=None
        Function to check arguments on.
    version : callable, default="1.3"
        The version when positional arguments will result in error.
    """

    def _inner_deprecate_positional_args(f):
        sig = signature(f)
        kwonly_args = []
        all_args = []

        for name, param in sig.parameters.items():
            if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
                all_args.append(name)
            elif param.kind == Parameter.KEYWORD_ONLY:
                kwonly_args.append(name)

        @wraps(f)
        def inner_f(*args, **kwargs):
            extra_args = len(args) - len(all_args)
            if extra_args <= 0:
                return f(*args, **kwargs)

            # extra_args > 0
            args_msg = [
                f"{name}={arg}"
                for name, arg in zip(
                    kwonly_args[:extra_args],
                    args[-extra_args:],
                    strict=False,
                )
            ]
            args_msg = ", ".join(args_msg)
            warnings.warn(
                f"Pass {args_msg} as keyword args. From version "
                f"{version} passing these as positional arguments "
                "will result in an error",
                FutureWarning,
                stacklevel=2,
            )
            kwargs.update(
                zip(sig.parameters, args, strict=False)
            )
            return f(**kwargs)

        return inner_f

    if func is not None:
        return _inner_deprecate_positional_args(func)

    return _inner_deprecate_positional_args



[docs]
def to_dtype_str(arr, return_values=False):
    """Convert numeric or object dtype to string dtype.

    This will avoid a particular TypeError when an array is filled by np.nan
    and at the same time contains string values.
    Converting the array to dtype str rather than keeping to 'object'
    will pass this error.

    :param arr: array-like
        array with all numpy datatype or pandas dtypes
    :param return_values: bool, default=False
        returns array values in string dtype. This might be usefull when a
        series with dtype equals to object or numeric is passed.
    :returns: array-like
        array-like with dtype str
        Note that if the dataframe or serie is passed, the object datatype
        will change only if `return_values` is set to ``True``, otherwise
        returns the same object.

    """
    if not hasattr(arr, "__array__"):
        raise TypeError(
            f"Expects an array, got: {type(arr).__name__!r}"
        )
    if return_values:
        if hasattr(arr, "name") or hasattr(arr, "columns"):
            arr = arr.values
    return arr.astype(str)



def _is_arraylike_1d(x):
    """Returns whether the input is arraylike one dimensional and not a scalar"""
    if not hasattr(x, "__array__"):
        raise TypeError(
            "Expects a one-dimensional array, "
            f"got: {type(x).__name__!r}"
        )
    _is_arraylike_not_scalar(x)
    return _is_arraylike_not_scalar(x) and (
        len(x.shape) < 2
        or (len(x.shape) == 2 and x.shape[1] == 1)
    )


def _is_arraylike(x):
    """Returns whether the input is array-like."""
    return (
        hasattr(x, "__len__")
        or hasattr(x, "shape")
        or hasattr(x, "__array__")
    )


def _is_arraylike_not_scalar(array):
    """Return True if array is array-like and not a scalar"""
    return _is_arraylike(array) and not np.isscalar(array)


def _num_features(X):
    """Return the number of features in an array-like X.

    This helper function tries hard to avoid to materialize an array version
    of X unless necessary. For instance, if X is a list of lists,
    this function will return the length of the first element, assuming
    that subsequent elements are all lists of the same length without
    checking.

    Parameters
    ----------
    X : array-like
        array-like to get the number of features.

    Returns
    -------
    features : int
        Number of features
    """
    type_ = type(X)
    if type_.__module__ == "builtins":
        type_name = type_.__qualname__
    else:
        type_name = f"{type_.__module__}.{type_.__qualname__}"
    message = f"Unable to find the number of features from X of type {type_name}"
    if not hasattr(X, "__len__") and not hasattr(X, "shape"):
        if not hasattr(X, "__array__"):
            raise TypeError(message)
        # Only convert X to a numpy array if there is no cheaper, heuristic
        # option.
        X = np.asarray(X)

    if hasattr(X, "shape"):
        if (
            not hasattr(X.shape, "__len__")
            or len(X.shape) <= 1
        ):
            message += f" with shape {X.shape}"
            raise TypeError(message)
        return X.shape[1]

    first_sample = X[0]

    # Do not consider an array-like of strings or dicts to be a 2D array
    if isinstance(first_sample, str | bytes | dict):
        message += f" where the samples are of type {type(first_sample).__qualname__}"
        raise TypeError(message)

    try:
        # If X is a list of lists, for instance, we assume that all nested
        # lists have the same length without checking or converting to
        # a numpy array to keep this function call as cheap as possible.
        return len(first_sample)
    except Exception as err:
        raise TypeError(message) from err


def _num_samples(x):
    """Return number of samples in array-like x."""
    message = (
        f"Expected sequence or array-like, got {type(x)}"
    )
    if hasattr(x, "fit") and callable(x.fit):
        # Don't get num_samples from an ensembles length!
        raise TypeError(message)

    if not hasattr(x, "__len__") and not hasattr(x, "shape"):
        if hasattr(x, "__array__"):
            x = np.asarray(x)
        else:
            raise TypeError(message)

    if hasattr(x, "shape") and x.shape is not None:
        if len(x.shape) == 0:
            raise TypeError(
                f"Singleton array {x!r} cannot be considered a valid collection."
            )
        # Check that shape is returning an integer or default to len
        # Dask dataframes may not return numeric shape[0] value
        if isinstance(x.shape[0], numbers.Integral):
            return x.shape[0]

    try:
        return len(x)
    except TypeError as type_error:
        raise TypeError(message) from type_error



[docs]
def check_memory(memory):
    """Check that ``memory`` is joblib.Memory-like.

    joblib.Memory-like means that ``memory`` can be converted into a
    joblib.Memory instance (typically a str denoting the ``location``)
    or has the same interface (has a ``cache`` method).

    Parameters
    ----------
    memory : None, str or object with the joblib.Memory interface
        - If string, the location where to create the `joblib.Memory` interface.
        - If None, no caching is done and the Memory object is completely transparent.
    Returns
    -------
    memory : object with the joblib.Memory interface
        A correct joblib.Memory object.
    Raises
    ------
    ValueError
        If ``memory`` is not joblib.Memory-like.
    """
    if memory is None or isinstance(memory, str):
        memory = joblib.Memory(location=memory, verbose=0)
    elif not hasattr(memory, "cache"):
        raise ValueError(
            "'memory' should be None, a string or have the same"
            " interface as joblib.Memory."
            f" Got memory='{memory}' instead."
        )
    return memory




[docs]
def check_consistent_length(*arrays):
    """Check that all arrays have consistent first dimensions.

    Checks whether all objects in arrays have the same shape or length.

    Parameters
    ----------
    *arrays : list or tuple of input objects.
        Objects that will be checked for consistent length.
    """

    lengths = [
        _num_samples(X) for X in arrays if X is not None
    ]
    uniques = np.unique(lengths)
    if len(uniques) > 1:
        raise ValueError(
            "Found input variables with inconsistent numbers of samples: %r"
            % [int(l) for l in lengths]
        )




[docs]
def check_random_state(seed):
    """Turn seed into a np.random.RandomState instance.

    Parameters
    ----------
    seed : None, int or instance of RandomState
        If seed is None, return the RandomState singleton used by np.random.
        If seed is an int, return a new RandomState instance seeded with seed.
        If seed is already a RandomState instance, return it.
        Otherwise raise ValueError.
    Returns
    -------
    :class:`numpy:numpy.random.RandomState`
        The random state object based on `seed` parameter.
    """
    if seed is None or seed is np.random:
        return np.random.mtrand._rand
    if isinstance(seed, numbers.Integral):
        return np.random.RandomState(seed)
    if isinstance(seed, np.random.RandomState):
        return seed
    raise ValueError(
        f"{seed!r} cannot be used to seed a numpy.random.RandomState instance"
    )




[docs]
def has_fit_parameter(estimator, parameter):
    """Check whether the estimator's fit method supports the given parameter.

    Parameters
    ----------
    estimator : object
        An estimator to inspect.
    parameter : str
        The searched parameter.

    Returns
    -------
    is_parameter : bool
        Whether the parameter was found to be a named parameter of the
        estimator's fit method.

    Examples
    --------
    >>> from sklearn.svm import SVC
    >>> from sklearn.utils.validation import has_fit_parameter
    >>> has_fit_parameter(SVC(), "sample_weight")
    True
    """
    return parameter in signature(estimator.fit).parameters




[docs]
def check_symmetric(
    array,
    *,
    tol=1e-10,
    raise_warning=True,
    raise_exception=False,
):
    """Make sure that array is 2D, square and symmetric.

    If the array is not symmetric, then a symmetrized version is returned.
    Optionally, a warning or exception is raised if the matrix is not
    symmetric.

    Parameters
    ----------
    array : {ndarray, sparse matrix}
        Input object to check / convert. Must be two-dimensional and square,
        otherwise a ValueError will be raised.
    tol : float, default=1e-10
        Absolute tolerance for equivalence of arrays. Default = 1E-10.
    raise_warning : bool, default=True
        If True then raise a warning if conversion is required.
    raise_exception : bool, default=False
        If True then raise an exception if array is not symmetric.
    Returns
    -------
    array_sym : {ndarray, sparse matrix}
        Symmetrized version of the input array, i.e. the average of array
        and array.transpose(). If sparse, then duplicate entries are first
        summed and zeros are eliminated.
    """
    if (array.ndim != 2) or (
        array.shape[0] != array.shape[1]
    ):
        raise ValueError(
            f"array must be 2-dimensional and square. shape = {array.shape}"
        )

    if sp.issparse(array):
        diff = array - array.T
        # only csr, csc, and coo have `data` attribute
        if diff.format not in ["csr", "csc", "coo"]:
            diff = diff.tocsr()
        symmetric = np.all(abs(diff.data) < tol)
    else:
        symmetric = np.allclose(array, array.T, atol=tol)

    if not symmetric:
        if raise_exception:
            raise ValueError("Array must be symmetric")
        if raise_warning:
            warnings.warn(
                "Array is not symmetric, and will be converted "
                "to symmetric by average with its transpose.",
                stacklevel=2,
            )
        if sp.issparse(array):
            conversion = "to" + array.format
            array = getattr(
                0.5 * (array + array.T), conversion
            )()
        else:
            array = 0.5 * (array + array.T)

    return array




[docs]
def check_scalar(
    x,
    name,
    target_type,
    *,
    min_val=None,
    max_val=None,
    include_boundaries="both",
):
    """Validate scalar parameters type and value.

    Parameters
    ----------
    x : object
        The scalar parameter to validate.
    name : str
        The name of the parameter to be printed in error messages.
    target_type : type or tuple
        Acceptable data types for the parameter.
    min_val : float or int, default=None
        The minimum valid value the parameter can take. If None (default) it
        is implied that the parameter does not have a lower bound.
    max_val : float or int, default=None
        The maximum valid value the parameter can take. If None (default) it
        is implied that the parameter does not have an upper bound.
    include_boundaries : {"left", "right", "both", "neither"}, default="both"
        Whether the interval defined by `min_val` and `max_val` should include
        the boundaries. Use ``"left"`` for ``[min_val, max_val)``,
        ``"right"`` for ``(min_val, max_val]``, ``"both"`` for
        ``[min_val, max_val]``, or ``"neither"`` for
        ``(min_val, max_val)``.
    Returns
    -------
    x : numbers.Number
        The validated number.
    Raises
    ------
    TypeError
        If the parameter's type does not match the desired type.
    ValueError
        If the parameter's value violates the given bounds.
        If `min_val`, `max_val` and `include_boundaries` are inconsistent.
    """

    def type_name(t):
        """Convert type into humman readable string."""
        module = t.__module__
        qualname = t.__qualname__
        if module == "builtins":
            return qualname
        elif t == numbers.Real:
            return "float"
        elif t == numbers.Integral:
            return "int"
        return f"{module}.{qualname}"

    if not isinstance(x, target_type):
        if isinstance(target_type, tuple):
            types_str = ", ".join(
                type_name(t) for t in target_type
            )
            target_type_str = f"{{{types_str}}}"
        else:
            target_type_str = type_name(target_type)

        raise TypeError(
            f"{name} must be an instance of {target_type_str}, not"
            f" {type(x).__qualname__}."
        )

    expected_include_boundaries = (
        "left",
        "right",
        "both",
        "neither",
    )
    if include_boundaries not in expected_include_boundaries:
        raise ValueError(
            f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. "
            f"Possible values are: {expected_include_boundaries}."
        )

    if max_val is None and include_boundaries == "right":
        raise ValueError(
            "`include_boundaries`='right' without specifying explicitly `max_val` "
            "is inconsistent."
        )

    if min_val is None and include_boundaries == "left":
        raise ValueError(
            "`include_boundaries`='left' without specifying explicitly `min_val` "
            "is inconsistent."
        )

    comparison_operator = (
        operator.lt
        if include_boundaries in ("left", "both")
        else operator.le
    )
    if min_val is not None and comparison_operator(
        x, min_val
    ):
        raise ValueError(
            f"{name} == {x}, must be"
            f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."
        )

    comparison_operator = (
        operator.gt
        if include_boundaries in ("right", "both")
        else operator.ge
    )
    if max_val is not None and comparison_operator(
        x, max_val
    ):
        raise ValueError(
            f"{name} == {x}, must be"
            f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."
        )

    return x



def _get_feature_names(X):
    """Get feature names from X.

    Support for other array containers should place its implementation here.

    Parameters
    ----------
    X : {ndarray, dataframe} of shape (n_samples, n_features)
        Array container from which to extract feature names. For pandas
        DataFrames, columns are treated as feature names and ``None`` is
        returned when any feature name is not a string. All other array
        containers return ``None``.

    Returns
    -------
    names: ndarray or None
        Feature names of `X`. Unrecognized array containers will return `None`.
    """
    feature_names = None

    # extract feature names for support array containers
    if hasattr(X, "columns"):
        feature_names = np.asarray(X.columns, dtype=object)

    if feature_names is None or len(feature_names) == 0:
        return

    types = sorted(
        t.__qualname__
        for t in set(type(v) for v in feature_names)
    )

    # mixed type of string and non-string is not supported
    if len(types) > 1 and "str" in types:
        raise TypeError(
            "Feature names only support names that are all strings. "
            f"Got feature names with dtypes: {types}."
        )

    # Only feature names of all strings are supported
    if len(types) == 1 and types[0] == "str":
        return feature_names



[docs]
def check_is_fitted(
    estimator, attributes=None, *, msg=None, all_or_any=all
):
    """Perform is_fitted validation for estimator.

    Checks if the estimator is fitted by verifying the presence of
    fitted attributes (ending with a trailing underscore) and otherwise
    raises a NotFittedError with the given message.

    If an estimator does not set any attributes with a trailing underscore, it
    can define a ``__sklearn_is_fitted__`` or ``__fusionlab_is_fitted__`` method
    returning a boolean to specify if the estimator is fitted or not.

    Parameters
    ----------
    estimator : estimator instance
        Estimator instance for which the check is performed.

    attributes : str, list or tuple of str, default=None
        Attribute name(s) given as string or a list/tuple of strings
        Eg.: ``["coef_", "estimator_", ...], "coef_"``

        If `None`, `estimator` is considered fitted if there exist an
        attribute that ends with a underscore and does not start with double
        underscore.

    msg : str, default=None
        The default error message is, "This %(name)s instance is not fitted
        yet. Call 'fit' with appropriate arguments before using this
        estimator."

        For custom messages if "%(name)s" is present in the message string,
        it is substituted for the estimator name.

        Eg. : "Estimator, %(name)s, must be fitted before sparsifying".

    all_or_any : callable, {all, any}, default=all
        Specify whether all or any of the given attributes must exist.

    Raises
    ------
    TypeError
        If the estimator is a class or not an estimator instance

    NotFittedError
        If the attributes are not found.
    """
    from ..exceptions import NotFittedError

    if isclass(estimator):
        raise TypeError(
            f"{estimator} is a class, not an instance."
        )
    if msg is None:
        msg = (
            "This %(name)s instance is not fitted yet. Call 'fit' with "
            "appropriate arguments before using this estimator."
        )

    if not hasattr(estimator, "fit"):
        raise TypeError(
            f"{estimator} is not an estimator instance."
        )

    if attributes is not None:
        if not isinstance(attributes, list | tuple):
            attributes = [attributes]
        fitted = all_or_any(
            [hasattr(estimator, attr) for attr in attributes]
        )
    elif hasattr(estimator, "__sklearn_is_fitted__"):
        fitted = estimator.__sklearn_is_fitted__()
    elif hasattr(estimator, "__fusionlab_is_fitted__"):
        fitted = estimator.__gofast_is_fitted__()
    else:
        fitted = [
            v
            for v in vars(estimator)
            if v.endswith("_") and not v.startswith("__")
        ]

    if not fitted:
        raise NotFittedError(
            msg % {"name": type(estimator).__name__}
        )



def _check_feature_names_in(
    estimator, input_features=None, *, generate_names=True
):
    """Check `input_features` and generate names if needed.

    Commonly used in :term:`get_feature_names_out`.

    Parameters
    ----------
    input_features : array-like of str or None, default=None
        Input feature names. If ``input_features`` is ``None``, then
        ``feature_names_in_`` is used when available; otherwise names like
        ``["x0", "x1", ..., "x(n_features_in_ - 1)"]`` are generated.
        When an array-like is provided, it must match ``feature_names_in_``
        if that attribute is defined.
    generate_names : bool, default=True
        Whether to generate names when `input_features` is `None` and
        `estimator.feature_names_in_` is not defined. This is useful for transformers
        that validates `input_features` but do not require them in
        :term:`get_feature_names_out` e.g. `PCA`.

    Returns
    -------
    feature_names_in : ndarray of str or `None`
        Feature names in.
    """

    feature_names_in_ = getattr(
        estimator, "feature_names_in_", None
    )
    n_features_in_ = getattr(
        estimator, "n_features_in_", None
    )

    if input_features is not None:
        input_features = np.asarray(
            input_features, dtype=object
        )
        if (
            feature_names_in_ is not None
            and not np.array_equal(
                feature_names_in_, input_features
            )
        ):
            raise ValueError(
                "input_features is not equal to feature_names_in_"
            )

        if (
            n_features_in_ is not None
            and len(input_features) != n_features_in_
        ):
            raise ValueError(
                "input_features should have length equal to number of "
                f"features ({n_features_in_}), got {len(input_features)}"
            )
        return input_features

    if feature_names_in_ is not None:
        return feature_names_in_

    if not generate_names:
        return

    # Generates feature names if `n_features_in_` is defined
    if n_features_in_ is None:
        raise ValueError(
            "Unable to generate feature names without n_features_in_"
        )

    return np.asarray(
        [f"x{i}" for i in range(n_features_in_)], dtype=object
    )


def _pandas_dtype_needs_early_conversion(pd_dtype):
    """Return True if pandas extension pd_dtype need to be converted early."""
    # Check these early for pandas versions without extension dtypes
    from pandas.api.types import (
        is_bool_dtype,
        # is_sparse,
        is_float_dtype,
        is_integer_dtype,
    )

    if is_bool_dtype(pd_dtype):
        # bool and extension booleans need early converstion because __array__
        # converts mixed dtype dataframes into object dtypes
        return True

    if isinstance(pd_dtype, pd.SparseDtype):
        # Sparse arrays will be converted later in `check_array`
        return False

    try:
        from pandas.api.types import is_extension_array_dtype
    except ImportError:
        return False

    # if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype): # deprecated
    if isinstance(
        pd_dtype, pd.SparseDtype
    ) or not is_extension_array_dtype(pd_dtype):
        # Sparse arrays will be converted later in `check_array`
        # Only handle extension arrays for integer and floats
        return False
    elif is_float_dtype(pd_dtype):
        # Float ndarrays can normally support nans. They need to be converted
        # first to map pd.NA to np.nan
        return True
    elif is_integer_dtype(pd_dtype):
        # XXX: Warn when converting from a high integer to a float
        return True

    return False


def _ensure_no_complex_data(array):
    if (
        hasattr(array, "dtype")
        and array.dtype is not None
        and hasattr(array.dtype, "kind")
        and array.dtype.kind == "c"
    ):
        raise ValueError(
            f"Complex data not supported\n{array}\n"
        )


def _check_estimator_name(estimator):
    if estimator is not None:
        if isinstance(estimator, str):
            return estimator
        else:
            return estimator.__class__.__name__
    return None


def set_array_back(
    X, *, to_frame=False, columns=None, input_name="X"
):
    """Set array back to frame, reconvert the Numpy array to pandas series
    or dataframe.

    Parameters
    ----------
    X: Array-like
        Array to convert to frame.
    columns: str or list of str
        Series name or columns names for pandas.Series and DataFrame.

    to_frame: str, default=False
        If ``True`` , reconvert the array to frame using the columns ortherwise
        no-action is performed and return the same array.
    input_name : str, default=""
        The data name used to construct the error message.
    force: bool, default=False,
        Force columns creating using the combination ``input_name`` and
        columns range if `columns` is not supplied.
    Returns
    -------
    X, columns : Array-like
        columns if `X` is dataframe and  name if Series. Otherwwise returns None.

    """

    # set_back =('out', 'back','reconvert', 'to_frame',
    #            'export', 'step back')
    type_col_name = type(columns).__name__

    if not (hasattr(X, "__array__") or sp.issparse(X)):
        raise TypeError(
            f"{input_name + ' o' if input_name != '' else 'O'}nly "
            f"supports array, got: {type(X).__name__!r}"
        )

    if hasattr(X, "columns"):
        # keep the columns
        columns = X.columns
    elif hasattr(X, "name"):
        # keep the name of series
        columns = X.name

    if to_frame and not sp.issparse(X):
        if columns is None:
            raise ValueError(
                "Name or columns must be supplied for"
                " frame conversion."
            )
        # if not string is given as name
        # check whether the columns contains only one
        # value and use it as name to skip
        # TypeError: Series.name must be a hashable type
        if _is_arraylike_1d(X):
            if not isinstance(columns, str) and hasattr(
                columns, "__len__"
            ):
                if len(columns) > 1:
                    raise ValueError(
                        f"{input_name} is 1d-array, only pandas.Series "
                        "conversion can be performed while name must be a"
                        f" hashable type: got {type_col_name!r}"
                    )
                columns = columns[0]

            X = pd.Series(X, name=columns)
        else:
            # columns is str , reconvert to a list
            # and check whether the columns match
            # the shape [1]
            if isinstance(columns, str):
                columns = [columns]
            if not hasattr(columns, "__len__"):
                raise TypeError(
                    " Columns for {input_name!r} expects "
                    f"a list or tuple. Got {type_col_name!r}"
                )
            if X.shape[1] != len(columns):
                raise ValueError(
                    f"Shape of passed values for {input_name} is"
                    f" {X.shape}. Columns indices imply {X.shape[1]},"
                    f" got {len(columns)}"
                )

            X = pd.DataFrame(X, columns=columns)

    return X, columns



[docs]
def convert_array_to_pandas(
    X, *, to_frame=False, columns=None, input_name="X"
):
    """
    Converts an array-like object to a pandas DataFrame or Series, applying
    provided column names or series name.

    Parameters
    ----------
    X : array-like
        The array to convert to a DataFrame or Series.
    to_frame : bool, default=False
        If True, converts the array to a DataFrame. Otherwise, returns the array unchanged.
    columns : str or list of str, optional
        Name(s) for the columns of the resulting DataFrame or the name of the Series.
    input_name : str, default='X'
        The name of the input variable; used in constructing error messages.

    Returns
    -------
    pd.DataFrame or pd.Series
        The converted DataFrame or Series. If `to_frame` is False, returns `X` unchanged.
    columns : str or list of str
        The column names of the DataFrame or the name of the Series, if applicable.

    Raises
    ------
    TypeError
        If `X` is not array-like or if `columns` is neither a string nor a list of strings.
    ValueError
        If the conversion to DataFrame is requested but `columns` is not provided,
        or if the length of `columns` does not match the number of columns in `X`.
    """
    # Check if the input is string, which is a common mistake
    if isinstance(X, str):
        raise TypeError(
            f"The parameter '{input_name}' should be an array-like"
            " or sparse matrix, but a string was passed."
        )

    # Validate the type of X
    if not (
        hasattr(X, "__array__")
        or isinstance(X, np.ndarray | pd.Series | list)
        or sp.issparse(X)
    ):
        raise TypeError(
            f"The parameter '{input_name}' should be array-like"
            f" or a sparse matrix. Received: {type(X).__name__!r}"
        )

    # Preserve existing DataFrame or Series column names
    if hasattr(X, "columns"):
        columns = X.columns
    elif hasattr(X, "name"):
        columns = X.name

    if to_frame and not sp.issparse(X):
        if columns is None:
            raise ValueError(
                "Columns must be provided for DataFrame conversion."
            )

        # Ensure columns is list-like for DataFrame conversion, single string for Series
        if isinstance(columns, str):
            columns = [columns]

        if not hasattr(columns, "__len__") or isinstance(
            columns, str
        ):
            raise TypeError(
                f"Columns for {input_name} must be a list or a single string."
            )

        # Convert to Series or DataFrame based on dimensionality
        if (
            X.ndim == 1 or len(X) == len(columns) == 1
        ):  # 1D array or single-column DataFrame
            X = pd.Series(X, name=columns[0])
        elif X.ndim == 2:  # 2D array to DataFrame
            if X.shape[1] != len(columns):
                raise ValueError(
                    f"Shape of passed values is {X.shape},"
                    f" but columns implied {len(columns)}"
                )
            X = pd.DataFrame(X, columns=columns)
        else:
            raise ValueError(
                f"{input_name} cannot be converted to DataFrame with given columns."
            )

    return X, columns




[docs]
def is_frame(
    arr,
    df_only=False,
    raise_exception=False,
    objname=None,
    error="raise",
):
    r"""
    Check if `arr` is a pandas DataFrame or Series.

    If ``df_only=True``, the function checks strictly for a pandas
    DataFrame. Otherwise, it accepts either a pandas DataFrame or
    Series. This utility is often used to validate input data before
    processing, ensuring that the input conforms to expected types.

    Parameters
    ----------
    arr : object
        The object to examine. Typically a pandas DataFrame or Series,
        but can be any Python object.
    df_only : bool, optional
        If True, only verifies that `arr` is a DataFrame. If False,
        checks for either a DataFrame or a Series. Default is False.
    raise_exception : bool, optional
        If True, this will override `error="raise"`. This parameter
        is deprecated and will be removed soon. Default is False.
    error : str, optional
        Determines the action when `arr` is not a valid frame. Can be:
        - ``"raise"``: Raises a TypeError.
        - ``"warn"``: Issues a warning.
        - ``"ignore"``: Does nothing. Default is ``"raise"``.
    objname : str or None, optional
        A custom name used in the error message if `error` is set to
        ``"raise"``. If None, a generic name is used.

    Returns
    -------
    bool
        True if `arr` is a DataFrame or Series (or strictly a DataFrame
        if `df_only=True`), otherwise False.

    Raises
    ------
    TypeError
        If `error="raise"` and `arr` is not a valid frame. The error
        message guides the user to provide the correct type
        (`DataFrame` or `DataFrame or Series`).

    Notes
    -----
    This function does not convert or modify `arr`. It merely checks
    its compatibility with common DataFrame/Series interfaces by
    examining attributes such as `'columns'` or `'name'`. For a
    DataFrame, `arr.columns` should exist, and for a Series, a `'name'`
    attribute is often present. Both DataFrame and Series implement
    `__array__`, making them NumPy array-like.

    Examples
    --------
    >>> import pandas as pd
    >>> from geoprior.utils.validator import is_frame

    >>> df = pd.DataFrame({'A': [1,2,3]})
    >>> is_frame(df)
    True

    >>> s = pd.Series([4,5,6], name='S')
    >>> is_frame(s)
    True

    >>> is_frame(s, df_only=True)
    False

    If `error="raise"`:

    >>> is_frame(s, df_only=True, error="raise", objname='Input')
    Traceback (most recent call last):
        ...
    TypeError: 'Input' parameter expects a DataFrame. Got 'Series'
    """

    # Handle deprecation for `raise_exception`
    if raise_exception and error != "raise":
        warnings.warn(
            "'raise_exception' is deprecated and will be replaced by 'error'."
            " The 'error' parameter is now used for specifying error handling.",
            stacklevel=2,
            category=DeprecationWarning,
        )
        error = "raise"  # Fall back to 'raise' if raise_exception is True

    # Determine if arr qualifies as a frame based on df_only
    if df_only:
        obj_is_frame = hasattr(arr, "__array__") and hasattr(
            arr, "columns"
        )
    else:
        obj_is_frame = hasattr(arr, "__array__") and (
            hasattr(arr, "name") or hasattr(arr, "columns")
        )

    # If not valid and error is set to 'raise', raise TypeError
    if not obj_is_frame:
        if error == "raise":
            objname = objname or "Input"
            objname = f"{objname!r} parameter expects"
            expected = (
                "a DataFrame"
                if df_only
                else "a DataFrame or Series"
            )
            raise TypeError(
                f"{objname} {expected}. Got {type(arr).__name__!r}"
            )
        elif error == "warn":
            warning_msg = (
                f"Warning: {objname or 'Input'} expects "
                f"a DataFrame or Series. Got {type(arr).__name__!r}."
            )
            warnings.warn(
                warning_msg,
                stacklevel=2,
                category=UserWarning,
            )

    return obj_is_frame




[docs]
def check_array(
    array,
    *,
    accept_large_sparse=True,
    dtype="numeric",
    accept_sparse=False,
    order=None,
    copy=False,
    force_all_finite=True,
    ensure_2d=True,
    allow_nd=False,
    ensure_min_samples=1,
    ensure_min_features=1,
    estimator=None,
    input_name="",
    to_frame=True,
):
    """Input validation on an array, list, or similar.

    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the array is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.

    accept_sparse : str, bool or list/tuple of str, default=False
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool, default=True
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.

    dtype : 'numeric', type, list of type or None, default='numeric'
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.
    order : {'F', 'C'} or None, default=None
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.
    copy : bool, default=False
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : bool or 'allow-nan', default=True
        Whether to raise an error on ``np.inf``, ``np.nan``, or ``pd.NA`` in
        ``array``. Use ``True`` to require all values to be finite, ``False``
        to allow ``np.inf``, ``np.nan``, and ``pd.NA``, or ``"allow-nan"`` to
        allow only ``np.nan`` and ``pd.NA`` while still rejecting infinite
        values. ``pd.NA`` is converted into ``np.nan``.
    ensure_2d : bool, default=True
        Whether to raise a value error if array is not 2D.
    ensure_min_samples : int, default=1
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.
    ensure_min_features : int, default=1
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.
    estimator : str or estimator instance, default=None
        If passed, include the name of the estimator in warning messages.
    input_name : str, default=""
        The data name used to construct the error message. In particular
        if `input_name` is "X" and the data has NaN values and
        allow_nan is False, the error message will link to the imputer
        documentation.

    to_frame : bool, default=False
        Reconvert array back to pd.Series or pd.DataFrame if
        the original array is pd.Series or pd.DataFrame.

    Returns
    -------
    array_converted : object
        The converted and validated array.
    """
    if isinstance(array, np.matrix):
        raise TypeError(
            "np.matrix is not supported. Please convert to a numpy array with "
            "np.asarray. For more information see: "
            "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
        )
    xp, is_array_api = get_namespace(array)

    # collect the name or series if
    # data is pandas series or dataframe.
    # and reconvert by to series or dataframe
    # array is series or dataframe.
    array, column_orig = convert_array_to_pandas(
        array, input_name=input_name
    )

    # store reference to original array to check if copy is needed when
    # function returns
    array_orig = array

    # store whether originally we wanted numeric dtype
    dtype_numeric = (
        isinstance(dtype, str) and dtype == "numeric"
    )

    dtype_orig = getattr(array, "dtype", None)
    if not hasattr(dtype_orig, "kind"):
        # not a data type (e.g. a column named dtype in a pandas DataFrame)
        dtype_orig = None

    # check if the object contains several dtypes (typically a pandas
    # DataFrame), and store them. If not, store None.
    dtypes_orig = None
    pandas_requires_conversion = False

    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    if hasattr(array, "dtypes") and hasattr(
        array.dtypes, "__array__"
    ):
        # throw warning if columns are sparse. If all columns are sparse, then
        # array.sparse exists and sparsity will be preserved (later).
        with suppress(ImportError):
            # from pandas.api.types import is_sparse

            if not hasattr(array, "sparse") and isinstance(
                array, pd.SparseDtype
            ):
                warnings.warn(
                    "pandas.DataFrame with sparse columns found."
                    "It will be converted to a dense numpy array.",
                    stacklevel=2,
                )

        dtypes_orig = list(array.dtypes)
        pandas_requires_conversion = any(
            _pandas_dtype_needs_early_conversion(i)
            for i in dtypes_orig
        )
        if all(
            isinstance(dtype_iter, np.dtype)
            for dtype_iter in dtypes_orig
        ):
            dtype_orig = np.result_type(*dtypes_orig)
    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    if dtype_numeric:
        if dtype_orig is not None and dtype_orig.kind == "O":
            # if input is object, convert to float.
            dtype = xp.float64
        else:
            dtype = None

    if isinstance(dtype, list | tuple):
        if dtype_orig is not None and dtype_orig in dtype:
            # no dtype conversion required
            dtype = None
        else:
            # dtype conversion required. Let's select the first element of the
            # list of accepted types.
            dtype = dtype[0]

    if pandas_requires_conversion:
        # pandas dataframe requires conversion earlier to handle extension dtypes with
        # nans
        # Use the original dtype for conversion if dtype is None
        new_dtype = dtype_orig if dtype is None else dtype
        array = array.astype(new_dtype)
        # Since we converted here, we do not need to convert again later
        dtype = None

    if force_all_finite not in (True, False, "allow-nan"):
        raise ValueError(
            f'force_all_finite should be a bool or "allow-nan". Got {force_all_finite!r} instead'
        )
    estimator_name = _check_estimator_name(estimator)
    # context = " by %s" % estimator_name if estimator is not None else ""

    if sp.issparse(array):
        _ensure_no_complex_data(array)
        array = _ensure_sparse_format(
            array,
            accept_sparse=accept_sparse,
            dtype=dtype,
            copy=copy,
            force_all_finite=force_all_finite,
            accept_large_sparse=accept_large_sparse,
            estimator_name=estimator_name,
            input_name=input_name,
        )

    else:
        # If np.array(..) gives ComplexWarning, then we convert the warning
        # to an error. This is needed because specifying a non complex
        # dtype to the function converts complex to real dtype,
        # thereby passing the test made in the lines following the scope
        # of warnings context manager.
        with warnings.catch_warnings():
            try:
                warnings.simplefilter("error", ComplexWarning)
                if (
                    dtype is not None
                    and np.dtype(dtype).kind in "iu"
                ):
                    # Conversion float -> int should not contain NaN or
                    # inf (numpy#14412). We cannot use casting='safe' because
                    # then conversion float -> int would be disallowed.
                    array = _asarray_with_order(
                        array, order=order, xp=xp
                    )
                    if array.dtype.kind == "f":
                        _assert_all_finite(
                            array,
                            allow_nan=False,
                            msg_dtype=dtype,
                            estimator_name=estimator_name,
                            input_name=input_name,
                        )
                    array = xp.astype(
                        array, dtype, copy=False
                    )
                else:
                    array = _asarray_with_order(
                        array, order=order, dtype=dtype, xp=xp
                    )
            except ComplexWarning as complex_warning:
                raise ValueError(
                    f"Complex data not supported\n{array}\n"
                ) from complex_warning

        # It is possible that the np.array(..) gave no warning. This happens
        # when no dtype conversion happened, for example dtype = None. The
        # result is that np.array(..) produces an array of complex dtype
        # and we need to catch and raise exception for such cases.
        _ensure_no_complex_data(array)

        if len(array) == 0:
            raise ValueError(
                "Found array with 0 length while a minimum of 1 is required."
            )
        if ensure_2d:
            # If input is scalar raise error
            if array.ndim == 0:
                raise ValueError(
                    f"Expected 2D array, got scalar array instead:\narray={array}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample."
                )
            # If input is 1D raise error
            if array.ndim == 1:
                raise ValueError(
                    "Expected 2D array, got 1D array instead. "
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample."
                )

        if (
            dtype_numeric
            and (
                array.values.dtype.kind
                if hasattr(array, "columns")
                else array.dtype.kind
            )
            in "USV"
        ):
            raise ValueError(
                "dtype='numeric' is not compatible with arrays of bytes/strings."
                "Convert your data to numeric values explicitly instead."
            )
        if not allow_nd and array.ndim >= 3:
            raise ValueError(
                f"Found array with dim {array.ndim}. "
                f"{estimator_name} expected <= 2."
            )
        if force_all_finite:
            _assert_all_finite(
                array,
                input_name=input_name,
                estimator_name=estimator_name,
                allow_nan=force_all_finite == "allow-nan",
            )

    if ensure_min_samples > 0:
        n_samples = _num_samples(array)
        if n_samples < ensure_min_samples:
            raise ValueError(
                f"Found array with {n_samples} sample(s) "
                f"(shape={array.shape}) while a minimum of "
                f"{ensure_min_samples} is required."
            )

    if ensure_min_features > 0 and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError(
                f"Found array with {n_features} feature(s) "
                f"(shape={array.shape}) while a minimum of "
                f"{ensure_min_features} is required."
            )

    if copy:
        if xp.__name__ in {"numpy", "numpy.array_api"}:
            # only make a copy if `array` and `array_orig` may share memory`
            if np.may_share_memory(array, array_orig):
                array = _asarray_with_order(
                    array,
                    dtype=dtype,
                    order=order,
                    copy=True,
                    xp=xp,
                )
        else:
            # always make a copy for non-numpy arrays
            array = _asarray_with_order(
                array,
                dtype=dtype,
                order=order,
                copy=True,
                xp=xp,
            )

    if to_frame:
        array = array_to_frame(
            array,
            to_frame=to_frame,
            columns=column_orig,
            input_name=input_name,
            raise_warning="silence",
        )

    return array




[docs]
def check_X_y(
    X,
    y,
    accept_sparse=False,
    *,
    accept_large_sparse=True,
    dtype="numeric",
    order=None,
    copy=False,
    force_all_finite=True,
    ensure_2d=True,
    allow_nd=False,
    multi_output=False,
    ensure_min_samples=1,
    ensure_min_features=1,
    y_numeric=False,
    estimator=None,
    to_frame=False,
):
    """Input validation for standard estimators.

    Checks X and y for consistent length, enforces X to be 2D and y 1D. By
    default, X is checked to be non-empty and containing only finite values.
    Standard input checks are also applied to y, such as checking that y
    does not have np.nan or np.inf targets. For multi-label y, set
    multi_output=True to allow 2D and sparse y. If the dtype of X is
    object, attempt converting to float, raising on failure.

    Parameters
    ----------
    X : {ndarray, list, sparse matrix}
        Input data.
    y : {ndarray, list, sparse matrix}
        Labels.
    accept_sparse : str, bool or list of str, default=False
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.
    accept_large_sparse : bool, default=True
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse will cause it to be accepted only
        if its indices are stored with a 32-bit dtype.
    dtype : 'numeric', type, list of type or None, default='numeric'
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.
    order : {'F', 'C'}, default=None
        Whether an array will be forced to be fortran or c-style.
    copy : bool, default=False
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : bool or 'allow-nan', default=True
        Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
        does not influence whether y can have np.inf, np.nan, pd.NA values.
        Use ``True`` to require all values of ``X`` to be finite, ``False`` to
        allow ``np.inf``, ``np.nan``, and ``pd.NA``, or ``"allow-nan"`` to
        allow only ``np.nan`` and ``pd.NA`` while still rejecting infinite
        values. ``pd.NA`` is accepted and converted into ``np.nan``.
    ensure_2d : bool, default=True
        Whether to raise a value error if X is not 2D.
    allow_nd : bool, default=False
        Whether to allow X.ndim > 2.
    multi_output : bool, default=False
        Whether to allow 2D y (array or sparse matrix). If false, y will be
        validated as a vector. y cannot have np.nan or np.inf values if
        multi_output=True.
    ensure_min_samples : int, default=1
        Make sure that X has a minimum number of samples in its first
        axis (rows for a 2D array).
    ensure_min_features : int, default=1
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when X has effectively 2 dimensions or
        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
        this check.
    y_numeric : bool, default=False
        Whether to ensure that y has a numeric type. If dtype of y is object,
        it is converted to float64. Should only be used for regression
        algorithms.
    estimator : str or estimator instance, default=None
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    X_converted : object
        The converted and validated X.
    y_converted : object
        The converted and validated y.
    """
    if y is None:
        if estimator is None:
            estimator_name = "estimator"
        else:
            estimator_name = _check_estimator_name(estimator)
        raise ValueError(
            f"{estimator_name} requires y to be passed, but the target y is None"
        )

    X = check_array(
        X,
        accept_sparse=accept_sparse,
        accept_large_sparse=accept_large_sparse,
        dtype=dtype,
        order=order,
        copy=copy,
        force_all_finite=force_all_finite,
        ensure_2d=ensure_2d,
        allow_nd=allow_nd,
        ensure_min_samples=ensure_min_samples,
        ensure_min_features=ensure_min_features,
        estimator=estimator,
        input_name="X",
        to_frame=to_frame,
    )

    y = check_y(
        y,
        multi_output=multi_output,
        y_numeric=y_numeric,
        estimator=estimator,
    )

    check_consistent_length(X, y)

    return X, y




[docs]
def check_y(
    y,
    multi_output=False,
    y_numeric=False,
    input_name="y",
    estimator=None,
    to_frame=False,
    allow_nan=False,
):
    """
    Validates the target array `y`, ensuring it is suitable for classification
    or regression tasks based on its content and the specified strategy.

    Parameters
    ----------
    y : array-like
        Target values to validate.
    multi_output : bool, default=False
        Whether to allow two-dimensional ``y`` values. If ``False``, ``y`` is
        validated as a vector. When ``multi_output=True``, ``y`` still cannot
        contain ``np.nan`` or ``np.inf`` values unless ``allow_nan`` permits
        NaNs.
    y_numeric : bool, default=False
        Whether to ensure that y has a numeric type. If dtype of y is object,
        it is converted to float64. Should only be used for regression
        algorithms.
    input_name : str, default="y"
        Data name used to construct the error message.
    estimator : str or estimator instance, default=None
        If passed, include the name of the estimator in warning messages.
    allow_nan : bool, default=False
        If ``True``, do not raise an error when ``y`` contains NaN values.
    to_frame : bool, default=False
        Reconvert the validated array to its initial pandas type when the
        input was provided as a pandas Series or DataFrame.

    Returns
    -------
    y_converted : object
        The converted and validated y.
    """
    y, column_orig = convert_array_to_pandas(
        y, input_name=input_name
    )
    if multi_output:
        y = check_array(
            y,
            accept_sparse="csr",
            force_all_finite=True
            if not allow_nan
            else "allow-nan",
            ensure_2d=False,
            dtype=None,
            input_name=input_name,
            estimator=estimator,
        )
    else:
        estimator_name = _check_estimator_name(estimator)
        y = _check_y_1d(y, warn=True, input_name=input_name)
        _assert_all_finite(
            y,
            input_name=input_name,
            estimator_name=estimator_name,
            allow_nan=allow_nan,
        )
        _ensure_no_complex_data(y)
    if y_numeric and y.dtype.kind == "O":
        y = y.astype(np.float64)

    if to_frame:
        y = array_to_frame(
            y,
            to_frame=to_frame,
            columns=column_orig,
            input_name=input_name,
            raise_warning="mute",
        )

    return y




[docs]
def validate_dtype_selector(dtype_selector: str) -> str:
    """
    Validates and categorizes the dtype_selector using regex, including
    handling cases where 'only' is specifically included.

    Parameters
    ----------
    dtype_selector : str
        Input dtype selector string.

    Returns
    -------
    str
        Categorized ``dtype_selector`` based on predefined patterns. If
        ``"only"`` is included, the returned category reflects this so it can
        drive specific data-type handling.

    Raises
    ------
    ValueError
        If the input ``dtype_selector`` does not match any predefined category.
    """
    types = [
        "numeric",
        "numeric_only",
        "categoric",
        "categoric_only",
        "biselect",
        "biselector",
        "datetime",
    ]
    # Regex patterns for matching dtype_selector categories with an optional 'only'
    numeric_pattern = r"numeric(_only)?"
    categoric_pattern = r"categoric(al|_only)?|categorical"
    datetime_pattern = r"dt|datetime"
    biselect_pattern = r"bi[-_]?selector|biselect|biselector"

    # Check if 'only' is included and modify the category accordingly
    suffix = (
        "_only"
        if "only" in str(dtype_selector).lower()
        else ""
    )

    if re.match(
        numeric_pattern, dtype_selector, re.IGNORECASE
    ):
        return f"numeric{suffix}"
    elif re.match(
        categoric_pattern, dtype_selector, re.IGNORECASE
    ):
        return f"categoric{suffix}"
    elif re.match(
        datetime_pattern, dtype_selector, re.IGNORECASE
    ):
        return "datetime"
    elif re.match(
        biselect_pattern, dtype_selector, re.IGNORECASE
    ):
        return "biselect"

    raise ValueError(
        f"Invalid dtype_selector provided. Valid options are :{types}"
    )



def build_series_if(
    *arr,
    series_names=None,
    indexes=None,
    dtype=None,
    dropna=False,
    fill_value=None,
    inplace=False,
    transpose=False,
    reset_index=False,
    error_policy="raise",
):
    """
    Constructs one or more pandas Series from the provided input arrays.
    Handles various input cases, such as single values, arrays, and DataFrames.

    Parameters
    ----------
    *arr : array-like or DataFrame
        The input data(s). Can be a numpy array, pandas DataFrame, or
        single value. Each element will be processed individually to build
        a pandas Series.

    series_names : str or list of str, optional, default None
        The name(s) to assign to each resulting Series. If a string is provided,
        all Series will have the same name. If a list is provided, the list
        should have the same length as `arr`. If not provided, the Series will
        not be given a name.

    indexes : array-like, optional, default None
        The index to use for the resulting Series. If None, a default integer
        index will be used. If specified, it should match the length of the
        input data or the number of Series being created.

    dtype : dtype, optional, default None
        The data type to force on the resulting Series. If None, pandas will
        infer the appropriate type based on the data.

    dropna : bool, optional, default False
        If True, any NaN values will be dropped from the Series. If False,
        NaN values will remain in the Series.

    fill_value : scalar, optional, default None
        If specified, this value will replace any NaN values in the resulting
        Series. If None, no filling occurs.

    inplace : bool, optional, default False
        If True, modifications to the Series will be made in place and no
        new object will be returned. If False, a new Series is created and
        returned.

    transpose : bool, optional, default False
        If True, the Series will be transposed. This option is useful when
        working with DataFrames where each column needs to be converted into
        a Series.

    reset_index : bool, optional, default False
        If True, resets the index of the resulting Series. This will drop the
        current index and replace it with a new default integer index.

    error_policy : {'raise', 'warn', 'ignore'}, optional, default 'raise'
        Defines how to handle errors during Series construction. Use
        ``"raise"`` to raise an error, ``"warn"`` to emit a warning, or
        ``"ignore"`` to suppress errors.

    Returns
    -------
    list of pandas.Series or pandas.Series
        The constructed pandas Series objects. If only one Series is created,
        a single Series is returned; otherwise, a list of Series is returned.

    Examples
    --------
    >>> from geoprior.utils.validator import build_series_if
    >>> data = [1, 2, 3]
    >>> build_series_if(data)
    0    1
    1    2
    2    3
    dtype: int64

    >>> data1 = [1, 2, 3]
    >>> data2 = [4, 5, 6]
    >>> build_series_if(data1, data2, series_names=["A", "B"])
    [0    1
     1    2
     2    3
     dtype: int64,
     0    4
     1    5
     2    6
     dtype: int64]

    >>> build_series_if(data, fill_value=0, dropna=True)
    0    1
    1    2
    2    3
    dtype: int64

    Notes
    -----
    The function validates and converts each input in ``arr`` before building
    a pandas Series, then applies any optional dtype conversion, index
    assignment, missing-value handling, transposition, or index reset. The
    final result is either a single Series or a list of Series depending on
    how many inputs are provided.

    See Also
    --------
    pandas.Series : The pandas Series constructor used to generate Series objects.
    """
    series_list = []

    try:
        # Iterate over the input data
        for idx, data in enumerate(arr):
            # Convert input data to a Series if needed
            data = _validate_and_convert_data(data)

            # Handle series naming
            series_names = _check_series_names(
                series_names, len(arr), error_policy
            )

            # Create the Series with a name if available
            if series_names:
                series = pd.Series(
                    data,
                    name=series_names[idx]
                    if series_names[idx]
                    else None,
                )
            else:
                series = pd.Series(data)
            # Apply additional modifications based on parameters
            if dtype is not None:
                series = series.astype(dtype)

            if indexes is not None:
                # Check if indexes length matches the data length
                series = _check_series_indexes(
                    series, indexes, data, idx, error_policy
                )
            if dropna:
                series = series.dropna()

            if fill_value is not None:
                series = series.fillna(fill_value)

            if inplace:
                # If inplace is True, modify the series in place
                continue

            # Apply transpose if needed
            if transpose:
                series = series.T

            if reset_index:
                series = series.reset_index(drop=True)

            # Append series to the list of results
            series_list.append(series)

        # Return a single Series if only one is constructed
        if len(series_list) == 1:
            return series_list[0]

        return series_list

    except Exception as e:
        if error_policy == "raise":
            raise e
        elif error_policy == "warn":
            warnings.warn(f"{e}", stacklevel=2)
        elif error_policy == "ignore":
            pass

    return arr


def _check_series_names(series_names, data_len, error_policy):
    """
    Helper function to check if the length of series_names matches the
    length of data.
    """
    if (
        isinstance(series_names, list | tuple)
        and len(series_names) != data_len
    ):
        msg = "Length of series_names does not match the length of input data."
        if error_policy == "raise":
            raise ValueError(msg)
        elif error_policy == "warn":
            warnings.warn(f"{msg}", stacklevel=2)
        # Optionally extend series names if needed
        elif error_policy == "ignore":
            series_names += [None] * (
                data_len - len(series_names)
            )
            # return series_names
    return series_names


def _validate_and_convert_data(data):
    """
    Helper function to validate and convert input data to a compatible form.

    - Converts a list or tuple into a numpy array.
    - If a pandas DataFrame with a single column is passed, converts
      it to a Series.
    - If a numpy array with shape (1, N) is passed, squeezes to a 1D array.
    - If the input is a scalar, converts it to a 1D numpy array.
    - If the input is a 2D array (not a single-column DataFrame),
      raises a ValueError.

    Parameters
    ----------
    data : object
        Input data to validate and convert.

    Returns
    -------
    numpy.ndarray or pandas.Series
        A one-dimensional NumPy array or pandas Series.

    Raises
    ------
    ValueError
        If the input data is not one-dimensional when expected.
    """
    # Check if the data is a pandas DataFrame with a single column
    if isinstance(data, pd.DataFrame) and data.shape[1] == 1:
        return data.iloc[
            :, 0
        ]  # Return the single column as a Series

    # Check if the data is a numpy array, and squeeze it if necessary
    elif isinstance(data, np.ndarray):
        data = data.squeeze()  # Squeeze to ensure a 1D array

    # Check if the data is a scalar value (0-dimensional)
    if np.ndim(data) == 0:
        return np.array([data])  # Convert scalar to 1D array

    # Check if the data is 2D (and not a single-column DataFrame)
    if np.ndim(data) == 2:
        # Raise error for 2D array
        raise ValueError(
            "Expected 1D data for series construction, but got 2D array."
        )

    # Otherwise, convert the data to a numpy array
    return np.asarray(data)  # Return data as a 1D numpy array


def _check_series_indexes(
    series, indexes, data, idx, error_policy
):
    """Check if indexes length matches the data length"""
    if len(indexes) != len(data):
        if error_policy == "raise":
            raise ValueError(
                "Length of indexes does not match the length of input data."
            )
        elif error_policy == "warn":
            warnings.warn(
                "Length of indexes does not match the length of input data.",
                stacklevel=2,
            )
        # Use default index if error_policy is 'ignore'
    else:
        series.index = indexes[idx]

    return series



[docs]
def build_data_if(
    data,
    columns=None,
    to_frame=True,
    input_name="data",
    col_prefix="col_",
    force=False,
    error="warn",
    coerce_datetime=False,
    coerce_numeric=True,
    start_incr_at=0,
    **kw,
):
    """
    Validates and converts ``data`` into a pandas DataFrame
    if requested, optionally enforcing consistent column
    naming. Intended to standardize data structures for
    downstream analysis.

    See more in :func:`geoprior.utils.data_utils.build_df` for
    documentation details.

    """

    force = (
        True
        if (force == "auto" and columns is None)
        else force
    )

    # Attempt to ensure start_incr_at is an integer
    try:
        start_incr_at = int(start_incr_at)
    except ValueError:
        # If the user provided a non-integer, handle it
        # based on the value of `error`
        if error == "raise":
            raise TypeError(
                f"Expected integer for start_incr_at, got "
                f"{type(start_incr_at)} instead."
            )
        elif error == "warn":
            warnings.warn(
                f"Provided 'start_incr_at'={start_incr_at} is not "
                "an integer. Defaulting to 0.",
                UserWarning,
                stacklevel=2,
            )
        # Gracefully default to 0 if error='ignore' or we
        # just want to continue
        start_incr_at = 0

    # Convert from dict to DataFrame if needed. If it's a dict,
    # we can directly create a DataFrame from it
    if isinstance(data, dict):
        data = pd.DataFrame(data)
        # Overwrite columns if they come from dict's keys
        columns = list(data.columns)

    # Convert list or tuple to NumPy array for uniform handling
    elif isinstance(data, list | tuple):
        data = np.array(data)

    # If data is a Series, convert it to a DataFrame
    elif isinstance(data, pd.Series):
        data = data.to_frame()

    # Ensure data is 2D by using a helper function
    data = ensure_2d(data)

    # If user wants a DataFrame but we don't have one yet:
    if to_frame and not isinstance(data, pd.DataFrame):
        # If columns are not specified and force=False,
        # we warn or raise accordingly
        if columns is None and not force:
            msg = (
                f"Conversion of '{input_name}' to DataFrame requires "
                "column names. Provide `columns` or set `force=True` to "
                "auto-generate them."
            )
            if error == "raise":
                raise TypeError(msg)
            elif error == "warn":
                warnings.warn(msg, UserWarning, stacklevel=2)

        # If forced, generate column names automatically if not given
        if force and columns is None:
            columns = [
                f"{col_prefix}{i + start_incr_at}"
                for i in range(data.shape[1])
            ]

        # Perform final DataFrame conversion
        data = pd.DataFrame(data, columns=columns)

    # Perform an array-to-frame conversion with potential
    # re-checking of columns
    data = array_to_frame(
        data,
        columns=columns,
        to_frame=to_frame,
        input_name=input_name,
        force=force,
    )

    # Optionally apply data-type checks or conversions, like
    # datetime or numeric coercion
    if isinstance(data, pd.DataFrame):
        data = recheck_data_types(
            data,
            coerce_datetime=coerce_datetime,
            coerce_numeric=coerce_numeric,
            return_as_numpy=False,
            column_prefix=col_prefix,
        )

    # Convert integer column names to strings, if needed
    data = _convert_int_columns_to_str(
        data, col_prefix=col_prefix
    )

    # Return the final validated and (optionally) converted DataFrame
    return data



def _convert_int_columns_to_str(
    df: pd.DataFrame, col_prefix: str | None = "col_"
) -> pd.DataFrame:
    """
    Convert integer columns in a DataFrame to string form,
    optionally adding a prefix.
    """
    # If it's not a DataFrame, just return it as-is
    if not isinstance(df, pd.DataFrame):
        return df

    # Check if every column name is an integer
    if all(isinstance(col, int) for col in df.columns):
        # Copy to avoid mutating the original
        df_converted = df.copy()

        if col_prefix is None:
            # Convert to str without prefix
            df_converted.columns = (
                df_converted.columns.astype(str)
            )
        else:
            # Convert to str with user-provided prefix
            df_converted.columns = [
                f"{col_prefix}{col}"
                for col in df_converted.columns
            ]
        return df_converted
    else:
        # Return a copy of the original if columns are not all int
        return df.copy()



[docs]
def array_to_frame(
    X,
    *,
    to_frame=False,
    columns=None,
    raise_exception=False,
    raise_warning=True,
    input_name="",
    force=False,
):
    """
    Validates and optionally converts an array-like object to a pandas DataFrame,
    applying specified column names if provided or generating them if the `force`
    parameter is set.

    Parameters
    ----------
    X : array-like
        The array to potentially convert to a DataFrame.
    columns : str or list of str, optional
        The names for the resulting DataFrame columns or the Series name.
    to_frame : bool, default=False
        If True, converts `X` to a DataFrame if it isn't already one.
    input_name : str, default=''
        The name of the input variable, used for error and warning messages.
    raise_warning : bool, default=True
        If True and `to_frame` is True but `columns` are not provided,
        a warning is issued unless `force` is True.
    raise_exception : bool, default=False
        If True, raises an exception when `to_frame` is True but columns
        are not provided and `force` is False.
    force : bool, default=False
        Forces the conversion of `X` to a DataFrame by generating column names
        based on `input_name` if `columns` are not provided.

    Returns
    -------
    pd.DataFrame or pd.Series
        The potentially converted DataFrame or Series, or `X` unchanged.

    Examples
    --------
    >>> from geoprior.utils.validator import array_to_frame
    >>> from sklearn.datasets import load_iris
    >>> data = load_iris()
    >>> X = data.data
    >>> array_to_frame(X, to_frame=True, columns=['sepal_length', 'sepal_width',
                                                  'petal_length', 'petal_width'])
    """
    # Determine if conversion to frame is needed
    if to_frame and not isinstance(
        X, pd.DataFrame | pd.Series
    ):
        # Handle force conversion without provided column names
        if columns is None and force:
            columns = [
                f"{input_name}_{i}" for i in range(X.shape[1])
            ]
        elif columns is None:
            msg = (
                f"Array '{input_name}' requires column names for conversion to a DataFrame. "
                "Provide `columns` or set `force=True` to auto-generate column names."
            )
            if raise_exception:
                raise ValueError(msg)
            if raise_warning and raise_warning not in (
                "silence",
                "ignore",
                "mute",
            ):
                warnings.warn(msg, stacklevel=2)
            return X  # Early return if no columns and not forcing

        # Proceed with conversion using the provided or generated column names
        X, _ = convert_array_to_pandas(
            X,
            to_frame=True,
            columns=columns,
            input_name=input_name,
        )

    return X




[docs]
def array_to_frame2(
    X,
    *,
    to_frame=False,
    columns=None,
    raise_exception=False,
    raise_warning=True,
    input_name="",
    force: bool = False,
):
    """Added part of `is_frame` dedicated to X and y frame reconversion
    validation.

    Parameters
    ------------
    X: Array-like
        Array to convert to frame.
    columns: str or list of str
        Series name or columns names for pandas.Series and DataFrame.

    to_frame: str, default=False
        If ``True`` , reconvert the array to frame using the columns orthewise
        no-action is performed and return the same array.
    input_name : str, default=""
        The data name used to construct the error message.

    raise_warning : bool, default=True
        If True then raise a warning if conversion is required.
        If ``ignore``, warnings silence mode is triggered.
    raise_exception : bool, default=False
        If True then raise an exception if array is not symmetric.

    force:bool, default=False
        Force conversion array to a frame is columns is not supplied.
        Use the combinaison, `input_name` and `X.shape[1]` range.

    Returns
    --------
    X: converted array

    Example
    ---------
    >>> from geoprior.datasets import fetch_data
    >>> from geoprior.utils.validator import array_to_frame
    >>> data = fetch_data ('hlogs').frame
    >>> array_to_frame (data.k.values ,
                        to_frame= True, columns =None, input_name= 'y',
                        raise_warning="silence"
                                )
    ... array([nan, nan, nan, ..., nan, nan, nan]) # mute

    """

    isf = to_frame
    isf = is_frame(X)

    if to_frame and not isf and columns is None:
        if force:
            columns = [
                f"{input_name + str(i)}"
                for i in range(X.shape[1])
            ]
            isf = True
        else:
            msg = (
                f"Array {input_name} is originally not a frame. Frame "
                "conversion cannot be performed with no column names."
            )
            if raise_exception:
                raise ValueError(msg)
            if raise_warning and raise_warning not in (
                "silence",
                "ignore",
                "mute",
            ):
                warnings.warn(msg, stacklevel=2)

            isf = False

    elif to_frame and columns is not None:
        isf = True

    X, _ = convert_array_to_pandas(
        X,
        to_frame=isf,
        columns=columns,
        input_name=input_name,
    )

    return X



def _check_y_1d(y, *, warn=False, input_name="y"):
    """Ravel column or 1d numpy array, else raises an error.

    and Isolated part of check_X_y dedicated to y validation.

    Parameters
    ----------
    y : array-like
       Input data.
    warn : bool, default=False
       To control display of warnings.
    Returns
    -------
    y : ndarray
       Output data.
    Raises
    ------
    ValueError
        If `y` is not a 1D array or a 2D array with a single row or column.
    """
    xp, _ = get_namespace(y)
    y = xp.asarray(y)
    shape = y.shape
    if len(shape) == 1:
        return _asarray_with_order(
            xp.reshape(y, -1), order="C", xp=xp
        )
    if len(shape) == 2 and shape[1] == 1:
        if warn:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples, ), for example using ravel().",
                DataConversionWarning,
                stacklevel=2,
            )
        return _asarray_with_order(
            xp.reshape(y, -1), order="C", xp=xp
        )

    raise ValueError(
        f"{input_name} should be a 1d array, got"
        f" an array of shape {shape} instead."
    )


def _check_large_sparse(X, accept_large_sparse=False):
    """Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""
    if not accept_large_sparse:
        supported_indices = ["int32"]
        if X.getformat() == "coo":
            index_keys = ["col", "row"]
        elif X.getformat() in ["csr", "csc", "bsr"]:
            index_keys = ["indices", "indptr"]
        else:
            return
        for key in index_keys:
            indices_datatype = getattr(X, key).dtype
            if indices_datatype not in supported_indices:
                raise ValueError(
                    "Only sparse matrices with 32-bit integer"
                    f" indices are accepted. Got {indices_datatype} indices."
                )


def _ensure_sparse_format(
    spmatrix,
    accept_sparse,
    dtype,
    copy,
    force_all_finite,
    accept_large_sparse,
    estimator_name=None,
    input_name="",
):
    """Convert a sparse matrix to a given format.

    Checks the sparse format of spmatrix and converts if necessary.

    Parameters
    ----------
    spmatrix : sparse matrix
        Input to validate and convert.
    accept_sparse : str, bool or list/tuple of str
        String[s] representing allowed sparse matrix formats ('csc',
        'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
        not in the allowed format, it will be converted to the first listed
        format. True allows the input to be any format. False means
        that a sparse matrix input will raise an error.
    dtype : str, type or None
        Data type of result. If None, the dtype of the input is preserved.
    copy : bool
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.
    force_all_finite : bool or 'allow-nan'
        Whether to raise an error on ``np.inf``, ``np.nan``, or ``pd.NA`` in
        ``X``. Use ``True`` to require all values to be finite, ``False`` to
        allow ``np.inf``, ``np.nan``, and ``pd.NA``, or ``"allow-nan"`` to
        allow only ``np.nan`` and ``pd.NA`` while still rejecting infinite
        values. ``pd.NA`` is accepted and converted into ``np.nan``.
    estimator_name : str, default=None
        The estimator name, used to construct the error message.
    input_name : str, default=""
        The data name used to construct the error message. In particular
        if `input_name` is "X" and the data has NaN values and
        allow_nan is False, the error message will link to the imputer
        documentation.

    Returns
    -------
    spmatrix_converted : sparse matrix.
        Matrix that is ensured to have an allowed type.
    """
    if dtype is None:
        dtype = spmatrix.dtype

    changed_format = False

    if isinstance(accept_sparse, str):
        accept_sparse = [accept_sparse]

    # Indices dtype validation
    _check_large_sparse(spmatrix, accept_large_sparse)

    if accept_sparse is False:
        raise TypeError(
            "A sparse matrix was passed, but dense "
            "data is required. Use X.toarray() to "
            "convert to a dense numpy array."
        )
    elif isinstance(accept_sparse, list | tuple):
        if len(accept_sparse) == 0:
            raise ValueError(
                "When providing 'accept_sparse' "
                "as a tuple or list, it must contain at "
                "least one string value."
            )
        # ensure correct sparse format
        if spmatrix.format not in accept_sparse:
            # create new with correct sparse
            spmatrix = spmatrix.asformat(accept_sparse[0])
            changed_format = True
    elif accept_sparse is not True:
        # any other type
        raise ValueError(
            "Parameter 'accept_sparse' should be a string, "
            "boolean or list of strings. You provided "
            f"'accept_sparse={accept_sparse}'."
        )

    if dtype != spmatrix.dtype:
        # convert dtype
        spmatrix = spmatrix.astype(dtype)
    elif copy and not changed_format:
        # force copy
        spmatrix = spmatrix.copy()

    if force_all_finite:
        if not hasattr(spmatrix, "data"):
            warnings.warn(
                f"Can't check {spmatrix.format} sparse matrix for nan or inf.",
                stacklevel=2,
            )
        else:
            _assert_all_finite(
                spmatrix.data,
                allow_nan=force_all_finite == "allow-nan",
                estimator_name=estimator_name,
                input_name=input_name,
            )

    return spmatrix


def _object_dtype_isnan(X):
    return X != X


def _assert_all_finite(
    X,
    allow_nan=False,
    msg_dtype=None,
    estimator_name=None,
    input_name="",
):
    """Like assert_all_finite, but only for ndarray."""

    err_msg = (
        f"{input_name} does not accept missing values encoded as NaN"
        " natively. Alternatively, it is possible to preprocess the data,"
        " for instance by using the imputer transformer like the ufunc"
        " 'soft_imputer' in 'geoprior.utils.mlutils.soft_imputer'."
    )

    xp, _ = get_namespace(X)

    # if _get_config()["assume_finite"]:
    #     return
    X = xp.asarray(X)

    # for object dtype data, we only check for NaNs (GH-13254)
    if X.dtype == np.dtype("object") and not allow_nan:
        if _object_dtype_isnan(X).any():
            raise ValueError("Input contains NaN. " + err_msg)

    # We need only consider float arrays, hence can early return for all else.
    if X.dtype.kind not in "fc":
        return

    # First try an O(n) time, O(1) space solution for the common case that
    # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
    # Cython implementation to prevent false positives and provide a detailed
    # error message.
    with np.errstate(over="ignore"):
        first_pass_isfinite = xp.isfinite(xp.sum(X))
    if first_pass_isfinite:
        return
    # Cython implementation doesn't support FP16 or complex numbers
    # use_cython = (
    #     xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
    # )
    # if use_cython:
    #     out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
    #     has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
    #     has_inf = out == FiniteStatus.has_infinite
    # else:
    has_inf = np.isinf(X).any()
    has_nan_error = False if allow_nan else xp.isnan(X).any()
    if has_inf or has_nan_error:
        if has_nan_error:
            type_err = "NaN"
        else:
            msg_dtype = (
                msg_dtype
                if msg_dtype is not None
                else X.dtype
            )
            type_err = f"infinity or a value too large for {msg_dtype!r}"
        padded_input_name = (
            input_name + " " if input_name else ""
        )
        msg_err = (
            f"Input {padded_input_name}contains {type_err}."
        )
        if (
            estimator_name
            and input_name == "X"
            and has_nan_error
        ):
            # Improve the error message on how to handle missing values in
            # scikit-learn.
            msg_err += (
                f"\n{estimator_name} does not accept missing values"
                " encoded as NaN natively. For supervised learning, you might want"
                " to consider sklearn.ensemble.HistGradientBoostingClassifier and"
                " Regressor which accept missing values encoded as NaNs natively."
                " Alternatively, it is possible to preprocess the data, for"
                " instance by using an imputer transformer in a pipeline or drop"
                " samples with missing values. See"
                " https://scikit-learn.org/stable/modules/impute.html"
                " You can find a list of all estimators that handle NaN values"
                " at the following page:"
                " https://scikit-learn.org/stable/modules/impute.html"
                "#estimators-that-handle-nan-values"
            )
        elif estimator_name is None and has_nan_error:
            msg_err += f"\n{err_msg}"

        raise ValueError(msg_err)



[docs]
def assert_all_finite(
    X,
    *,
    allow_nan=False,
    estimator_name=None,
    input_name="",
):
    """Throw a ValueError if X contains NaN or infinity.

    Parameters
    ----------
    X : {ndarray, sparse matrix}
        The input data.
    allow_nan : bool, default=False
        If True, do not throw error when `X` contains NaN.
    estimator_name : str, default=None
        The estimator name, used to construct the error message.
    input_name : str, default=""
        The data name used to construct the error message. In particular
        if `input_name` is "X" and the data has NaN values and
        allow_nan is False, the error message will link to the imputer
        documentation.
    """
    _assert_all_finite(
        X.data if sp.issparse(X) else X,
        allow_nan=allow_nan,
        estimator_name=estimator_name,
        input_name=input_name,
    )



def _generate_get_feature_names_out(
    estimator, n_features_out, input_features=None
):
    """Generate feature names out for estimator using the estimator name as the prefix.

    The input_feature names are validated but not used. This function is useful
    for estimators that generate their own names based on `n_features_out`, i.e. PCA.

    Parameters
    ----------
    estimator : estimator instance
        Estimator producing output feature names.
    n_feature_out : int
        Number of feature names out.
    input_features : array-like of str or None, default=None
        Only used to validate feature names with `estimator.feature_names_in_`.

    Returns
    -------
    feature_names_in : ndarray of str or `None`
        Feature names in.
    """
    _check_feature_names_in(
        estimator, input_features, generate_names=False
    )
    estimator_name = estimator.__class__.__name__.lower()
    return np.asarray(
        [
            f"{estimator_name}{i}"
            for i in range(n_features_out)
        ],
        dtype=object,
    )



[docs]
class PositiveSpectrumWarning(UserWarning):
    """Warning raised when the eigenvalues of a PSD matrix have issues

    This warning is typically raised by ``_check_psd_eigenvalues`` when the
    eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix
    (kernel) present significant negative eigenvalues, or bad conditioning i.e.
    very small non-zero eigenvalues compared to the largest eigenvalue.

    .. versionadded:: 0.22
    """




[docs]
class DataConversionWarning(UserWarning):
    """Warning used to notify implicit data conversions happening in the code.

    This warning occurs when some input data needs to be converted or
    interpreted in a way that may not match the user's expectations. For
    example, this warning may occur when the user:

    - passes an integer array to a function that expects float input and will
      convert the input;
    - requests a non-copying operation, but a copy is required to meet the
      implementation's data-type expectations;
    - passes an input whose shape can be interpreted ambiguously.

    .. versionchanged:: 0.18
       Moved from ``sklearn.utils.validation``.
    """