Source code for geoprior.scripts.summarize_hotspots

# SPDX-License-Identifier: Apache-2.0
# GeoPrior-v3 - https://github.com/earthai-tech/geoprior-v3
# Copyright (c) 2026-present
# Author: LKouadio <https://lkouadio.com>

r"""Summarise hotspot point clouds.

- subsidence level (mm/yr): `value`
- anomaly / delta metric (mm/yr): `metric_value`

Input
-----
Hotspot CSV produced by the Fig.6 spatial script
(e.g., make_figure6_spatial_forecasts.py) with columns:

  city, panel, kind, year, coord_x, coord_y, value,
  hotspot_mode, hotspot_quantile, metric_value,
  baseline_value, threshold

Only these are required:
  city, year, kind, value, metric_value

Output
------
A tidy summary grouped by (city, year, kind) with:

  n_hotspots
  value_min, value_mean, value_max
  metric_min, metric_mean, metric_max
  baseline_min, baseline_mean, baseline_max   (if present)
  threshold_min, threshold_max                (if present)

API conventions
---------------
- Use scripts.utils.resolve_out_out() for tables/artifacts.
- Use cfg.OUT_DIR as default output location.
- Expose a stable main(argv) wrapper.
- CLI has a program name (hyphenated).

Examples
--------
Write to scripts/out/ by default:
  python nat.com/summarize_hotspots.py \
    --hotspot-csv results/figs/fig6_hotspot_points.csv

Explicit output (relative -> scripts/out/):
  python nat.com/summarize_hotspots.py \
    --hotspot-csv results/figs/fig6_hotspot_points.csv \
    --out fig6_hotspot_summary.csv
"""

from __future__ import annotations

import argparse

# from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

# from . import config as cfg
from . import utils


# ---------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------
def _parse_args(
    argv: list[str] | None, *, prog: str | None = None
) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog=prog or "summarize-hotspots",
        description="Summarise hotspot clouds (stats by city/year/kind).",
    )

    p.add_argument(
        "--hotspot-csv",
        type=str,
        required=True,
        help="Hotspot CSV (points) from Fig.6 script.",
    )

    # API convention:
    # - tables go under scripts/out/ by default.
    # - if --out is relative, it resolves under cfg.OUT_DIR.
    p.add_argument(
        "--out",
        "-o",
        type=str,
        default="fig6_hotspot_summary.csv",
        help="Output CSV name/path (relative -> scripts/out/).",
    )

    # Optional: also print the pretty table
    p.add_argument(
        "--quiet",
        type=str,
        default="false",
        help="Do not print table to console (true/false).",
    )

    return p.parse_args(argv)


# ---------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------
def _require_columns(
    df: pd.DataFrame, req: list[str]
) -> None:
    missing = [c for c in req if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")


def _to_numeric_inplace(
    df: pd.DataFrame, cols: list[str]
) -> None:
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")


def _summary_per_group(g: pd.DataFrame) -> pd.Series:
    """
    Compute summary stats for one (city, year, kind) group.
    """
    out: dict[str, Any] = {
        "n_hotspots": int(len(g)),
        # Hotspot subsidence (mm/yr)
        "value_min": float(np.nanmin(g["value"].to_numpy())),
        "value_max": float(np.nanmax(g["value"].to_numpy())),
        "value_mean": float(
            np.nanmean(g["value"].to_numpy())
        ),
        # Hotspot anomaly metric (mm/yr)
        "metric_min": float(
            np.nanmin(g["metric_value"].to_numpy())
        ),
        "metric_max": float(
            np.nanmax(g["metric_value"].to_numpy())
        ),
        "metric_mean": float(
            np.nanmean(g["metric_value"].to_numpy())
        ),
    }

    if "baseline_value" in g.columns:
        b = g["baseline_value"].to_numpy(dtype=float)
        out.update(
            {
                "baseline_min": float(np.nanmin(b)),
                "baseline_max": float(np.nanmax(b)),
                "baseline_mean": float(np.nanmean(b)),
            }
        )

    if "threshold" in g.columns:
        # thresholds are typically constant per group,
        # but we keep min/max for robustness.
        t = pd.to_numeric(g["threshold"], errors="coerce")
        out.update(
            {
                "threshold_min": float(
                    np.nanmin(t.to_numpy())
                ),
                "threshold_max": float(
                    np.nanmax(t.to_numpy())
                ),
            }
        )

    return pd.Series(out)


[docs] def summarize_hotspots(df: pd.DataFrame) -> pd.DataFrame: """ Summarize hotspot points into a group-level table. Groups: (city, year, kind) Returns: summary DataFrame with stable column ordering. """ _require_columns( df, ["city", "year", "kind", "value", "metric_value"], ) # Normalize city names to canonical form. df = df.copy() df["city"] = ( df["city"].astype(str).map(utils.canonical_city) ) # Types df["year"] = pd.to_numeric(df["year"], errors="coerce") df = df.dropna(subset=["year"]).copy() df["year"] = df["year"].astype(int) _to_numeric_inplace( df, [ "value", "metric_value", "baseline_value", "threshold", ], ) grouped = df.groupby( ["city", "year", "kind"], dropna=False ) out = grouped.apply(_summary_per_group).reset_index() # Stable column ordering. base = [ "city", "year", "kind", "n_hotspots", "value_min", "value_mean", "value_max", "metric_min", "metric_mean", "metric_max", ] extra = [c for c in out.columns if c not in base] return out[base + extra]
# --------------------------------------------------------------------- # Main # ---------------------------------------------------------------------
[docs] def summarize_hotspots_main( argv: list[str] | None = None, *, prog: str | None = None ) -> None: args = _parse_args(argv, prog=prog) src = utils.as_path(args.hotspot_csv) if not src.exists(): raise FileNotFoundError(str(src)) df = pd.read_csv(src) summary = summarize_hotspots(df) # Default: write to scripts/out/ out_path = utils.resolve_out_out(args.out) utils.ensure_dir(out_path.parent) summary.to_csv(out_path, index=False) quiet = utils.str_to_bool(args.quiet, default=False) if not quiet: pd.set_option("display.max_columns", None) pd.set_option("display.width", 120) print(summary.to_string(index=False)) print(f"\n[OK] summary -> {out_path}")
[docs] def main( argv: list[str] | None = None, *, prog: str | None = None ) -> None: summarize_hotspots_main(argv, prog=prog)
if __name__ == "__main__": main()