Skip to content

Summary module

Dataset summary and batch extraction helpers.

DatasetSummary dataclass

Describe a local hyperspectral dataset.

Parameters:

Name Type Description Default
path str

Dataset path.

required
exists bool

Whether the path exists.

required
sensor Optional[str]

Sensor name used to read the dataset.

None
variables list[str]

Data variable names.

<factory>
selected_variable Optional[str]

Selected data variable.

None
crs Optional[str]

Dataset CRS when discoverable.

None
bounds Optional[tuple[float, float, float, float]]

Dataset bounds when discoverable.

None
wavelength_count int

Number of spectral coordinates.

0
wavelength_min Optional[float]

Minimum wavelength or band coordinate.

None
wavelength_max Optional[float]

Maximum wavelength or band coordinate.

None
dimensions dict[str, int]

Dataset dimensions.

<factory>
default_rgb list[float]

Default RGB wavelengths from the sensor registry.

<factory>
warnings list[str]

Non-fatal summary warnings.

<factory>
Source code in hypercoast/summary.py
@dataclass
class DatasetSummary:
    """Describe a local hyperspectral dataset.

    Args:
        path: Dataset path.
        exists: Whether the path exists.
        sensor: Sensor name used to read the dataset.
        variables: Data variable names.
        selected_variable: Selected data variable.
        crs: Dataset CRS when discoverable.
        bounds: Dataset bounds when discoverable.
        wavelength_count: Number of spectral coordinates.
        wavelength_min: Minimum wavelength or band coordinate.
        wavelength_max: Maximum wavelength or band coordinate.
        dimensions: Dataset dimensions.
        default_rgb: Default RGB wavelengths from the sensor registry.
        warnings: Non-fatal summary warnings.
    """

    path: str
    exists: bool
    sensor: Optional[str] = None
    variables: list[str] = field(default_factory=list)
    selected_variable: Optional[str] = None
    crs: Optional[str] = None
    bounds: Optional[tuple[float, float, float, float]] = None
    wavelength_count: int = 0
    wavelength_min: Optional[float] = None
    wavelength_max: Optional[float] = None
    dimensions: dict[str, int] = field(default_factory=dict)
    default_rgb: list[float] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)

    def as_dict(self) -> dict[str, Any]:
        """Return a JSON-serializable summary dictionary.

        Returns:
            dict: Summary fields.
        """
        return asdict(self)

as_dict(self)

Return a JSON-serializable summary dictionary.

Returns:

Type Description
dict

Summary fields.

Source code in hypercoast/summary.py
def as_dict(self) -> dict[str, Any]:
    """Return a JSON-serializable summary dictionary.

    Returns:
        dict: Summary fields.
    """
    return asdict(self)

extract_spectra_to_csv(sensor, path, points_csv, output, x_column='x', y_column='y', crs='EPSG:4326')

Extract spectra for CSV point coordinates.

Parameters:

Name Type Description Default
sensor str

Sensor name or alias.

required
path str | Path

Input dataset path.

required
points_csv str | Path

CSV file containing point coordinates.

required
output str | Path

Output long-form CSV path.

required
x_column str

X or longitude column.

'x'
y_column str

Y or latitude column.

'y'
crs str

Coordinate reference system for point coordinates.

'EPSG:4326'

Returns:

Type Description
str

Output CSV path.

Source code in hypercoast/summary.py
def extract_spectra_to_csv(
    sensor: str,
    path: str | Path,
    points_csv: str | Path,
    output: str | Path,
    x_column: str = "x",
    y_column: str = "y",
    crs: str = "EPSG:4326",
) -> str:
    """Extract spectra for CSV point coordinates.

    Args:
        sensor: Sensor name or alias.
        path: Input dataset path.
        points_csv: CSV file containing point coordinates.
        output: Output long-form CSV path.
        x_column: X or longitude column.
        y_column: Y or latitude column.
        crs: Coordinate reference system for point coordinates.

    Returns:
        str: Output CSV path.
    """
    from .registry import extract_sensor

    points = pd.read_csv(points_csv)
    if x_column not in points.columns or y_column not in points.columns:
        raise ValueError(f"Point CSV must contain '{x_column}' and '{y_column}'.")

    dataset = read_sensor(sensor, path)
    rows: list[dict[str, Any]] = []
    try:
        for feature_id, row in points.iterrows():
            lon, lat = _to_lon_lat(float(row[x_column]), float(row[y_column]), crs)
            spectrum = extract_sensor(sensor, dataset, lat=lat, lon=lon)
            wavelengths, values = _spectrum_values(spectrum)
            for wavelength, value in zip(wavelengths, values):
                rows.append(
                    {
                        "feature_id": feature_id,
                        "x": row[x_column],
                        "y": row[y_column],
                        "crs": crs,
                        "wavelength": wavelength,
                        "value": value,
                        "layer": str(path),
                        "variable": getattr(spectrum, "name", None),
                    }
                )
    finally:
        close = getattr(dataset, "close", None)
        if callable(close):
            close()

    output_path = Path(output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(rows).to_csv(output_path, index=False)
    return str(output_path)

subset_dataset(path, output, bbox, sensor=None, variable=None)

Subset a rectilinear local dataset by bounding box and write NetCDF.

Parameters:

Name Type Description Default
path str | Path

Input dataset path.

required
output str | Path

Output NetCDF path.

required
bbox tuple[float, float, float, float]

Bounding box as (xmin, ymin, xmax, ymax).

required
sensor Optional[str]

Optional sensor name or alias.

None
variable Optional[str]

Optional data variable to keep.

None

Returns:

Type Description
str

Output path.

Source code in hypercoast/summary.py
def subset_dataset(
    path: str | Path,
    output: str | Path,
    bbox: tuple[float, float, float, float],
    sensor: Optional[str] = None,
    variable: Optional[str] = None,
) -> str:
    """Subset a rectilinear local dataset by bounding box and write NetCDF.

    Args:
        path: Input dataset path.
        output: Output NetCDF path.
        bbox: Bounding box as ``(xmin, ymin, xmax, ymax)``.
        sensor: Optional sensor name or alias.
        variable: Optional data variable to keep.

    Returns:
        str: Output path.
    """
    ds = read_sensor(sensor, path) if sensor else xr.open_dataset(path)
    try:
        selected = _select_variable(ds, variable)
        if variable and selected != variable:
            raise ValueError(
                f"Variable {variable!r} not found. " f"Available: {list(ds.data_vars)}"
            )
        if variable and selected:
            ds = ds[[selected]]
        xmin, ymin, xmax, ymax = bbox
        ds = _subset_by_bbox(ds, xmin, ymin, xmax, ymax)
        output_path = Path(output)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        ds.to_netcdf(output_path)
        return str(output_path)
    finally:
        close = getattr(ds, "close", None)
        if callable(close):
            close()

summarize_dataset(path, sensor=None, variable=None)

Summarize a local dataset.

Parameters:

Name Type Description Default
path str | Path

Local dataset path.

required
sensor Optional[str]

Optional sensor name or alias.

None
variable Optional[str]

Optional selected data variable.

None

Returns:

Type Description
DatasetSummary

Dataset metadata and warnings.

Source code in hypercoast/summary.py
def summarize_dataset(
    path: str | Path,
    sensor: Optional[str] = None,
    variable: Optional[str] = None,
) -> DatasetSummary:
    """Summarize a local dataset.

    Args:
        path: Local dataset path.
        sensor: Optional sensor name or alias.
        variable: Optional selected data variable.

    Returns:
        DatasetSummary: Dataset metadata and warnings.
    """
    dataset_path = Path(path)
    summary = DatasetSummary(path=str(dataset_path), exists=dataset_path.exists())
    if not dataset_path.exists():
        summary.warnings.append(f"File not found: {dataset_path}")
        return summary

    handler = None
    if sensor:
        handler = get_sensor(sensor)
        summary.sensor = handler.name
        summary.default_rgb = [float(value) for value in handler.default_rgb]

    try:
        ds = (
            read_sensor(sensor, dataset_path)
            if sensor
            else xr.open_dataset(dataset_path)
        )
    except Exception as exc:
        summary.warnings.append(f"Could not open dataset: {exc}")
        return summary

    try:
        summary.variables = list(ds.data_vars)
        summary.dimensions = {name: int(size) for name, size in ds.sizes.items()}
        summary.selected_variable = _select_variable(ds, variable)
        summary.crs = _dataset_crs(ds)
        summary.bounds = _dataset_bounds(ds)
        wavelengths = _wavelength_values(ds, summary.selected_variable)
        if wavelengths is not None and wavelengths.size > 0:
            finite = wavelengths[np.isfinite(wavelengths)]
            summary.wavelength_count = int(wavelengths.size)
            if finite.size > 0:
                summary.wavelength_min = float(np.nanmin(finite))
                summary.wavelength_max = float(np.nanmax(finite))
    finally:
        close = getattr(ds, "close", None)
        if callable(close):
            close()

    if variable and summary.selected_variable != variable:
        summary.warnings.append(f"Variable not found or not selectable: {variable}")
    return summary