Source code for coonfit.helper
"""
Utility functions for the coonfit regression workflow.
This module provides supporting functions used during predictor validation and
data quality assessment. It includes tools for detecting rank-deficient
predictor matrices (which would cause the normal equations to be singular) and
for counting the number of usable pixels within a boolean selector mask.
"""
from __future__ import annotations
import numpy as np
[docs]
def check_rank_deficiency(array: np.ndarray, return_by_issue_type: bool = False,
) -> dict[int, str] | dict[str, list[int]]:
"""Check if matrix is rank deficient and identify problematic columns.
Returns a dictionary with column indices (key) and issue description (value).
An empty dictionary indicates that no rank deficiency was detected.
Parameters
----------
array : NDArray
Matrix to check for rank deficiency
return_by_issue_type : bool, optional
If True, returns nested dictionary separating issues by type:
{"linear_dependent": [...], "all_zero": [...]}
Returns
-------
dict[int, str] or dict[str, list[int]]
Problematic columns and their issues. Uses :func:`numpy.linalg.matrix_rank`
to determine the rank of the array.
See Also
--------
:func:`~coonfit.parallel.get_XT_X_dependency` : Check predictors for linear dependency.
"""
all_zero_cols = {}
rank_deficient_cols = {}
_, num_columns = array.shape
rank = np.linalg.matrix_rank(array)
if rank == num_columns:
return dict()
for col in range(num_columns):
column_vector = array[:, col]
if np.all(column_vector == 0):
all_zero_cols[col] = "All zero column"
else:
# drop focus column
sub_array = np.delete(array, col, axis=1)
# does removing a column increase the rank?
if np.linalg.matrix_rank(sub_array) == rank:
rank_deficient_cols[col] = "Linear dependent column"
if return_by_issue_type:
return dict(linear_dependent=[l for l in rank_deficient_cols.keys()],
all_zero=[z for z in all_zero_cols.keys()])
else:
return {**rank_deficient_cols, **all_zero_cols}
[docs]
def usable_pixels_info(all_pixels: int, data_pixels: int) -> None:
"""Print the fraction of usable pixels.
Parameters
----------
all_pixels : int
Total number of pixels in the dataset
data_pixels : int
Number of pixels that contain usable data
See Also
--------
:func:`usable_pixels_count` : Count the number of usable pixels.
Examples
--------
>>> usable_pixels_info(1000, 750)
Of all_pixels=1000 there are data_pixels=750, i.e. 75.0% are usable
"""
print(f"Of {all_pixels=} there are {data_pixels=}, i.e. "
f"{round(100 * data_pixels/all_pixels, 2)}% are usable")
[docs]
def usable_pixels_count(selector):
"""Count the number of usable pixels determined by the selector.
Parameters
----------
selector : NDArray
Boolean array where True indicates a usable pixel and False
indicates a pixel to be excluded
Returns
-------
int
Number of True values in the selector array (count of usable pixels).
Uses :func:`numpy.unique` to count occurrences.
See Also
--------
:func:`usable_pixels_info` : Print the fraction of usable pixels.
Examples
--------
>>> selector = np.array([True, True, False, True, False])
>>> usable_pixels_count(selector)
3
"""
vals, counts = np.unique(selector, return_counts=True)
# vals: [True, False] or inv. in any case ok
try:
return int(counts[vals][0])
except IndexError:
return 0