Source code for stepsel.binning.helper

"""
Helper functions for binning. Smaller functions that are used in multiple binning functions.
"""
import numpy as np
from pandas import Series
from numpy.typing import ArrayLike
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor


[docs] def get_tree_cut_points(clf: DecisionTreeRegressor | DecisionTreeClassifier, feature_names: ArrayLike | None = None): """Get the cut points of a decision tree. Parameters ---------- clf : DecisionTreeRegressor or DecisionTreeClassifier The decision tree to get the cut points from. feature_names : array-like, optional The feature names of the decision tree. If None, the features are assumed to be integers. Returns ------- feature_cut_points : dict A dictionary with the feature names as keys and the cut points as values. """ # Test if clf is a decision tree if not isinstance(clf, DecisionTreeRegressor) and not isinstance(clf, DecisionTreeClassifier): raise ValueError("clf should be a DecisionTreeRegressor or DecisionTreeClassifier.") # Test if feature_names is None or array-like if feature_names is not None and not isinstance(feature_names, (list, np.ndarray, Series)): raise ValueError("feature_names should be None or array-like.") features = clf.tree_.feature cut_points = clf.tree_.threshold if feature_names is not None: feature_cut_points = {feature_names[f]: np.sort(np.unique(cut_points[np.where(features == f)])) for f in features if f >= 0} # negative numbers are nodes without split else: feature_cut_points = {f: np.sort(np.unique(cut_points[np.where(features == f)])) for f in features if f >= 0} # negative numbers are nodes without split return feature_cut_points
[docs] def bin_values(data: ArrayLike, thresholds: ArrayLike, right=True) -> np.ndarray: """Bin data into bins based on thresholds. Parameters ---------- data : array-like The input values to be binned. thresholds : array-like The thresholds to use for binning, ordered from smallest to largest. right : bool, optional Whether the intervals should be closed on the right (default) or left. Returns ------- binned_values : array-like The binned values. String format is "(a, b]" if right=True, "[a, b)" if right=False. TODO ---- * Add option to return pd.Categorical ordered by thresholds. """ # Raise error if thresholds are empty if len(thresholds) == 0: raise ValueError("Thresholds should not be empty.") # Sort the thresholds thresholds = np.sort(thresholds) # Bin the data bins = np.digitize(data, thresholds, right=right) # Create string variable based on the bin intervals with brackets based on right parameter if right: bin_labels = [f"(-Inf, {thresholds[0]}]"] + [f'({thresholds[i-1]}, {thresholds[i]}]' for i in range(1, len(thresholds))] + [f"({thresholds[-1]}, Inf)"] else: bin_labels = [f"(-Inf, {thresholds[0]})"] + [f'[{thresholds[i-1]}, {thresholds[i]})' for i in range(1, len(thresholds))] + [f"[{thresholds[-1]}, Inf)"] result = np.array([bin_labels[i] for i in bins]) return result