"""
Helper functions for binning. Smaller functions that are used in multiple binning functions.
"""
import numpy as np
from pandas import Series
from numpy.typing import ArrayLike
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
[docs]
def get_tree_cut_points(clf: DecisionTreeRegressor | DecisionTreeClassifier, feature_names: ArrayLike | None = None):
"""Get the cut points of a decision tree.
Parameters
----------
clf : DecisionTreeRegressor or DecisionTreeClassifier
The decision tree to get the cut points from.
feature_names : array-like, optional
The feature names of the decision tree. If None, the features are assumed to be integers.
Returns
-------
feature_cut_points : dict
A dictionary with the feature names as keys and the cut points as values.
"""
# Test if clf is a decision tree
if not isinstance(clf, DecisionTreeRegressor) and not isinstance(clf, DecisionTreeClassifier):
raise ValueError("clf should be a DecisionTreeRegressor or DecisionTreeClassifier.")
# Test if feature_names is None or array-like
if feature_names is not None and not isinstance(feature_names, (list, np.ndarray, Series)):
raise ValueError("feature_names should be None or array-like.")
features = clf.tree_.feature
cut_points = clf.tree_.threshold
if feature_names is not None:
feature_cut_points = {feature_names[f]: np.sort(np.unique(cut_points[np.where(features == f)])) for f in features if f >= 0} # negative numbers are nodes without split
else:
feature_cut_points = {f: np.sort(np.unique(cut_points[np.where(features == f)])) for f in features if f >= 0} # negative numbers are nodes without split
return feature_cut_points
[docs]
def bin_values(data: ArrayLike, thresholds: ArrayLike, right=True) -> np.ndarray:
"""Bin data into bins based on thresholds.
Parameters
----------
data : array-like
The input values to be binned.
thresholds : array-like
The thresholds to use for binning, ordered from smallest to largest.
right : bool, optional
Whether the intervals should be closed on the right (default) or left.
Returns
-------
binned_values : array-like
The binned values. String format is "(a, b]" if right=True, "[a, b)" if right=False.
TODO
----
* Add option to return pd.Categorical ordered by thresholds.
"""
# Raise error if thresholds are empty
if len(thresholds) == 0:
raise ValueError("Thresholds should not be empty.")
# Sort the thresholds
thresholds = np.sort(thresholds)
# Bin the data
bins = np.digitize(data, thresholds, right=right)
# Create string variable based on the bin intervals with brackets based on right parameter
if right:
bin_labels = [f"(-Inf, {thresholds[0]}]"] + [f'({thresholds[i-1]}, {thresholds[i]}]' for i in range(1, len(thresholds))] + [f"({thresholds[-1]}, Inf)"]
else:
bin_labels = [f"(-Inf, {thresholds[0]})"] + [f'[{thresholds[i-1]}, {thresholds[i]})' for i in range(1, len(thresholds))] + [f"[{thresholds[-1]}, Inf)"]
result = np.array([bin_labels[i] for i in bins])
return result