Source code for stepsel.modeling.prep.model_matrix

import pandas as pd
import numpy as np
from stepsel.modeling.prep import (
    parse_model_formula, recognize_variable_types, get_interaction_type,
    interaction_categorical_numerical, interaction_categorical_categorical,
    interaction_numerical_numerical
)


[docs] def prepare_model_matrix(formula: str, data: pd.DataFrame, intercept: bool = True, drop_first: bool = True, omit_left_side_variables: bool = False): """ Prepare a model matrix based on a formula and a data set. TODO: If intercept = False, keep all the levels of the first categorical variable. Parameters ---------- formula : str The formula for the model. data : pandas.DataFrame The data set. intercept : bool, optional Whether to include an intercept in the model matrix. Default is True. drop_first : bool, optional Whether to drop the first level of each categorical variable. Default is True. omit_left_side_variables : bool, optional Whether to omit the left side variables from the output. Default is False. If True, the function will return only the model matrix and the feature IDs. Returns ------- y : pandas.Series The response variable. If omit_left_side_variables is True, the function won't return y. model_matrix : pandas.DataFrame The model matrix. feature_ids : list The feature IDs. Raises ------ ValueError If interaction type is not supported. Notes ----- The function will create a model matrix based on the formula and the data set. Categories will be dummy-encoded. Interaction terms will be created and dummy-encoded if necessary. The feature IDs will be a list of strings of the variable names corresponding to the columns of the model matrix. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from stepsel.modeling.prep import prepare_model_matrix >>> data = pd.DataFrame({"y": np.random.normal(size=100), ... "x1": np.random.normal(size=100), ... "x2": np.random.choice(["A", "B", "C"], size=100), ... "x3": np.random.choice(["A", "B", "C"], size=100)}) >>> data[["x2", "x3"]] = data[["x2", "x3"]].astype("category") >>> y, model_matrix, feature_ids = prepare_model_matrix("y ~ x1 + x2 + x3 + x1*x2 + x1*x3", data) """ # Parse formula left_side_variables, interaction_variables, non_interaction_variables = parse_model_formula(formula) # Recognize variable types of parsed formula variables_by_type = recognize_variable_types(data, interaction_variables, non_interaction_variables) non_interaction_numerical_variables = variables_by_type["non_interaction_numerical_variables"] non_interaction_categorical_variables = variables_by_type["non_interaction_categorical_variables"] interaction_variables = variables_by_type["interaction_variables"] interaction_numerical_variables = variables_by_type["interaction_numerical_variables"] interaction_categorical_variables = variables_by_type["interaction_categorical_variables"] # Create model matrix: non-interaction numerical variables model_matrix = pd.DataFrame() feature_ids = [] for variable in non_interaction_numerical_variables: model_matrix = pd.concat([model_matrix, data[variable]], axis=1) feature_ids.append(variable) # Create model matrix: non-interaction categorical variables for variable in non_interaction_categorical_variables: model_matrix = pd.concat([model_matrix, pd.get_dummies(data[variable], prefix=variable, prefix_sep=": ", columns=data[variable].name, drop_first=drop_first, dtype=int)], axis=1) feature_ids.extend([variable] * (model_matrix.shape[1] - len(feature_ids))) # Create model matrix: interaction variables for interaction in interaction_variables: interaction_split = interaction.split("*") interaction_split = [x.strip() for x in interaction_split] # Type of interaction interaction_type = get_interaction_type(interaction, interaction_numerical_variables, interaction_categorical_variables) if interaction_type in ["numerical_categorical", "categorical_numerical"]: series1 = data[interaction_split[0]] series2 = data[interaction_split[1]] model_matrix = pd.concat([model_matrix, interaction_categorical_numerical(series1, series2)], axis=1) feature_ids.extend([interaction] * (model_matrix.shape[1] - len(feature_ids))) elif interaction_type == "categorical_categorical": series1 = data[interaction_split[0]] series2 = data[interaction_split[1]] series_interaction = interaction_categorical_categorical(series1, series2) model_matrix = pd.concat([model_matrix, pd.get_dummies(series_interaction, prefix=series_interaction.name, prefix_sep=": ", columns=series_interaction.name, drop_first=drop_first, dtype=int)], axis=1) feature_ids.extend([interaction] * (model_matrix.shape[1] - len(feature_ids))) elif interaction_type == "numerical_numerical": series1 = data[interaction_split[0]] series2 = data[interaction_split[1]] model_matrix = pd.concat([model_matrix, interaction_numerical_numerical(series1, series2)], axis=1) feature_ids.extend([interaction] * (model_matrix.shape[1] - len(feature_ids))) else: raise ValueError(f"""Interaction type not recognized. Interaction: {interaction} Interaction type: {interaction_type} """) # Add intercept if intercept: model_matrix = pd.concat([pd.Series([1] * model_matrix.shape[0], name="Intercept"), model_matrix], axis=1) feature_ids.insert(0, "Intercept") # Target variable if omit_left_side_variables: return model_matrix, feature_ids else: y = data[left_side_variables] return y, model_matrix, feature_ids
[docs] def adjust_model_matrix(model_matrices: list, adjusted_coeffs: dict, offsets: list = None): """ Adjust model matrix (and offset) based on adjusted coefficients dictionary. Parameters ---------- model_matrices : list (of data frames) The model matrices. adjusted_coeffs : dict The adjusted coefficients dictionary. The format of the dictionary is as follows: {variable_name: adjusted_coefficient} Variable_name is the name of the variable in the model. Example: {"ts_new9_g: 06": 0.20, "drpou_cpp_dop3: H": -1.74} offsets : list (of numpy arrays or pandas Series), optional The offsets. Default is None. Returns ------- model_matrices : tuple (of data frames) The adjusted model matrices. offsets : tuple (of numpy arrays or pandas Series) The adjusted offsets. Raises ------ Exception If the number of offsets is not equal to the number of model matrices. If the number of rows in the model matrix is not equal to the number of offset values. Notes ----- The function will adjust the model matrices and offsets based on the adjusted coefficients dictionary. The function will delete the variables from the model matrices and add the adjusted coefficients to the offsets. The function will return a tuple of the adjusted model matrices and offsets. Adjustments are done in-place. If both matrices and offsets are provided, re-assignment is not necessary. If one wants to keep the original model matrices and offsets, make a copy of them before calling the function. """ # Copy the model matrices to avoid changing the original data model_matrices = [model_matrix.copy() for model_matrix in model_matrices] # Create offsets if not provided if offsets is None: offsets = [None] * len(model_matrices) for i in range(len(model_matrices)): offsets[i] = np.zeros(model_matrices[i].shape[0]) # Check if the number of offsets is equal to the number of model matrices elif len(offsets) != len(model_matrices): raise Exception("The number of offsets must be equal to the number of model matrices.") # Check if the shape of the offsets is equal to the number of rows in the model matrices else: offsets = [offset.copy() for offset in offsets] for i in range(len(offsets)): offsets[i] = pd.Series(offsets[i]) if offsets[i].shape[0] != model_matrices[i].shape[0]: raise Exception(f"The number of rows in the model matrix {i} is not equal to the number of offset values.") # Adjust the model matrices and offsets for k, v in adjusted_coeffs.items(): for i in range(len(model_matrices)): offsets[i] += (v * model_matrices[i][k]) model_matrices[i].drop(k, axis=1, inplace=True) return tuple(model_matrices + offsets)