Source code for breton_cretenet.algorithm

# algorithm

# This module contains all the function in order to build the machine learning algorithm of the project

import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor


[docs]def linear_regression_algorithm(X_train, y_train, X_train_labels):
    """
    Fit a linear regression model to the training data.

    Parameters
    ----------
    X_train : numpy.ndarray
        Training input data of shape (n_samples, n_features).
    y_train : numpy.ndarray
        Target values of shape (n_samples,).
    X_train_labels : list
        List of strings representing the feature names.

    Returns
    -------
    sklearn.linear_model.LinearRegression
        A fitted linear regression model.
    """

    # assert type(X_train) == np.ndarray # Check if data input is an acceptable format i.e {array-like, sparse matrix} of shape (n_samples, n_features)
    # assert type(y_train) == np.ndarray # Check if data input is an acceptable format i.e array-like of shape (n_samples,) or (n_samples, n_targets)
    # assert len(X_train) == len(y_train) # Check if data input have the same amount of samples

    # Switching to DataFrame so X train labels are stored in model
    df_X_train = pd.DataFrame(X_train, columns=X_train_labels)

    regressor = LinearRegression()
    regressor.fit(df_X_train, y_train)

    return regressor


[docs]def decision_tree_regressor_algorithm(X_train, y_train, X_train_labels, max_depth=2):
    """
    Fit a decision tree regression model to the training data.

    Parameters
    ----------
    X_train : numpy.ndarray
        Training input data of shape (n_samples, n_features).
    y_train : numpy.ndarray
        Target values of shape (n_samples,).
    X_train_labels : list
        List of strings representing the feature names.
    max_depth : int, optional (default=2)
        The maximum depth of the decision tree.

    Returns
    -------
    sklearn.tree.DecisionTreeRegressor
        A fitted decision tree regression model.
    """

    # assert type(X_train) == np.ndarray # Check if data input is an acceptable format i.e {array-like, sparse matrix} of shape (n_samples, n_features)
    # assert type(y_train) == np.ndarray # Check if data input is an acceptable format i.e array-like of shape (n_samples,) or (n_samples, n_targets)
    # assert len(X_train) == len(y_train) # Check if data input have the same amount of samples

    # Switching to DataFrame so X train labels are stored in model
    df_X_train = pd.DataFrame(X_train, columns=X_train_labels)

    regressor = DecisionTreeRegressor(
        max_depth=max_depth, random_state=0
    )  # random_state = 0 to stick to the same random seed
    regressor.fit(df_X_train, y_train)

    return regressor


[docs]def predict_from_regressor(model, X, X_labels):
    """
    Predict the target values for new input data using a given regression model.

    Parameters
    ----------
    model : sklearn estimator
        A fitted regression model.
    X : numpy.ndarray
        Input data of shape (n_samples, n_features).
    X_labels : list
        List of strings representing the feature names.

    Returns
    -------
    numpy.ndarray
        Predicted target values of shape (n_samples,).
    """

    # Check if Input Data correspond to model parameters, i.e features numbers, order
    if all(feature in model.feature_names_in_ for feature in X_labels):
        return model.predict(X)
    else:
        df_X_predict = pd.DataFrame(X, columns=X_labels)
        df_X_predict = df_X_predict[model.feature_names_in_]
        return model.predict(df_X_predict)


[docs]def lasso_regression_feature_selection(X_train, y_train, X_train_labels):
    """
    Apply Lasso regression feature selection to the training data.

    Parameters
    ----------
    X_train : numpy.ndarray
        Training input data of shape (n_samples, n_features).
    y_train : numpy.ndarray
        Target values of shape (n_samples,).
    X_train_labels : list
        List of strings representing the feature names.

    Returns
    -------
    tuple
        A tuple containing the selected training input data of shape (n_samples, n_selected_features)
        and a list of strings representing the names of the selected features.
        If the number of training samples is less than or equal to 50, the function returns
        the original input data and feature names unchanged.
    """

    # assert type(X_train) == np.ndarray # Check if data input is an acceptable format i.e {array-like, sparse matrix} of shape (n_samples, n_features)
    # assert type(y_train) == np.ndarray # Check if data input is an acceptable format i.e array-like of shape (n_samples,) or (n_samples, n_targets)
    # assert len(X_train) == len(y_train) # Check if data input have the same amount of samples
    # assert len(X_train) == len(X_train_labels)  # Check if data input have the same amount of samples

    # Cannot apply cv on low amount of sample, just return input as output
    if len(X_train) > 500:
        cv = 5
    else:
        return X_train, X_train_labels

    cv = GridSearchCV(
        Lasso(),
        {"model__alpha": np.arange(0.1, 10, 0.1)},
        cv=cv,
        scoring="neg_mean_absolute_error",
        verbose=3,
    )

    cv.fit(X_train, y_train)

    # print(cv.best_params_)
    coefficients = cv.best_estimator_.named_steps["model"].coef_
    X_train_labels_selected = np.array(X_train_labels)[np.abs(coefficients) > 0]
    X_train_selected = np.array(X_train)[np.abs(coefficients) > 0]

    return X_train_selected, X_train_labels_selected


[docs]def score(y_true, y_predict):
    """Calculate the mean absolute error (MAE) between true and predicted values.

    Parameters
    ----------
    y_true : np.ndarray
        Correct target values.
    y_predict : np.ndarray
        Estimated target values.

    Returns
    -------
    float
        Mean absolute error between `y_true` and `y_predict`.

    Examples
    --------
    >>> y_true = np.array([3, -0.5, 2, 7])
    >>> y_predict = np.array([2.5, 0.0, 2, 8])
    >>> score(y_true, y_predict)
    0.5
    """
    return mean_absolute_error(y_true, y_predict)