import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import xgboost as xgb

import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

# save models
import joblib
import pickle

import os
import re
import json


# Load dataset
file = r"https://raw.githubusercontent.com/bouz1/Manipulation_of_second_hand_vehicles_data/refs/heads/main/datasets/used_cars_data_250.csv"
df=pd.read_csv(file)


len(df)

7250


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7250 entries, 0 to 7249
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model1             7250 non-null   object 
 1   model2             7250 non-null   object 
 2   version            7250 non-null   object 
 3   price              7250 non-null   float64
 4   km                 7250 non-null   float64
 5   fuel               7250 non-null   float64
 6   CV_fisc            7250 non-null   float64
 7   HorseP             7250 non-null   float64
 8   Gearbox_auto       7250 non-null   float64
 9   L_by_100km         7250 non-null   float64
 10  numbe_seats        7250 non-null   float64
 11  doors_nb           7250 non-null   float64
 12  Euro_stand         7250 non-null   float64
 13  Length             7250 non-null   float64
 14  Nb_option          7250 non-null   float64
 15  registration_date  7250 non-null   float64
dtypes: float64(13), object(3)
memory usage: 906.4+ KB


df.head(2)


df.isna().sum()

model1               0
model2               0
version              0
price                0
km                   0
fuel                 0
CV_fisc              0
HorseP               0
Gearbox_auto         0
L_by_100km           0
numbe_seats          0
doors_nb             0
Euro_stand           0
Length               0
Nb_option            0
registration_date    0
dtype: int64


df.columns

Index(['model1', 'model2', 'version', 'price', 'km', 'fuel', 'CV_fisc',
       'HorseP', 'Gearbox_auto', 'L_by_100km', 'numbe_seats', 'doors_nb',
       'Euro_stand', 'Length', 'Nb_option', 'registration_date'],
      dtype='object')


df.price.describe().to_frame().T


df2= df[df.price <40000]
len(df2)/len(df)

0.8137931034482758


df3= df2[['km', 'fuel', 'CV_fisc','HorseP', 'Gearbox_auto', 
       'L_by_100km', 'numbe_seats', 'doors_nb',
       'Euro_stand', 'Length', 'Nb_option', 'price']]


correlation_matrix= df3.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

plt.figure(figsize=(8, 3))
S= correlation_matrix["price"].drop("price")
S=S.abs().sort_values(ascending=False)
S.plot.bar()
plt.grid()
plt.xlabel("features")
plt.ylabel("Abs correlation")
plt.title("Feature correlation with Price")
plt.show()


colsx=['km', 'fuel', 'CV_fisc','HorseP', 'Gearbox_auto', 
       'L_by_100km', 'numbe_seats', 'doors_nb',
       'Euro_stand', 'Length', 'Nb_option']
coly= 'price'


X= df3[colsx].values
y= df3[ coly].values


# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define and train the model
model = xgb.XGBRegressor(
    n_estimators=100,   # Number of trees
    max_depth=3,        # Maximum tree depth
    eta=0.1,            # Learning rate
    objective='reg:squarederror'  # Regression objective
    ,random_state=42
)

model.fit(X_train, y_train)

importance = model.feature_importances_
S=pd.Series(importance, index = colsx)
S=S.sort_values(ascending=False)
plt.figure(figsize=(8, 3))
S.plot.bar()
plt.grid()
plt.xlabel("features")
plt.ylabel("Importance using XGboost")
plt.title("Feature impotances using XGboost")
plt.show()


print("The 4 important features: ", list(S.head(4).index))

The 4 important features:  ['Gearbox_auto', 'HorseP', 'Euro_stand', 'km']


NewColx= list(S.head(4).index)
NewColx

['Gearbox_auto', 'HorseP', 'Euro_stand', 'km']


FileName="..\data\processed\df_price_4inputs.csv"
df3[NewColx+["price"]].astype("float32").\
        to_csv(FileName, 
                index = False)


FileName="..\data\processed\df_price_4inputs.csv"
dfnew= pd.read_csv(FileName).astype("float32")
print("Df columns: ", list(dfnew.columns))
X= dfnew.iloc[:,:4].values
y= dfnew.iloc[:,4].values
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Df columns:  ['Gearbox_auto', 'HorseP', 'Euro_stand', 'km', 'price']


from sklearn.linear_model import LinearRegression


LR = LinearRegression()


_=LR.fit(X_train, y_train)


# Save the model
joblib.dump(LR, r'../models/LinearReg/LR_model.pkl')

['../models/LinearReg/LR_model.pkl']


# Load the model
LR_model = joblib.load(r'../models/LinearReg/LR_model.pkl')


y_pred_test= LR_model.predict(X_test)
y_pred_train= LR_model.predict(X_train)


# Evaluate the model
maeTrain = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Squared Error Train: {maeTrain:.2f}")

maeTest = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Squared Error Test: {maeTest:.2f}")

## Plot 
plt.figure(figsize=(8, 4))
plt.scatter(y_train, y_pred_train, s= 4, label="Train")
plt.scatter(y_test, y_pred_test, s=4 , label="Test")

plt.plot([y_test.min(),y_test.max()], 
         [y_test.min(),y_test.max()],
         c="r", 
        label = "Equal")

plt.xlabel("price: Real")
plt.ylabel("price: prediction")
plt.grid()
plt.legend()
plt.show()

Mean Squared Error Train: 3759.17
Mean Squared Error Test: 3758.83


DTR=DecisionTreeRegressor(max_depth=10, random_state=0)


_=DTR.fit(X_train, y_train)


# Save the model
joblib.dump(DTR, r'../models/Trees/dtr_model.pkl')

['../models/Trees/dtr_model.pkl']


# Load the model
dtr_model = joblib.load(r'../models/Trees/dtr_model.pkl')


y_pred_test= dtr_model.predict(X_test)
y_pred_train= dtr_model.predict(X_train)


# Evaluate the model
maeTrain = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Squared Error Train: {maeTrain:.2f}")

maeTest = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Squared Error Test: {maeTest:.2f}")

## Plot 
plt.figure(figsize=(8, 4))
plt.scatter(y_train, y_pred_train, s= 4, label="Train")
plt.scatter(y_test, y_pred_test, s=4 , label="Test")

plt.plot([y_test.min(),y_test.max()], 
         [y_test.min(),y_test.max()],
         c="r", 
        label = "Equal")

plt.xlabel("price: Real")
plt.ylabel("price: prediction")
plt.grid()
plt.legend()
plt.show()

Mean Squared Error Train: 2135.29
Mean Squared Error Test: 3099.48


RF= RandomForestRegressor(n_estimators=3, 
                      max_depth=8, random_state=0)


_=RF.fit(X_train, y_train)


import joblib

# Save the model to a file
joblib.dump(RF, '../models/RF/random_forest_model.pkl')

['../models/RF/random_forest_model.pkl']


# Load the model from the file
RF_model = joblib.load('../models/RF/random_forest_model.pkl')


y_pred= RF_model .predict(X_test)


y_pred_test= RF_model.predict(X_test)
y_pred_train= RF_model.predict(X_train)


# Evaluate the model
maeTrain = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Squared Error Train: {maeTrain:.2f}")

maeTest = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Squared Error Test: {maeTest:.2f}")

## Plot 
plt.figure(figsize=(8, 4))
plt.scatter(y_train, y_pred_train, s= 4, label="Train")
plt.scatter(y_test, y_pred_test, s=4 , label="Test")

plt.plot([y_test.min(),y_test.max()], 
         [y_test.min(),y_test.max()],
         c="r", 
        label = "Equal")

plt.xlabel("price: Real")
plt.ylabel("price: prediction")
plt.grid()
plt.legend()
plt.show()

Mean Squared Error Train: 2712.65
Mean Squared Error Test: 3124.63


base_score=X_train.mean()
base_score

18476.805


# Define and train the model
model = xgb.XGBRegressor(
    n_estimators=100,   # Number of trees
    max_depth=3,        # Maximum tree depth
    eta=0.1,            # Learning rate
    objective='reg:squarederror'  # Regression objective
    ,random_state=42
    ,base_score=base_score
)

_=model.fit(X_train, y_train)


# Save the model
joblib.dump(model, r'../models/xgboost/xgb_model.pkl')

['../models/xgboost/xgb_model.pkl']


# Load the model
xgb_model = joblib.load(r'../models/xgboost/xgb_model.pkl')


y_pred_test= xgb_model.predict(X_test)
y_pred_train= xgb_model.predict(X_train)


# Evaluate the model
maeTrain = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Squared Error Train: {maeTrain:.2f}")

maeTest = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Squared Error Test: {maeTest:.2f}")

## Plot 
plt.figure(figsize=(8, 4))
plt.scatter(y_train, y_pred_train, s= 4, label="Train")
plt.scatter(y_test, y_pred_test, s=4 , label="Test")

plt.plot([y_test.min(),y_test.max()], 
         [y_test.min(),y_test.max()],
         c="r", 
        label = "Equal")

plt.xlabel("price: Real")
plt.ylabel("price: prediction")
plt.grid()
plt.legend()
plt.show()

Mean Squared Error Train: 3011.73
Mean Squared Error Test: 3161.42


X_train.shape, X_test.shape

((4720, 4), (1180, 4))


# Define the DNN model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),  # Input and 1st hidden layer
    tf.keras.layers.Dense(8, activation='relu'),  # 2nd hidden layer
    tf.keras.layers.Dense(4, activation='relu'),  # 3rd hidden layer
    tf.keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


# Train the model
history = model.fit(X_train, y_train, 
                    validation_split=0.2, epochs=30,
                    batch_size=32, verbose=0)


hist=history.history


# Save the dictionary
with open("../data/processed/tf_hist.pkl", "wb") as file:
    pickle.dump(hist, file)

model.save('../models/DNN/tf_model.keras', include_optimizer=False)


# Load the dictionary
with open("../data/processed/tf_hist.pkl", "rb") as file:
    load_hist= pickle.load(file)
    
tf_model=tf.keras.models.load_model('../models/DNN/tf_model.keras')


print(list(load_hist.keys()))
for c in load_hist.keys():
    if 'loss' in c:
        plt.plot(load_hist[c], label = c)
plt.legend()
plt.xlabel("epochs")
plt.ylabel("loss")
plt.grid()
plt.show()

['loss', 'mae', 'val_loss', 'val_mae']


y_pred_test= tf_model.predict(X_test,batch_size=32,verbose=0)
y_pred_train= tf_model.predict(X_train,batch_size=32,verbose=0)


# Evaluate the model
maeTrain = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Squared Error Train: {maeTrain:.2f}")

maeTest = mean_absolute_error(y_test, y_pred_test)
print(f"Mean Squared Error Test: {maeTest:.2f}")

## Plot 
plt.figure(figsize=(8, 4))
plt.scatter(y_train, y_pred_train, s= 4, label="Train")
plt.scatter(y_test, y_pred_test, s=4 , label="Test")

plt.plot([y_test.min(),y_test.max()], 
         [y_test.min(),y_test.max()],
         c="r", 
        label = "Equal")

plt.xlabel("price: Real")
plt.ylabel("price: prediction")
plt.grid()
plt.legend()
plt.show()

Mean Squared Error Train: 4090.79
Mean Squared Error Test: 4334.73


def array_to_arduino(x):
    """
    Helper function to convert a Python list or NumPy array to Arduino array format 
    for use in the generated Arduino code. 
    It converts the input into a string format where square brackets [] are replaced 
    with curly braces {}.

    Input:
    - x: List or array to be converted

    Output:
    - Formatted string that can be used in Arduino code
    """
    x = str(x.tolist())  # Convert array to list and then string
    x = x.replace('[', '{')  # Replace square brackets with curly braces
    x = x.replace(']', '}')  # Replace closing square bracket with closing curly brace
    return x


# Load the model
LR_model = joblib.load(r'../models/LinearReg/LR_model.pkl')


# Sub inputs/outpusts to test the arduino model: 10 samples
sub_X=X_train[:10]
sub_y=LR_model.predict(sub_X)


# Get linear regression parameters 
coef = LR_model.coef_
bias = LR_model.intercept_
print("coef: ", coef.tolist())
print("bias: ", bias )

coef:  [3661.1865234375, 119.34064483642578, 2962.884521484375, -0.05416186898946762]
bias:  -7844.6016


sub_X.shape, coef.shape

((10, 4), (4,))


# Understund the Linear regression algo
print("y with model predict\n" ,sub_y)
print("y with matrix calculation: Y = X.coef + bias \n", 
    (sub_X.dot(coef.reshape(-1,1))+bias).flatten())
print("the result is the same")

y with model predict
 [19074.861 22590.434 18458.254 20624.408 20219.445 29240.262 32525.
 27160.86  25408.605 26429.555]
y with matrix calculation: Y = X.coef + bias 
 [19074.861 22590.434 18458.254 20624.408 20219.445 29240.262 32525.
 27160.86  25408.605 26429.555]
the result is the same


def LinearRegToC (model, X, y):
    """Convert a Linear regression model (sklearn) to C++ (Arduino)
    Model : trained LR model 
    X,y : input outputs to test the arduino code
    """
    codeInit="""

const int Nv = NvReplace;
const int dimX = dimXReplace;

/////// Xy ////// 
const float X [] PROGMEM  = Xreplace;

const float y[] PROGMEM  = yreplace;




////////////////// Model
const float coef[] PROGMEM = coefreplace; 
const float Bias = Biasreplace; 
float LinearReg ( float X[] ) {
float Out=Bias;
for(int j = 0; j<dimX;j++){
    Out+=X[j]*pgm_read_float_near(&coef[j]);
}

return Out;
}




void setup() {
    Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
float yc;


Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
    Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
timestart=micros();
yc=LinearReg(Xi);
timeend=micros();
Serial.print(yc);
Serial.print(",");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(",");
Serial.println(timeend-timestart);
}
Serial.println("====The End=====");
while(1);
}
"""    
    
    Nv, dimX= X.shape
    Nv, dimX= str(Nv), str(dimX)
    Xs=array_to_arduino(X.flatten())
    ys=array_to_arduino(y)
    coef = array_to_arduino(model.coef_)
    bias = str(model.intercept_)



    codeInit= codeInit.replace("NvReplace",Nv)
    codeInit= codeInit.replace("dimXReplace",dimX)
    codeInit= codeInit.replace("Xreplace",Xs)
    codeInit= codeInit.replace("yreplace",ys)
    codeInit= codeInit.replace("coefreplace",coef)
    codeInit= codeInit.replace("Biasreplace", bias)

    return codeInit


# Convert the model 
arduino_code= LinearRegToC (LR_model, sub_X, sub_y)


# save the arduino code 
ino_file="../ArduinoCode/LinearReg.ino" # Path of the file
ino_file=ino_file.replace(".ino" ,"")
current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    
    print(path, "saved")

../ArduinoCode/LinearReg/LinearReg.ino saved


# The arduino serial print result 
serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
19074.86,19074.861328,68
22590.43,22590.433593,76
18458.25,18458.253906,80
20624.41,20624.408203,76
20219.45,20219.445312,76
29240.26,29240.261718,80
32525.00,32525.000000,84
27160.86,27160.859375,80
25408.60,25408.605468,80
26429.56,26429.554687,88
====The End====="""


# Convert the serial result to DF 
data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 0.08 ms


# Ploting 
DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()


# The arduino and python model have the same result.


# Load the model
dtr_model = joblib.load(r'../models/Trees/dtr_model.pkl')


# Sub inputs/outpusts to test the arduino model: 10 samples
sub_X=X_train[:10]
sub_y=dtr_model.predict(sub_X)


# Export and print the tree structure
tree_text = export_text(dtr_model)
print("Example of tree txt")
print(tree_text[:300])

Example of tree txt
|--- feature_1 <= 127.50
|   |--- feature_1 <= 90.50
|   |   |--- feature_2 <= 5.50
|   |   |   |--- feature_3 <= 146700.00
|   |   |   |   |--- feature_2 <= 4.50
|   |   |   |   |   |--- feature_3 <= 105865.00
|   |   |   |   |   |   |--- feature_3 <= 92399.00
|   |   |   |   |   |   |   |--- featu


def get_cpp_code_from_tree(tree, feature_names):
    """
    Convert a decision tree to if/else code C++
    """
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value 
    code = ""
    def recurse(left, right, threshold, features, node):
            nonlocal code 
            if (threshold[node] != -2):
                    code+="if ( " + features[node] + " <= " + str(threshold[node]) + " ) {\n"
                    if left[node] != -1:
                            recurse (left, right, threshold, features,left[node])
                    code+="} else {\n"
                    if right[node] != -1:
                            recurse (left, right, threshold, features,right[node])
                    code+="}\n"
            else:
                    code+="return " + str(value[node]).replace("[","").replace("]","")+";\n"

    recurse(left, right, threshold, features, 0)
    return code


# Example of conversion 
TXT=get_cpp_code_from_tree(dtr_model, ["a","b","c","d"])
print(TXT[:250])

if ( b <= 127.5 ) {
if ( b <= 90.5 ) {
if ( c <= 5.5 ) {
if ( d <= 146700.0 ) {
if ( c <= 4.5 ) {
if ( d <= 105865.0 ) {
if ( d <= 92399.0 ) {
if ( d <= 85367.0 ) {
return 10060.;
} else {
return 10690.;
}
} else {
return 8360.;
}
} else {
if ( b <=


def convert_DecTree_To_C(model, X,y):
    codeInit="""

const int Nv = NvReplace;
const int dimX = dimXReplace;

/////// Xy ////// 
const float X [] PROGMEM  = Xreplace;

const float y[] PROGMEM  = yreplace;



////////////////// TREE
float DecisionTreeReg ( float X[] ) {
IF_ELSE_CONDITION_replace
}




void setup() {
    Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
float yc;


Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
    Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
timestart=micros();
yc=DecisionTreeReg(Xi);
timeend=micros();
Serial.print(yc);
Serial.print(",");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(",");
Serial.println(timeend-timestart);
}
Serial.println("====The End=====");
while(1);
}
"""

    Nv, dimX= X.shape
    Nv, dimX= str(Nv), str(dimX)
    Xs=array_to_arduino(X.flatten())
    ys=array_to_arduino(y)

    features = ["X["+str(i)+"]" for i in range(X.shape[1])]
    ifelsecode = get_cpp_code_from_tree(model, features)

    codeInit= codeInit.replace("NvReplace",Nv)
    codeInit= codeInit.replace("dimXReplace",dimX)
    codeInit= codeInit.replace("Xreplace",Xs)
    codeInit= codeInit.replace("yreplace",ys)
    codeInit= codeInit.replace("IF_ELSE_CONDITION_replace",ifelsecode)

    return codeInit


arduino_code = convert_DecTree_To_C(dtr_model, X,y)


# save the arduino code 
ino_file="../ArduinoCode/DecisionTree"
ino_file=ino_file.replace(".ino" ,"")
current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    
    print(path, "saved")

../ArduinoCode/DecisionTree/DecisionTree.ino saved


# The arduino serial print result 
serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
19870.97,19870.972656,40
26722.96,26722.962890,48
16522.86,16522.857421,48
18817.56,18817.560546,44
17535.56,17535.554687,48
17620.00,17620.000000,44
35083.11,35083.109375,48
32269.54,32269.535156,44
23814.67,23814.671875,40
31409.13,31409.130859,48
====The End====="""


# Convert the serial result to DF 
data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 0.05 ms


# Ploting 
DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()


# Load the model from the file
RF_model = joblib.load('../models/RF/random_forest_model.pkl')


# Sub inputs/outpusts to test the arduino model: 10 samples
sub_X=X_train[:10]
sub_y=RF_model.predict(sub_X)


def convert_RandForest_To_C(model, X,y):
    codeInit="""

const int Nv = NvReplace;
const int dimX = dimXReplace;

/////// Xy ////// 
const float X [] PROGMEM  = Xreplace;

const float y[] PROGMEM  = yreplace;



////////////////// TREES

TREES_replace 


////////////////// RANDOM FOREST 

RF_replace



void setup() {
Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
float yc;


Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
timestart=micros();
yc=RandForestReg(Xi);
timeend=micros();
Serial.print(yc);
Serial.print(",");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(",");
Serial.println(timeend-timestart);
}
Serial.println("====The End=====");
while(1);
}
"""
    code_trees=""
    code_randForest="\n\n\nfloat RandForestReg ( float X[] ) {\nfloat out=0;\n"
    features = ["X["+str(i)+"]" for i in range(X.shape[1])]
    trees = model.estimators_
    for i, tree in enumerate(trees):
        code_tree=get_cpp_code_from_tree(tree,  features )
        code_tree="\n\n\nfloat Tree"+str(i)+" ( float X[] ) {\n"+code_tree+"\n}\n"
        code_trees+=code_tree

        code_randForest+="out+=Tree"+str(i)+" (X);\n";

    code_randForest+="out=out/"+str(model.n_estimators)+";\nreturn out;\n}\n"




    Nv, dimX= X.shape
    Nv, dimX= str(Nv), str(dimX)
    Xs=array_to_arduino(X.flatten())
    ys=array_to_arduino(y)

  

    codeInit= codeInit.replace("NvReplace",Nv)
    codeInit= codeInit.replace("dimXReplace",dimX)
    codeInit= codeInit.replace("Xreplace",Xs)
    codeInit= codeInit.replace("yreplace",ys)

    codeInit= codeInit.replace("TREES_replace",code_trees)
    codeInit= codeInit.replace("RF_replace",code_randForest)


    return codeInit


arduino_code = convert_RandForest_To_C(RF_model, X,y)


# save the arduino code 
ino_file="../ArduinoCode/RandForest"
ino_file=ino_file.replace(".ino" ,"")
current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    
    print(path, "saved")

../ArduinoCode/RandForest/RandForest.ino saved


# The arduino serial print result 
serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
20217.69,20217.689453,120
24530.45,24530.447265,116
18560.16,18560.160156,124
19753.04,19753.039062,120
17726.35,17726.345703,120
20670.88,20670.882812,136
35056.59,35056.593750,132
31866.39,31866.384765,120
25756.68,25756.675781,120
28062.48,28062.480468,120
====The End====="""


# Convert the serial result to DF 
data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 0.12 ms


# Ploting
DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()


# Load the model
xgb_model = joblib.load(r'../models/xgboost/xgb_model.pkl')


# Sub inputs/outpusts to test the arduino model: 10 samples
sub_X=X_train[:10]
sub_y=xgb_model .predict(sub_X)


base_score=X_train.mean()
base_score

18476.805


xgb_model.base_score

18476.805


# Function: TreesCode
# Description:
# This function generates C++ code representing the decision trees of an XGBoost model.
# It parses the model's JSON representation and recursively converts each tree into a C++ function.
def TreesCode(model):
    """
    Generates C++ code for each decision tree in an XGBoost model.

    The function extracts the tree structure in JSON format from the model and recursively
    traverses each tree to generate a corresponding C++ function. Each function represents
    the decision logic of a single tree, taking an input array `X` and returning the output.

    Args:
        model: The trained XGBoost model containing the decision trees.

    Returns:
        str: A string containing the complete C++ code for all trees in the model.
    """

    # Extract the JSON representation of the tree
    booster = model.get_booster()
    trees = booster.get_dump(dump_format="json")
    cpp_code = ""

    def recurse(node, depth=0):
        """
        Recursive helper function to traverse a tree node and generate corresponding C++ code.
        - If the node is a leaf, it appends a return statement with the leaf value.
        - Otherwise, it generates a conditional statement based on the split condition.

        :param node: Dictionary representation of a tree node.
        :param depth: Current depth of the node for indentation purposes.
        """
        nonlocal cpp_code
        indent = "    " * depth

        # Leaf node
        if "leaf" in node:
            cpp_code += f"{indent}return {node['leaf']};\n"
            return

        split_condition = node['split_condition']
        INDEX_INP= int(node['split'][1:])
        cpp_code += f"{indent}if (X[{INDEX_INP}] < {split_condition}) {{\n"
        recurse(node['children'][0], depth + 1)
        cpp_code += f"{indent}}} else {{\n"
        recurse(node['children'][1], depth + 1)
        cpp_code += f"{indent}}}\n"

    # Generate code for each tree
    for tree_index, tree_json in enumerate(trees):
        cpp_code += f"\n////////////////// TREE {tree_index}\n"
        cpp_code += f"float tree{tree_index}(float X[]) {{\n"
        tree_dict = json.loads(tree_json)
        recurse(tree_dict)
        cpp_code += "}\n\n"

    return cpp_code



# Function: code_trees
# Description:
# Generates the cumulative summation of the predictions from all trees, formatted as C++ code.
# The summation depends on the learning rate and number of trees.
def code_trees(N, learning_rate): 
    XGBOOST_CODE= ""
    for index in range(N):
        if learning_rate  == "1": 
            XGBOOST_CODE+= f"out+= tree{index}(X);\n"
        else: 
            XGBOOST_CODE+= f"out+= learning_rate*tree{index}(X);\n"
    return XGBOOST_CODE




# Function: XGBOOST_to_CPP
# Description:
# Converts an XGBoost model to a complete C++ implementation for predictions.
# This includes tree code, model initialization, and a prediction function.
def XGBOOST_to_CPP(model, X, y, base_score):
    # Template for the C++ implementation
    codeInit="""

const int Nv = NvReplace;
const int dimX = dimXReplace;

float base_score =   base_score_Replace ;
float learning_rate = learning_rate_Replace ;

/////// Xy ////// 
const float X [] PROGMEM  = Xreplace;

const float y[] PROGMEM  = yreplace;



////////////////// TREES ////////////////////////////
////////////////////////////////////////////////////
TREES_CODE_replace

////////////////// XGBOOST MODEL //////////////////
///////////////////////////////////////////////////
float XGBpred(float X[]){
float out = 0;
XGBOOST_CODE_replace
out = out+base_score;
return out;}



void setup() {
Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
float yc;


Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
timestart=micros();
yc=XGBpred(Xi);
timeend=micros();
Serial.print(yc);
Serial.print(",");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(",");
Serial.println(timeend-timestart);
}
Serial.println("====The End=====");
while(1);
}
"""
    
    
    
    if model.base_score is not None: 
        base_score = str(model.base_score)
    elif base_score is not None: 
        base_score = str(base_score)
    else : 
        base_score = "0"
    
    if model.learning_rate is not None: 
        learning_rate = str(model.learning_rate) 
    else: 
        learning_rate = "1" 
    learning_rate, base_score 


    N= model.n_estimators
    XGBOOST_CODE = code_trees(N, learning_rate)
    TREES_CODE = TreesCode(xgb_model)
    
    Nv, dimX= X.shape
    Nv, dimX= str(Nv), str(dimX)
    Xs=array_to_arduino(X.flatten())
    ys=array_to_arduino(y)

    codeInit= codeInit.replace("NvReplace",Nv)
    codeInit= codeInit.replace("dimXReplace",dimX)
    codeInit= codeInit.replace("Xreplace",Xs)
    codeInit= codeInit.replace("yreplace",ys)
    codeInit= codeInit.replace("base_score_Replace",base_score)
    codeInit= codeInit.replace("learning_rate_Replace",learning_rate)
    codeInit= codeInit.replace("TREES_CODE_replace", TREES_CODE)
    codeInit= codeInit.replace("XGBOOST_CODE_replace", XGBOOST_CODE)
    return codeInit


arduino_code = XGBOOST_to_CPP(xgb_model, sub_X, sub_y, base_score)


ino_file="../ArduinoCode/Xgboost_Model2.ino"
ino_file=ino_file.replace(".ino", "")


current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    
    print(path, "saved")

../ArduinoCode/Xgboost_Model2/Xgboost_Model2.ino saved


# The arduino serial print result 
serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
19879.69,19879.697265,2076
24714.27,24714.275390,1984
16844.20,16844.208984,1952
19767.67,19767.681640,1956
20115.12,20115.128906,1960
25036.53,25036.535156,1964
34841.58,34841.546875,1988
29939.02,29939.027343,1980
25749.97,25749.972656,1960
28292.76,28292.769531,1968
====The End====="""


# Convert the serial result to DF
data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 1.98 ms


# Ploting
DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()


# Load the model
tf_model=tf.keras.models.load_model('../models/DNN/tf_model.keras')


sub_X=X_train[:10]
sub_y=tf_model.predict(X, verbose = 0)


def tf_model_to_arduino_code(inp_model, sub_X, sub_y, code):
    """
    This function converts a trained TensorFlow model into an Arduino-compatible code 
    for forward propagation. The model's weights, biases, and activation functions 
    are extracted, and Arduino code is generated to represent the model for use on 
    an embedded system.

    Inputs:
    - inp_model: Trained TensorFlow model (Keras model) whose layers and weights 
                 will be used for forward propagation.
    - sub_X: Input data (not used in the function directly, but likely required 
             for the context or future extension).
    - sub_y: Output data (not used directly, similar to `sub_X`).
    - code: Template code (as a string) that will be modified and returned, 
            with model weights, biases, and activation functions.

    Outputs:
    - code2: Arduino code with initialized model weights, biases, and forward 
             propagation logic embedded.
    """
    
    
    init_code="""
#include <math.h>
#include <Arduino.h>
#include <avr/pgmspace.h> // Include the PROGMEM functions

INIT_1

// Activation function///////////////
float sigmoid (float x){
    return 1./(1.+exp(-x));
}

float relu (float x){
    return max(x,0.);
}

float tanh_ (float x){ 
// make difference between tanh of C++ and tanh_ the activation func
    return tanh(x);
}

float linear(float x){
    return x;
}
///// You can add other activation function ////

void print_arr(float arr[], int N) {
    Serial.print("[");
    for (int i = 0; i < N; i++) {
        Serial.print(arr[i],4);
        if (i < N-1) {
            Serial.print(",");
            }
    }
    Serial.print("]");
}


void propagation(const float *WTf,  float *VEC, const float *B,float *out,  int M, int N, float (*act_func)(float)) {

  // Perform matrix-vector multiplication and activation
  for (int i = 0; i < M; ++i) {
    out[i] = pgm_read_float_near(&B[i]);
    for (int j = 0; j < N; ++j) {
      out[i] += pgm_read_float_near(&WTf[i * N + j]) * VEC[j];
    }
    out[i] = act_func(out[i]);
  }
}

void setup() {
  Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
INIT_2

Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
    Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
LOOP_
for (int k=0;k<M__final;k++){
Serial.print(OUTPUT__final[k],6);
Serial.print(" , ");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(" , ");
Serial.println(timeend-timestart);
}
}
Serial.println("====The End=====");
while(1);
}
"""
    WTfs = []  # List to store flattened weight matrices for each layer
    Bs = []  # List to store bias vectors for each layer
    acts = []  # List to store activation functions for each layer
    INIT = ""  # String to hold the initialization section of Arduino code
    
    # Loop through each layer of the model
    for i, layer in enumerate(inp_model.layers):
        W, B = layer.get_weights()  # Get weights and biases for the current layer
        WTf = W.T.flatten()  # Flatten the weight matrix and store it
        actfun = layer.activation.__name__  # Get the activation function name
        WTfs.append(WTf)  # Append flattened weights to the list
        Bs.append(B)  # Append biases to the list
        acts.append(actfun)  # Append activation function name to the list
        print("Layer", i, "W shape", W.shape, "Bias shape", B.shape, "Activation Function", actfun)

    # Define dimensions of weight matrix W
    M, N = W.T.shape

    # Get shape of the input data X (not used directly in the function)
    xshape = X.shape
    NvdimX = "const int Nv = " + str(xshape[0]) + ";\nconst int dimX = " + str(xshape[1]) + ";\n"

    # Convert X and y to Arduino-compatible format and store as strings
    Xystr = "\n/////// Xy ////// \nconst float X [] PROGMEM  = " + array_to_arduino(X.flatten()) + ";\n\n" + \
            "const float y[] PROGMEM  = " + array_to_arduino(y.flatten()) + " ;\n\n"

    initstr = ""  # String to hold initialization section for each layer
    
    # Loop through each layer again to generate initialization strings for weights and biases
    for i, layer in enumerate(inp_model.layers):
        W, B = layer.get_weights()  # Get weights and biases for the current layer
        M, N = W.T.shape  # Get dimensions of the weight matrix
        WTf = W.T.flatten()  # Flatten the weights

        # Prepare the Arduino code initialization for this layer
        Mstr = "const int M" + str(i) + " = " + str(M) + " ;"
        Nstr = "const int N" + str(i) + " = " + str(N) + " ;"
        WTfstr = "const float WTf" + str(i) + "[] PROGMEM  = " + str(WTf.tolist()).replace("[", "{").replace("]", "}") + " ;"
        Bstr = "const float BIAS" + str(i) + "[] PROGMEM= " + str(B.tolist()).replace("[", "{").replace("]", "}") + " ;"
        Outstr = "float OUTPUT" + str(i) + "[" + str(M) + "] ;"
        layerstr = "// Layer" + str(i) + " init \n" + Nstr + "\n" + Mstr + "\n" + WTfstr + "\n" + Bstr + "\n" + Outstr

        # Append the layer initialization to the overall initialization string
        initstr += layerstr + "\n\n"

    # Define the forward propagation logic in Arduino code
    prostr = "\n///////// Forward Propagation ////////////\ntimestart=micros();\n"
    funcstr = "propagation(WTf_, VEC, BIAS_, OUTPUT_, M_, N_,  activation); // Layer_\n"
    
    # Generate forward propagation code for each layer
    for i, layer in enumerate(inp_model.layers):
        W, B = layer.get_weights()  # Get weights and biases
        M, N = W.T.shape  # Get dimensions of the weight matrix
        WTf = W.T.flatten()  # Flatten the weights
        actfunc = layer.activation.__name__  # Get activation function name
        actfunc = actfunc.replace('tanh', 'tanh_')  # Replace 'tanh' with 'tanh_' for Arduino compatibility
        prostr += funcstr.replace("_", str(i)) \
            .replace('activation', actfunc) \
            .replace("VEC", "OUTPUT" + str(i - 1)) \
            .replace("OUTPUT-1", "Xi")

    # Final Arduino code section
    prostr += "timeend=micros();"
    
    # Replace placeholders in the code template with the generated code
    code2 = code.replace("INIT_1", NvdimX + initstr + Xystr)
    code2 = code2.replace("INIT_2", "")
    code2 = code2.replace("LOOP_", prostr)
    code2 = code2.replace("__final", str(i))  # Replace the final placeholder with the last layer index

    return code2  # Return the generated Arduino code


arduino_code=tf_model_to_arduino_code(tf_model, X, y, init_code)

Layer 0 W shape (4, 16) Bias shape (16,) Activation Function relu
Layer 1 W shape (16, 8) Bias shape (8,) Activation Function relu
Layer 2 W shape (8, 4) Bias shape (4,) Activation Function relu
Layer 3 W shape (4, 1) Bias shape (1,) Activation Function linear


# save the arduino code 
ino_file="../ArduinoCode/Tf_Model"
ino_file=ino_file.replace(".ino" ,"")

current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    print(path, "saved")

../ArduinoCode/Tf_Model/Tf_Model.ino saved


# The arduino serial print result
serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
17770.185546 , 17770.187500 , 4556
22208.960937 , 22208.962890 , 4420
16064.545898 , 16064.555664 , 4444
19372.082031 , 19372.083984 , 4412
19566.919921 , 19566.931640 , 4436
28578.988281 , 28578.988281 , 4528
33195.054687 , 33195.054687 , 4512
24006.271484 , 24006.269531 , 4508
22752.718750 , 22752.728515 , 4520
23988.726562 , 23988.726562 , 4536
====The End====="""


# Convert the serial result to DF 
data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 4.49 ms


# Ploting
DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()


# Load the model
xgb_model = joblib.load(r'../models/xgboost/xgb_model.pkl')


sub_X=X_train[:10]
sub_y=xgb_model.predict(sub_X)


X=sub_X
y=sub_y


# Get the base score
base_score = xgb_model.base_score
print("Base Score:", base_score)

Base Score: 18476.805


booster=xgb_model.get_booster()
print(booster.get_dump(dump_format='text')[0])

0:[f1<129] yes=1,no=2,missing=2
	1:[f1<91] yes=3,no=4,missing=4
		3:[f2<6] yes=7,no=8,missing=8
			7:leaf=-905.217285
			8:leaf=-422.910553
		4:[f3<47435] yes=9,no=10,missing=10
			9:leaf=442.025391
			10:leaf=-22.4555092
	2:[f1<177] yes=5,no=6,missing=6
		5:[f3<84687] yes=11,no=12,missing=12
			11:leaf=948.198547
			12:leaf=342.00119
		6:[f3<136640] yes=13,no=14,missing=14
			13:leaf=1401.38916
			14:leaf=803.22821


l1=list(dfnew.columns)
l2=list(range(len(l1)))
dic={x1:'f'+str(x2) for (x1,x2) in zip(l1,l2)}
dic

{'Gearbox_auto': 'f0',
 'HorseP': 'f1',
 'Euro_stand': 'f2',
 'km': 'f3',
 'price': 'f4'}


l1=list(dic.keys())
# TO AVOID MISTAKE IN 'evHvBatteryEnergyLevel_lag' AND 'evHvBatteryEnergyLevel',
l1.sort()
l1=l1[::-1]
d=dict()
for x in l1:
    d[x]=dic[x]
dic=d
dic

{'price': 'f4',
 'km': 'f3',
 'HorseP': 'f1',
 'Gearbox_auto': 'f0',
 'Euro_stand': 'f2'}


print(booster.get_dump(dump_format='text')[0])

0:[f1<129] yes=1,no=2,missing=2
	1:[f1<91] yes=3,no=4,missing=4
		3:[f2<6] yes=7,no=8,missing=8
			7:leaf=-905.217285
			8:leaf=-422.910553
		4:[f3<47435] yes=9,no=10,missing=10
			9:leaf=442.025391
			10:leaf=-22.4555092
	2:[f1<177] yes=5,no=6,missing=6
		5:[f3<84687] yes=11,no=12,missing=12
			11:leaf=948.198547
			12:leaf=342.00119
		6:[f3<136640] yes=13,no=14,missing=14
			13:leaf=1401.38916
			14:leaf=803.22821


txt=booster.get_dump(dump_format='text')[0]

for k in dic.keys(): 
    txt= txt.replace(k, dic[k])
        
print(txt)

0:[f1<129] yes=1,no=2,missing=2
	1:[f1<91] yes=3,no=4,missing=4
		3:[f2<6] yes=7,no=8,missing=8
			7:leaf=-905.217285
			8:leaf=-422.910553
		4:[f3<47435] yes=9,no=10,missing=10
			9:leaf=442.025391
			10:leaf=-22.4555092
	2:[f1<177] yes=5,no=6,missing=6
		5:[f3<84687] yes=11,no=12,missing=12
			11:leaf=948.198547
			12:leaf=342.00119
		6:[f3<136640] yes=13,no=14,missing=14
			13:leaf=1401.38916
			14:leaf=803.22821


def txt_c_nodes2(tree_string,dic): 
    out=""
    # Define the pattern for extracting the desired parts
    pattern = r'(\d+):\[(\w+)([<>=]+)(-?[\d.]+)\]\s+yes=(\d+),no=(\d+),missing=(\d+)'
    for k in dic.keys(): 
        tree_string= tree_string.replace(k, dic[k])
    #print(tree_string)
    lines=tree_string.replace("\t","").split('\n')
    for l in lines: 
        if "[" in l and "]" in l: 
            # Use re.findall to extract matching groups
            matches = re.findall(pattern, l)
            # Extracted parts
            if matches:
                for match in matches:
                    #condition, yes, no, missing = match
                    node,feature, cond, value, yes, no, missing = match
                    index=feature.replace("f","")
                    cond=cond.replace("=","==")
                    out+="node"+node+": if (X ["+index+"] "+cond+value+") goto node"+ yes+" ; else goto node"+no+" ; \n" 
        else: 
            if 'leaf=' in l :
                #print(l)
                node=l.split(':leaf=')[0]
                leaf=l.split(':leaf=')[1]
                out+='node'+node+': return ' + leaf+" ;\n" 
    return out 
print(txt_c_nodes2(x,dic))

node0: if (X [1] <132) goto node1 ; else goto node2 ; 
node1: if (X [1] <129) goto node3 ; else goto node4 ; 
node3: if (X [2] <5) goto node7 ; else goto node8 ; 
node7: return 67.2528915 ;
node8: return -4.22500849 ;
node4: if (X [3] <55480) goto node9 ; else goto node10 ; 
node9: return 125.454178 ;
node10: return -0.75947547 ;
node2: if (X [1] <150) goto node5 ; else goto node6 ; 
node5: if (X [3] <176320) goto node11 ; else goto node12 ; 
node11: return -50.4777794 ;
node12: return 346.403595 ;
node6: if (X [3] <23600) goto node13 ; else goto node14 ; 
node13: return -66.9801254 ;
node14: return 12.834815 ;


def trees_to_C2(booster,dic):
    code=""
    for i, x in enumerate (booster.get_dump(dump_format='text')):
        code += "////////////////// TREE_"+str(i+1)
        code += "\n"
        code += "float tree"+str(i)+" ( float X[] ) {"
        code += "\n"
        code += txt_c_nodes2(x,dic)
        code += "}"
        code += "\n"
    return code


base_score=X_train.mean()
base_score

18476.805


sub_X=X_train[:10]
sub_y=xgb_model.predict(sub_X)

X=sub_X
y=sub_y


base_score

18476.805


def all_arduino_code4(model,X,y, dic, base_score):
    Xs=str(list(X)).replace('[','{').replace(']','}')
    booster=model.get_booster()
    
    code ="""
    INIT_1
    """
    xshape = X.shape
    NvdimX = "const int Nv = " + str(xshape[0]) +\
    ";\nconst int dimX = " + str(xshape[1]) + ";\n"

    Xystr = "\n/////// Xy ////// \nconst float X [] PROGMEM  = " +\
    array_to_arduino(X.flatten()) + ";\n\n" + \
    "const float y[] PROGMEM  = " + \
    array_to_arduino(y.flatten()) + " ;\n\n"


    code=code.replace("INIT_1", NvdimX + Xystr)
    #print(code)




    if model.base_score is not None: 
        code += "float base_score = " + str(model.base_score)+" ;"
    else: 
        code += "float base_score =  " + str(base_score)+" ;"
    code += "\n"
    if model.learning_rate is not None: 
        code += "float learning_rate = "+ str(model.learning_rate)+" ;"
    else: 
        code += "float learning_rate = 1 ;"
    code += "\n"
    #code += "float X[]= "+Xs+" ;"
    code += "\n"
    code +=  trees_to_C2(booster, dic)
    code += '/////////////////// XGBpredict'
    code += "\n"
    code +='float XGBpred(float X[]){'
    code += "\n"
    code +='float out = 0;'
    code += "\n"
    for i, x in enumerate (booster.get_dump(dump_format='text')):
        code +="out= tree"+str(i)+"(X)+out;"
        code += "\n"
    code += "\n"
    #code += "out = out*learning_rate+base_score;"
    code += "out = out+base_score;"
    code += "\n"
    code +="return out;}"
    code += "\n"
    code += "\n"
    code += """void setup() {
    Serial.begin(115200);
}

void loop() {
unsigned long timestart;
unsigned long timeend;
float Xi[dimX];
float yc;


Serial.println("Cal_Ardui,Expected,Delta_time(us)");
for (int l=0;l<Nv;l++){
for(int j = 0; j<dimX;j++){
    Xi[j]=pgm_read_float_near(&X[l*dimX+j]);
}
timestart=micros();
yc=XGBpred(Xi);
timeend=micros();
Serial.print(yc);
Serial.print(",");
Serial.print(pgm_read_float_near(&y[l]),6);
Serial.print(",");
Serial.println(timeend-timestart);
}
Serial.println("====The End=====");
while(1);
}
"""
 
    return code


base_score=y_train.mean()
base_score

22734.895


X=X_test[0]
arduino_code=all_arduino_code4(xgb_model,sub_X, \
                    sub_y, dic,base_score)


ino_file="../ArduinoCode/Xgboost_Model"


current_directory = os.getcwd()
new_directory_path = os.path.join(current_directory, ino_file)
try:
    os.makedirs(new_directory_path)
except: pass

path=ino_file+"/"+ino_file.split("/")[-1]+".ino"
with open(path,'w+') as f:
    f.write(arduino_code)
    
    print(path, "saved")

../ArduinoCode/Xgboost_Model/Xgboost_Model.ino saved


serialPrint="""
Cal_Ardui,Expected,Delta_time(us)
20222.40,20222.404296,2088
24689.11,24689.107421,1976
16387.31,16387.304687,1952
19894.63,19894.634765,1940
20383.30,20383.296875,1960
25561.64,25561.638671,1964
34748.62,34748.605468,2000
29990.77,29990.771484,1980
26085.88,26085.880859,1948
28298.82,28298.818359,1956
====The End====="""


data = serialPrint.split("\n")[1:-1]
data=[x.split(",") for x in data]
DF_serial= pd.DataFrame( data[1:], columns= data[0]).astype("float32")
DF_serial


DF_serial.columns

Index(['Cal_Ardui', 'Expected', 'Delta_time(us)'], dtype='object')


print("The AVG prediction time of one input is", 
      (DF_serial['Delta_time(us)'].mean()/1000).round(2), 
      "ms"
     )

The AVG prediction time of one input is 1.98 ms


DF_serial.plot.scatter(x='Expected', y='Cal_Ardui',  marker='o', label="Arduino calculation")
xx=[DF_serial['Expected'].min(), DF_serial['Expected'].max()]
plt.plot(xx,xx, c='r', label="equal")
plt.legend()
plt.xlabel("Python model prediction")
plt.ylabel("Arduino model prediction")
plt.grid()
plt.show()

Feature	Arduino Uno	Arduino Nano	Arduino Micro	Arduino Mega 2560	ESP32
Microcontroller	ATmega328P	ATmega328P	ATmega32U4	ATmega2560	Tensilica Xtensa LX6
Operating Voltage	5V	5V	5V	5V	3.3V
Input Voltage	7-12V	7-12V	7-12V	7-12V	5V via USB or 7-12V
Digital I/O Pins	14 (6 PWM)	14 (6 PWM)	20 (7 PWM)	54 (15 PWM)	34
Analog Input Pins	6	8	12	16	18
Flash Memory	32 KB	32 KB	32 KB	256 KB	Up to 16 MB
SRAM	2 KB	2 KB	2.5 KB	8 KB	520 KB
EEPROM	1 KB	1 KB	1 KB	4 KB	None
Clock Speed	16 MHz	16 MHz	16 MHz	16 MHz	240 MHz (dual-core)
Connectivity	UART, I2C, SPI	UART, I2C, SPI	UART, I2C, SPI	UART, I2C, SPI	Wi-Fi, Bluetooth
USB Interface	USB-B	Mini USB	Micro USB	USB-B	Micro USB
Dimensions	68.6 x 53.4 mm	45 x 18 mm	48 x 18 mm	101.52 x 53.3 mm	51 x 25.5 mm
Power Consumption	~50 mA	~50 mA	~50 mA	~70 mA	Varies (~80-240 mA)
Special Features	Simple and robust	Compact	USB HID support	High I/O count	Wi-Fi and BLE
Price Range	Low	Low	Medium	Medium	Medium-High

	Cal_Ardui	Expected	Delta_time(us)
0	19074.859375	19074.861328	68.0
1	22590.429688	22590.433594	76.0
2	18458.250000	18458.253906	80.0
3	20624.410156	20624.408203	76.0
4	20219.449219	20219.445312	76.0
5	29240.259766	29240.261719	80.0
6	32525.000000	32525.000000	84.0
7	27160.859375	27160.859375	80.0
8	25408.599609	25408.605469	80.0
9	26429.560547	26429.554688	88.0

	Cal_Ardui	Expected	Delta_time(us)
0	19870.970703	19870.972656	40.0
1	26722.960938	26722.962891	48.0
2	16522.859375	16522.857422	48.0
3	18817.560547	18817.560547	44.0
4	17535.560547	17535.554688	48.0
5	17620.000000	17620.000000	44.0
6	35083.109375	35083.109375	48.0
7	32269.539062	32269.535156	44.0
8	23814.669922	23814.671875	40.0
9	31409.130859	31409.130859	48.0

	Cal_Ardui	Expected	Delta_time(us)
0	20217.689453	20217.689453	120.0
1	24530.449219	24530.447266	116.0
2	18560.160156	18560.160156	124.0
3	19753.039062	19753.039062	120.0
4	17726.349609	17726.345703	120.0
5	20670.880859	20670.882812	136.0
6	35056.589844	35056.593750	132.0
7	31866.390625	31866.384766	120.0
8	25756.679688	25756.675781	120.0
9	28062.480469	28062.480469	120.0

	Cal_Ardui	Expected	Delta_time(us)
0	19879.689453	19879.697266	2076.0
1	24714.269531	24714.275391	1984.0
2	16844.199219	16844.208984	1952.0
3	19767.669922	19767.681641	1956.0
4	20115.119141	20115.128906	1960.0
5	25036.529297	25036.535156	1964.0
6	34841.578125	34841.546875	1988.0
7	29939.019531	29939.027344	1980.0
8	25749.970703	25749.972656	1960.0
9	28292.759766	28292.769531	1968.0

Convert a Python Machine Learning Model to Arduino Code (C++)

Introduction¶

Motivation¶

Hardware¶

Table of contents¶

Libraries¶

Load dataset¶

All inputs¶

Dataset with 4 inputs¶

Example of ML models¶

Linear Regression¶

Decision Tree Regressor¶

Random forest regressor¶

XGBOOST¶

DNN¶

Model to C++ (Arduino Language)¶

Utils : for all models¶

Linear Regression¶

Decision tree Regressor¶

Random forest regressor¶

XGBoost¶

DNN¶

Make a PyPi package of all this project function¶

Annexes¶

Other solution for XGBoost¶

	model1	model2	version	price	km	fuel	CV_fisc	HorseP	Gearbox_auto	L_by_100km	numbe_seats	doors_nb	Euro_stand	Length	Nb_option	registration_date
0	RENAULT	MEGANE 4	IV 1.6 TCE 205 ENERGY GT EDC7	23440.0	78325.0	0.0	11.0	205.0	1.0	4.9	5.0	5.0	6.0	4.36	40.0	3183.0
1	RENAULT	CLIO 5	V 1.0 TCE 100 INTENS	19930.0	27008.0	0.0	5.0	101.0	0.0	5.3	5.0	5.0	6.0	4.05	11.0	4085.0

	Cal_Ardui	Expected	Delta_time(us)
0	17770.185547	17770.187500	4556.0
1	22208.960938	22208.962891	4420.0
2	16064.545898	16064.555664	4444.0
3	19372.082031	19372.083984	4412.0
4	19566.919922	19566.931641	4436.0
5	28578.988281	28578.988281	4528.0
6	33195.054688	33195.054688	4512.0
7	24006.271484	24006.269531	4508.0
8	22752.718750	22752.728516	4520.0
9	23988.726562	23988.726562	4536.0

	Cal_Ardui	Expected	Delta_time(us)
0	20222.400391	20222.404297	2088.0
1	24689.109375	24689.107422	1976.0
2	16387.310547	16387.304688	1952.0
3	19894.630859	19894.634766	1940.0
4	20383.300781	20383.296875	1960.0
5	25561.640625	25561.638672	1964.0
6	34748.621094	34748.605469	2000.0
7	29990.769531	29990.771484	1980.0
8	26085.880859	26085.880859	1948.0
9	28298.820312	28298.818359	1956.0