Sunday, September 17, 2023

TIME-SERIES SALES FORECASTING AND PREDICTING USING MACHINE LEARNING WITH TKINTER--- Vivian Siahaan and Rismon Hasiholan Sianipar (BALIGE ACADEMY)

Dataset


This is the product of Balige Academy, North Sumatera. If you find the source code useful, please subscribe to our youtube channel.



Balige Academy Team

Vivian Siahaan

RIsmon Hasiholan Sianipar

HORASSS!!!











































FULL SOURCE CODE:

<

#main_sales.py
import os
import pandas as pd
import tkinter as tk
from tkinter import *
from main_window import Main_Window
from helper_plot import Helper_Plot
from process_data import Process_Data
from regression import Regression
from machine_learning import Machine_Learning
from form1 import Form1

class Main_Sales():
    def __init__(self, root):
        #super().__init__()
        self.initialize()

    def initialize(self):
        self.root = root
        width = 1560
        height = 790
        self.root.geometry(f"{width}x{height}")
        self.root.title("TIME-SERIES SUPERSTORE SALES FORECASTING AND PREDICTION USING MACHINE LEARNING")

        #Creates necessary objects
        self.obj_window = Main_Window()
        self.obj_plot = Helper_Plot()
        self.obj_data = Process_Data()
        self.obj_reg = Regression()   
        self.obj_ML = Machine_Learning()
        
        #Places widgets in root
        self.obj_window.add_widgets(self.root)  

        #Reads dataset
        self.df_before_fill, self.df_after_fill = self.obj_data.preprocess()

        #Creates dummy dataset for visualization
        self.df_dummy = self.df_after_fill.copy()
        self.df_dummy = self.obj_data.create_dummy(self.df_dummy)

        #Normalizes year-wise data
        self.year_data_mean, self.year_data_ewm, self.year_norm = self.obj_data.normalize_year_wise_data(self.df_after_fill)

        #Normalizes month-wise data
        self.month_data_mean, self.month_data_ewm, self.month_norm = self.obj_data.normalize_month_wise_data(self.df_after_fill)       

        #Calculates RFM
        self.rfm_df, self.rank_df, self.merged_rank_dummy = self.obj_data.calculate_RFM(self.df_after_fill, self.df_dummy) 
        
        #Finds churn customer
        self.df_final = self.obj_data.find_churn_customer(self.df_dummy, self.rfm_df, self.rank_df)

        #For machine learning
        self.df_final, self.X1, self.y1, self.X2, self.y2 = self.obj_data.encode_df(self.df_final)

        #Extracts input and output variables for regression
        self.obj_reg.splitting_data_regression(self.X2, self.y2) 

        #Extracts input and output variables for prediction
        self.obj_ML.oversampling_splitting(self.df_final)

        #turns off combo_reg and combo_pred after splitting is done
        self.obj_window.combo_reg['state'] = 'disabled'
        self.obj_window.combo_pred['state'] = 'disabled'
        
        #Binds events
        self.binds_event()
        self.obj_plot.binds_menu_open_dataset(self.df_before_fill, self.df_after_fill, self.root, self.obj_window)
        self.obj_plot.binds_features_distribution(self.obj_window, self.df_before_fill, self.df_after_fill, self.df_dummy)   
        self.obj_plot.binds_categories_distribution(self.obj_window, self.df_dummy)
        self.obj_plot.binds_year_wise(self.obj_window, self.df_dummy, self.year_data_mean, self.year_data_ewm)
        self.obj_plot.binds_month_wise(self.obj_window, self.df_dummy, self.month_data_mean, self.month_data_ewm)
        self.obj_plot.binds_rfm_distribution(self.obj_window, self.merged_rank_dummy)
        self.obj_plot.binds_feat_importance(self.obj_window, self.df_final, self.X1, self.y1)

    def binds_event(self):
        #Shows table if user clicks LOAD DATASET 
        self.obj_window.btn_load.config(command = lambda:self.obj_plot.shows_table(self.root, 
            self.merged_rank_dummy, 1250, 600, "Superstore Dataset"))  

        #Binds listbox to choose_list_widget() function
        self.obj_window.listbox.bind("<<ListboxSelect>>", self.choose_list_widget)

        # Binds combo_year to choose_combo_year()
        self.obj_window.combo_year.bind("<<ComboboxSelected>>", self.choose_combo_year)

        # Binds combo_month to choose_combobox_month()
        self.obj_window.combo_month.bind("<<ComboboxSelected>>", self.choose_combobox_month)

        # Binds combo_tech to choose_combo_rfm()
        self.obj_window.combo_rfm.bind("<<ComboboxSelected>>", self.choose_combo_rfm)

        #Binds btn_reg to split_regression() function 
        self.obj_window.btn_reg.config(command = self.split_regression) 

        # Binds combo_reg to choose_combo_reg()
        self.obj_window.combo_reg.bind("<<ComboboxSelected>>", self.choose_combo_reg)

        #Binds combo_pred to split_prediction() function 
        self.obj_window.btn_pred.config(command = self.split_prediction)
        
        # Binds combo_pred to choose_combo_pred()
        self.obj_window.combo_pred.bind("<<ComboboxSelected>>", self.choose_combo_pred)
        
    def choose_list_widget(self, event):
        chosen = self.obj_window.listbox.get(self.obj_window.listbox.curselection())
        print(chosen)
        self.obj_plot.choose_plot(self.df_after_fill, self.df_dummy, chosen, 
            self.obj_window.figure1, self.obj_window.canvas1, 
            self.obj_window.figure2, self.obj_window.canvas2)

    def choose_combo_year(self, event):
        chosen = self.obj_window.combo_year.get()
        self.obj_plot.choose_year_wise(self.df_after_fill, self.year_data_mean, self.year_data_ewm, self.year_norm, chosen, 
            self.obj_window.figure1, self.obj_window.canvas1, 
            self.obj_window.figure2, self.obj_window.canvas2)

    def choose_combobox_month(self, event):
        chosen = self.obj_window.combo_month.get()
        self.obj_plot.choose_month_wise(self.df_dummy, 
            self.month_data_mean, self.month_data_ewm, self.month_norm, chosen, 
            self.obj_window.figure1, self.obj_window.canvas1, 
            self.obj_window.figure2, self.obj_window.canvas2)

    def choose_combo_rfm(self, event):
        chosen = self.obj_window.combo_rfm.get()
        self.obj_plot.choose_rfm_distribution(self.merged_rank_dummy, chosen, 
            self.obj_window.figure1, 
            self.obj_window.canvas1, self.obj_window.figure2, 
            self.obj_window.canvas2)

    def split_regression(self):
        file_path = os.getcwd()+"/X_final_reg.pkl"
        if os.path.exists(file_path):
            self.X_Ori, self.X_final_reg, self.X_train_reg, self.X_test_reg, \
            self.X_val_reg, self.y_final_reg, self.y_train_reg, \
            self.y_test_reg, self.y_val_reg = self.obj_reg.load_regression_files()
        else:
            self.obj_reg.splitting_data_regression(self.df_final)
            self.X_Ori, self.X_final_reg, self.X_train_reg, self.X_test_reg, 
            self.X_val_reg, self.y_final_reg, self.y_train_reg, 
            self.y_test_reg, self.y_val_reg = self.obj_reg.load_regression_files()

        print("Loading regression files done...")

        #turns on combo_reg after splitting is done
        self.obj_window.combo_reg['state'] = 'normal'

        self.obj_window.btn_reg.config(state="disabled")

    def choose_combo_reg(self, event):
        chosen = self.obj_window.combo_reg.get()
        
        self.obj_plot.choose_plot_regression(chosen, self.X_final_reg, 
            self.X_train_reg, self.X_test_reg, self.X_val_reg,
            self.y_final_reg, self.y_train_reg, self.y_test_reg,
            self.y_val_reg,
            self.obj_window.figure1, self.obj_window.canvas1, 
            self.obj_window.figure2, self.obj_window.canvas2)

    def split_prediction(self):
        file_path = os.getcwd()+"/X_train.pkl"
        if os.path.exists(file_path):
            self.X_train, self.X_test, self.y_train, self.y_test = self.obj_ML.load_files()
        else:
            self.obj_ML.oversampling_splitting(self.df_final)
            self.X_train, self.X_test, self.y_train, self.y_test = self.obj_ML.load_files()

        print("Loading files done...")

        #turns on combo_pred after splitting is done
        self.obj_window.combo_pred['state'] = 'normal'

        self.obj_window.btn_pred.config(state="disabled")

    def choose_combo_pred(self, event):
        chosen = self.obj_window.combo_pred.get()
        self.obj_plot.choose_plot_ML(self.root, chosen, self.X_train, self.X_test, 
            self.y_train, self.y_test, self.obj_window.figure1, 
            self.obj_window.canvas1, self.obj_window.figure2, 
            self.obj_window.canvas2)  
        
if __name__ == "__main__":
    root = tk.Tk()
    app = Main_Sales(root)
    root.mainloop()


#design_window.py
import tkinter as tk
from tkinter import ttk
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

class Design_Window:
    def add_widgets(self, root):
        #Set styles
        self.set_style(root)
        
        #Adds menu
        self.add_menu(root)
        
        #Adds button(s)
        self.add_buttons(root)

        #Adds canvasses
        self.add_canvas(root)

        #Adds labels
        self.add_labels(root)

        #Adds listbox widget
        self.add_listboxes(root)

        #Adds combobox widget
        self.add_comboboxes(root)

    def set_style(self, root):
        # variables created for colors
        ebg = '#404040'
        fg = '#FFFFFF'

        style = ttk.Style()

        # Be sure to include this or style.map() won't function as expected.
        style.theme_use('alt')

        # the following alters the Listbox
        root.option_add('*TCombobox*Listbox.Background', ebg)
        root.option_add('*TCombobox*Listbox.Foreground', fg)
        root.option_add('*TCombobox*Listbox.selectBackground', fg)
        root.option_add('*TCombobox*Listbox.selectForeground', ebg)

        # the following alters the Combobox entry field
        style.map('TCombobox', fieldbackground=[('readonly', ebg)])
        style.map('TCombobox', selectbackground=[('readonly', ebg)])
        style.map('TCombobox', selectforeground=[('readonly', fg)])
        style.map('TCombobox', background=[('readonly', ebg)])
        style.map('TCombobox', foreground=[('readonly', fg)])

    def add_menu(self, root):
        self.menu_bar = tk.Menu(root)
        
        #Creates a Dataset menu
        self.dataset_menu = tk.Menu(self.menu_bar, tearoff=0)
        self.dataset_menu.add_command(label="Open Dataset")
        self.dataset_menu.add_command(label="Save Dataset")
        self.menu_bar.add_cascade(label="File", menu=self.dataset_menu)

        #Creates a feature distribution menu
        self.dist_menu = tk.Menu(self.menu_bar, tearoff=0)
        self.dist_menu.add_command(label="Missing Values")
        self.dist_menu.add_command(label="Correlation Coefficient")
        self.menu_bar.add_cascade(label="Features Distribution", menu=self.dist_menu)
        
        root.config(menu=self.menu_bar)    
        
    def add_buttons(self, root):
        #Adds button
        self.btn_load = tk.Button(root, height=2, width=35, text="LOAD DATASET")
        self.btn_load.grid(row=0, column=0, padx=5, pady=5, sticky="w")

        self.btn_reg = tk.Button(root, height=2, width=35, text="SPLIT DATA FOR FORECASTING")
        self.btn_reg.grid(row=9, column=0, padx=5, pady=5, sticky="w")

        self.btn_pred = tk.Button(root, height=2, width=35, text="SPLIT DATA FOR PREDICTION")
        self.btn_pred.grid(row=12, column=0, padx=5, pady=5, sticky="w")

    def add_labels(self, root):
        #Adds labels
        self.label1 = tk.Label(root, text = "CHOOSE DISTRIBUTION")
        self.label1.grid(row=1, column=0, padx=5, pady=1, sticky="w")

        self.label2 = tk.Label(root, text = "YEAR-WISE TIME-SERIES PLOT")
        self.label2.grid(row=3, column=0, padx=5, pady=1, sticky="w")

        self.label3 = tk.Label(root, text = "MONTH-WISE TIME-SERIES PLOT")
        self.label3.grid(row=5, column=0, padx=5, pady=1, sticky="w")

        self.label4 = tk.Label(root, text = "TECHNICAL INDICATORS")
        self.label4.grid(row=7, column=0, padx=5, pady=1, sticky="w")

        self.label5 = tk.Label(root, text = "CHOOSE FORECASTING")
        self.label5.grid(row=10, column=0, padx=5, pady=1, sticky="w")

        self.label6 = tk.Label(root, text = "CHOOSE PREDICTION")
        self.label6.grid(row=13, column=0, padx=5, pady=1, sticky="w")

    def add_canvas(self, root):
        #Menambahkan canvas1 widget pada root untuk menampilkan hasil
        self.figure1 = Figure(figsize=(6.2, 7.6), dpi=100)
        self.figure1.patch.set_facecolor('#F0F0F0')
        self.canvas1 = FigureCanvasTkAgg(self.figure1, master=root)
        self.canvas1.get_tk_widget().grid(row=0, column=1, columnspan=1, 
            rowspan=25, padx=5, pady=5, sticky="n")

        #Menambahkan canvas2 widget pada root untuk menampilkan hasil
        self.figure2 = Figure(figsize=(6.2, 7.6), dpi=100)
        self.figure2.patch.set_facecolor('#F0F0F0')
        self.canvas2 = FigureCanvasTkAgg(self.figure2, master=root)
        self.canvas2.get_tk_widget().grid(row=0, column=2, columnspan=1, 
            rowspan=25, padx=5, pady=5, sticky="n")

    def add_listboxes(self, root):
        #Creates list widget
        self.listbox = tk.Listbox(root, selectmode=tk.SINGLE, width=40, 
            fg ="black", bg="#F0F0F0", 
            highlightcolor="black", selectbackground="red",relief="flat", 
            borderwidth=5, highlightthickness=0)
        self.listbox.grid(row=2, column=0, sticky='n', padx=5, pady=1)

        # Menyisipkan item ke dalam list widget
        items = ["Missing Values", "Correlation Coefficient", "Year", "Day", 
                 "Month", "Quarter", "Open versus Adj Close versus Year",
                 "Low versus High versus Quarter", "Adj Close versus Volume versus Month", 
                 "High versus Volume versus Day", "Distribution of Volume by Year", 
                 "Distribution of Volume by Days of Week", "Distribution of Volume by Month",
                 "Distribution of Volume by Quarter", "Year versus Categorized Volume",
                 "Day versus Categorized Volume", "Week versus Categorized Volume",
                 "Month versus Categorized Volume", "Quarter versus Categorized Volume",
                 "Quarter versus High Per Categorized Volume",
                 "Day versus Adj Close Per Categorized Volume",
                 "Categorized Volume", "Correlation Matrix"]
        for item in items:
            self.listbox.insert(tk.END, item)

        self.listbox.config(height=len(items)) 
        
    def add_comboboxes(self, root):
        # Create ComboBoxes
        self.combo_year = ttk.Combobox(root, width=38)
        self.combo_year["values"] = ["Low and High", "Open and Close",
            "Adj Close and Close",  "Year-Wise Mean EWM Low and High",  
            "Year-Wise Mean EWM Open and Close",                 
            "Normalized Year-Wise Data", 
            "Adj Close by Year", "Volume by Year",
            "Open by Year", "Close by Year",
            "Low by Year", "High by Year"]
        self.combo_year.grid(row=4, column=0, padx=5, pady=1, sticky="n")

        self.combo_month = ttk.Combobox(root, width=38, style='TCombobox')
        self.combo_month["values"] = ["Quarter-Wise Low and High", "Quarter-Wise Open and Close",
            "Month-Wise Open and Adj Close", "Month-Wise Mean EWM Low and High",
            "Month-Wise Mean EWM Open and Close", "Month-Wise Adj Close",
            "Month-Wise Open", "Month-Wise Close", "Month-Wise Low",
            "Month-Wise High", "Month-Wise Volume", "Normalized Month-Wise Data",
            "Adj Close by Month", "Open by Month", "Close by Month",
            "Low by Month", "High by Month", "Volume by Month"]
        self.combo_month.grid(row=6, column=0, padx=5, pady=1, sticky="n")

        self.combo_tech = ttk.Combobox(root, width=38, style='TCombobox')
        self.combo_tech["values"] = ["Adj Close versus Daily_Returns versus Year", 
            "Volume versus Daily_Returns versus Quarter", 
            "Low versus Daily_Returns versus Month",
            "High versus Daily_Returns versus Day",
            "Technical Indicators", "Differences"]
        self.combo_tech.grid(row=8, column=0, padx=5, pady=1, sticky="n")

        self.combo_reg = ttk.Combobox(root, width=38, style='TCombobox')
        self.combo_reg["values"] = ["Linear Regression", "RF Regression",
            "Decision Trees Regression", "KNN Regression",
            "AdaBoost Regression", "Gradient Boosting Regression",
            "MLP Regression", "SVR Regression", "Lasso Regression", "Ridge Regression"]
        self.combo_reg.grid(row=11, column=0, padx=5, pady=1, sticky="n")

        self.combo_pred = ttk.Combobox(root, width=38, style='TCombobox')
        self.combo_pred["values"] = ["Logistic Regression", "Random Forest",
            "Decision Trees", "K-Nearest Neighbors",
            "AdaBoost", "Gradient Boosting",
            "Extreme Gradient Boosting", "Light Gradient Boosting", 
            "Multi-Layer Perceptron", "Support Vector Classifier"]
        self.combo_pred.grid(row=14, column=0, padx=5, pady=1, sticky="n")


#process_data.py
import os
import pandas as pd 
from datetime import datetime
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

class Process_Data:
    def read_dataset(self, filename):
        #Reads dataset
        curr_path = os.getcwd()
        path = os.path.join(curr_path, filename) 
        df = pd.read_csv(path)

        return df
    
    def preprocess(self):
        df = self.read_dataset("train.csv")

        #Extracts day, month, week, quarter, and year from order date
        df['Date'] = pd.to_datetime(df['Order Date'], format="%d/%m/%Y")
        df['Day'] = df['Date'].dt.weekday
        df['Month'] = df['Date'].dt.month
        df['Year']  = df['Date'].dt.year
        df['Week'] = df['Date'].dt.isocalendar().week
        df['Quarter']= df['Date'].dt.quarter

        #Sets Date column as index
        df = df.set_index("Date")
        
        #Converts data types to datetime
        df['Order Date'] = pd.to_datetime(df['Order Date'], format="%d/%m/%Y")
        df['Ship Date'] = pd.to_datetime(df['Ship Date'], format="%d/%m/%Y")

        # Drop columns
        df = df.drop(['Row ID'],axis=1)

        # Sort values by order date
        df.sort_values('Order Date', ascending=True, inplace=True)

        # Fillna values in 'Postal Code' with correct postal code
        df2 = df.copy()
        df2['Postal Code'] = df['Postal Code'].fillna(5401)

        return df, df2  

    def create_dummy(self, df):
        #Creates a dummy dataframe for visualization
        df_dummy=df.copy()

        #Converts days and months from numerics to meaningful string
        days = {0:'Sunday',1:'Monday',2:'Tuesday',3:'Wednesday',
                4:'Thursday',5: 'Friday',6:'Saturday'}
        df_dummy['Day'] = df_dummy['Day'].map(days)

        months={1:'January',2:'February',3:'March',4:'April',
                5:'May',6:'June',7:'July',8:'August',9:'September',
                10:'October',11:'November',12:'December'}
        df_dummy['Month']= df_dummy['Month'].map(months)

        quarters = {1:'Jan-March', 2:'April-June',3:'July-Sept',
                    4:'Oct-Dec'}
        df_dummy['Quarter'] = df_dummy['Quarter'].map(quarters)

        #Categorizes Sales feature
        labels = ['0-10', '10-20',  '20-50', '50-100', '100-150', '150-200','250-300','300-400', '>400']
        df_dummy['Cat_Sales'] = pd.cut(df_dummy['Sales'],
            [0, 10, 20, 50, 100, 150, 200, 250, 300, 400], labels=labels)
        
        return df_dummy

    def normalize_year_wise_data(self, df):
        #Normalizes year-wise data
        year_data_mean = df["Sales"].resample('y').mean()
        year_data_ewm = year_data_mean.ewm(span=5).mean()        
        year_norm = (year_data_mean - year_data_mean.min()) / (year_data_mean.max() - year_data_mean.min())

        return year_data_mean, year_data_ewm, year_norm

    def normalize_month_wise_data(self, df):      
        month_data_mean = df["Sales"].resample('m').mean()
        month_data_ewm = month_data_mean.ewm(span=5).mean()

        month_norm = (month_data_mean - month_data_mean.min()) / (month_data_mean.max() - month_data_mean.min())

        return month_data_mean, month_data_ewm, month_norm

    def calculate_RFM(self, df, df_dummy):
        # Calculating recency
        recency_df = df.groupby('Customer Name', as_index=False)['Order Date'].max()
        recent_date = recency_df['Order Date'].max()
        recency_df['Recency'] = recency_df['Order Date'].apply(
            lambda x: (recent_date - x).days)
        recency_df.rename(columns={'Order Date':'Last Purchase Date'}, inplace=True)

        # Calculating Frequency
        frequency_df = df.groupby('Customer Name', as_index=False)['Order Date'].count()
        frequency_df.rename(columns={'Order Date':'Frequency'}, inplace=True)

        # Calculating monetary
        monetary_df = df.groupby('Customer Name', as_index=False)['Sales'].sum()
        monetary_df.rename(columns={'Sales':'Monetary'}, inplace=True)

        # Merging all three df in one df
        rfm_df = recency_df.merge(frequency_df, on='Customer Name')
        rfm_df = rfm_df.merge(monetary_df, on='Customer Name')
        rfm_df['Monetary'] = rfm_df['Monetary'].round(2)
        rfm_df.drop(['Last Purchase Date'], axis=1, inplace=True)

        rank_df = rfm_df.copy() # We make copy of rfm_df because we will need RFM features later

        # Normalizing the rank of the customers
        rank_df['r_rank'] = rank_df['Recency'].rank(ascending=False)
        rank_df['f_rank'] = rank_df['Frequency'].rank(ascending=False)
        rank_df['m_rank'] = rank_df['Monetary'].rank(ascending=False)

        rank_df['r_rank_norm'] = (rank_df['r_rank'] / rank_df['r_rank'].max()) * 100
        rank_df['f_rank_norm'] = (rank_df['f_rank'] / rank_df['f_rank'].max()) * 100
        rank_df['m_rank_norm'] = (rank_df['m_rank'] / rank_df['m_rank'].max()) * 100

        rank_df.drop(['r_rank','f_rank','m_rank'], axis=1, inplace=True)

        # Calculating RFM scores
        rank_df['rfm_score'] = (0.15*rank_df['r_rank_norm']) + (0.28*rank_df['f_rank_norm']) + (0.57*rank_df['m_rank_norm'])
        rank_df = rank_df[['Customer Name','rfm_score']]
        rank_df['rfm_score'] = round(rank_df['rfm_score']*0.05, 2)

        # Masking all customers rfm scores by rating conditions to set customer segments easily
        top_customer_mask = (rank_df['rfm_score'] >= 4.5)
        high_value_mask = ((rank_df['rfm_score']<4.5) & (rank_df['rfm_score']>=4))
        medium_value_mask = ((rank_df['rfm_score']<4) & (rank_df['rfm_score']>=3))
        low_value_mask = ((rank_df['rfm_score']<3) & (rank_df['rfm_score']>=1.6))
        lost_mask = (rank_df['rfm_score'] < 1.6)

        rank_df.loc[top_customer_mask, 'Customer Segment'] = 'Top Customer'
        rank_df.loc[high_value_mask, 'Customer Segment'] = 'High Value Customer'
        rank_df.loc[medium_value_mask, 'Customer Segment'] = 'Medium Value Customer'
        rank_df.loc[low_value_mask, 'Customer Segment'] = 'Low Value Customer'
        rank_df.loc[lost_mask, 'Customer Segment'] = 'Lost Customer'

        # Merge the DataFrames on 'Customer Name'
        merged_rank_dummy = rank_df.merge(df_dummy, on='Customer Name', how='inner')

        # Convert index to datetime
        merged_rank_dummy.index = pd.to_datetime(merged_rank_dummy.index)

        # Set 'Order Date' as the index
        merged_rank_dummy.set_index('Order Date', inplace=True)
        
        return rfm_df, rank_df, merged_rank_dummy

    def find_churn_customer(self, df, rfm_df, rank_df):
        #Find time since first purchase for every customer
        first_purchase_df = df.groupby('Customer Name', as_index=False)['Order Date'].min()
        first_purchase_df.rename(columns={'Order Date':'First Purchase Date'}, inplace=True)

        df_final = df.copy() # Make sure changes we will make doesn't affect original df so we copy it
        df_final = df_final.merge(first_purchase_df, on='Customer Name',how='left')
        df_final['Time Since First Purchase'] = (df_final['Order Date'].max() -
                                        df_final['First Purchase Date']).dt.days        
        
        # Add recancy,frequency,monetary and segment columns to df. We found those features in the previous section
        df_final = df_final.merge(rfm_df, on='Customer Name', how='left')
        df_final = df_final.merge(rank_df, on='Customer Name', how='left')        

        # Find churned and not churned customers
        churned = (df_final['Customer Segment'] == 'Lost Customer')
        not_churned = (df_final['Customer Segment'] != 'Lost Customer')

        df_final.loc[churned, 'Churned'] = 1
        df_final.loc[not_churned, 'Churned'] = 0
        df_final['Churned'] = df_final['Churned'].astype('int64')

        #Rename Churned column to Churn
        df_final.rename(columns={'Churned':'Churn'}, inplace=True)  

        # Convert index to datetime
        df_final.index = pd.to_datetime(df_final.index)

        # Set 'Order Date' as the index
        df_final.set_index('Order Date', inplace=True)
        
        return df_final
        
    def encode_df(self, df):         
        #Drops Daily Summary column
        df.drop(['Order ID', 'Ship Date', 'Customer ID', 'Cat_Sales', 'Day', 'Month', 'Week', 'Quarter'], axis=1, inplace=True)
        
        #Controls the size of dataset for regression and prediction to suit your computing power
        df=df[df["Year"] == 2018]
        #df = df[(df["Year"] == 2016) & (df["Month"] >= 3) & (df["Month"] <= 7)]
        #Selects data in year 2015-2016, because very big dataset
        #df = df[df['Year'].isin([2015, 2016])]      

        # Encodes all non-numeric columns
        non_numeric_columns = df.select_dtypes(exclude=['number']).columns
        label_encoder = LabelEncoder()
        for column in non_numeric_columns:
            df[column] = label_encoder.fit_transform(df[column])

        print(df.head().to_string())    

        #Extracts output and input variables for prediction
        y1 = df['Churn'] # Target for the model
        X1 = df.drop(['Churn'], axis = 1)
        
        y2 = df['Sales'] # Target for regression
        X2 = df.drop(['Sales'], axis = 1)
    
        return df, X1, y1, X2, y2

    def feat_importance_rf(self, X, y):
        names = X.columns
        rf = RandomForestClassifier()
        rf.fit(X, y)

        result_rf = pd.DataFrame()
        result_rf['Features'] = X.columns
        result_rf ['Values'] = rf.feature_importances_
        result_rf.sort_values('Values', inplace = True, ascending = False)

        return result_rf
    
    def feat_importance_et(self, X, y):
        model = ExtraTreesClassifier()
        model.fit(X, y)

        result_et = pd.DataFrame()
        result_et['Features'] = X.columns
        result_et ['Values'] = model.feature_importances_
        result_et.sort_values('Values', inplace=True, ascending =False)

        return result_et    
    
    def feat_importance_rfe(self, X, y):
        model = LogisticRegression()
        #Creates the RFE model
        rfe = RFE(model)
        rfe = rfe.fit(X, y)

        result_lg = pd.DataFrame()
        result_lg['Features'] = X.columns
        result_lg ['Ranking'] = rfe.ranking_
        result_lg.sort_values('Ranking', inplace=True , ascending = False)

        return result_lg  
    
    def save_result(self, y_test, y_pred, fname):
        # Convert y_test and y_pred to pandas Series for easier handling
        y_test_series = pd.Series(y_test)
        y_pred_series = pd.Series(y_pred)
        
        # Calculate y_result_series
        y_result_series = pd.Series(y_pred - y_test == 0)
        y_result_series = y_result_series.map({True: 'True', False: 'False'})

        # Create a DataFrame to hold y_test, y_pred, and y_result
        data = pd.DataFrame({'y_test': y_test_series, 'y_pred': y_pred_series, 'result': y_result_series})

        # Save the DataFrame to a CSV file
        data.to_csv(fname, index=False)


#helper_plot.py
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import *
import seaborn as sns
import numpy as np 
import pandas as pd
import sys
from pandastable import Table
from io import StringIO
from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score
from sklearn.model_selection import learning_curve
from process_data import Process_Data
from main_window import Main_Window
from form1 import Form1
from form2 import Form2
from form3 import Form3
from machine_learning import Machine_Learning
from regression import Regression

class Helper_Plot:
    def __init__(self):
        self.obj_window = Main_Window()
        self.obj_data = Process_Data()
        self.obj_reg = Regression()
        self.obj_ml = Machine_Learning()
        

    def shows_table(self, root, df, width, height, title):
       frame = Toplevel(root) #new window
       self.table = Table(frame, dataframe=df, showtoolbar=True, showstatusbar=True)
       
       # Sets dimension of Toplevel
       frame.geometry(f"{width}x{height}")
       frame.title(title)
       self.table.show()

    def plot_missing_values(self, df, figure, canvas, title=""):
        figure.clear()
        ax = figure.add_subplot(1,1,1) 
        #Plots null values
        missing = df.isna().sum().reset_index()
        missing.columns = ['features', 'total_missing']
        missing['percent'] = (missing['total_missing'] / len(df)) * 100
        missing.index = missing['features']
        del missing['features']
        missing['total_missing'].plot(kind = 'bar', ax=ax)
        ax.set_title(title, fontsize = 12)
        ax.set_facecolor('#F0F0F0')
        
        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=5)
        ax.tick_params(axis='both', which='minor', labelsize=5)        
        figure.tight_layout()
        canvas.draw()

    def plot_corr_coeffs(self, df, figure, canvas):
        figure.clear()
        ax = figure.add_subplot(1,1,1) 
        
        #correlation coefficient of every column with Adj Close column
        all_corr = df.corr().abs()['Adj Close'].sort_values(ascending = False)

        # Filters correlations greater than 0.25
        filtered_corr = all_corr[all_corr > 0.25]
    
        # Define a custom color palette (replace with your preferred colors)
        custom_palette = sns.color_palette("Set1", len(filtered_corr))
        filtered_corr.plot(kind='barh', ax=ax, color=custom_palette)
        ax.set_title("Correlation Coefficient of Features with Adj Close (Threshold > 0.25)", fontsize = 10)
        ax.set_ylabel("Coefficient")
        ax.set_facecolor('#F0F0F0')
        
        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=5)
        ax.tick_params(axis='both', which='minor', labelsize=5)
        
        ax.grid(True)
        figure.tight_layout()
        canvas.draw()

    # Defines function to create pie chart and bar plot as subplots   
    def plot_piechart(self, df, var, figure, canvas, title='', top_ten=False):
        figure.clear()

        # Optionally filter the DataFrame to consider only the top ten values
        if top_ten:
            value_counts = df[var].value_counts().nlargest(10)
        else:
            value_counts = df[var].value_counts()
        
        # Pie Chart (top subplot)
        ax1 = figure.add_subplot(2,1,1)        
        label_list = list(value_counts.index)
        colors = sns.color_palette("Set1", len(label_list))  
        _, _, autopcts = ax1.pie(value_counts, autopct="%1.1f%%", colors=colors,
            startangle=30, labels=label_list,
            wedgeprops={"linewidth": 2, "edgecolor": "white"},  # Add white edge
            shadow=True, textprops={'fontsize': 7})
        ax1.set_title(title, weight="bold", fontsize=12)

        # Bar Plot (bottom subplot)
        ax2 = figure.add_subplot(2,1,2)
        ax = value_counts.plot(kind="barh", color=colors, alpha=0.8, ax = ax2) 
        for i, j in enumerate(value_counts.values):
            ax.text(.7, i, j, weight="bold", fontsize=7)

        ax2.set_title(title, weight="bold", fontsize=12)
        ax2.set_xlabel("Count")
        ax2.set_facecolor('#F0F0F0')
        figure.tight_layout()
        
        # Autoscale the subplots
        ax1.autoscale()
        ax2.autoscale() 
        
        canvas.draw()

    def plot_piechart_group(self, df, figure, canvas, title="", label=""):
        figure.clear()

        # Pie Chart (top subplot)
        ax1 = figure.add_subplot(2,1,1)        
        label_list = list(df.index)
        print(label_list)
        colors = sns.color_palette("Set1", len(label_list))  
        _, _, autopcts = ax1.pie(df.values, autopct="%1.1f%%", colors=colors,
            startangle=30, labels=label_list,
            wedgeprops={"linewidth": 2, "edgecolor": "white"},  # Add white edge
            shadow=True, textprops={'fontsize': 7})
        ax1.set_title(title, fontsize=10)

        # Bar Plot (bottom subplot)
        ax2 = figure.add_subplot(2,1,2)
        ax = df.plot(kind="barh", color=colors, alpha=0.8, ax = ax2) 
        for i, j in enumerate(df.values):
            ax.text(.7, i, j, weight="bold", fontsize=7)

        ax2.set_title(title, fontsize=10)
        ax2.set_xlabel("Count")
        ax2.set_facecolor('#F0F0F0')
        
        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=6)
        ax.tick_params(axis='both', which='minor', labelsize=6)          
        figure.tight_layout()
        canvas.draw()

    def plot_scatter(self, df, x, y, hue, figure, canvas):
        figure.clear()  
        ax = figure.add_subplot(1,1,1)    
        sns.scatterplot(data=df, x=x, y=y, hue=hue, palette="Set1", ax=ax)
        ax.set_title(x + " versus " + y + " by " + hue)
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        ax.grid(True)
        ax.legend(facecolor='#E6E6FA', edgecolor='black')
        ax.set_facecolor('#F0F0F0')
        figure.tight_layout()
        canvas.draw()

    #Puts label inside stacked bar
    def put_label_stacked_bar(self, ax,fontsize):
        #patches is everything inside of the chart
        for rect in ax.patches:
            # Find where everything is located
            height = rect.get_height()
            width = rect.get_width()
            x = rect.get_x()
            y = rect.get_y()
    
            # The height of the bar is the data value and can be used as the label
            label_text = f'{width:.0f}'  
    
            # ax.text(x, y, text)
            label_x = x + width / 2
            label_y = y + height / 2

            # plots only when height is greater than specified value
            if width > 0:
                ax.text(label_x, label_y, label_text, \
                    ha='center', va='center', \
                    weight = "bold",fontsize=fontsize)
    
    #Plots one variable against another variable
    def dist_one_vs_another_plot(self, df, cat1, cat2, figure, canvas, title):
        figure.clear()
        ax1 = figure.add_subplot(1,1,1)

        group_by_stat = df.groupby([cat1, cat2]).size()
        colors = sns.color_palette("Set1", len(df[cat1].unique()))
        group_by_stat.unstack().plot(kind='barh', stacked=True, ax=ax1,color=colors)
        ax1.set_title(title, fontsize=12)
        ax1.set_xlabel('Number of Cases', fontsize=10)
        ax1.set_ylabel(cat1, fontsize=10)
        self.put_label_stacked_bar(ax1,7)
        
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=8)
        ax1.tick_params(axis='both', which='minor', labelsize=8)    
        ax1.legend(facecolor='#E6E6FA', edgecolor='black', fontsize=8) 
        ax1.set_facecolor('#F0F0F0')
        figure.tight_layout()
        canvas.draw()

    def box_plot(self, df, x, y, hue, figure, canvas, title):
        figure.clear()
        ax1 = figure.add_subplot(1,1,1)

        sns.boxplot(data = df, x = x, y = y, hue = hue, ax=ax1)
        ax1.set_title(title, fontsize=14)
        ax1.set_xlabel(x, fontsize=10)
        ax1.set_ylabel(y, fontsize=10)
        ax1.set_facecolor('#F0F0F0')
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        figure.tight_layout()
        canvas.draw()

    def plot_corr_mat(self, df, figure, canvas):
        figure.clear()    
        ax = figure.add_subplot(1,1,1)  
        categorical_columns = df.select_dtypes(include=['object', 'category']).columns 
        df_removed = df.drop(columns=categorical_columns) 
        corrdata = df_removed.corr()

        annot_kws = {"size": 5}
        # Filter correlations greater than 0.1
        mask = abs(corrdata) > 0.1
        filtered_corr = corrdata[mask]
        
        # Drops features that don't meet the threshold
        filtered_corr = filtered_corr.dropna(axis=0, how='all')
        filtered_corr = filtered_corr.dropna(axis=1, how='all')
               
        sns.heatmap(filtered_corr, ax = ax, lw=1, annot=True, cmap="Greens", annot_kws=annot_kws)
        ax.set_title('Correlation Matrix (Threshold > 0.1)', fontweight="bold", fontsize=10)

        # Set font for x and y labels
        ax.set_xlabel('Features', fontweight="bold", fontsize=12)
        ax.set_ylabel('Features', fontweight="bold", fontsize=12)

        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=5)
        ax.tick_params(axis='both', which='minor', labelsize=5)

        figure.tight_layout()
        canvas.draw()

    def dataset_info(self, df):
        win = tk.Toplevel()
        form2 = Form2(win)  
        win.title("Dataset Information")

        # Capture sys.stdout
        original_stdout = sys.stdout
        sys.stdout = StringIO()

        # Get df.info() output
        df.info()

        # Get the string value
        info_string = sys.stdout.getvalue()

        # Reset sys.stdout
        sys.stdout = original_stdout

        # Insert the info string into the Text widget
        form2.text.insert(tk.END, info_string)

    def dataset_describe(self, df):
        win = tk.Toplevel()
        form2 = Form2(win)  
        win.title("Statistical Description")

        # Capture df.describe() output as a string
        describe_string_io = StringIO()
        df.describe().to_string(buf=describe_string_io)

        # Get the string value
        describe_string = describe_string_io.getvalue()

        # Insert the info string into the Text widget
        form2.text.insert(tk.END, describe_string)

    def display_null_counts(self, df):
        win = tk.Toplevel()
        form2 = Form2(win)  
        win.title("Null Value Counts")

        # Capture sys.stdout
        original_stdout = sys.stdout
        sys.stdout = StringIO()

        # Get df.isnull().sum() output
        null_counts = df.isnull().sum()

        # Get the string value
        null_counts_string = null_counts.to_string()

        # Reset sys.stdout
        sys.stdout = original_stdout

        # Insert the info string into the Text widget
        form2.text.insert(tk.END, null_counts_string)

    def display_null_postal_code_rows(self, df):
        win = tk.Toplevel()
        form2 = Form2(win)  
        win.title("Postal Code Null Value Counts")
        
        # Filter rows where "Postal Code" is null
        null_postal_code_df = df[df["Postal Code"].isnull()]

        # Convert DataFrame to string representation
        null_postal_code_str = null_postal_code_df.to_string(index=False)

        # Insert the string into the ScrolledText widget
        form2.text.insert(tk.END, null_postal_code_str)
                
    def plot_missing_values_and_coeff(self, df0, df1):
        win = tk.Toplevel()
        form1 = Form1(win)
        win.title("Missing Values and Correlation Coefficients")
        self.plot_missing_values(df0, form1.figure1, form1.canvas1, "Before Filling Null Values")
        self.plot_missing_values(df1, form1.figure2, form1.canvas2, "After Filling Null Values")
   
    def plot_case_distribution(self, df, var1, var2, title="", label="", top_ten=False):
        win = tk.Toplevel()
        form1 = Form1(win)
        win.title(title)
        self.plot_piechart(df, var1, form1.figure1, form1.canvas1, "Case Distribution of " + label + var1, top_ten)
        self.plot_piechart(df, var2, form1.figure2, form1.canvas2, "Case Distribution of " + label + var2, top_ten)

    def binds_menu_open_dataset(self, df_before, df_after, root, window):        
        window.dataset_menu.entryconfigure("View Dataset",
            command =lambda:self.shows_table(root, df_after, 1250, 600, "Superstore Sales Dataset"))  

        window.dataset_menu.entryconfigure("Dataset Information",
            command =lambda:self.dataset_info(df_after)) 

        window.dataset_menu.entryconfigure("Statistical Description",
            command =lambda:self.dataset_describe(df_after)) 
        
        window.dataset_menu.entryconfigure("Null Values",
            command =lambda:self.display_null_counts(df_before))         
        
        window.dataset_menu.entryconfigure("Postal Code Null Values",
            command =lambda:self.display_null_postal_code_rows(df_before))  
        
    def binds_features_distribution(self, window, df0, df1, df2):        
        window.dist_menu.entryconfigure("Missing Values",
            command = lambda:self.plot_missing_values_and_coeff(df0, df1))  

        window.dist_menu.entryconfigure("Day and Month",
            command = lambda:self.plot_case_distribution(df2, "Day", "Month", 
                "The Case Distribution of Day and Month"))

        window.dist_menu.entryconfigure("Quarter and Year",
            command = lambda:self.plot_case_distribution(df2, "Quarter", "Year",
                "The Case Distribution of Quarter and Year"))

        window.dist_menu.entryconfigure("Country and City",
            command = lambda:self.plot_case_distribution(df2, "Country", "City", 
                "The Case Distribution of Top Ten Country and Year", " Top 10 ", top_ten=True))

        window.dist_menu.entryconfigure("State and Region",
            command = lambda:self.plot_case_distribution(df2, "State", "Region",
                "The Case Distribution of State and Year", " Top 10 ", top_ten=True))

        window.dist_menu.entryconfigure("Customer Name and Customer ID",
            command = lambda:self.plot_case_distribution(df2, "Customer Name", "Customer ID",
                "The Case Distribution of Customer Name and Customer ID", " Top 10 ", top_ten=True))

        window.dist_menu.entryconfigure("Ship Mode and Segment",
            command = lambda:self.plot_case_distribution(df2, "Ship Mode", "Segment",
                "The Case Distribution of Ship Mode and Segment"))

        window.dist_menu.entryconfigure("Product Name and Product ID",
            command = lambda:self.plot_case_distribution(df2, "Product Name", "Product ID",
                "The Case Distribution of Product Name and Product ID", " Top 10 ", top_ten=True))

        window.dist_menu.entryconfigure("Category and Sub-Category",
            command = lambda:self.plot_case_distribution(df2, "Category", "Sub-Category",
                "The Case Distribution of Category and Sub-Category"))

    def plot_box_distribution(self, df, var1="", var2="", var3="", title=""):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.box_plot(df, var1, var2, var3, form3.figure1, form3.canvas1, title)
        
    def plot_categorized_distribution(self, df, var1="", var2="", var3="", 
        var4="", title1="", title2=""):        
        win = tk.Toplevel()
        form1 = Form1(win)
        
        self.dist_one_vs_another_plot(df, var1, var2, form1.figure1, form1.canvas1, title1)
        self.dist_one_vs_another_plot(df, var3, var4, form1.figure2, form1.canvas2, title2)

    def plot_grouped_distribution(self, df, var1="", var2="", var3="", 
        var4="", title1="", title2="", label="", mode=""):
        win = tk.Toplevel()
        form1 = Form1(win)
        win.title("Distribution of " + var2 + " by " + var1 + " and " + var3)
        sum_by_cat1 = df.groupby(var1)[var2].sum()
        sum_by_cat_top_ten1 = sum_by_cat1.nlargest(10)
        sum_by_cat2 = df.groupby(var3)[var4].sum()
        sum_by_cat_top_ten2 = sum_by_cat2.nlargest(10)        
        
        if mode == "":       
            self.plot_piechart_group(sum_by_cat1, form1.figure1, form1.canvas1, title1, label)
            self.plot_piechart_group(sum_by_cat2, form1.figure2, form1.canvas2, title2, label)
        else:
            self.plot_piechart_group(sum_by_cat_top_ten1, form1.figure1, form1.canvas1, title1, label)
            self.plot_piechart_group(sum_by_cat_top_ten2, form1.figure2, form1.canvas2, title2, label)        


    def binds_categories_distribution(self, window, df): 
        window.dist_cat.entryconfigure("Sales by Year and Quarter",
            command = lambda:self.plot_grouped_distribution(df, "Year", "Sales", 
            "Quarter", "Sales", "Sales by Year", "Sales by Quarter", "Sales")) 

        window.dist_cat.entryconfigure("Sales by Day and Month",
            command = lambda:self.plot_grouped_distribution(df, "Day", "Sales", 
            "Month", "Sales", "Sales by Day", "Sales by Month", "Sales")) 

        window.dist_cat.entryconfigure("Sales by Ship Mode and Segment",
            command = lambda:self.plot_grouped_distribution(df, "Ship Mode", "Sales", 
            "Segment", "Sales", "Sales by Ship Mode", "Sales by Segment", "Sales")) 
        
        window.dist_cat.entryconfigure("Sales by Category and Sub-Category",
            command = lambda:self.plot_grouped_distribution(df, "Category", "Sales", 
            "Sub-Category", "Sales", "Sales by Category", "Sales by Sub-Category", "Sales")) 

        window.dist_cat.entryconfigure("Sales by Product Name and Product ID",
            command = lambda:self.plot_grouped_distribution(df, "Product Name", "Sales", 
            "Product ID", "Sales", "Sales by Product Name", "Sales by Product ID", "Sales", "top-ten")) 

        window.dist_cat.entryconfigure("Sales by Customer Name and Customer ID",
            command = lambda:self.plot_grouped_distribution(df, "Customer Name", "Sales", 
            "Customer ID", "Sales", "Sales by Customer Name", "Sales by Customer ID", "Sales", "top-ten")) 

        window.dist_cat.entryconfigure("Sales by City and State",
            command = lambda:self.plot_grouped_distribution(df, "City", "Sales", 
            "State", "Sales", "Sales by City", "Sales by State", "Sales", "top-ten")) 

        window.dist_cat.entryconfigure("Categorized Sales",
            command = lambda:self.plot_case_distribution(df, "Cat_Sales", "Category",
                "The Case Distribution of Categorized Sales and Category"))

        window.dist_cat.entryconfigure("Categorized Sales by Year and Quarter",
            command = lambda:self.plot_categorized_distribution(df, "Year", "Cat_Sales",
                "Quarter", "Cat_Sales",                                                
                "The Case Distribution of Categorized Sales by Year",
                "The Case Distribution of Categorized Sales by Quarter"))

        window.dist_cat.entryconfigure("Categorized Sales by Day and Month",
            command = lambda:self.plot_categorized_distribution(df, "Day", "Cat_Sales",
                "Month", "Cat_Sales",                                                
                "The Case Distribution of Categorized Sales by Day",
                "The Case Distribution of Categorized Sales by Month"))

        window.dist_cat.entryconfigure("Categorized Sales by Segment and Sub-Category",
            command = lambda:self.plot_categorized_distribution(df, "Segment", "Cat_Sales",
                "Sub-Category", "Cat_Sales",                                                
                "The Case Distribution of Categorized Sales by Segment",
                "The Case Distribution of Categorized Sales by Sub-Category"))

        window.dist_cat.entryconfigure("Categorized Sales by Region and State",
            command = lambda:self.plot_categorized_distribution(df, "Region", "Cat_Sales",
                "State", "Cat_Sales",                                                
                "The Case Distribution of Categorized Sales by Region",
                "The Case Distribution of Categorized Sales by State"))

        window.dist_cat.entryconfigure("Day versus Sales Per Category",
            command = lambda:self.plot_box_distribution(df, "Day", "Sales", "Category",
                "Day versus Sales Per Category"))

        window.dist_cat.entryconfigure("Month versus Sales Per Segment",
            command = lambda:self.plot_box_distribution(df, "Month", "Sales", "Segment",
                "Month versus Sales Per Segment"))
        
        window.dist_cat.entryconfigure("Sub-Category versus Sales Per Year",
            command = lambda:self.plot_box_distribution(df, "Sub-Category", "Sales", "Year",
                "Sub-Category versus Sales Per Year"))        
        
        window.dist_cat.entryconfigure("Region versus Sales Per Quarter",
            command = lambda:self.plot_box_distribution(df, "Region", "Sales", "Quarter",
                "Region versus Sales Per Quarter"))         

    def choose_plot(self, df1, df2, chosen, figure1, canvas1, figure2, canvas2):
        print(chosen)
        if chosen == "Day":
            self.plot_piechart(df2, "Day", figure1, canvas1, "Case Distribution of Day")

        elif chosen == "Month":
            self.plot_piechart(df2, "Month", figure2, canvas2, "Case Distribution of Month")

        elif chosen == "Quarter":
            self.plot_piechart(df2, "Quarter", figure1, canvas1, "Case Distribution of Quarter")            

        elif chosen == "Year":
            self.plot_piechart(df2, "Year", figure2, canvas2, "Case Distribution of Year")              

        elif chosen == "Missing Values":
            self.plot_missing_values(df1, figure1, canvas1)

        elif chosen == "Correlation Coefficient":
            self.plot_corr_coeffs(df1, figure2, canvas2)

        elif chosen == "Country and City":
            self.plot_piechart(df2, "Country", figure1, canvas1, "Case Distribution of Country")
            self.plot_piechart(df2, "City", figure2, canvas2, "Case Distribution of City", top_ten=True)

        elif chosen == "State and Region":
            self.plot_piechart(df2, "State", figure1, canvas1, "Case Distribution of State", top_ten=True)
            self.plot_piechart(df2, "Region", figure2, canvas2, "Case Distribution of Region")       

        elif chosen == "Customer Name and Customer ID":
            self.plot_piechart(df2, "Customer Name", figure1, canvas1, "Case Distribution of Customer Name", top_ten=True)
            self.plot_piechart(df2, "Customer ID", figure2, canvas2, "Case Distribution of Customer ID", top_ten=True)  

        elif chosen == "Ship Mode and Segment":
            self.plot_piechart(df2, "Ship Mode", figure1, canvas1, "Case Distribution of Ship Mode")
            self.plot_piechart(df2, "Segment", figure2, canvas2, "Case Distribution of Segment")  

        elif chosen == "Product Name and Product ID":
            self.plot_piechart(df2, "Product Name", figure1, canvas1, "Case Distribution of Product Name", top_ten=True)
            self.plot_piechart(df2, "Product ID", figure2, canvas2, "Case Distribution of Product ID", top_ten=True) 

        elif chosen == "Category and Sub-Category":
            self.plot_piechart(df2, "Category", figure1, canvas1, "Case Distribution of Category", top_ten=True)
            self.plot_piechart(df2, "Sub-Category", figure2, canvas2, "Case Distribution of Sub-Category", top_ten=True) 

        elif chosen == "Sales by Year and Quarter":
            self.plot_piechart_group(df2.groupby('Year')['Sales'].sum(), figure1, canvas1, "Sales by Year")
            self.plot_piechart_group(df2.groupby('Quarter')['Sales'].sum(), figure2, canvas2, "Sales by Quarter")

        elif chosen == "Sales by Day and Month":
            self.plot_piechart_group(df2.groupby('Day')['Sales'].sum(), figure1, canvas1, "Sales by Day")
            self.plot_piechart_group(df2.groupby('Month')['Sales'].sum(), figure2, canvas2, "Sales by Month")            

        elif chosen == "Sales by Ship Mode and Segment":
            self.plot_piechart_group(df2.groupby('Ship Mode')['Sales'].sum(), figure1, canvas1, "Sales by Ship Mode")
            self.plot_piechart_group(df2.groupby('Segment')['Sales'].sum(), figure2, canvas2, "Sales by Segment")              

        elif chosen == "Sales by Category and Sub-Category":
            self.plot_piechart_group(df2.groupby('Category')['Sales'].sum(), figure1, canvas1, "Sales by Category")
            self.plot_piechart_group(df2.groupby('Sub-Category')['Sales'].sum(), figure2, canvas2, "Sales by Sub-Category") 

        elif chosen == "Sales by Product Name and Product ID":
            self.plot_piechart_group(df2.groupby('Product Name')['Sales'].sum().nlargest(10), figure1, canvas1, "Sales by Product Name")
            self.plot_piechart_group(df2.groupby('Product ID')['Sales'].sum().nlargest(10), figure2, canvas2, "Sales by Product ID") 

        elif chosen == "Sales by Customer Name and Customer ID":
            self.plot_piechart_group(df2.groupby('Customer Name')['Sales'].sum().nlargest(10), 
                figure1, canvas1, "Sales by Customer Name")
            self.plot_piechart_group(df2.groupby('Customer ID')['Sales'].sum().nlargest(10), 
                figure2, canvas2, "Sales by Customer ID") 

        elif chosen == "Sales by City and State":
            self.plot_piechart_group(df2.groupby('City')['Sales'].sum().nlargest(10), 
                figure1, canvas1, "Sales by City")
            self.plot_piechart_group(df2.groupby('State')['Sales'].sum().nlargest(10), 
                figure2, canvas2, "Sales by State") 

        elif chosen == "Categorized Sales by Year and Quarter":
            self.dist_one_vs_another_plot(df2, "Year", "Cat_Sales", figure1, canvas1, "Categorized Sales by Year")              
            self.dist_one_vs_another_plot(df2, "Quarter", "Cat_Sales", figure2, canvas2, "Categorized Sales by Quarter")

        elif chosen == "Categorized Sales by Day and Month":
            self.dist_one_vs_another_plot(df2, "Day", "Cat_Sales", figure1, canvas1, "Categorized Sales by Day")              
            self.dist_one_vs_another_plot(df2, "Month", "Cat_Sales", figure2, canvas2, "Categorized Sales by Month")         

        elif chosen == "Categorized Sales by Segment and Sub-Category":
            self.dist_one_vs_another_plot(df2, "Segment", "Cat_Sales", figure1, canvas1, "Categorized Sales by Segment")              
            self.dist_one_vs_another_plot(df2, "Sub-Category", "Cat_Sales", figure2, canvas2, "Categorized Sales by Sub-Category")

        elif chosen == "Categorized Sales by Region and State":
            self.dist_one_vs_another_plot(df2, "Region", "Cat_Sales", figure1, canvas1, "Categorized Sales by Region")              
            self.dist_one_vs_another_plot(df2, "State", "Cat_Sales", figure2, canvas2, "Categorized Sales by State")

        if chosen == "Correlation Matrix":
            self.plot_corr_mat(df1, figure1, canvas1)
            
    def line_plot_year_wise(self, df, feat, year1, year2, figure, canvas):
        figure.clear()    
        ax1 = figure.add_subplot(2, 1, 1)  
        data1 = df[df["Year"]==year1]
        data2 = df[df["Year"]==year2]
        # Convert the column and index to NumPy arrays
        date_index1 = data1.index.to_numpy()
        date_index2 = data2.index.to_numpy()

        # Line plot
        ax1.plot(date_index1, data1[feat].to_numpy(), 
            color="red", marker='o', linestyle='-', linewidth=2, markersize=1, label=feat)
        ax1.set_xlabel('YEAR')
        ax1.set_title(feat + ' (YEAR = ' + str(year1) + ')', fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        ax2 = figure.add_subplot(2, 1, 2) 
        ax2.plot(date_index2, data2[feat].to_numpy(), 
            color="blue", marker='o', linestyle='-', linewidth=2, markersize=1, label=feat)
        ax2.set_xlabel('YEAR')
        ax2.set_title(feat + ' (YEAR = ' + str(year2) + ')', fontsize=12)
        ax2.legend(facecolor='#E6E6FA', edgecolor='black')
        ax2.set_facecolor('#F0F0F0')
        ax2.grid(True)

        figure.tight_layout()
        canvas.draw()

    def line_plot_norm_data(self, norm_data, figure, canvas, label, title):
        figure.clear()    
        ax = figure.add_subplot(1, 1, 1)  
        
        # Convert the column and index to NumPy arrays
        date_index = norm_data.index.to_numpy()

        values = norm_data.to_numpy()
        ax.plot(values, date_index, marker='o', linestyle='-', 
                        linewidth=3, markersize=2, label="SALES")

        ax.set_ylabel(label)
        ax.set_title(title, fontsize=12)
        ax.legend(fontsize=7, facecolor='#E6E6FA', edgecolor='black')
        ax.set_facecolor('#F0F0F0')
        ax.grid(True)

        figure.tight_layout()
        canvas.draw()

    def line_plot_data_mean_ewm(self, data_mean, data_ewm, figure, canvas, xlabel, ylabel):
        figure.clear()    
        ax1 = figure.add_subplot(1, 1, 1)  

        # Convert the column and index to NumPy arrays
        date_index = data_mean.index.to_numpy()

        # Line plot
        ax1.plot(date_index, data_mean.to_numpy(), 
            color="red", marker='o', linestyle='-', linewidth=2, markersize=1, label="Mean")
        ax1.plot(date_index,data_ewm.to_numpy(), 
            color="blue", marker='o', linestyle='-', linewidth=2, markersize=1, label="EWM")
        ax1.set_title("Year-Wise Mean/EWM of Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel(xlabel)
        ax1.set_ylabel(ylabel)
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()
        
    def plot_year_wise(self, df, feat, year1, year2):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.line_plot_year_wise(df, feat, year1, year2, form3.figure1, form3.canvas1)

    def plot_year_wise_mean_ewm(self, data_mean, data_ewm):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.line_plot_data_mean_ewm(data_mean, data_ewm, form3.figure1, form3.canvas1, "YEAR", "SALES")

    def plot_year_trends(self, df, var1, var2, title=""):        
        win = tk.Toplevel()
        form1 = Form1(win)
        self.box_violin_strip_heat(df, var1, var2, form1.figure1, form1.canvas1, 
            form1.figure2, form1.canvas2, title)

    def box_violin_strip_heat(self, data, filter, feat1, figure1, canvas1, figure2, canvas2, title):
        figure1.clear()    
        ax1 = figure1.add_subplot(2, 1, 1)
        sns.boxplot(x = filter, y = feat1, data = data, ax=ax1)       
        ax1.set_title("Box Plot of " + feat1 + " by " + filter, fontsize=12)       
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=6)
        ax1.tick_params(axis='both', which='minor', labelsize=6)
        ax1.grid(True)
        ax1.set_facecolor('#F0F0F0')

        ax2 = figure1.add_subplot(2, 1, 2)
        sns.violinplot(x = filter, y = feat1, data = data, ax=ax2)       
        ax2.set_title("Violin Plot of " + feat1 + " by " + filter, fontsize=12)  
        # Set font for tick labels
        ax2.tick_params(axis='both', which='major', labelsize=6)
        ax2.tick_params(axis='both', which='minor', labelsize=6)  
        ax2.grid(True)     
        ax2.set_facecolor('#F0F0F0')
        figure1.tight_layout()
        canvas1.draw()

        figure2.clear()    
        ax3 = figure2.add_subplot(2, 1, 1)
        sns.stripplot(x = filter, y = feat1, data = data, ax=ax3)       
        ax3.set_title("Strip Plot of " + feat1 + " by " + filter, fontsize=12)  
        # Set font for tick labels
        ax3.tick_params(axis='both', which='major', labelsize=6)
        ax3.tick_params(axis='both', which='minor', labelsize=6)
        ax3.set_facecolor('#F0F0F0')
        ax3.grid(True)

        ax4 = figure2.add_subplot(2, 1, 2)
        sns.swarmplot(x = filter, y = feat1, data = data, ax=ax4)
        ax4.set_title("Swarm Plot of " + feat1 + " by " + filter, fontsize=12)
        # Set font for tick labels
        ax4.tick_params(axis='both', which='major', labelsize=6)
        ax4.tick_params(axis='both', which='minor', labelsize=6) 
        ax4.grid(True)
        ax4.set_facecolor('#F0F0F0')
        figure2.tight_layout()
        canvas2.draw()
        
    def binds_year_wise(self, window, df, data_mean, data_ewm): 
        window.year_wise.entryconfigure("Year-Wise Sales Distribution 2017 and 2018",
            command = lambda:self.plot_year_wise(df, "Sales", 2017, 2018)) 

        window.year_wise.entryconfigure("Year-Wise Sales Distribution 2015 and 2016",
            command = lambda:self.plot_year_wise(df, "Sales", 2015, 2016)) 

        window.year_wise.entryconfigure("Year-Wise Sales Mean and EWM",
            command = lambda:self.plot_year_wise_mean_ewm(data_mean, data_ewm)) 

        window.year_wise.entryconfigure("Sales by Year",
            command = lambda:self.plot_year_trends(df, "Year", "Sales")) 

        window.year_wise.entryconfigure("Sales by Quarter",
            command = lambda:self.plot_year_trends(df, "Quarter", "Sales")) 

        window.year_wise.entryconfigure("Sales by Month",
            command = lambda:self.plot_year_trends(df, "Month", "Sales")) 

        window.year_wise.entryconfigure("Sales by Day",
            command = lambda:self.plot_year_trends(df, "Day", "Sales")) 

        window.year_wise.entryconfigure("Sales by Week",
            command = lambda:self.plot_year_trends(df, "Week", "Sales"))                 


    def choose_year_wise(self, df, data_mean, data_ewm, data_norm, chosen, figure1, canvas1, figure2, canvas2): 
        if chosen == "Year-Wise Sales Distribution 2017 and 2018":
            self.line_plot_year_wise(df, "Sales", 2017, 2018, figure1, canvas1)

        if chosen == "Year-Wise Sales Distribution 2015 and 2016":
            self.line_plot_year_wise(df, "Sales", 2015, 2016, figure2, canvas2)   

        if chosen == "Year-Wise Sales Mean and EWM":
            self.line_plot_data_mean_ewm(data_mean, data_ewm, figure1, canvas1, "YEAR", chosen) 

        if chosen == "Normalized Year-Wise Data":
            self.line_plot_norm_data(data_norm, figure2, canvas2, "YEAR", chosen) 

        if chosen == "Adj Close by Year":
            self.box_violin_strip_heat(df, "Year", "Adj Close", figure1, canvas1, figure2, canvas2, 
                "Year-Wise Normalized ")

    def line_plot_month_wise(self, df, feat1, year, filter, filter1, filter2, figure, canvas):
        figure.clear()    
        ax1 = figure.add_subplot(2, 1, 1)  

        data1 = df[(df["Year"]==year)&(df[filter]==filter1)]
        data2 = df[(df["Year"]==year)&(df[filter]==filter2)]
        
        # Convert the column and index to NumPy arrays
        date_index1 = data1.index.to_numpy()
        date_index2 = data2.index.to_numpy()

        # Line plot
        ax1.plot(date_index1, data1[feat1].to_numpy(), 
            color="red", marker='o', linestyle='-', linewidth=2, markersize=1, label=feat1)
        ax1.set_xlabel('DATE')
        ax1.set_ylabel(feat1)
        ax1.set_title(feat1 + " " + filter + " = " + filter1 + " " + str(year), fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=7)
        ax1.tick_params(axis='both', which='minor', labelsize=7)

        ax2 = figure.add_subplot(2, 1, 2) 
        ax2.plot(date_index2, data2[feat1].to_numpy(), 
            color="red", marker='o', linestyle='-', linewidth=2, markersize=1, label=feat1)
        ax2.set_xlabel('DATE')
        ax2.set_ylabel(feat1)
        ax2.set_title(feat1 + " " + filter + " = " + filter2 + " " + str(year), fontsize=12)
        ax2.legend(facecolor='#E6E6FA', edgecolor='black')
        ax2.set_facecolor('#F0F0F0')
        ax2.grid(True)

        # Set font for tick labels
        ax2.tick_params(axis='both', which='major', labelsize=7)
        ax2.tick_params(axis='both', which='minor', labelsize=7)

        figure.tight_layout()
        canvas.draw()

    def color_month(self, month):
        if month == 1:
            return 'January','blue'
        elif month == 2:
            return 'February','green'
        elif month == 3:
            return 'March','orange'
        elif month == 4:
            return 'April','yellow'
        elif month == 5:
            return 'May','red'
        elif month == 6:
            return 'June','violet'
        elif month == 7:
            return 'July','purple'
        elif month == 8:
            return 'August','black'
        elif month == 9:
            return 'September','brown'
        elif month == 10:
            return 'October','darkblue'
        elif month == 11:
            return 'November','grey'
        else:
            return 'December','pink'

    def line_plot_month(self, month, data, ax):
        label, color = self.color_month(month)
        mdata = data[data.index.month == month]       
        date_index = mdata.index.to_numpy()

        ax.plot(date_index, mdata.to_numpy(), 
            marker='o', linestyle='-', 
            color=color, linewidth=2, markersize=1, label=label)

    def sns_plot_month(self, monthly_data, title, figure, canvas):
        figure.clear()    
        ax = figure.add_subplot(1, 1, 1)         
        ax.set_title(title, fontsize=12)
        ax.set_xlabel('YEAR', fontsize=10)
        ax.set_ylabel("SALES", fontsize=10)

        for i in range(1,13):
            self.line_plot_month(i, monthly_data, ax)

        ax.legend(facecolor='#E6E6FA', edgecolor='black')
        ax.grid()
        ax.set_facecolor('#F0F0F0')
        figure.tight_layout()
        canvas.draw()

    def plot_month_wise(self, df, feat1, year, filter, filter1, filter2):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.line_plot_month_wise(df, feat1, year, filter, 
            filter1, filter2, form3.figure1, form3.canvas1)

    def plot_month_wise_mean_ewm(self, data_mean, data_ewm):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.line_plot_data_mean_ewm(data_mean, data_ewm, form3.figure1, form3.canvas1, "YEAR", "SALES")

    def plot_month_wise_by_month(self, data_mean, title):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.sns_plot_month(data_mean, title, form3.figure1, form3.canvas1)

    def month_wise_region_based(self, df, figure, canvas):
        figure.clear()           

        west_df = df.loc[df['Region'] == 'West']
        east_df = df.loc[df['Region'] == 'East']
        south_df = df.loc[df['Region'] == 'South']
        central_df = df.loc[df['Region'] == 'Central']

        west_monthly_sales = west_df['Sales'].resample('M').sum()
        west_monthly_sales = west_monthly_sales.round(2)

        east_monthly_sales = east_df['Sales'].resample('M').sum()
        east_monthly_sales = east_monthly_sales.round(2)

        south_monthly_sales = south_df['Sales'].resample('M').sum()
        south_monthly_sales = south_monthly_sales.round(2)

        central_monthly_sales = central_df['Sales'].resample('M').sum()
        central_monthly_sales = central_monthly_sales.round(2)

        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(west_monthly_sales.index.to_numpy(), west_monthly_sales.values, color="red", marker='o', linestyle='-', label="West")
        ax1.plot(east_monthly_sales.index.to_numpy(), east_monthly_sales.values, color="blue", marker='o', linestyle='-', label="East")
        ax1.plot(south_monthly_sales.index.to_numpy(), south_monthly_sales.values, color="green", marker='o', linestyle='-', label="South")
        ax1.plot(central_monthly_sales.index.to_numpy(), central_monthly_sales.values, color="black", marker='o', linestyle='-', label="Central")

        ax1.set_title("Region-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_category_based(self, df, figure, canvas):
        figure.clear()           

        #Creates a new column
        df['Quantity'] = 1

        office_supplies_df = df.loc[df['Category'] == 'Office Supplies']
        technology_df = df.loc[df['Category'] == 'Technology']
        furniture_df = df.loc[df['Category'] == 'Furniture']

        # Find how many quantities sold per month for each category
        monthly_office = office_supplies_df['Quantity'].resample('M').sum()
        monthly_technology = technology_df['Quantity'].resample('M').sum()
        monthly_furniture = furniture_df['Quantity'].resample('M').sum()

        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(monthly_office.index.to_numpy(), monthly_office.values, color="red", marker='o', linestyle='-', label="Office Products Sales Quantities by Month")
        ax1.plot(monthly_technology.index.to_numpy(), monthly_technology.values, color="blue", marker='o', linestyle='-', label="Technology Products Sales Quantities by Month")
        ax1.plot(monthly_furniture.index.to_numpy(), monthly_furniture.values, color="green", marker='o', linestyle='-', label="Furniture Products Sales Quantities by Month")

        ax1.set_title("Category-Based Monthly Quantity", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Quantity")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_segment_based(self, df, figure, canvas):
        figure.clear()           

        corp_df = df.loc[df['Segment'] == 'Corporate']
        office_df = df.loc[df['Segment'] == 'Home Office']
        cons_df = df.loc[df['Segment'] == 'Consumer']


        corp_monthly_sales = corp_df['Sales'].resample('M').sum()
        corp_monthly_sales = corp_monthly_sales.round(2)

        office_monthly_sales = office_df['Sales'].resample('M').sum()
        office_monthly_sales = office_monthly_sales.round(2)

        cons_monthly_sales = cons_df['Sales'].resample('M').sum()
        cons_monthly_sales = cons_monthly_sales.round(2)

        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(corp_monthly_sales.index.to_numpy(), corp_monthly_sales.values, color="red", marker='o', linestyle='-', label="Corporate")
        ax1.plot(office_monthly_sales.index.to_numpy(), office_monthly_sales.values, color="blue", marker='o', linestyle='-', label="Home Office")
        ax1.plot(cons_monthly_sales.index.to_numpy(), cons_monthly_sales.values, color="green", marker='o', linestyle='-', label="Consumer")

        ax1.set_title("Segment-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_city_based(self, df, figure, canvas):
        figure.clear()           

        los_df = df.loc[df['City'] == 'Los Angeles']
        york_df = df.loc[df['City'] == 'New York City']
        phil_df = df.loc[df['City'] == 'Philadelphia']
        san_df = df.loc[df['City'] == 'San Francisco']
        sea_df = df.loc[df['City'] == 'Seattle']
        chi_df = df.loc[df['City'] == 'Chicago']
        hou_df = df.loc[df['City'] == 'Houston']
        col_df = df.loc[df['City'] == 'Columbus']
        die_df = df.loc[df['City'] == 'San Diego']
        spr_df = df.loc[df['City'] == 'Springfield']
        
        los_monthly_sales = los_df['Sales'].resample('M').sum()
        los_monthly_sales = los_monthly_sales.round(2)

        york_monthly_sales = york_df['Sales'].resample('M').sum()
        york_monthly_sales = york_monthly_sales.round(2)

        phils_monthly_sales = phil_df['Sales'].resample('M').sum()
        phils_monthly_sales = phils_monthly_sales.round(2)

        san_monthly_sales = san_df['Sales'].resample('M').sum()
        san_monthly_sales = san_monthly_sales.round(2)

        sea_monthly_sales = sea_df['Sales'].resample('M').sum()
        sea_monthly_sales = sea_monthly_sales.round(2)
        
        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(los_monthly_sales.index.to_numpy(), los_monthly_sales.values, color="red", marker='o', linestyle='-', label="Los Angeles")
        ax1.plot(york_monthly_sales.index.to_numpy(), york_monthly_sales.values, color="blue", marker='o', linestyle='-', label="New York City")
        ax1.plot(phils_monthly_sales.index.to_numpy(), phils_monthly_sales.values, color="green", marker='o', linestyle='-', label="Philadelphia")
        ax1.plot(san_monthly_sales.index.to_numpy(), san_monthly_sales.values, color="black", marker='o', linestyle='-', label="San Francisco")
        ax1.plot(sea_monthly_sales.index.to_numpy(), sea_monthly_sales.values, color="cyan", marker='o', linestyle='-', label="Seattle")

        ax1.set_title("City-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_shipmode_based(self, df, figure, canvas):
        figure.clear()           

        std_df = df.loc[df['Ship Mode'] == 'Standard Class']
        first_df = df.loc[df['Ship Mode'] == 'First Class']
        scd_df = df.loc[df['Ship Mode'] == 'Second Class']
        same_df = df.loc[df['Ship Mode'] == 'Same Day']

        std_monthly_sales = std_df['Sales'].resample('M').sum()
        std_monthly_sales = std_monthly_sales.round(2)

        first_monthly_sales = first_df['Sales'].resample('M').sum()
        first_monthly_sales = first_monthly_sales.round(2)

        scd_monthly_sales = scd_df['Sales'].resample('M').sum()
        scd_monthly_sales = scd_monthly_sales.round(2)

        same_monthly_sales = same_df['Sales'].resample('M').sum()
        same_monthly_sales = same_monthly_sales.round(2)
        
        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(std_monthly_sales.index.to_numpy(), std_monthly_sales.values, color="red", marker='o', linestyle='-', label="Standard Class")
        ax1.plot(first_monthly_sales.index.to_numpy(), first_monthly_sales.values, color="blue", marker='o', linestyle='-', label="First Class")
        ax1.plot(scd_monthly_sales.index.to_numpy(), scd_monthly_sales.values, color="green", marker='o', linestyle='-', label="Second Class")
        ax1.plot(same_monthly_sales.index.to_numpy(), same_monthly_sales.values, color="cyan", marker='o', linestyle='-', label="Same Day")

        ax1.set_title("Ship Mode-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_productname_based(self, df, figure, canvas):
        figure.clear()           

        env_df = df.loc[df['Product Name'] == 'Staple envelope']
        stp_df = df.loc[df['Product Name'] == 'Staples']
        eas_df = df.loc[df['Product Name'] == 'Easy-staple paper']
        ave_df = df.loc[df['Product Name'] == 'Avery Non-Stick Binders']
        rem_df = df.loc[df['Product Name'] == 'Staple remover']
        
        env_monthly_sales = env_df['Sales'].resample('M').sum()
        env_monthly_sales = env_monthly_sales.round(2)

        stp_monthly_sales = stp_df['Sales'].resample('M').sum()
        stp_monthly_sales = stp_monthly_sales.round(2)

        eas_monthly_sales = eas_df['Sales'].resample('M').sum()
        eas_monthly_sales = eas_monthly_sales.round(2)

        ave_monthly_sales = ave_df['Sales'].resample('M').sum()
        ave_monthly_sales = ave_monthly_sales.round(2)

        rem_monthly_sales = ave_df['Sales'].resample('M').sum()
        rem_monthly_sales = rem_monthly_sales.round(2)
        
        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(env_monthly_sales.index.to_numpy(), env_monthly_sales.values, 
                 color="red", marker='o', linestyle='-', label="Staple envelope")
        ax1.plot(stp_monthly_sales.index.to_numpy(), stp_monthly_sales.values, 
                 color="blue", marker='o', linestyle='-', label="Staples")
        ax1.plot(eas_monthly_sales.index.to_numpy(), eas_monthly_sales.values, 
                 color="green", marker='o', linestyle='-', label="Easy-staple paper")
        ax1.plot(ave_monthly_sales.index.to_numpy(), ave_monthly_sales.values, 
                 color="cyan", marker='o', linestyle='-', label="Avery Non-Stick Binders")
        ax1.plot(rem_monthly_sales.index.to_numpy(), rem_monthly_sales.values, 
                 color="black", marker='o', linestyle='-', label="Staple remover")

        ax1.set_title("Product Name-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def month_wise_subcategory_based(self, df, figure, canvas):
        figure.clear()           

        bin_df = df.loc[df['Sub-Category'] == 'Binders']
        pap_df = df.loc[df['Sub-Category'] == 'Paper']
        pho_df = df.loc[df['Sub-Category'] == 'Phones']
        sto_df = df.loc[df['Sub-Category'] == 'Storage']
        fur_df = df.loc[df['Sub-Category'] == 'Furnishings']
        
        bin_monthly_sales = bin_df['Sales'].resample('M').sum()
        bin_monthly_sales = bin_monthly_sales.round(2)

        pap_monthly_sales = pap_df['Sales'].resample('M').sum()
        pap_monthly_sales = pap_monthly_sales.round(2)

        pho_monthly_sales = pho_df['Sales'].resample('M').sum()
        pho_monthly_sales = pho_monthly_sales.round(2)

        sto_monthly_sales = sto_df['Sales'].resample('M').sum()
        sto_monthly_sales = sto_monthly_sales.round(2)

        fur_monthly_sales = fur_df['Sales'].resample('M').sum()
        fur_monthly_sales = fur_monthly_sales.round(2)
        
        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(bin_monthly_sales.index.to_numpy(), bin_monthly_sales.values, 
                 color="red", marker='o', linestyle='-', label="Binders")
        ax1.plot(pap_monthly_sales.index.to_numpy(), pap_monthly_sales.values, 
                 color="blue", marker='o', linestyle='-', label="Paper")
        ax1.plot(pho_monthly_sales.index.to_numpy(), pho_monthly_sales.values, 
                 color="green", marker='o', linestyle='-', label="Phones")
        ax1.plot(sto_monthly_sales.index.to_numpy(), sto_monthly_sales.values, 
                 color="black", marker='o', linestyle='-', label="Storage")
        ax1.plot(fur_monthly_sales.index.to_numpy(), fur_monthly_sales.values, 
                 color="cyan", marker='o', linestyle='-', label="Furnishings")

        ax1.set_title("Sub-Category-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def plot_month_wise_subcategory_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_subcategory_based(df, form3.figure1, form3.canvas1)
        
    def plot_month_wise_productname_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_productname_based(df, form3.figure1, form3.canvas1)
        
    def plot_month_wise_shipmode_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_shipmode_based(df, form3.figure1, form3.canvas1)
        
    def plot_month_wise_city_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_city_based(df, form3.figure1, form3.canvas1)
        
    def plot_month_wise_region_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_region_based(df, form3.figure1, form3.canvas1)

    def plot_month_wise_category_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_category_based(df, form3.figure1, form3.canvas1)

    def plot_month_wise_segment_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_segment_based(df, form3.figure1, form3.canvas1)
            
    def binds_month_wise(self, window, df, data_mean, data_ewm): 
        window.month_wise.entryconfigure("Sales Quarter 1 and 2 Year 2018",
            command = lambda:self.plot_month_wise(df, "Sales", 2018, "Quarter", "Jan-March", "April-June")) 

        window.month_wise.entryconfigure("Sales Quarter 3 and 4 Year 2018",
            command = lambda:self.plot_month_wise(df, "Sales", 2018, "Quarter", "July-Sept", "Oct-Dec")) 

        window.month_wise.entryconfigure("Sales Quarter 1 and 2 Year 2017",
            command = lambda:self.plot_month_wise(df, "Sales", 2017, "Quarter", "Jan-March", "April-June")) 

        window.month_wise.entryconfigure("Sales Quarter 3 and 4 Year 2017",
            command = lambda:self.plot_month_wise(df, "Sales", 2017, "Quarter", "July-Sept", "Oct-Dec")) 

        window.month_wise.entryconfigure("Sales Quarter 1 and 2 Year 2016",
            command = lambda:self.plot_month_wise(df, "Sales", 2016, "Quarter", "Jan-March", "April-June")) 

        window.month_wise.entryconfigure("Sales Quarter 3 and 4 Year 2016",
            command = lambda:self.plot_month_wise(df, "Sales", 2016, "Quarter", "July-Sept", "Oct-Dec")) 

        window.month_wise.entryconfigure("Sales Quarter 1 and 2 Year 2015",
            command = lambda:self.plot_month_wise(df, "Sales", 2015, "Quarter", "Jan-March", "April-June")) 

        window.month_wise.entryconfigure("Sales Quarter 3 and 4 Year 2015",
            command = lambda:self.plot_month_wise(df, "Sales", 2015, "Quarter", "July-Sept", "Oct-Dec")) 

        window.month_wise.entryconfigure("Sales Month 1 and 2 Year 2018",
            command = lambda:self.plot_month_wise(df, "Sales", 2018, "Month", "January", "February")) 

        window.month_wise.entryconfigure("Sales Month 3 and 4 Year 2017",
            command = lambda:self.plot_month_wise(df, "Sales", 2017, "Month", "March", "April"))

        window.month_wise.entryconfigure("Sales Month 5 and 6 Year 2016",
            command = lambda:self.plot_month_wise(df, "Sales", 2016, "Month", "May", "June"))

        window.month_wise.entryconfigure("Sales Month 7 and 8 Year 2015",
            command = lambda:self.plot_month_wise(df, "Sales", 2015, "Month", "July", "August"))

        window.month_wise.entryconfigure("Month-Wise Sales Mean and EWM",
            command = lambda:self.plot_month_wise_mean_ewm(data_mean, data_ewm)) 

        window.month_wise.entryconfigure("Sales by Month",
            command = lambda:self.plot_month_wise_by_month(data_mean, "Sales by Month"))        
 
        window.month_wise.entryconfigure("Region-Based Monthly Sales",
            command = lambda:self.plot_month_wise_region_based(df))        

        window.month_wise.entryconfigure("Category-Based Monthly Quantities",
            command = lambda:self.plot_month_wise_category_based(df)) 

        window.month_wise.entryconfigure("Segment-Based Monthly Sales",
            command = lambda:self.plot_month_wise_segment_based(df))        

        window.month_wise.entryconfigure("City-Based Monthly Sales",
            command = lambda:self.plot_month_wise_city_based(df)) 

        window.month_wise.entryconfigure("Ship Mode-Based Monthly Sales",
            command = lambda:self.plot_month_wise_shipmode_based(df))

        window.month_wise.entryconfigure("Product Name-Based Monthly Sales",
            command = lambda:self.plot_month_wise_productname_based(df))

        window.month_wise.entryconfigure("Sub-Category-Based Monthly Sales",
            command = lambda:self.plot_month_wise_subcategory_based(df))
       
    def choose_month_wise(self, df, data_mean, data_ewm, 
            data_norm, chosen, figure1, canvas1, figure2, canvas2):  
        if chosen == "Sales Quarter 1 and 2 Year 2018":
            self.line_plot_month_wise(df, "Sales", 2018, 
                "Quarter", "Jan-March", "April-June", figure1, canvas1)

        if chosen == "Sales Quarter 3 and 4 Year 2018":
            self.line_plot_month_wise(df, "Sales", 2018, 
                "Quarter", "July-Sept", "Oct-Dec", figure2, canvas2)

        if chosen == "Sales Quarter 1 and 2 Year 2017":
            self.line_plot_month_wise(df, "Sales", 2017, 
                "Quarter", "Jan-March", "April-June", figure1, canvas1)

        if chosen == "Sales Quarter 3 and 4 Year 2017":
            self.line_plot_month_wise(df, "Sales", 2017, 
                "Quarter", "July-Sept", "Oct-Dec", figure2, canvas2)

        if chosen == "Sales Quarter 1 and 2 Year 2016":
            self.line_plot_month_wise(df, "Sales", 2016, 
                "Quarter", "Jan-March", "April-June", figure1, canvas1)

        if chosen == "Sales Quarter 3 and 4 Year 2016":
            self.line_plot_month_wise(df, "Sales", 2016, 
                "Quarter", "July-Sept", "Oct-Dec", figure2, canvas2)

        if chosen == "Sales Quarter 1 and 2 Year 2015":
            self.line_plot_month_wise(df, "Sales", 2015, 
                "Quarter", "Jan-March", "April-June", figure1, canvas1)

        if chosen == "Sales Quarter 3 and 4 Year 2015":
            self.line_plot_month_wise(df, "Sales", 2015, 
                "Quarter", "July-Sept", "Oct-Dec", figure2, canvas2)

        if chosen == "Month-Wise Sales Mean and EWM":
            self.line_plot_data_mean_ewm(data_mean, data_ewm, figure1, canvas1, "YEAR", "SALES")

        if chosen == "Sales by Month":
            self.sns_plot_month(data_mean, "Sales by Month", figure2, canvas2)

        if chosen == "Region-Based Monthly Sales":
            self.month_wise_region_based(df, figure1, canvas1)

        if chosen == "Category-Based Monthly Quantities":
            self.month_wise_category_based(df, figure2, canvas2)

        if chosen == "Segment-Based Monthly Sales":
            self.month_wise_segment_based(df, figure1, canvas1)

        if chosen == "City-Based Monthly Sales":
            self.month_wise_city_based(df, figure2, canvas2)

        if chosen == "Ship Mode-Based Monthly Sales":
            self.month_wise_shipmode_based(df, figure1, canvas1)

        if chosen == "Product Name-Based Monthly Sales":
            self.month_wise_productname_based(df, figure2, canvas2)

        if chosen == "Sub-Category-Based Monthly Sales":
            self.month_wise_subcategory_based(df, figure1, canvas1)

    def plot_rfm_distribution(self, df, var1, title=""):
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title(title)
        self.plot_piechart(df, var1, form3.figure1, form3.canvas1, "Case Distribution of " + var1)

    def plot_grouped_rfm_distribution(self, df, var1="", var2="", title1="", label=""):
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("Distribution of " + var2 + " by " + var1)
        sum_by_cat = df.groupby(var1)[var2].sum()       
        self.plot_piechart_group(sum_by_cat, form3.figure1, form3.canvas1, title1, label)

    def month_wise_rfm_based(self, df, figure, canvas):
        figure.clear()        

        top_df = df.loc[df['Customer Segment'] == 'Top Customer']
        high_df = df.loc[df['Customer Segment'] == 'High Value Customer']
        med_df = df.loc[df['Customer Segment'] == 'Medium Value Customer']
        low_df = df.loc[df['Customer Segment'] == 'Low Value Customer']
        lost_df = df.loc[df['Customer Segment'] == 'Lost Customer']
        
        top_monthly_sales = top_df['Sales'].resample('M').sum()
        top_monthly_sales = top_monthly_sales.round(2)

        high_monthly_sales = high_df['Sales'].resample('M').sum()
        high_monthly_sales = high_monthly_sales.round(2)

        med_monthly_sales = med_df['Sales'].resample('M').sum()
        med_monthly_sales = med_monthly_sales.round(2)

        low_monthly_sales = low_df['Sales'].resample('M').sum()
        low_monthly_sales = low_monthly_sales.round(2)

        lost_monthly_sales = lost_df['Sales'].resample('M').sum()
        lost_monthly_sales = lost_monthly_sales.round(2)
        
        ax1 = figure.add_subplot(1, 1, 1)
        ax1.plot(top_monthly_sales.index.to_numpy(), top_monthly_sales.values, 
                 color="red", marker='o', linestyle='-', label="Top Customer")
        ax1.plot(high_monthly_sales.index.to_numpy(), high_monthly_sales.values, 
                 color="blue", marker='o', linestyle='-', label="High Value Customer")
        ax1.plot(med_monthly_sales.index.to_numpy(), med_monthly_sales.values, 
                 color="green", marker='o', linestyle='-', label="Medium Value Customer")
        ax1.plot(low_monthly_sales.index.to_numpy(), low_monthly_sales.values, 
                 color="black", marker='o', linestyle='-', label="Low Value Customer")
        ax1.plot(lost_monthly_sales.index.to_numpy(), lost_monthly_sales.values, 
                 color="orange", marker='o', linestyle='-', label="Lost Customer")

        ax1.set_title("RFM-Based Monthly Sales", fontsize=12)
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.set_xlabel("Date")
        ax1.set_ylabel("Sales")
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)

        figure.tight_layout()
        canvas.draw()

    def plot_month_wise_rfm_based(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        self.month_wise_rfm_based(df, form3.figure1, form3.canvas1)
        
    def binds_rfm_distribution(self, window, df):        
        window.rfm_analysis.entryconfigure("Customer Segment",
            command = lambda:self.plot_rfm_distribution(df, "Customer Segment",  
                "The Case Distribution of Customer Segment"))
        
        window.rfm_analysis.entryconfigure("Sales by Customer Segment",
            command = lambda:self.plot_grouped_rfm_distribution(df, "Customer Segment",  
                "Sales", "Sales Distribution by Customer Segment"))        
        
        window.rfm_analysis.entryconfigure("Customer Segment by Year and Quarter",
            command = lambda:self.plot_categorized_distribution(df, "Year", "Customer Segment",
                "Quarter", "Customer Segment",                                                
                "The Case Distribution of Customer Segment by Year",
                "The Case Distribution of Customer Segment by Quarter"))        
        
        window.rfm_analysis.entryconfigure("Customer Segment by Day and Month",
            command = lambda:self.plot_categorized_distribution(df, "Day", "Customer Segment",
                "Month", "Customer Segment",                                                
                "The Case Distribution of Customer Segment by Day",
                "The Case Distribution of Customer Segment by Month"))

        window.rfm_analysis.entryconfigure("Customer Segment by Segment and Sub-Category",
            command = lambda:self.plot_categorized_distribution(df, "Segment", "Customer Segment",
                "Sub-Category", "Customer Segment",                                                
                "The Case Distribution of Customer Segment by Segment",
                "The Case Distribution of Customer Segment by Sub-Category"))

        window.rfm_analysis.entryconfigure("Customer Segment by Region and State",
            command = lambda:self.plot_categorized_distribution(df, "Region", "Customer Segment",
                "State", "Customer Segment",                                                
                "The Case Distribution of Customer Segment by Region",
                "The Case Distribution of Customer Segment by State"))
        
        window.rfm_analysis.entryconfigure("RFM-Based Monthly Sales",
            command = lambda:self.plot_month_wise_rfm_based(df))        
        
    def choose_rfm_distribution(self, df, chosen, figure1, canvas1, figure2, canvas2):  
        if chosen == "Customer Segment":
            self.plot_piechart(df, chosen, figure1, canvas1, 
                "Case Distribution of " + chosen)        

        if chosen == "Sales by Customer Segment":
            sum_by_cat = df.groupby("Customer Segment")["Sales"].sum()
            self.plot_piechart_group(sum_by_cat, figure2, canvas2, chosen)

        if chosen == "Customer Segment by Year and Quarter":
            self.dist_one_vs_another_plot(df, "Year", "Customer Segment", 
                figure1, canvas1, "Customer Segment by Year")
            self.dist_one_vs_another_plot(df, "Quarter", "Customer Segment", 
                figure2, canvas2, "Customer Segment by Quarter")

        if chosen == "Customer Segment by Day and Month":
            self.dist_one_vs_another_plot(df, "Day", "Customer Segment", 
                figure1, canvas1, "Customer Segment by Day")
            self.dist_one_vs_another_plot(df, "Month", "Customer Segment", 
                figure2, canvas2, "Customer Segment by Month")

        if chosen == "Customer Segment by Segment and Sub-Category":
            self.dist_one_vs_another_plot(df, "Segment", "Customer Segment", 
                figure1, canvas1, "Customer Segment by Segment")
            self.dist_one_vs_another_plot(df, "Sub-Category", "Customer Segment", 
                figure2, canvas2, "Customer Segment by Sub-Category")

        if chosen == "Customer Segment by Region and State":
            self.dist_one_vs_another_plot(df, "Region", "Customer Segment", 
                figure1, canvas1, "Customer Segment by Region")
            self.dist_one_vs_another_plot(df, "State", "Customer Segment", 
                figure2, canvas2, "Customer Segment by State")

        if chosen == "RFM-Based Monthly Sales":
            self.month_wise_rfm_based(df, figure1, canvas1)

    def rf_importance(self, X, y, figure, canvas):
        result_rf = self.obj_data.feat_importance_rf(X, y)
        figure.clear()    
        ax1 = figure.add_subplot(1,1,1)  
        sns.set_color_codes("pastel")
        ax=sns.barplot(x = 'Values',y = 'Features', data=result_rf, color="orange", ax=ax1)
        ax1.set_title('Random Forest Features Importance', fontweight ="bold",fontsize=14)

        ax1.set_xlabel('Features Importance',  fontsize=10) 
        ax1.set_ylabel('Feature Labels',  fontsize=10) 
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=10)
        ax1.tick_params(axis='both', which='minor', labelsize=10)
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)
        figure.tight_layout()
        canvas.draw()

    def plot_rf_importance(self, X, y):        
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("Random Forest Feature Importance")
        self.rf_importance(X, y, form3.figure1, form3.canvas1)
        
    def et_importance(self, X, y, figure, canvas):
        result_rf = self.obj_data.feat_importance_et(X, y)
        figure.clear()    
        ax1 = figure.add_subplot(1,1,1)  
        sns.set_color_codes("pastel")
        ax=sns.barplot(x = 'Values',y = 'Features', data=result_rf, color="Red", ax=ax1)
        ax1.set_title('Extra Trees Features Importance', fontweight ="bold",fontsize=14)

        ax1.set_xlabel('Features Importance',  fontsize=10) 
        ax1.set_ylabel('Feature Labels',  fontsize=10) 
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=10)
        ax1.tick_params(axis='both', which='minor', labelsize=10)
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)        
        figure.tight_layout()
        canvas.draw()        

    def plot_et_importance(self, X, y):        
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("Extra Trees Feature Importance")
        self.et_importance(X, y, form3.figure1, form3.canvas1)
        
    def rfe_importance(self, X, y, figure, canvas):
        result_lg = self.obj_data.feat_importance_rfe(X, y)
        figure.clear()    
        ax1 = figure.add_subplot(1,1,1)  
        sns.set_color_codes("pastel")
        ax=sns.barplot(x = 'Ranking',y = 'Features', data=result_lg, color="green", ax=ax1)
        ax1.set_title('RFE Features Importance', fontweight ="bold",fontsize=14)

        ax1.set_xlabel('Features Importance',  fontsize=10) 
        ax1.set_ylabel('Feature Labels',  fontsize=10) 
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=10)
        ax1.tick_params(axis='both', which='minor', labelsize=10)
        ax1.set_facecolor('#F0F0F0')
        ax1.grid(True)        
        figure.tight_layout()
        canvas.draw() 

    def plot_rfe_importance(self, X, y):        
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("RFE Feature Importance")
        self.rfe_importance(X, y, form3.figure1, form3.canvas1)

    def corr_coeffs(self, df, figure, canvas):
        figure.clear()
        ax = figure.add_subplot(1,1,1) 
        
        #correlation coefficient of every column with Summary column
        all_corr = df.corr().abs()['Churn'].sort_values(ascending = False)

        # Filters correlations greater than 0.01
        filtered_corr = all_corr[all_corr > 0.01]
    
        # Define a custom color palette (replace with your preferred colors)
        custom_palette = sns.color_palette("Set1", len(filtered_corr))
        filtered_corr.plot(kind='barh', ax=ax, color=custom_palette)
        ax.set_title("Correlation Coefficient of Features with Churn (Threshold > 0.01)", fontsize = 9)
        ax.set_ylabel("Coefficient")
        ax.set_facecolor('#F0F0F0')
        
        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=10)
        ax.tick_params(axis='both', which='minor', labelsize=10)
        
        ax.grid(True)
        figure.tight_layout()
        canvas.draw()

    def plot_corr_coeffs(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("Correlation Coefficients")
        self.corr_coeffs(df, form3.figure1, form3.canvas1)
        
    def corr_mat(self, df, figure, canvas):
        figure.clear()    
        ax = figure.add_subplot(1,1,1)  
        categorical_columns = df.select_dtypes(include=['object', 'category']).columns 
        df_removed = df.drop(columns=categorical_columns) 
        corrdata = df_removed.corr()

        annot_kws = {"size": 8, "color":"black"}
        # Filter correlations greater than 0.01
        mask = abs(corrdata) > 0.01
        filtered_corr = corrdata[mask]
        
        # Drops features that don't meet the threshold
        filtered_corr = filtered_corr.dropna(axis=0, how='all')
        filtered_corr = filtered_corr.dropna(axis=1, how='all')
               
        sns.heatmap(filtered_corr, ax = ax, lw=1, annot=True, cmap="Set1", annot_kws=annot_kws)
        ax.set_title('Correlation Matrix (Threshold > 0.01)', fontweight="bold", fontsize=12)

        # Set font for x and y labels
        ax.set_xlabel('Features', fontweight="bold", fontsize=12)
        ax.set_ylabel('Features', fontweight="bold", fontsize=12)

        # Set font for tick labels
        ax.tick_params(axis='both', which='major', labelsize=8)
        ax.tick_params(axis='both', which='minor', labelsize=8)

        figure.tight_layout()
        canvas.draw()

    def plot_corr_mat(self, df):        
        win = tk.Toplevel()
        form3 = Form3(win)
        win.title("Correlation Matrix")
        self.corr_mat(df, form3.figure1, form3.canvas1)
        
    def binds_feat_importance(self, window, df, X, y):  
        window.feat_eng.entryconfigure("Correlation Matrix",
            command = lambda:self.plot_corr_mat(df))

        window.feat_eng.entryconfigure("Correlation Coefficients",
            command = lambda:self.plot_corr_coeffs(df))
        
        window.feat_eng.entryconfigure("Random Forest Feature Importance",
            command = lambda:self.plot_rf_importance(X, y))

        window.feat_eng.entryconfigure("Extra Trees Feature Importance",
            command = lambda:self.plot_et_importance(X, y))

        window.feat_eng.entryconfigure("RFE Feature Importance",
            command = lambda:self.plot_rfe_importance(X, y))
            
    def scatter_train_test_regression(self, ytrain, ytest, predictions_train, predictions_test, figure, canvas, label):
        # Visualizes the training set results in a scatter plot
        figure.clear()    
        ax1 = figure.add_subplot(2, 1, 1)  
        ax1.scatter(x=ytrain, y=predictions_train, color='red', label='Training Data')
        ax1.set_title('The actual versus predicted (Training set): ' + label, fontweight='bold', fontsize=10)
        ax1.set_xlabel('Actual Train Set', fontsize=8)
        ax1.set_ylabel('Predicted Train Set', fontsize=8)
        ax1.plot([ytrain.min(), ytrain.max()], [ytrain.min(), ytrain.max()], 'b--', linewidth=2, label='Perfect Prediction')
        ax1.grid(True)
        ax1.set_facecolor('#F0F0F0')
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')

        ax2 = figure.add_subplot(2, 1, 2)  
        ax2.scatter(x=ytest, y=predictions_test, color='red', label='Test Data')
        ax2.set_title('The actual versus predicted (Test set): ' + label, fontweight='bold', fontsize=10)
        ax2.set_xlabel('Actual Test Set', fontsize=8)
        ax2.set_ylabel('Predicted Test Set', fontsize=8)
        ax2.plot([ytest.min(), ytest.max()], [ytest.min(), ytest.max()], 'b--', linewidth=2, label='Perfect Prediction')
        ax2.grid(True)
        ax2.set_facecolor('#F0F0F0')
        ax2.legend(facecolor='#E6E6FA', edgecolor='black')
        
        figure.tight_layout()
        canvas.draw()

    def lineplot_train_test_regression(self, ytrain, ytest, yval, yfinal,
            predictions_train, predictions_test, predictions_val, all_pred, figure, canvas, label):
        figure.clear()    
        ax1 = figure.add_subplot(4, 1, 1)    
        ax1.plot(ytrain.index.to_numpy(), ytrain.to_numpy(), color="blue", linewidth=2, linestyle="-", label='Actual')
        ax1.plot(ytrain.index.to_numpy(), predictions_train, color="red", linewidth=2, linestyle="-", label='Predicted')
        ax1.set_title('Actual and Predicted Training Set: ' + label, fontsize=10)
        ax1.set_xlabel('Date', fontsize=8)
        ax1.set_ylabel("Sales", fontsize=8)
        ax1.legend(prop={'size': 8},facecolor='#E6E6FA', edgecolor='black') 
        ax1.grid(True)   
        ax1.set_facecolor('#F0F0F0')
        # Set font for tick labels
        ax1.tick_params(axis='both', which='major', labelsize=8)
        ax1.tick_params(axis='both', which='minor', labelsize=8)

        ax2 = figure.add_subplot(4, 1, 2)    
        ax2.plot(ytest.index.to_numpy(), ytest.to_numpy(), color="blue", linewidth=2, linestyle="-", label='Actual')
        ax2.plot(ytest.index.to_numpy(), predictions_test, color="red", linewidth=2, linestyle="-", label='Predicted')
        ax2.set_title('Actual and Predicted Test Set: ' + label, fontsize=10)
        ax2.set_xlabel('Date', fontsize=8)
        ax2.set_ylabel("Sales", fontsize=8)
        ax2.legend(prop={'size': 8}, facecolor='#E6E6FA', edgecolor='black') 
        ax2.grid(True) 
        ax2.set_facecolor('#F0F0F0')
        # Set font for tick labels
        ax2.tick_params(axis='both', which='major', labelsize=8)
        ax2.tick_params(axis='both', which='minor', labelsize=8)

        ax3 = figure.add_subplot(4, 1, 3)    
        ax3.plot(yval.index.to_numpy(), yval.to_numpy(), color="blue", linewidth=2, linestyle="-", label='Actual')
        ax3.plot(yval.index.to_numpy(), predictions_val, color="red", linewidth=2, linestyle="-", label='Predicted')
        ax3.set_title('Actual and Predicted Validation Set (90 days forecasting) ' + label, fontsize=8)
        ax3.set_xlabel('Date', fontsize=8)
        ax3.set_ylabel("Sales", fontsize=8)
        ax3.legend(prop={'size': 8}, facecolor='#E6E6FA', edgecolor='black') 
        ax3.grid(True) 
        ax3.set_facecolor('#F0F0F0')
        # Set font for tick labels
        ax3.tick_params(axis='both', which='major', labelsize=8)
        ax3.tick_params(axis='both', which='minor', labelsize=8)

        ax4 = figure.add_subplot(4, 1, 4)    
        ax4.plot(yfinal.index.to_numpy(), yfinal.to_numpy(), color="blue", linewidth=2, linestyle="-", label='Actual')
        ax4.plot(yfinal.index.to_numpy(), all_pred, color="red", linewidth=2, linestyle="-", label='Predicted')
        ax4.set_title('Actual and Predicted All Set ' + label, fontsize=8)
        ax4.set_xlabel('Date', fontsize=8)
        ax4.set_ylabel("Adj Close", fontsize=8)
        ax4.legend(prop={'size': 8}, facecolor='#E6E6FA', edgecolor='black') 
        ax4.grid(True) 
        ax4.set_facecolor('#F0F0F0')
        # Set font for tick labels
        ax4.tick_params(axis='both', which='major', labelsize=8)
        ax4.tick_params(axis='both', which='minor', labelsize=8)

        figure.tight_layout()
        canvas.draw()
    
    def choose_plot_regression(self, chosen, X_final_reg, X_train_reg, 
            X_test_reg, X_val_reg, y_final_reg, y_train_reg, y_test_reg, y_val_reg, 
            figure1, canvas1, figure2, canvas2):  
        if chosen == "Linear Regression":
            best_lin_reg = self.obj_reg.linear_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_lin_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "RF Regression":
            best_rf_reg = self.obj_reg.rf_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_rf_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)
            
        if chosen == "Decision Trees Regression":
            best_dt_reg = self.obj_reg.dt_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_dt_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "Gradient Boosting Regression":
            best_gb_reg = self.obj_reg.gb_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_gb_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "XGB Regression":
            best_xgb_reg = self.obj_reg.xgb_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_xgb_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "MLP Regression":
            best_mlp_reg = self.obj_reg.mlp_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_mlp_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "Lasso Regression":
            best_lasso_reg = self.obj_reg.lasso_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_lasso_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "Ridge Regression":
            best_ridge_reg = self.obj_reg.ridge_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_ridge_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "AdaBoost Regression":
            best_ada_reg = self.obj_reg.adaboost_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_ada_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)

        if chosen == "KNN Regression":
            best_knn_reg = self.obj_reg.knn_regression(X_train_reg, y_train_reg)
            predictions_test, predictions_train, predictions_val, all_pred = self.obj_reg.perform_regression(best_knn_reg, X_final_reg, y_final_reg, 
                X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_val_reg, y_val_reg, chosen)

            self.scatter_train_test_regression(y_train_reg, y_test_reg, 
                predictions_train, predictions_test, figure1, canvas1, chosen)

            self.lineplot_train_test_regression(y_train_reg, 
                y_test_reg, y_val_reg, y_final_reg, predictions_train, predictions_test, predictions_val, all_pred,
                figure2, canvas2, chosen)                                  

    def plot_cm_roc(self, model, X_test, y_test, ypred, name, figure, canvas):
        figure.clear()    
        
        #Plots confusion matrix
        ax1 = figure.add_subplot(2,1,1)  
        cm = confusion_matrix(y_test, ypred, )
        sns.heatmap(cm, annot=True, linewidth=2, linecolor='black', fmt='g', cmap="cool", annot_kws={"size": 14}, ax=ax1)
        ax1.set_title('Confusion Matrix' + " of " + name, fontsize=12)
        ax1.set_xlabel('Y predict', fontsize=10)
        ax1.set_ylabel('Y test', fontsize=10)
        ax1.xaxis.set_ticklabels(['Churn = 1', 'Churn = 0'], fontsize=10)
        ax1.yaxis.set_ticklabels(['Churn = 1', 'Churn = 0'], fontsize=10)
        ax1.set_facecolor('#F0F0F0')
        
        #Plots ROC
        ax2 = figure.add_subplot(2,1,2)
        Y_pred_prob = model.predict_proba(X_test)
        Y_pred_prob = Y_pred_prob[:, 1]

        fpr, tpr, thresholds = roc_curve(y_test, Y_pred_prob)
        ax2.plot([0,1],[0,1], color='navy', linestyle='--', linewidth=3, label='Random Guess')
        ax2.plot(fpr,tpr, color='red', linewidth=3, label='ROC Curve')
        ax2.set_xlabel('False Positive Rate', fontsize=10)
        ax2.set_ylabel('True Positive Rate', fontsize=10)
        ax2.set_title('ROC Curve of ' + name , fontsize=12)
        ax2.grid(True)
        ax2.legend(facecolor='#E6E6FA', edgecolor='black')
        ax2.set_facecolor('#F0F0F0')

        figure.tight_layout()
        canvas.draw()   

    #Plots true values versus predicted values diagram and learning curve
    def plot_real_pred_val_learning_curve(self, model, X_train, y_train, X_test, y_test, ypred, name, figure, canvas):
        figure.clear()    
        
        #Plots true values versus predicted values diagram
        ax1 = figure.add_subplot(2,1,1)  
        acc=accuracy_score(y_test, ypred)
        ax1.scatter(range(len(ypred)),ypred,color="blue", lw=2,label="Predicted")
        ax1.scatter(range(len(y_test)), 
            y_test, color="red", label="Actual")
        ax1.set_title("Predicted Values vs True Values of " + name, fontsize=12)
        ax1.set_xlabel("Accuracy: " + str(round((acc*100),3)) + "%")
        ax1.legend(facecolor='#E6E6FA', edgecolor='black')
        ax1.grid(True, alpha=0.75, lw=1, ls='-.')
        ax1.set_facecolor('#F0F0F0')

        #Plots learning curve
        train_sizes=np.linspace(.1, 1.0, 5)
        train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(model, 
            X_train, y_train, cv=None, n_jobs=None, train_sizes=train_sizes, return_times=True)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        ax2 = figure.add_subplot(2,1,2)
        ax2.fill_between(train_sizes, train_scores_mean - train_scores_std,
            train_scores_mean + train_scores_std, alpha=0.1, color="r")
        ax2.fill_between(train_sizes, test_scores_mean - test_scores_std,
            test_scores_mean + test_scores_std, alpha=0.1, color="g")
        ax2.plot(train_sizes, train_scores_mean, 'o-', 
            color="b", label="Training score")
        ax2.plot(train_sizes, test_scores_mean, 'o-', 
            color="r", label="Cross-validation score")
        ax2.legend(loc="best", facecolor='#E6E6FA', edgecolor='black')
        ax2.set_title("Learning curve of " + name, fontsize=12)
        ax2.set_xlabel("fit_times")
        ax2.set_ylabel("Score")
        ax2.grid(True, alpha=0.75, lw=1, ls='-.')
        ax2.set_facecolor('#F0F0F0')

        figure.tight_layout()
        canvas.draw()  

    def choose_plot_ML(self, root, chosen, X_train, X_test, y_train, y_test, figure1, canvas1, figure2, canvas2):  
        if chosen == "Logistic Regression":
            best_model, y_pred = self.obj_ml.implement_LR(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)

            #Shows table of result
            df_lr = self.obj_data.read_dataset("results_LR.csv")
            self.shows_table(root, df_lr, 350, 750, "Y_test and Y_pred of Logistic Regression")

        if chosen == "Random Forest":
            best_model, y_pred = self.obj_ml.implement_RF(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_rf = self.obj_data.read_dataset("results_RF.csv")
            self.shows_table(root, df_rf, 350, 750, "Y_test and Y_pred of Random Forest")   

        if chosen == "K-Nearest Neighbors":
            best_model, y_pred = self.obj_ml.implement_KNN(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_knn = self.obj_data.read_dataset("results_KNN.csv")
            self.shows_table(root, df_knn, 350, 750, "Y_test and Y_pred of KNN")              

        if chosen == "Decision Trees":
            best_model, y_pred = self.obj_ml.implement_DT(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_dt = self.obj_data.read_dataset("results_DT.csv")
            self.shows_table(root, df_dt, 350, 750, "Y_test and Y_pred of Decision Trees")  

        if chosen == "Gradient Boosting":
            best_model, y_pred = self.obj_ml.implement_GB(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_gb = self.obj_data.read_dataset("results_GB.csv")
            self.shows_table(root, df_gb, 350, 750, "Y_test and Y_pred of Gradient Boosting") 

        if chosen == "Extreme Gradient Boosting":
            best_model, y_pred = self.obj_ml.implement_XGB(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_xgb = self.obj_data.read_dataset("results_XGB.csv")
            self.shows_table(root, df_xgb, 350, 750, "Y_test and Y_pred of Extreme Gradient Boosting") 

        if chosen == "Multi-Layer Perceptron":
            best_model, y_pred = self.obj_ml.implement_MLP(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_mlp = self.obj_data.read_dataset("results_MLP.csv")
            self.shows_table(root, df_mlp, 350, 750, "Y_test and Y_pred of Multi-Layer Perceptron") 

        if chosen == "Support Vector Classifier":
            best_model, y_pred = self.obj_ml.implement_SVC(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_svc = self.obj_data.read_dataset("results_SVC.csv")
            self.shows_table(root, df_svc, 350, 750, "Y_test and Y_pred of Support Vector Classifier")

        if chosen == "AdaBoost":
            best_model, y_pred = self.obj_ml.implement_ADA(chosen, X_train, X_test, y_train, y_test)

            #Plots confusion matrix and ROC
            self.plot_cm_roc(best_model, X_test, y_test, y_pred, chosen, figure1, canvas1)

            #Plots true values versus predicted values diagram and learning curve
            self.plot_real_pred_val_learning_curve(best_model, X_train, y_train, 
                X_test, y_test, y_pred, chosen, figure2, canvas2)
            
            #Shows table of result
            df_ada = self.obj_data.read_dataset("results_ADA.csv")
            self.shows_table(root, df_ada, 350, 750, "Y_test and Y_pred of AdaBoost Classifier")



#form1.py
import tkinter as tk
from tkinter import ttk
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

class Form1:
    def __init__(self, window):
        self.window = window
        width = 1520
        height = 760
        self.window.geometry(f"{width}x{height}")
    
        #Adds canvasses
        self.add_canvas(self.window)

    def add_canvas(self, master):    
        # Create a frame for canvas1 with a border
        frame1 = ttk.Frame(master, borderwidth=3, relief="groove")
        frame1.grid(row=0, column=0, columnspan=1, rowspan=25, padx=5, pady=5, sticky="n")

        # Adds canvas1 widget to frame1
        self.figure1 = Figure(figsize=(7.4, 7.4), dpi=100)
        self.figure1.patch.set_facecolor('#F0F0F0')
        self.canvas1 = FigureCanvasTkAgg(self.figure1, master=frame1)
        self.canvas1.get_tk_widget().pack(fill=tk.BOTH, expand=True)

        # Create a frame for canvas2 with a border
        frame2 = ttk.Frame(master, borderwidth=3, relief="groove")
        frame2.grid(row=0, column=1, columnspan=1, rowspan=25, padx=5, pady=5, sticky="n")

        # Adds canvas2 widget to frame2
        self.figure2 = Figure(figsize=(7.4, 7.4), dpi=100)
        self.figure2.patch.set_facecolor('#F0F0F0')
        self.canvas2 = FigureCanvasTkAgg(self.figure2, master=frame2)
        self.canvas2.get_tk_widget().pack(fill=tk.BOTH, expand=True)
        
if __name__ == "__main__":
    window = tk.Tk()
    Form1(window)
    window.mainloop()


#form2.py
import tkinter as tk
from tkinter import ttk
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from tkinter import scrolledtext

class Form2:
    def __init__(self, window):
        self.window = window
        width = 1300
        height = 500
        self.window.geometry(f"{width}x{height}")

        self.text = scrolledtext.ScrolledText(self.window, wrap=tk.WORD, bg="#F0F0F0")
        self.text.pack(expand=True, fill='both')  
        
if __name__ == "__main__":
    window = tk.Tk()
    Form2(window)
    window.mainloop()


#form3.py
import tkinter as tk
from tkinter import ttk
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

class Form3:
    def __init__(self, window):
        self.window = window
        width = 1520
        height = 760
        self.window.geometry(f"{width}x{height}")
    
        #Adds canvasses
        self.add_canvas(self.window)

    def add_canvas(self, master):    
        # Create a frame for canvas1 with a border
        frame1 = ttk.Frame(master, borderwidth=3, relief="groove")
        frame1.grid(row=0, column=0, columnspan=1, rowspan=25, padx=5, pady=5, sticky="n")

        # Adds canvas1 widget to frame1
        self.figure1 = Figure(figsize=(15, 7.4), dpi=100)
        self.figure1.patch.set_facecolor('#F0F0F0')
        self.canvas1 = FigureCanvasTkAgg(self.figure1, master=frame1)
        self.canvas1.get_tk_widget().pack(fill=tk.BOTH, expand=True)

        
if __name__ == "__main__":
    window = tk.Tk()
    Form3(window)
    window.mainloop()


#regression.py
import pandas as pd
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import roc_auc_score,roc_curve, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.neighbors import KNeighborsRegressor

class Regression:
    def splitting_data_regression(self, X, y_final): 
        #Normalizes data
        scaler = MinMaxScaler()
        X_minmax_data = scaler.fit_transform(X)
        X_final = pd.DataFrame(columns=X.columns, data=X_minmax_data, index=X.index)
        print('Shape of features : ', X_final.shape)
        print('Shape of target : ', y_final.shape)

        #Shifts target array to predict the n + 1 samples
        n=90
        y_final = y_final.shift(-1)
        y_val = y_final[-n:-1]
        y_final = y_final[:-n]

        #Takes last n rows of data to be validation set
        X_val = X_final[-n:-1]
        X_final = X_final[:-n]

        print("\n -----After process------ \n")
        print('Shape of features : ', X_final.shape)
        print('Shape of target : ', y_final.shape)
        print(y_final.tail().to_string())

        y_final=y_final.astype('float64')

        #Splits data into training and test data at 80% and 20% respectively
        split_idx=round(0.8*len(X))
        print("split_idx=",split_idx)
        X_train_reg = X_final[:split_idx]
        y_train_reg = y_final[:split_idx]
        X_test_reg = X_final[split_idx:]
        y_test_reg = y_final[split_idx:]   

        #Saves into pkl files
        joblib.dump(X, 'X_Ori.pkl')
        joblib.dump(X_final, 'X_final_reg.pkl')
        joblib.dump(X_train_reg, 'X_train_reg.pkl')
        joblib.dump(X_test_reg, 'X_test_reg.pkl')
        joblib.dump(X_val, 'X_val_reg.pkl')
        joblib.dump(y_final, 'y_final_reg.pkl')
        joblib.dump(y_train_reg, 'y_train_reg.pkl')
        joblib.dump(y_test_reg, 'y_test_reg.pkl') 
        joblib.dump(y_val, 'y_val_reg.pkl')

    def load_regression_files(self):
        X_Ori = joblib.load('X_Ori.pkl')
        X_final_reg = joblib.load('X_final_reg.pkl')
        X_train_reg = joblib.load('X_train_reg.pkl')
        X_test_reg = joblib.load('X_test_reg.pkl')
        X_val_reg = joblib.load('X_val_reg.pkl')
        y_final_reg = joblib.load('y_final_reg.pkl')
        y_train_reg = joblib.load('y_train_reg.pkl')
        y_test_reg = joblib.load('y_test_reg.pkl')  
        y_val_reg = joblib.load('y_val_reg.pkl') 

        return X_Ori, X_final_reg, X_train_reg, X_test_reg, X_val_reg, y_final_reg, y_train_reg, y_test_reg, y_val_reg 

    def perform_regression(self, model, X, y, xtrain, ytrain, xtest, ytest, xval, yval, label):
        model.fit(xtrain, ytrain)
        predictions_test = model.predict(xtest)
        predictions_train = model.predict(xtrain)
        predictions_val = model.predict(xval)

        # Convert ytest and predictions_test to NumPy arrays
        ytest_np = ytest.to_numpy().flatten()
        predictions_test_np = predictions_test.flatten()

        str_label = 'RMSE using ' + label
        print(str_label + f': {np.sqrt(mean_squared_error(ytest_np, predictions_test_np))}')    
        print("mean square error: ", mean_squared_error(ytest_np, predictions_test_np))
        print("variance or r-squared: ", explained_variance_score(ytest_np, predictions_test_np))
        print("mean absolute error (MAE): ", mean_absolute_error(ytest_np, predictions_test_np))
        print("R2 (R-squared): ", r2_score(ytest_np, predictions_test_np))
        print("Adjusted R2: ", 1 - (1-r2_score(ytest_np, predictions_test_np))*(len(ytest_np)-1)/(len(ytest_np)-xtest.shape[1]-1))
    
        mean_percentage_error = np.mean((ytest_np - predictions_test_np) / ytest_np) * 100
        print("Mean Percentage Error (MPE): ", mean_percentage_error)
    
        mean_absolute_percentage_error = np.mean(np.abs((ytest_np - predictions_test_np) / ytest_np)) * 100
        print("Mean Absolute Percentage Error (MAPE): ", mean_absolute_percentage_error)
    
        print('ACTUAL: Avg. ' + f': {ytest_np.mean()}')
        print('ACTUAL: Median ' + f': {np.median(ytest_np)}')    
        print('PREDICTED: Avg. ' + f': {predictions_test_np.mean()}')
        print('PREDICTED: Median ' + f': {np.median(predictions_test_np)}')  

        # Evaluation of regression on all dataset
        all_pred = model.predict(X)
        print("mean square error (whole dataset): ", mean_squared_error(y, all_pred))
        print("variance or r-squared (whole dataset): ", explained_variance_score(y, all_pred))

        return predictions_test, predictions_train, predictions_val, all_pred

    def linear_regression(self, X_train, y_train):
        #Linear Regression    
        #Creates a Linear Regression model
        lin_reg = LinearRegression()

        #Defines the hyperparameter grid to search
        param_grid = {
            'fit_intercept': [True, False],   # Try both True and False for fit_intercept
            'normalize': [True, False]        # Try both True and False for normalize
        }

        #Creates GridSearchCV with the Linear Regression model and the hyperparameter grid
        grid_search = GridSearchCV(lin_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        #Fits the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        #Gets the best Linear Regression model from the grid search
        best_lin_reg = grid_search.best_estimator_

        #Prints the best hyperparameters found
        print("Best Hyperparameters for Linear Regression:")
        print(grid_search.best_params_)

        return best_lin_reg

    def rf_regression(self, X_train, y_train):
        #Random Forest Regression    
        # Create a RandomForestRegressor model
        rf_reg = RandomForestRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'n_estimators': [50, 100, 150],           # Number of trees in the forest
            'max_depth': [None, 5, 10],                # Maximum depth of the tree
            'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split an internal node
            'min_samples_leaf': [1, 2, 4],             # Minimum number of samples required to be at a leaf node
            'bootstrap': [True, False]                 # Whether bootstrap samples are used when building trees
        }

        # Create GridSearchCV with the RandomForestRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best RandomForestRegressor model from the grid search
        best_rf_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for RandomForestRegressor:")
        print(grid_search.best_params_)

        return best_rf_reg
    
    def dt_regression(self, X_train, y_train):
        #Decision Tree (DT) regression
        # Create a DecisionTreeRegressor model
        dt_reg = DecisionTreeRegressor(random_state=100)

        # Define the hyperparameter grid to search
        param_grid = {
            'max_depth': [None, 5, 10, 15],          # Maximum depth of the tree
            'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
            'min_samples_leaf': [1, 2, 4, 6],        # Minimum number of samples required to be at a leaf node
        }

        # Create GridSearchCV with the DecisionTreeRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(dt_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best DecisionTreeRegressor model from the grid search
        best_dt_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for DecisionTreeRegressor:")
        print(grid_search.best_params_)

        return best_dt_reg

    def gb_regression(self, X_train, y_train):
        #Gradient Boosting regression
        # Create the GradientBoostingRegressor model
        gb_reg = GradientBoostingRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'n_estimators': [50, 100, 150],                # Number of boosting stages (trees) to build
            'learning_rate': [0.01, 0.1, 0.5],             # Step size at each boosting iteration
            'max_depth': [3, 5, 7],                        # Maximum depth of the individual trees
            'min_samples_split': [2, 5, 10],               # Minimum number of samples required to split an internal node
            'min_samples_leaf': [1, 2, 4],                 # Minimum number of samples required to be at a leaf node
        }

        # Create GridSearchCV with the GradientBoostingRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(gb_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best GradientBoostingRegressor model from the grid search
        best_gb_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for GradientBoostingRegressor:")
        print(grid_search.best_params_)

        return best_gb_reg
    
    def xgb_regression(self, X_train, y_train):
        #Extreme Gradient Boosting (XGB) 
        # Create the XGBRegressor model
        xgb_reg = XGBRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'n_estimators': [50, 100, 150],             # Number of boosting stages (trees) to build
            'learning_rate': [0.01, 0.1, 0.5],          # Step size at each boosting iteration
            'max_depth': [3, 5, 7],                     # Maximum depth of the individual trees
            'min_child_weight': [1, 2, 4],              # Minimum sum of instance weight (hessian) needed in a child
            'gamma': [0, 0.1, 0.2],                     # Minimum loss reduction required to make a further partition on a leaf node
            'subsample': [0.8, 1.0],                    # Subsample ratio of the training instances
            'colsample_bytree': [0.8, 1.0]              # Subsample ratio of columns when constructing each tree
        }

        # Create GridSearchCV with the XGBRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(xgb_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best XGBRegressor model from the grid search
        best_xgb_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for XGBRegressor:")
        print(grid_search.best_params_)

        return best_xgb_reg
    
    def mlp_regression(self, X_train, y_train):
        #MLP regression
        # Create the MLPRegressor model
        mlp_reg = MLPRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],   # Number of neurons in each hidden layer
            'activation': ['relu', 'tanh'],                                # Activation function for the hidden layers
            'solver': ['adam', 'sgd'],                                     # Solver for weight optimization
            'learning_rate': ['constant', 'invscaling', 'adaptive'],      # Learning rate schedule
            'learning_rate_init': [0.01, 0.001],                           # Initial learning rate
            'max_iter': [100, 200, 300],                                   # Maximum number of iterations
        }

        # Create GridSearchCV with the MLPRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(mlp_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best MLPRegressor model from the grid search
        best_mlp_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for MLPRegressor:")
        print(grid_search.best_params_)

        return best_mlp_reg

    def lasso_regression(self, X_train, y_train):
        # Create the LassoCV model
        lasso_reg = LassoCV(n_alphas=1000, max_iter=3000, random_state=0)

        # Define the hyperparameter grid to search
        param_grid = {
            'normalize': [True, False],        # Whether to normalize the features before fitting the model
            'fit_intercept': [True, False]     # Whether to calculate the intercept for this model
        }

        # Create GridSearchCV with the LassoCV model and the hyperparameter grid
        grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best LassoCV model from the grid search
        best_lasso_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for Lasso Regression:")
        print(grid_search.best_params_)

        return best_lasso_reg
    
    def ridge_regression(self, X_train, y_train):
        #Ridge regression
        ridge_reg = RidgeCV(gcv_mode='auto')

        # Define the hyperparameter grid to search (optional if you want to include other hyperparameters)
        param_grid = {
            'normalize': [True, False],        # Whether to normalize the features before fitting the model
            'fit_intercept': [True, False]     # Whether to calculate the intercept for this model
        }

        # Create GridSearchCV with the RidgeCV model and the hyperparameter grid (optional if you include the param_grid)
        grid_search = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best RidgeCV model from the grid search
        best_ridge_reg = grid_search.best_estimator_

        # Print the best hyperparameters found (optional if you included the param_grid)
        print("Best Hyperparameters for Ridge Regression:")
        print(grid_search.best_params_)

        return best_ridge_reg

    def adaboost_regression(self, X_train, y_train):
        #Adaboost regression
        # Create the AdaBoostRegressor model
        ada_reg = AdaBoostRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'n_estimators': [50, 100, 150],          # Number of boosting stages (trees) to build
            'learning_rate': [0.01, 0.1, 0.5],       # Step size at each boosting iteration
            'loss': ['linear', 'square', 'exponential']  # Loss function to use when updating weights
        }

        # Create GridSearchCV with the AdaBoostRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(ada_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best AdaBoostRegressor model from the grid search
        best_ada_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for AdaBoostRegressor:")
        print(grid_search.best_params_)

        return best_ada_reg

    def knn_regression(self, X_train, y_train):
        #KNN regression
        # Create a KNeighborsRegressor model
        knn_reg = KNeighborsRegressor()

        # Define the hyperparameter grid to search
        param_grid = {
            'n_neighbors': [3, 5, 7, 9],             # Number of neighbors to use for regression
            'weights': ['uniform', 'distance'],       # Weight function used in prediction
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  # Algorithm used to compute the nearest neighbors
        }

        # Create GridSearchCV with the KNeighborsRegressor model and the hyperparameter grid
        grid_search = GridSearchCV(knn_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

        # Fit the GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best KNeighborsRegressor model from the grid search
        best_knn_reg = grid_search.best_estimator_

        # Print the best hyperparameters found
        print("Best Hyperparameters for KNeighborsRegressor:")
        print(grid_search.best_params_)

        return best_knn_reg


#machine_learning.py
import numpy as np 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.metrics import classification_report, f1_score, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import os
import joblib
import pandas as pd 
from process_data import Process_Data

class Machine_Learning:
    def __init__(self):
        self.obj_data = Process_Data()

    def oversampling_splitting(self, df):
        #Sets target column
        y = df["Churn"]

        #Ensures y is of integer type
        y = np.array([1 if i>0 else 0 for i in y]).astype(int)

        #Drops irrelevant column
        X = df.drop(["Churn"], axis =1)

        #Checks null values because of technical indicators
        print(X.isnull().sum().to_string())
        print('Total number of null values: ', X.isnull().sum().sum())

        #Fills each null value in every column with mean value
        cols = list(X.columns)
        for n in cols:
            X[n].fillna(X[n].mean(),inplace = True)

        #Checks again null values
        print(X.isnull().sum().to_string())
        print('Total number of null values: ', X.isnull().sum().sum())

        # Check and convert data types
        X = X.astype(float)
        y = y.astype(int)

        sm = SMOTE(random_state=42)
        X,y = sm.fit_resample(X, y)

        #Splits the data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021, stratify=y)   

        #Use Standard Scaler
        scaler = StandardScaler()
        X_train_stand = scaler.fit_transform(X_train)
        X_test_stand = scaler.transform(X_test)    
    
        #Saves into pkl files
        joblib.dump(X_train_stand, 'X_train.pkl')
        joblib.dump(X_test_stand, 'X_test.pkl')
        joblib.dump(y_train, 'y_train.pkl')
        joblib.dump(y_test, 'y_test.pkl')  

    def load_files(self):
        X_train = joblib.load('X_train.pkl')
        X_test = joblib.load('X_test.pkl')
        y_train = joblib.load('y_train.pkl')
        y_test = joblib.load('y_test.pkl')
    
        return X_train, X_test, y_train, y_test

    def train_model(self, model, X, y):
        model.fit(X, y)
        return model

    def predict_model(self, model, X, proba=False):
        if ~proba:
            y_pred = model.predict(X)
        else:
            y_pred_proba = model.predict_proba(X)
            y_pred = np.argmax(y_pred_proba, axis=1)

        return y_pred

    def run_model(self, name, model, X_train, X_test, y_train, y_test, proba=False):   
        y_pred = self.predict_model(model, X_test, proba)
    
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    
        print(name)
        print('accuracy: ', accuracy)
        print('recall: ', recall)
        print('precision: ', precision)
        print('f1: ', f1)
        print(classification_report(y_test, y_pred)) 

        return y_pred

    def logistic_regression(self, name, X_train, X_test, y_train, y_test):
        #Logistic Regression Classifier
        # Define the parameter grid for the grid search
        param_grid = {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['none', 'l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
        }

        # Initialize the Logistic Regression model
        logreg = LogisticRegression(max_iter=5000, random_state=2021)
    
        # Create GridSearchCV with the Logistic Regression model and the parameter grid
        grid_search = GridSearchCV(logreg, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)
    
        # Get the best Logistic Regression model from the grid search
        best_model = grid_search.best_estimator_

        #Saves model
        joblib.dump(best_model, 'LR_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for LR:")
        print(grid_search.best_params_)        

        return best_model

    def implement_LR(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/LR_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('LR_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.logistic_regression(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_LR.csv")

        print("Training Logistic Regression done...")
        return model, y_pred

    def random_forest(self, name, X_train, X_test, y_train, y_test):
        #Random Forest Classifier    
        # Define the parameter grid for the grid search
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30, 40, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        # Initialize the RandomForestClassifier model
        rf = RandomForestClassifier(random_state=2021)
    
        # Create GridSearchCV with the RandomForestClassifier model and the parameter grid
        grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)
    
        # Get the best RandomForestClassifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'RF_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for RF:")
        print(grid_search.best_params_)        

        return best_model

    def implement_RF(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/RF_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('RF_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.random_forest(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_RF.csv")

        print("Training Random Forest done...")
        return model, y_pred

    def knearest_neigbors(self, name, X_train, X_test, y_train, y_test):
        #KNN Classifier
        # Define the parameter grid for the grid search
        param_grid = {
            'n_neighbors': list(range(2, 10))
        }

        # Initialize the KNN Classifier
        knn = KNeighborsClassifier()
    
        # Create GridSearchCV with the KNN model and the parameter grid
        grid_search = GridSearchCV(knn, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)
    
        # Get the best KNN model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'KNN_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for KNN:")
        print(grid_search.best_params_)        

        return best_model

    def implement_KNN(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/KNN_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('KNN_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.knearest_neigbors(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_KNN.csv")

        print("Training KNN done...")
        return model, y_pred

    def decision_trees(self, name, X_train, X_test, y_train, y_test):
        # Initialize the DecisionTreeClassifier model
        dt_clf = DecisionTreeClassifier(random_state=2021)
    
        # Define the parameter grid for the grid search
        param_grid = {
            'max_depth': np.arange(1, 51, 1),
            'criterion': ['gini', 'entropy'],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
        }
    
        # Create GridSearchCV with the DecisionTreeClassifier model and the parameter grid
        grid_search = GridSearchCV(dt_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)
    
        # Get the best DecisionTreeClassifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'DT_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for DT:")
        print(grid_search.best_params_)        

        return best_model

    def implement_DT(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/DT_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('DT_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.decision_trees(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_DT.csv")

        print("Training Decision Trees done...")
        return model, y_pred

    def gradient_boosting(self, name, X_train, X_test, y_train, y_test):
        #Gradient Boosting Classifier      
        # Initialize the GradientBoostingClassifier model
        gbt = GradientBoostingClassifier(random_state=2021)

        # Define the parameter grid for the grid search
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'subsample': [0.6, 0.8, 1.0],
            'max_features': [0.2, 0.4, 0.6, 0.8, 1.0],
        }
    
        # Create GridSearchCV with the GradientBoostingClassifier model and the parameter grid
        grid_search = GridSearchCV(gbt, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

        # Train and perform grid search
        grid_search.fit(X_train, y_train)

        # Get the best GradientBoostingClassifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'GB_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for GB:")
        print(grid_search.best_params_)        

        return best_model

    def implement_GB(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/GB_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('GB_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.gradient_boosting(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_GB.csv")

        print("Training Gradient Boosting done...")
        return model, y_pred

    def extreme_gradient_boosting(self, name, X_train, X_test, y_train, y_test):
        # Define the parameter grid for the grid search
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
        }

        # Initialize the XGBoost classifier
        xgb = XGBClassifier(random_state=2021, use_label_encoder=False, eval_metric='mlogloss')

        # Create GridSearchCV with the XGBoost classifier and the parameter grid
        grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

        # Train and perform grid search
        grid_search.fit(X_train, y_train)

        # Get the best XGBoost classifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'XGB_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for XGB:")
        print(grid_search.best_params_)        

        return best_model

    def implement_XGB(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/XGB_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('XGB_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.extreme_gradient_boosting(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_XGB.csv")

        print("Training Extreme Gradient Boosting done...")
        return model, y_pred

    def multi_layer_perceptron(self, name, X_train, X_test, y_train, y_test):
        # Define the parameter grid for the grid search
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
            'activation': ['logistic', 'relu'],
            'solver': ['adam', 'sgd'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
        }

        # Initialize the MLP Classifier
        mlp = MLPClassifier(random_state=2021)

        # Create GridSearchCV with the MLP Classifier and the parameter grid
        grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

        # Train and perform grid search
        grid_search.fit(X_train, y_train)

        # Get the best MLP Classifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'MLP_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for MLP:")
        print(grid_search.best_params_)        

        return best_model

    def implement_MLP(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/MLP_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('MLP_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.multi_layer_perceptron(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_MLP.csv")

        print("Training Multi-Layer Perceptron done...")
        return model, y_pred

    def support_vector(self, name, X_train, X_test, y_train, y_test):
        #Support Vector Classifier
        # Define the parameter grid for the grid search
        param_grid = {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'poly', 'rbf'],
            'gamma': ['scale', 'auto', 0.1, 1],
        }

        # Initialize the SVC model
        model_svc = SVC(random_state=2021, probability=True)

        # Create GridSearchCV with the SVC model and the parameter grid
        grid_search = GridSearchCV(model_svc, param_grid, cv=3, scoring='accuracy', n_jobs=-1, refit=True)
    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)

        # Get the best MLP Classifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'SVC_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for SVC:")
        print(grid_search.best_params_)        

        return best_model

    def implement_SVC(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/SVC_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('SVC_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.support_vector(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_SVC.csv")

        print("Training Support Vector Classifier done...")
        return model, y_pred

    def adaboost_classifier(self, name, X_train, X_test, y_train, y_test):
        # Define the parameter grid for the grid search
        param_grid = {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 0.2],
        }

        # Initialize the AdaBoost classifier
        adaboost = AdaBoostClassifier(random_state=2021)

        # Create GridSearchCV with the AdaBoost classifier and the parameter grid
        grid_search = GridSearchCV(adaboost, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    
        # Train and perform grid search
        grid_search.fit(X_train, y_train)

        # Get the best AdaBoost Classifier model from the grid search
        best_model = grid_search.best_estimator_
    
        #Saves model
        joblib.dump(best_model, 'ADA_Model.pkl')    
    
        # Print the best hyperparameters found
        print(f"Best Hyperparameters for AdaBoost:")
        print(grid_search.best_params_)        

        return best_model

    def implement_ADA(self, chosen, X_train, X_test, y_train, y_test):
        file_path = os.getcwd()+"/ADA_Model.pkl"
        if os.path.exists(file_path):
            model = joblib.load('ADA_Model.pkl')
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True) 
        else:
            model = self.adaboost_classifier(chosen, X_train, X_test, y_train, y_test)
            y_pred = self.run_model(chosen, model, X_train, X_test, y_train, y_test, proba=True)

        #Saves result into excel file
        self.obj_data.save_result(y_test, y_pred, "results_ADA.csv")

        print("Training AdaBoost done...")
        return model, y_pred
br />





No comments: