# python_random_forest.py  RANDOM FOREST   April 2024  based on ass2s24q4.py

# A. Colin Cameron, Dept. of Economics, University of California - Davis

# This random forest example is based on Aurelien Geron "Hands-on Machine Learning
# with Scikit-Learn, Keras and TensorFlow" 3rd edition chapter 7
# https://github.com/ageron/handson-ml3/

# This uses SciKit Learn - for documentation see https://scikit-learn.org/stable/

# Standard check that python is working
print("hello")

# READ IN A STATA DATA SET (or .CSV FILE) USING pandas
# Use pandas - the following will shorten pandas in commands to pd 
# import pandas as pd
# For files in directories use / and not \
# Using absolute paths is recommended. 
# To use working directories use os module. See below. 

# Key data modules (including os to set working directories)
import numpy as np
import pandas as pd
import os               

# Read in data in a .csv file into a dataframe
# Change working directory to directory with file carsdata.csv
os.getcwd()
os.chdir("c:/Users/ccameron/Dropbox/Desktop/Teaching/240f/assignments/")
os.getcwd()
alldata = pd.read_csv("ass2s24_q4.csv")
alldata.describe()
alldata.head()

# Create training set (called data) and holdout data (called holdout)
from sklearn.model_selection import train_test_split
data, holdout = train_test_split(alldata, test_size=0.2, random_state=42)
len(data)
len(holdout)

# RANDOM FOREST
# Set the seed
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

# Separate features and target variable
# values forms the NumPy array underlying the Pandas dataframe named data
X = data.drop('ltotexp', axis=1).values
y = data['ltotexp'].values
# Run random forest using defaults 
rf = RandomForestRegressor(verbose=1)
# Fit the model to the data
rf.fit(X, y)
Rsquared = rf.score(X,y)
Rsquared
predictions = rf.predict(X) 
predictions
# Convert array to dataframe and write to file
dfpred = pd.DataFrame(predictions,columns=['rfrprediction'])
dfpred.describe()
data = dfpred.to_csv("predictions.csv")
data = dfpred.to_stata("predictions.dta")

# USE CROSS VALIDATION RATHER THAN OOB FOR TUNING PARAMETER SELECTION
# HERE TRY DIFFERENT DEPTHS AND NUMBER OF FEATURES IN RANDOM FOREST
# Note range(3,7) creates a sequence from 3 to 6 (7 is not included)
from sklearn.model_selection import GridSearchCV
params = {'max_depth': list(range(3,7)), 'max_features':[4,6,8,10]}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), params, verbose=1, cv=5)
grid_search.fit(X, y)
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
cvres
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score,params)
    
final_model = grid_search.best_estimator_
final_model

# SIMPLE IMPORTANCE MEASURES - BETTER IS TO USE FEATURE PERMUTATION
# from sklearn.inspection import permutation_importance
importances = final_model.feature_importances_
importances
std = np.std([tree.feature_importances_ for tree in final_model.estimators_], axis=0)
final_model_importances = pd.Series(importances)
final_model_importances
std

# NOW USE 
X_test = holdout.drop('ltotexp', axis=1).values
y_test = holdout['ltotexp'].values
final_predictions = final_model.predict(X_test)
from sklearn.metrics import mean_squared_error
final_mse = mean_squared_error(y_test, final_predictions)
final_mse

# END