First-Machine-Learning-Project/melb_model.py at main · NexarObs/First-Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv("melb_data.csv")

# Drop rows with missing target
df = df.dropna(subset=["Price"])

# =============================
#   ENCODE CATEGORICAL DATA (LIMITED FEATURES)
# =============================

from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]

# Keep only the useful categorical ones
useful_cats = ['Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname']

# Define high-cardinality columns to limit
high_card_cols = ['Suburb', 'SellerG']  # columns with many unique categories
top_n = 10  # keep only top 10 frequent categories per high-card column

# Group rare categories as "Other"
for col in high_card_cols:
    top_categories = df[col].value_counts().nlargest(top_n).index
    df[col] = df[col].where(df[col].isin(top_categories), 'Other')

# One-hot encode the selected categorical columns
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = pd.DataFrame(encoder.fit_transform(df[useful_cats]))

# Restore encoded column names
encoded_cats.columns = encoder.get_feature_names_out(useful_cats)

# Select numeric columns
numeric_df = df.select_dtypes(include=["int64", "float64"]).drop("Price", axis=1)

# Combine numeric and encoded categorical features
X = pd.concat([numeric_df, encoded_cats], axis=1)
y = df["Price"]

# Print summary
print(f"Total features after encoding: {X.shape[1]}")

# =============================
#   TRAINING PIPELINE
# =============================

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)

# Define model
model = RandomForestRegressor(random_state=0, n_estimators=100)

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
preds = model.predict(X_valid)
mae = mean_absolute_error(y_valid, preds)

print(f"Mean Absolute Error: {mae:.0f}")

# Calculate and print percentage error relative to average price
avg_price = y_valid.mean()
percent_error = (mae / avg_price) * 100
print(f"Average Price: {avg_price:.0f}")
print(f"Mean Absolute Error (% of average): {percent_error:.2f}%")

# Calculate and print approximate accuracy
accuracy = 100 - percent_error
print(f"Approximate Accuracy: {accuracy:.2f}%")

# =============================
#   ADDITIONAL METRICS
# =============================

from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

r2 = r2_score(y_valid, preds)
rmse = np.sqrt(mean_squared_error(y_valid, preds))

print(f"R² Score: {r2:.3f}")
print(f"RMSE: {rmse:.0f}")

# =============================
#   SAVE MODEL ARTIFACTS
# =============================
import joblib

artifacts = {
    "model": model,
    "encoder": encoder,
    "feature_names": X.columns.tolist()  # order of columns used for predictions
}
joblib.dump(artifacts, "melb_model_artifacts.pkl")
print("Saved artifacts to melb_model_artifacts.pkl")

# =============================
#   VISUALIZATIONS
# =============================

# 1. Feature Importance Plot
importances = model.feature_importances_
features = X.columns


# Sort features by importance
# sorted_idx = importances.argsort()[::-1]  # descending order
# top_n = 20  # show only top 20 features
# top_features = features[sorted_idx][:top_n]
# top_importances = importances[sorted_idx][:top_n]


# =============================
#   TOP FEATURES ONLY
# =============================
import numpy as np

sorted_idx = np.argsort(importances)[::-1][:20]
plt.figure(figsize=(10, 6))
plt.barh(features[sorted_idx][::-1], importances[sorted_idx][::-1])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 20 Most Important Features")
plt.tight_layout()
plt.show()


# 2. Actual vs Predicted Scatter Plot
plt.figure(figsize=(6, 6))
plt.scatter(y_valid, preds, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], "r--")
plt.tight_layout()
plt.show()

# 3. Error Distribution Histogram
errors = preds - y_valid
plt.figure(figsize=(8, 5))
plt.hist(errors, bins=40)
plt.xlabel("Prediction Error (Predicted - Actual)")
plt.ylabel("Count")
plt.title("Error Distribution")
plt.tight_layout()
plt.show()

# =============================
#   RESIDUAL ANALYSIS
# =============================
plt.figure(figsize=(6, 6))
plt.scatter(preds, preds - y_valid, alpha=0.5)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel("Predicted Price")
plt.ylabel("Residual (Predicted - Actual)")
plt.title("Residuals vs Predicted Values")
plt.tight_layout()
plt.show()


# =============================
#   HYPERPARAMETER TUNING (Optional)
# =============================
# from sklearn.model_selection import RandomizedSearchCV, cross_val_score

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 0.5]
# }

# search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=3,
#                             scoring='neg_mean_absolute_error', random_state=0)
# search.fit(X_train, y_train)

# print("Best parameters:", search.best_params_)
# print("Best cross-validated MAE:", -search.best_score_)

# # Cross-validation check on final model
# scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
# print("Average Cross-Validated MAE:", scores.mean())