Machine Learning - House Prices
Machine Learning
House Prices
The house prices in Douglas County are not just driven by having nearby grocery stores. Rather, they are driven by access to the right quantity, mix, and quality of community features. To further interpret the real estate data from Douglas County, I extracted, transformed, and loaded (ETL) the raw transaction-level and property-level information. Then, I created efficient features/variables from raw data via data mining techniques. Finally, I Predicted the house prices in Douglas County by applying the k-nearest neighbors algorithm (k-NN) to have powerful insights into the real estate data.
Extract, Transform, Load (ETL)
- Data Loading
from pandas import read_csv
from numpy import int64
trn_src = read_csv(
'TrainingData.csv',
encoding='ISO-8859-1',
keep_default_na=False,
na_values=[
'NaN',
'',
],
parse_dates=[
'transaction_date',
],
dtype={
'address_number': str,
'unit_no': str,
'land_economic_area_code': str,
'neighborhood_code': str,
},
)
data = trn_src.copy()
- None Value
from numpy import nan
data['remodeled_year'].replace(0, nan, inplace=True)
data['average_story_height'].replace([0, 1, 2, 3], nan, inplace=True)
data['no_of_bedroom'].replace(0, nan, inplace=True)
- Boolean Column
flag = {
'Y': True,
'N': False,
}
data['vacant_flag'] = data['vacant_flag'].map(flag)
data['walkout_basement_flag'] = data['walkout_basement_flag'].map(flag)
- Data Saving
from pickle import dump
with open('etl.pickle', 'wb') as f:
dump(data, f)
Feature Engineering
- Data Loading
from pickle import load
with open('etl.pickle', 'rb') as f:
data = load(f)
- CPI
from pandas import read_csv
cpi_src = read_csv('CPI.csv', index_col='Year')
def get_cpi(arr):
date = arr['transaction_date']
mo = cpi_src.columns[date.month - 1]
return cpi_src.loc[date.year, mo]
data['cpi'] = data.apply(get_cpi, axis=1)
data[['transaction_date', 'cpi']].tail()
- CPI Adjuestment
def adj_price(arr):
return arr['sale_price'] / arr['cpi']
data['cpi_adj_price'] = data.apply(adj_price, axis=1)
data[['transaction_date', 'cpi', 'sale_price', 'cpi_adj_price']].tail()
- Wear and Tear
def get_abrasion(arr):
trans_year = arr['transaction_date'].year
trans_month = arr['transaction_date'].month
if trans_month > 6:
trans_year += 1
last_modeled = max(arr['built_year'], arr['remodeled_year'])
return trans_year - last_modeled
data['abrasion'] = data.apply(get_abrasion, axis=1)
data[['transaction_date', 'built_year', 'remodeled_year', 'abrasion']].head()
- Room Size
data['room_size'] = data['built_as_sf'] / data['no_of_bedroom']
data[['built_as_sf', 'no_of_bedroom', 'room_size']].tail()
- House Location
from functools import lru_cache
from uszipcode import SearchEngine
search = SearchEngine()
@lru_cache(maxsize=2048)
def zip_info(zipcode):
info = search.by_zipcode(zipcode)
return info
def get_lat(arr):
info = zip_info(arr['location_zip_code'])
return info.lat
def get_lng(arr):
info = zip_info(arr['location_zip_code'])
return info.lng
data['lat'] = data.apply(get_lat, axis=1)
data['lng'] = data.apply(get_lng, axis=1)
data[['location_zip_code', 'lat', 'lng']].tail()
- Local Median Income
from functools import lru_cache
from uszipcode import SearchEngine
search = SearchEngine()
@lru_cache(maxsize=2048)
def zip_info(zipcode):
info = search.by_zipcode(zipcode)
return info
def get_household_income(arr):
info = zip_info(arr['location_zip_code'])
return info.median_household_income
data['area_income'] = data.apply(get_household_income, axis=1)
data[['location_zip_code', 'city_name', 'area_income']].head()
- Data Saving
from pickle import dump
with open('fe.pickle', 'wb') as f:
dump(data, f)
The k-nearest neighbors algorithm (k-NN)
- Data Loading
from pickle import load
with open('fe.pickle', 'rb') as f:
src = load(f)
- Feature Selection
feats = [
'abrasion',
'lat',
'lng',
#'area_income',
'total_net_acres',
'built_as_sf',
'total_garage_sf',
'room_size',
#'no_of_bedroom',
'no_of_bathroom',
]
def fill_median(data, col):
data[col].fillna(data[col].median(), inplace=True)
data = src.copy()
for col in feats:
fill_median(data, col)
data_trn = data[data['predict']==0][['cpi_adj_price'] + feats]
- Training & Validation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(data[feats])
X = scaler.transform(data_trn[feats])
y = data_trn[['cpi_adj_price']]
from sklearn.model_selection import train_test_split
X_trn, X_vld, y_trn, y_vld = train_test_split(
X, y, test_size=0.01, random_state=666)
- Hyper-parameter Selection
from sklearn.neighbors import KNeighborsRegressor
def adj_r2(m, X, y):
return 1 - ( (1 - m.score(X, y)) * (X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1) )
best_k, best_r2 = 0, 0
for k in range(1, 10, 2):
m = KNeighborsRegressor(n_neighbors=k).fit(X_trn, y_trn)
r2 = adj_r2(m, X_vld, y_vld)
if best_r2 < r2:
best_k, best_r2 = k, r2
print(f'Adjuested R^2: {best_r2}')
- Price Prediction
data_tst = data[data['predict']==1]
X_tst = scaler.transform(data_tst[feats])
m = KNeighborsRegressor(n_neighbors=best_k).fit(X, y)
p = m.predict(X_tst)
data_tst.loc[:, 'cpi_adj_price'] = p.T.tolist()[0]
prediction = data_tst.assign(sale_price=lambda arr: arr.cpi_adj_price * arr.cpi)
prediction[feats + ['cpi', 'cpi_adj_price', 'sale_price']].head(10)
Result
According to the results, I could add more effective features/variables to the model. Since some variables are more significant than the others in the model, I could also use the weighted K-NN model or apply neural networks to optimize the results.