The data set was donated by an unnamed company handling flight ticket reservations. The data is thin, it contains:
log_PAX
which is related to the number of passengers (the actual number were changed for privacy reasons)
The goal is to predict the log_PAX
column. The prediction quality is measured by RMSE.
The challenge for this dataset is to perform prediction on flight traffic using learnt regressor.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
# optional
import seaborn as sns; sns.set()
data = pd.read_csv("public_train.csv")
print( min(data['DateOfDeparture']) )
print( max(data['DateOfDeparture']) )
data.head()
data['Departure'].unique()
data.hist(column='log_PAX', bins=50);
data.hist('std_wtd', bins=50);
data.hist('WeeksToDeparture', bins=50);
data.describe()
data.dtypes
data.shape
print( data['log_PAX'].mean() )
print( data['log_PAX'].std() )
Getting dates into numerical columns is a common operation when time series are analyzed with non-parametric predictors. The code below makes all possible choices: ordered columns for the year, month, day, weekday, week, and day in the year, and one-hot columns for year month, day, weekday, and week.
The departure and arrival airports are also converted into one-hot columns.
data_encoded = data
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['Departure'], prefix='d'))
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['Arrival'], prefix='a'))
data_encoded = data_encoded.drop('Departure', axis=1)
data_encoded = data_encoded.drop('Arrival', axis=1)
# following http://stackoverflow.com/questions/16453644/regression-with-date-variable-using-scikit-learn
data_encoded['DateOfDeparture'] = pd.to_datetime(data_encoded['DateOfDeparture'])
data_encoded['year'] = data_encoded['DateOfDeparture'].dt.year
data_encoded['month'] = data_encoded['DateOfDeparture'].dt.month
data_encoded['day'] = data_encoded['DateOfDeparture'].dt.day
data_encoded['weekday'] = data_encoded['DateOfDeparture'].dt.weekday
data_encoded['week'] = data_encoded['DateOfDeparture'].dt.week
data_encoded['n_days'] = data_encoded['DateOfDeparture'].apply(lambda date: (date - pd.to_datetime("1970-01-01")).days)
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['year'], prefix='y'))
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['month'], prefix='m'))
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['day'], prefix='d'))
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['weekday'], prefix='wd'))
data_encoded = data_encoded.join(pd.get_dummies(data_encoded['week'], prefix='w'))
data_encoded.tail(5)
We drop the target column and the original data column.
features = data_encoded.drop(['log_PAX','DateOfDeparture'], axis=1)
X_columns = data_encoded.columns.drop(['log_PAX','DateOfDeparture'])
X = features.values
y = data_encoded['log_PAX'].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
It gives us a pretty nice improvement above baseline
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
reg = LinearRegression()
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='mean_squared_error')
print("log RMSE: {:.4f} +/-{:.4f}".format(
np.mean(np.sqrt(-scores)), np.std(np.sqrt(-scores))))
Exercise: Visualize the coefficients, try to make sense of them.
%%time
from sklearn.ensemble import RandomForestRegressor
n_estimators = 10
max_depth = 10
max_features = 10
reg = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='mean_squared_error',n_jobs=3)
print("log RMSE: {:.4f} +/-{:.4f}".format(
np.mean(np.sqrt(-scores)), np.std(np.sqrt(-scores))))
reg.fit(X_train, y_train)
len(X_columns)
plt.figure(figsize=(15, 5))
ordering = np.argsort(reg.feature_importances_)[::-1][:50]
importances = reg.feature_importances_[ordering]
feature_names = X_columns[ordering]
x = np.arange(len(feature_names))
plt.bar(x, importances)
plt.xticks(x + 0.5, feature_names, rotation=90, fontsize=15);