Store Sales - Time Series Forecasting

Store Sales - Time Series Forecasting#

Kaggle Competition

  1. Initial Setup and Imports

# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time series specific
from statsmodels.tsa.seasonal import seasonal_decompose
from prophet import Prophet  

# Machine learning
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectKBest, f_regression

# Train model (example using XGBoost)
import xgboost as xgb
/Users/saraliu/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
  1. Data Loading and Exploration

# Load all datasets
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

stores = pd.read_csv('./data/stores.csv')
stores.rename(columns={'type': 'store_type'}, inplace=True)

oil = pd.read_csv('./data/oil.csv')

holidays = pd.read_csv('./data/holidays_events.csv')
holidays.rename(columns={'type': 'holiday_type'}, inplace=True)

transactions = pd.read_csv('./data/transactions.csv')

# add the dataset column to the train and test dataframes
train['dataset'] = 'train'
test['dataset'] = 'test'

# concatenate the train and test dataframes to simplify preprocessing and ensure consistent feature engineering
df = pd.concat([train, test], axis=0).reset_index(drop=True)

# Convert date columns
df['date'] = pd.to_datetime(df['date'])
oil['date'] = pd.to_datetime(oil['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
# Basic information and statistics
print(df.info())
print(df.describe())

# Check for missing values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3029400 entries, 0 to 3029399
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
 6   dataset      object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 161.8+ MB
None
                 id                           date     store_nbr  \
count  3.029400e+06                        3029400  3.029400e+06   
mean   1.514700e+06  2015-05-02 08:56:11.294118144  2.750000e+01   
min    0.000000e+00            2013-01-01 00:00:00  1.000000e+00   
25%    7.573498e+05            2014-03-02 18:00:00  1.400000e+01   
50%    1.514700e+06            2015-05-02 12:00:00  2.750000e+01   
75%    2.272049e+06            2016-07-01 06:00:00  4.100000e+01   
max    3.029399e+06            2017-08-31 00:00:00  5.400000e+01   
std    8.745126e+05                            NaN  1.558579e+01   

              sales   onpromotion  
count  3.000888e+06  3.029400e+06  
mean   3.577757e+02  2.643830e+00  
min    0.000000e+00  0.000000e+00  
25%    0.000000e+00  0.000000e+00  
50%    1.100000e+01  0.000000e+00  
75%    1.958473e+02  0.000000e+00  
max    1.247170e+05  7.410000e+02  
std    1.101998e+03  1.233287e+01  
id                 0
date               0
store_nbr          0
family             0
sales          28512
onpromotion        0
dataset            0
dtype: int64
test.head()
id date store_nbr family onpromotion dataset
0 3000888 2017-08-16 1 AUTOMOTIVE 0 test
1 3000889 2017-08-16 1 BABY CARE 0 test
2 3000890 2017-08-16 1 BEAUTY 2 test
3 3000891 2017-08-16 1 BEVERAGES 20 test
4 3000892 2017-08-16 1 BOOKS 0 test
test.tail()
id date store_nbr family onpromotion dataset
28507 3029395 2017-08-31 9 POULTRY 1 test
28508 3029396 2017-08-31 9 PREPARED FOODS 0 test
28509 3029397 2017-08-31 9 PRODUCE 1 test
28510 3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES 9 test
28511 3029399 2017-08-31 9 SEAFOOD 0 test
df
id date store_nbr family sales onpromotion dataset
0 0 2013-01-01 1 AUTOMOTIVE 0.0 0 train
1 1 2013-01-01 1 BABY CARE 0.0 0 train
2 2 2013-01-01 1 BEAUTY 0.0 0 train
3 3 2013-01-01 1 BEVERAGES 0.0 0 train
4 4 2013-01-01 1 BOOKS 0.0 0 train
... ... ... ... ... ... ... ...
3029395 3029395 2017-08-31 9 POULTRY NaN 1 test
3029396 3029396 2017-08-31 9 PREPARED FOODS NaN 0 test
3029397 3029397 2017-08-31 9 PRODUCE NaN 1 test
3029398 3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 test
3029399 3029399 2017-08-31 9 SEAFOOD NaN 0 test

3029400 rows × 7 columns

holidays
date holiday_type locale locale_name description transferred
0 2012-03-02 Holiday Local Manta Fundacion de Manta False
1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca False
3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad False
4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba False
... ... ... ... ... ... ...
345 2017-12-22 Additional National Ecuador Navidad-3 False
346 2017-12-23 Additional National Ecuador Navidad-2 False
347 2017-12-24 Additional National Ecuador Navidad-1 False
348 2017-12-25 Holiday National Ecuador Navidad False
349 2017-12-26 Additional National Ecuador Navidad+1 False

350 rows × 6 columns

holidays[holidays['transferred'] == True]
date holiday_type locale locale_name description transferred
19 2012-10-09 Holiday National Ecuador Independencia de Guayaquil True
72 2013-10-09 Holiday National Ecuador Independencia de Guayaquil True
135 2014-10-09 Holiday National Ecuador Independencia de Guayaquil True
255 2016-05-24 Holiday National Ecuador Batalla de Pichincha True
266 2016-07-25 Holiday Local Guayaquil Fundacion de Guayaquil True
268 2016-08-10 Holiday National Ecuador Primer Grito de Independencia True
297 2017-01-01 Holiday National Ecuador Primer dia del ano True
303 2017-04-12 Holiday Local Cuenca Fundacion de Cuenca True
312 2017-05-24 Holiday National Ecuador Batalla de Pichincha True
324 2017-08-10 Holiday National Ecuador Primer Grito de Independencia True
328 2017-09-28 Holiday Local Ibarra Fundacion de Ibarra True
340 2017-12-06 Holiday Local Quito Fundacion de Quito True
holidays[holidays['holiday_type'] == 'Transfer']
date holiday_type locale locale_name description transferred
20 2012-10-12 Transfer National Ecuador Traslado Independencia de Guayaquil False
73 2013-10-11 Transfer National Ecuador Traslado Independencia de Guayaquil False
136 2014-10-10 Transfer National Ecuador Traslado Independencia de Guayaquil False
256 2016-05-27 Transfer National Ecuador Traslado Batalla de Pichincha False
265 2016-07-24 Transfer Local Guayaquil Traslado Fundacion de Guayaquil False
269 2016-08-12 Transfer National Ecuador Traslado Primer Grito de Independencia False
298 2017-01-02 Transfer National Ecuador Traslado Primer dia del ano False
304 2017-04-13 Transfer Local Cuenca Fundacion de Cuenca False
313 2017-05-26 Transfer National Ecuador Traslado Batalla de Pichincha False
325 2017-08-11 Transfer National Ecuador Traslado Primer Grito de Independencia False
329 2017-09-29 Transfer Local Ibarra Fundacion de Ibarra False
342 2017-12-08 Transfer Local Quito Traslado Fundacion de Quito False
stores.head()
store_nbr city state store_type cluster
0 1 Quito Pichincha D 13
1 2 Quito Pichincha D 13
2 3 Quito Pichincha D 8
3 4 Quito Pichincha D 9
4 5 Santo Domingo Santo Domingo de los Tsachilas D 4
oil.head()
date dcoilwtico
0 2013-01-01 NaN
1 2013-01-02 93.14
2 2013-01-03 92.97
3 2013-01-04 93.12
4 2013-01-07 93.20
oil.tail()
date dcoilwtico
1213 2017-08-25 47.65
1214 2017-08-28 46.40
1215 2017-08-29 46.46
1216 2017-08-30 45.96
1217 2017-08-31 47.26
transactions.head()
date store_nbr transactions
0 2013-01-01 25 770
1 2013-01-02 1 2111
2 2013-01-02 2 2358
3 2013-01-02 3 3487
4 2013-01-02 4 1922
  1. Data Preprocessing

# Combine all the data
def combine_data(df):
    df = df.merge(stores, on='store_nbr', how='left')
    df = df.merge(oil, on='date', how='left')
    df = df.merge(holidays, on='date', how='left')
    df = df.merge(transactions, on=['date', 'store_nbr'], how='left')
    return df

df = combine_data(df)
df.sample(5)
id date store_nbr family sales onpromotion dataset city state store_type cluster dcoilwtico holiday_type locale locale_name description transferred transactions
2215381 2181345 2016-05-12 14 GROCERY I 2197.0 50 train Riobamba Chimborazo C 7 46.64 Event National Ecuador Terremoto Manabi+26 False 1223.0
173364 173364 2013-04-08 23 HOME AND KITCHEN I 0.0 0 train Ambato Tungurahua D 9 93.36 NaN NaN NaN NaN NaN 1174.0
3035383 2981923 2017-08-05 27 EGGS 260.0 0 train Daule Guayas D 1 NaN Holiday Local Esmeraldas Fundacion de Esmeraldas False 1862.0
678331 669421 2014-01-12 41 HOME AND KITCHEN II 14.0 0 train Machala El Oro D 4 NaN NaN NaN NaN NaN NaN 1118.0
84434 84434 2013-02-17 28 LAWN AND GARDEN 0.0 0 train Guayaquil Guayas E 10 NaN NaN NaN NaN NaN NaN 1176.0
# identify missing values in each column
df.isnull().sum()
id                    0
date                  0
store_nbr             0
family                0
sales             28512
onpromotion           0
dataset               0
city                  0
state                 0
store_type            0
cluster               0
dcoilwtico       962280
holiday_type    2578554
locale          2578554
locale_name     2578554
description     2578554
transferred     2578554
transactions     277629
dtype: int64
# handle missing values 
def handle_missing_values(df):
    """Handle missing values in the dataframe"""

    # fill missing values in oil with the last observation carried forward
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')

    # fill missing values for holiday type
    df['holiday_type'] = df['holiday_type'].fillna('Not Holiday')
    df['transferred'] = df['transferred'].fillna(True)


    df['is_holiday'] = (df['holiday_type'] != 'Not Holiday') | (df['transferred'] != True)

    return df

df = handle_missing_values(df)

df.sample(5)
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_84507/1625405876.py:6: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill').fillna(method='bfill')
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_84507/1625405876.py:10: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df['transferred'] = df['transferred'].fillna(True)
id date store_nbr family sales onpromotion dataset city state store_type cluster dcoilwtico holiday_type locale locale_name description transferred transactions is_holiday
983179 968923 2014-06-29 45 EGGS 686.0 0 train Quito Pichincha A 11 106.46 Event National Ecuador Mundial de futbol Brasil: Octavos de Final False 4238.0 True
302505 300723 2013-06-18 46 PLAYERS AND ELECTRONICS 0.0 0 train Quito Pichincha A 14 98.46 Not Holiday NaN NaN NaN True 2788.0 False
1946808 1921860 2015-12-17 33 CELEBRATION 15.0 0 train Quevedo Los Rios C 3 34.98 Not Holiday NaN NaN NaN True 972.0 False
1157335 1141297 2014-10-04 31 PERSONAL CARE 316.0 0 train Babahoyo Los Rios B 10 89.76 Not Holiday NaN NaN NaN True 1509.0 False
1078821 1062783 2014-08-21 29 HOME CARE 0.0 0 train Guayaquil Guayas E 10 93.97 Not Holiday NaN NaN NaN True NaN False
# identify missing values in each column
df.isnull().sum()
id                    0
date                  0
store_nbr             0
family                0
sales             28512
onpromotion           0
dataset               0
city                  0
state                 0
store_type            0
cluster               0
dcoilwtico            0
holiday_type          0
locale          2578554
locale_name     2578554
description     2578554
transferred           0
transactions     277629
is_holiday            0
dtype: int64
transactions.tail()
date store_nbr transactions
83483 2017-08-15 50 2804
83484 2017-08-15 51 1573
83485 2017-08-15 52 2255
83486 2017-08-15 53 932
83487 2017-08-15 54 802
# Check for missing values in the 'transactions' column
missing_transactions = df[df['transactions'].isnull()]
missing_transactions
id date store_nbr family sales onpromotion dataset city state store_type cluster dcoilwtico holiday_type locale locale_name description transferred transactions is_holiday
0 0 2013-01-01 1 AUTOMOTIVE 0.0 0 train Quito Pichincha D 13 93.14 Holiday National Ecuador Primer dia del ano False NaN True
1 1 2013-01-01 1 BABY CARE 0.0 0 train Quito Pichincha D 13 93.14 Holiday National Ecuador Primer dia del ano False NaN True
2 2 2013-01-01 1 BEAUTY 0.0 0 train Quito Pichincha D 13 93.14 Holiday National Ecuador Primer dia del ano False NaN True
3 3 2013-01-01 1 BEVERAGES 0.0 0 train Quito Pichincha D 13 93.14 Holiday National Ecuador Primer dia del ano False NaN True
4 4 2013-01-01 1 BOOKS 0.0 0 train Quito Pichincha D 13 93.14 Holiday National Ecuador Primer dia del ano False NaN True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3082855 3029395 2017-08-31 9 POULTRY NaN 1 test Quito Pichincha B 6 47.26 Not Holiday NaN NaN NaN True NaN False
3082856 3029396 2017-08-31 9 PREPARED FOODS NaN 0 test Quito Pichincha B 6 47.26 Not Holiday NaN NaN NaN True NaN False
3082857 3029397 2017-08-31 9 PRODUCE NaN 1 test Quito Pichincha B 6 47.26 Not Holiday NaN NaN NaN True NaN False
3082858 3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 test Quito Pichincha B 6 47.26 Not Holiday NaN NaN NaN True NaN False
3082859 3029399 2017-08-31 9 SEAFOOD NaN 0 test Quito Pichincha B 6 47.26 Not Holiday NaN NaN NaN True NaN False

277629 rows × 19 columns

def drop_columns(df):
    """Drop columns that are not needed"""
    df = df.drop(columns=['transferred', 'locale', 'locale_name', 'transferred', 'transactions', 'description'])
    return df

df = drop_columns(df)
df.sample(5)
id date store_nbr family sales onpromotion dataset city state store_type cluster dcoilwtico holiday_type is_holiday
1487597 1467995 2015-04-06 48 MAGAZINES 0.0 0 train Quito Pichincha A 14 52.08 Not Holiday False
2252532 2216892 2016-06-01 11 HOME CARE 621.0 4 train Cayambe Pichincha B 6 49.07 Not Holiday False
140269 140269 2013-03-20 44 LADIESWEAR 0.0 0 train Quito Pichincha A 5 93.21 Not Holiday False
1911868 1886920 2015-11-27 52 GROCERY II 0.0 0 train Manta Manabi A 11 40.57 Event True
2647452 2601120 2017-01-03 41 PLAYERS AND ELECTRONICS 15.0 0 train Machala El Oro D 4 52.36 Not Holiday False
# identify missing values in each column
df.isnull().sum()
id                  0
date                0
store_nbr           0
family              0
sales           28512
onpromotion         0
dataset             0
city                0
state               0
store_type          0
cluster             0
dcoilwtico          0
holiday_type        0
is_holiday          0
dtype: int64
# Basic Time Features
def create_time_features(df):
    """Create basic time-based features"""
    df = df.sort_values('date')

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    
    df['is_weekend'] = df['day_of_week'] >= 5
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['day_of_year'] = df['date'].dt.dayofyear
    df['quarter'] = df['date'].dt.quarter
    
    # Paydays (15th and last day of month)
    df['is_payday'] = ((df['day_of_week'] == 15) | 
                       (df['date'].dt.is_month_end)).astype(int)

    return df

df = create_time_features(df)

df.sample(5)
id date store_nbr family sales onpromotion dataset city state store_type ... holiday_type is_holiday year month day_of_week is_weekend week_of_year day_of_year quarter is_payday
970605 960797 2014-06-25 18 BEAUTY 0.0 0 train Quito Pichincha B ... Holiday True 2014 6 2 False 26 176 2 0
563901 556773 2013-11-09 30 PRODUCE 0.0 0 train Guayaquil Guayas C ... Not Holiday False 2013 11 5 True 45 313 4 0
2895270 2847156 2017-05-21 45 HOME AND KITCHEN I 77.0 2 train Quito Pichincha A ... Not Holiday False 2017 5 6 True 20 141 2 0
976977 962721 2014-06-26 21 GROCERY I 0.0 0 train Santo Domingo Santo Domingo de los Tsachilas B ... Not Holiday False 2014 6 3 False 26 177 2 0
2682101 2635769 2017-01-23 14 PET SUPPLIES 0.0 0 train Riobamba Chimborazo C ... Not Holiday False 2017 1 0 False 4 23 1 0

5 rows × 22 columns

  1. Exploratory Data Analysis (EDA)

# Plot overall sales trend
df_train = df[df['dataset'] == 'train']

plt.figure(figsize=(15,6))
plt.plot(df_train['date'], df_train['sales'])
plt.title('Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

# Monthly sales patterns
monthly_sales = df_train.groupby('month')['sales'].mean()
plt.figure(figsize=(10,5))
monthly_sales.plot(kind='bar')
plt.title('Average Sales by Month')
plt.show()

# Decompose time series
decomposition = seasonal_decompose(df_train['sales'], period=30)  # adjust period as needed
decomposition.plot()
plt.show()
../_images/07692c50fd52f670c30280dc906a1cac39f05c0df9c40b78954228449012e6c4.png ../_images/e2ba11d447cf2c7b2d8e32681896b0b2c44110c55be3251048f9947a9b95ad27.png ../_images/c10934be7c6834d2b502368500feb06bd58454a0cc8de6d533ebecdf56168631.png
# 2. Basic Sales Analysis
df_train = df[df['dataset'] == 'train']

def analyze_sales_patterns():
    """Analyze basic sales patterns"""
    plt.figure(figsize=(15, 10))
    
    # Overall sales trend
    plt.subplot(2, 2, 1)
    daily_sales = df_train.groupby('date')['sales'].sum()
    daily_sales.plot()
    plt.title('Daily Total Sales')
    plt.xticks(rotation=45)
    
    # Sales by day of week
    plt.subplot(2, 2, 2)
    df_train['day_of_week'] = df_train['date'].dt.dayofweek
    sns.boxplot(data=df_train, x='day_of_week', y='sales')
    plt.title('Sales by Day of Week')
    
    # Sales distribution
    plt.subplot(2, 2, 3)
    sns.histplot(data=df_train, x='sales', bins=50)
    plt.title('Sales Distribution')
    
    # Monthly sales pattern
    plt.subplot(2, 2, 4)
    df_train['month'] = df_train['date'].dt.month
    monthly_sales = df_train.groupby('month')['sales'].mean()
    monthly_sales.plot(kind='bar')
    plt.title('Average Sales by Month')
    
    plt.tight_layout()
    plt.show()

print("Analyzing sales patterns...")
analyze_sales_patterns()
Analyzing sales patterns...
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_84507/1383474932.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['day_of_week'] = df_train['date'].dt.dayofweek
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_84507/1383474932.py:28: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['month'] = df_train['date'].dt.month
../_images/2a60e54ceebd50083784c84051139d91d098e02b067756fb97018907eec28134.png
# 3. Store Analysis
def analyze_store_impact():
    """Analyze how store characteristics affect sales"""
    df_train = df[df['dataset'] == 'train']

    plt.figure(figsize=(15, 10))
    
    # Sales by store type
    plt.subplot(2, 2, 1)
    sns.boxplot(data=df_train, x='store_type', y='sales')
    plt.title('Sales by Store Type')
    plt.xticks(rotation=45)
    
    # Sales by cluster
    plt.subplot(2, 2, 2)
    cluster_sales = df_train.groupby('cluster')['sales'].mean().sort_values()
    cluster_sales.plot(kind='bar')
    plt.title('Average Sales by Cluster')
    plt.xticks(rotation=45)
    
    # Sales by city
    plt.subplot(2, 2, 3)
    city_sales = df_train.groupby('city')['sales'].mean().sort_values(ascending=False)[:10]
    city_sales.plot(kind='bar')
    plt.title('Top 10 Cities by Average Sales')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Calculate store metrics
    store_metrics = df_train.groupby('store_nbr').agg({
        'sales': ['mean', 'std', 'count'],
        'onpromotion': 'mean'
    }).round(2)
    
    return store_metrics

print("Analyzing store impact...")
store_metrics = analyze_store_impact()
print("\nStore metrics summary:")
print(store_metrics.head())
Analyzing store impact...
../_images/462fb7c3b020a25411b1e94c6677d38d18b50a8b540bf7e7e07200245c667833.png
Store metrics summary:
            sales                 onpromotion
             mean      std  count        mean
store_nbr                                    
1          254.65   597.11  56562        2.48
2          389.43  1083.07  56562        2.85
3          911.10  2152.14  56562        3.19
4          341.31   803.42  56562        2.74
5          281.18   654.14  56562        2.70
# 4. Product Family Analysis
def analyze_product_families():
    """Analyze sales patterns by product family"""
    plt.figure(figsize=(15, 10))
    
    # Average sales by family
    family_sales = train.groupby('family')['sales'].mean().sort_values(ascending=False)
    
    plt.subplot(2, 1, 1)
    family_sales.plot(kind='bar')
    plt.title('Average Sales by Product Family')
    plt.xticks(rotation=45)
    
    # Promotion effectiveness by family
    family_promo = train.groupby('family').agg({
        'sales': 'mean',
        'onpromotion': 'mean'
    })
    family_promo['promo_effectiveness'] = family_promo['sales'] / family_promo['onpromotion']
    
    plt.subplot(2, 1, 2)
    family_promo['promo_effectiveness'].sort_values(ascending=False).plot(kind='bar')
    plt.title('Promotion Effectiveness by Family')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return family_promo

print("Analyzing product families...")
family_promo = analyze_product_families()
print("\nProduct family promotion effectiveness:")
print(family_promo.sort_values('promo_effectiveness', ascending=False).head())
Analyzing product families...
../_images/3e9a7d72f39617cb933532e07889d344ca1b2e5efcdd87052637097b695cb84d.png
Product family promotion effectiveness:
                    sales  onpromotion  promo_effectiveness
family                                                     
BOOKS            0.070797     0.000000                  inf
MAGAZINES        2.929082     0.003266           896.831650
HOME APPLIANCES  0.457476     0.000638           717.258621
HARDWARE         1.137833     0.001792           634.785276
LADIESWEAR       7.160629     0.018475           387.594643
# 5. Promotion Analysis
def analyze_promotions():
    """Analyze the impact of promotions"""
    # Calculate promotion effectiveness
    promo_impact = train.groupby('onpromotion')['sales'].agg(['mean', 'count', 'std'])
    
    # Time-based promotion analysis
    train['month'] = train['date'].dt.month
    promo_by_month = train.groupby('month')['onpromotion'].mean()
    
    plt.figure(figsize=(15, 5))
    promo_by_month.plot(kind='bar')
    plt.title('Promotion Frequency by Month')
    plt.show()
    
    return promo_impact

print("Analyzing promotions...")
promo_impact = analyze_promotions()
print("\nPromotion impact summary:")
print(promo_impact)
Analyzing promotions...
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[28], line 19
     16     return promo_impact
     18 print("Analyzing promotions...")
---> 19 promo_impact = analyze_promotions()
     20 print("\nPromotion impact summary:")
     21 print(promo_impact)

Cell In[28], line 8, in analyze_promotions()
      5 promo_impact = train.groupby('onpromotion')['sales'].agg(['mean', 'count', 'std'])
      7 # Time-based promotion analysis
----> 8 train['month'] = train['date'].dt.month
      9 promo_by_month = train.groupby('month')['onpromotion'].mean()
     11 plt.figure(figsize=(15, 5))

File ~/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/pandas/core/generic.py:6299, in NDFrame.__getattr__(self, name)
   6292 if (
   6293     name not in self._internal_names_set
   6294     and name not in self._metadata
   6295     and name not in self._accessors
   6296     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   6297 ):
   6298     return self[name]
-> 6299 return object.__getattribute__(self, name)

File ~/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/pandas/core/accessor.py:224, in CachedAccessor.__get__(self, obj, cls)
    221 if obj is None:
    222     # we're accessing the attribute of the class, i.e., Dataset.geo
    223     return self._accessor
--> 224 accessor_obj = self._accessor(obj)
    225 # Replace the property with the accessor object. Inspired by:
    226 # https://www.pydanny.com/cached-property.html
    227 # We need to use object.__setattr__ because we overwrite __setattr__ on
    228 # NDFrame
    229 object.__setattr__(obj, self._name, accessor_obj)

File ~/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/pandas/core/indexes/accessors.py:643, in CombinedDatetimelikeProperties.__new__(cls, data)
    640 elif isinstance(data.dtype, PeriodDtype):
    641     return PeriodProperties(data, orig)
--> 643 raise AttributeError("Can only use .dt accessor with datetimelike values")

AttributeError: Can only use .dt accessor with datetimelike values
# 6. Oil Price Impact
def analyze_oil_impact():
    """Analyze relationship between oil prices and sales"""
    # Merge oil data
    df_train = df[df['dataset'] == 'train']

    # Calculate correlation
    correlation = df_train['sales'].corr(df_train['dcoilwtico'])
    
    plt.figure(figsize=(10, 5))
    plt.scatter(df_train['dcoilwtico'], df_train['sales'], alpha=0.1)
    plt.title(f'Sales vs Oil Price (correlation: {correlation:.2f})')
    plt.xlabel('Oil Price')
    plt.ylabel('Sales')
    plt.show()
    
    return correlation

print("Analyzing oil price impact...")
oil_correlation = analyze_oil_impact()
print(f"\nOil price correlation with sales: {oil_correlation:.3f}")
Analyzing oil price impact...
../_images/9f382ceb1a89c9a149d1fcfedf41f5920d9b4396b812f1ef6a05a1fd55f0208b.png
Oil price correlation with sales: -0.075
# 7. Holiday Analysis
def analyze_holiday_impact():
    """Analyze sales patterns during holidays"""
    df_train = df[df['dataset'] == 'train']

    # Compare sales on holidays vs non-holidays
    holiday_stats = df_train.groupby('is_holiday')['sales'].agg(['mean', 'std', 'count'])
    
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=df_train, x='is_holiday', y='sales')
    plt.title('Sales Distribution: Holiday vs Non-Holiday')
    plt.show()
    
    return holiday_stats

print("Analyzing holiday impact...")
holiday_stats = analyze_holiday_impact()
print("\nHoliday vs Non-holiday sales:")
print(holiday_stats)
Analyzing holiday impact...
../_images/6d3c1a53bbb5073ef7855b1f5de64e379917bbcb04078d6bda96f1f0777b6917.png
Holiday vs Non-holiday sales:
                  mean          std    count
is_holiday                                  
False       352.159181  1076.081977  2551824
True        393.864762  1253.233869   502524
# 8. Earthquake Impact Analysis
def analyze_earthquake_impact():
    """Analyze impact of the 2016 earthquake"""
    earthquake_date = pd.Timestamp('2016-04-16')
    df_train = df[df['dataset'] == 'train']

    # Create time windows around earthquake
    before_earthquake = df_train[
        (df_train['date'] >= earthquake_date - pd.Timedelta(days=30)) & 
        (df_train['date'] < earthquake_date)
    ]
    
    after_earthquake = df_train[
        (df_train['date'] >= earthquake_date) & 
        (df_train['date'] < earthquake_date + pd.Timedelta(days=30))
    ]
    
    # Compare sales
    comparison = pd.DataFrame({
        'before': before_earthquake['sales'].describe(),
        'after': after_earthquake['sales'].describe()
    })
    
    return comparison

print("Analyzing earthquake impact...")
earthquake_comparison = analyze_earthquake_impact()
print("\nSales before vs after earthquake:")
print(earthquake_comparison)
Analyzing earthquake impact...

Sales before vs after earthquake:
             before          after
count  53460.000000   62370.000000
mean     422.028199     502.773620
std     1197.638473    1691.132072
min        0.000000       0.000000
25%        2.000000       2.000000
50%       21.000000      24.000000
75%      244.418750     278.000000
max    22018.000000  124717.000000
# 9. Feature Importance Analysis
def analyze_feature_importance():
    """Create and analyze initial features"""
    # Create basic features
    df_train = df[df['dataset'] == 'train']
    
    # Create dummy variables for categorical features
    df = pd.get_dummies(df, columns=['type', 'family'])
    
    # Drop non-numeric columns
    df = df.select_dtypes(include=[np.number])
    
    # Calculate correlations with sales
    correlations = df.corr()['sales'].sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    correlations[1:20].plot(kind='bar')  # Exclude sales correlation with itself
    plt.title('Top Feature Correlations with Sales')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return correlations

print("Analyzing feature importance...")
feature_correlations = analyze_feature_importance()
print("\nTop correlated features with sales:")
print(feature_correlations.head(10))
Analyzing feature importance...
../_images/cf40b725eabcf0d1d49d307b28043e7af02414e8144416c35e7cca90776709ed.png
Top correlated features with sales:
sales          1.000000
onpromotion    0.427923
id             0.085784
year           0.081093
is_weekend     0.051874
store_nbr      0.041196
cluster        0.038525
day_of_week    0.036869
month          0.019790
is_holiday     0.012370
Name: sales, dtype: float64
  1. Feature Engineering

# 1. Sales-related Features
def create_sales_features(df):
    """Create sales-related features"""
    # Group by store and family
    grouped = df.groupby(['store_nbr', 'family'])
    
    # Create lagged features
    lags = [1, 7, 14, 30]
    for lag in lags:
        df[f'sales_lag_{lag}'] = grouped['sales'].shift(lag)
    
    # Rolling means
    windows = [7, 14, 30]
    for window in windows:
        df[f'sales_rolling_mean_{window}'] = (
            grouped['sales'].transform(
                lambda x: x.rolling(window=window, min_periods=1).mean()
            )
        )
        
    # Sales momentum (percent change)
    df['sales_momentum'] = grouped['sales'].pct_change()
    
    return df

# 2. Promotion Features
def create_promotion_features(df):
    """Create promotion-related features"""
    # Group by store and family
    grouped = df.groupby(['store_nbr', 'family'])
    
    # Rolling promotion metrics
    windows = [7, 14, 30]
    for window in windows:
        # Rolling mean of promotions
        df[f'promo_rolling_mean_{window}'] = (
            grouped['onpromotion'].transform(
                lambda x: x.rolling(window=window, min_periods=1).mean()
            )
        )
    
    # Promotion changes
    df['promo_changed'] = grouped['onpromotion'].transform(
        lambda x: x.diff() != 0).astype(int)
    
    return df

# 3. Store Features
def create_store_features(df, stores_df):
    """Create store-related features"""
    # Merge store information
    df = df.merge(stores_df, on='store_nbr', how='left')
    
    # Create store type dummies
    store_type_dummies = pd.get_dummies(df['type'], prefix='store_type')
    df = pd.concat([df, store_type_dummies], axis=1)
    
    # Create city dummies (or use clustering for many cities)
    city_counts = df['city'].value_counts()
    major_cities = city_counts[city_counts > 100].index
    df['city_grouped'] = df['city'].apply(
        lambda x: x if x in major_cities else 'Other'
    )
    city_dummies = pd.get_dummies(df['city_grouped'], prefix='city')
    df = pd.concat([df, city_dummies], axis=1)
    
    return df

# 4. Oil Price Features
def create_oil_features(df, oil_df):
    """Create oil price related features"""
    # Merge oil prices
    df = df.merge(oil_df[['date', 'dcoilwtico']], on='date', how='left')
    
    # Forward fill missing oil prices
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill')
    
    # Create oil price changes
    df['oil_price_change'] = df['dcoilwtico'].pct_change()
    
    # Rolling oil statistics
    windows = [7, 14, 30]
    for window in windows:
        df[f'oil_rolling_mean_{window}'] = (
            df['dcoilwtico'].rolling(window=window, min_periods=1).mean()
        )
    
    return df

# 5. Holiday Features
def create_holiday_features(df, holidays_df):
    """Create holiday-related features"""
    # Clean up holidays dataframe
    holidays_df = holidays_df.copy()
    
    # Handle transferred holidays more efficiently
    transferred_days = holidays_df[holidays_df['type'] == 'Transfer']
    for _, row in transferred_days.iterrows():
        # Find the original holiday
        original = holidays_df[
            (holidays_df['description'] == row['description']) & 
            (holidays_df['type'] != 'Transfer')
        ]
        if not original.empty:
            # Update the date of the original holiday
            holidays_df.loc[original.index, 'date'] = row['date']
    
    # Create holiday flags using vectorized operations
    holiday_dates = set(holidays_df[holidays_df['type'] != 'Work Day']['date'])
    df['is_holiday'] = df['date'].isin(holiday_dates).astype(int)
    
    # Create a Series for days to/from holiday using vectorized operations
    df['days_to_holiday'] = df['date'].apply(
        lambda x: min((x - d).days for d in holiday_dates if (x - d).days > 0) if holiday_dates else 99
    )
    df['days_from_holiday'] = df['date'].apply(
        lambda x: min((d - x).days for d in holiday_dates if (d - x).days > 0) if holiday_dates else 99
    )
    
    return df

# 6. Earthquake Feature (Special Event)
def create_earthquake_feature(df):
    """Create earthquake-related features"""
    earthquake_date = pd.Timestamp('2016-04-16')
    df['days_from_earthquake'] = (df['date'] - earthquake_date).dt.days
    df['earthquake_period'] = (
        (df['date'] >= earthquake_date) & 
        (df['date'] <= earthquake_date + pd.Timedelta(days=30))
    ).astype(int)
    
    return df

# 7. Put it all together
def engineer_features(df, stores_df, oil_df, holidays_df):
    """Main function to engineer all features"""
    print("Creating time features...")
    df = create_time_features(df)
    
    print("Creating sales features...")
    df = create_sales_features(df)
    
    print("Creating promotion features...")
    df = create_promotion_features(df)
    
    print("Creating store features...")
    df = create_store_features(df, stores_df)
    
    print("Creating oil features...")
    df = create_oil_features(df, oil_df)
    
    # print("Creating holiday features...")
    # df = create_holiday_features(df, holidays_df)
    
    print("Creating earthquake feature...")
    df = create_earthquake_feature(df)
    
    return df

# Use the feature engineering pipeline
df_engineered = engineer_features(train, stores, oil, holidays)

# Handle missing values
def handle_missing_values(df):
    """Handle missing values in the engineered features"""
    # Fill missing lagged values with 0
    lag_columns = [col for col in df.columns if 'lag' in col]
    df[lag_columns] = df[lag_columns].fillna(0)
    
    # Fill missing rolling means with the global mean
    rolling_columns = [col for col in df.columns if 'rolling' in col]
    for col in rolling_columns:
        df[col] = df[col].fillna(df[col].mean())
    
    # Fill other missing values
    df = df.fillna(0)
    
    return df

df_engineered = handle_missing_values(df_engineered)

# Save engineered features
df_engineered.to_csv('engineered_features.csv', index=False)

print(df_engineered.head())
Creating time features...
Missing date values: 0
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_45153/511463118.py:11: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df['date'] = df['date'].fillna(method='bfill') # Backward fill
Creating sales features...
Creating promotion features...
Creating store features...
Creating oil features...
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_45153/2590496152.py:76: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
  df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill')
Creating earthquake feature...
     id       date  store_nbr        family  sales  onpromotion  month  year  \
0     0 2013-01-01          1    AUTOMOTIVE    0.0            0      1  2013   
1  1194 2013-01-01         42   CELEBRATION    0.0            0      1  2013   
2  1193 2013-01-01         42  BREAD/BAKERY    0.0            0      1  2013   
3  1192 2013-01-01         42         BOOKS    0.0            0      1  2013   
4  1191 2013-01-01         42     BEVERAGES    0.0            0      1  2013   

   day_of_month  day_of_week  ...  city_Riobamba  city_Salinas  \
0             1            1  ...          False         False   
1             1            1  ...          False         False   
2             1            1  ...          False         False   
3             1            1  ...          False         False   
4             1            1  ...          False         False   

   city_Santo Domingo  dcoilwtico  oil_price_change  oil_rolling_mean_7  \
0               False         0.0               0.0           67.909962   
1               False         0.0               0.0           67.909962   
2               False         0.0               0.0           67.909962   
3               False         0.0               0.0           67.909962   
4               False         0.0               0.0           67.909962   

   oil_rolling_mean_14  oil_rolling_mean_30  days_from_earthquake  \
0            67.910016            67.910137                 -1201   
1            67.910016            67.910137                 -1201   
2            67.910016            67.910137                 -1201   
3            67.910016            67.910137                 -1201   
4            67.910016            67.910137                 -1201   

   earthquake_period  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0  

[5 rows x 66 columns]
  1. Model Development

# Split data into train and test (last 15 days)
test_dates = df_engineered['date'].max() - pd.Timedelta(days=15)
test_df = df_engineered[df_engineered['date'] > test_dates]
train_df = df_engineered[df_engineered['date'] <= test_dates]

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Identify numeric features (excluding target and date)
numeric_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_features = [col for col in numeric_features if col not in ['sales', 'date']]

# Feature Analysis Function
def analyze_engineered_features(train_df, test_df, features):
    """
    Analyze engineered features and select the most important ones
    """
    print(f"Analyzing {len(features)} features...")
    
    # 1. Correlation with target
    correlations = train_df[features + ['sales']].corr()['sales'].sort_values(ascending=False)
    print("\nTop 10 correlations with sales:")
    print(correlations[:10])
    
    # 2. Feature Selection using SelectKBest
    X_train = train_df[features]
    y_train = train_df['sales']

    # Check for NaN values in X_train
    print("Checking for NaN values in X_train:")
    print(X_train.isnull().sum())  # This will show the count of NaN values for each feature

    # Check for infinite values in X_train
    print("Checking for infinite values in X_train:")
    print(np.isinf(X_train).sum())  # This will show the count of infinite values for each feature

    # Handle NaN values (example: fill with mean)
    X_train.fillna(X_train.mean(), inplace=True)

    # Handle infinite values (example: replace with a large finite number)
    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf with NaN
    X_train.fillna(X_train.mean(), inplace=True)  # Fill NaN again after replacing inf

    # Now you can proceed with fitting the model
        
    selector = SelectKBest(score_func=f_regression, k=50)
    selector.fit(X_train, y_train)

    # Get selected feature scores
    feature_scores = pd.DataFrame({
        'Feature': features,
        'F_Score': selector.scores_,
        'P_Value': selector.pvalues_
    }).sort_values('F_Score', ascending=False)
    
    print("\nTop 10 features by F-score:")
    print(feature_scores.head(10))
    
    # 3. XGBoost Feature Importance
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_train, y_train)],
        verbose=False
    )
    
    importance_df = pd.DataFrame({
        'Feature': features,
        'XGB_Importance': model.feature_importances_
    }).sort_values('XGB_Importance', ascending=False)
    
    print("\nTop 10 features by XGBoost importance:")
    print(importance_df.head(10))
    
    # 4. Feature Stability Analysis
    stability_metrics = pd.DataFrame(index=features)
    
    for feature in features:
        train_mean = train_df[feature].mean()
        test_mean = test_df[feature].mean()
        train_std = train_df[feature].std()
        test_std = test_df[feature].std()
        
        stability_metrics.loc[feature, 'mean_diff_pct'] = (
            abs(train_mean - test_mean) / (abs(train_mean) + 1e-10) * 100
        )
        stability_metrics.loc[feature, 'std_diff_pct'] = (
            abs(train_std - test_std) / (abs(train_std) + 1e-10) * 100
        )
    
    # 5. Combine all metrics
    final_scores = pd.DataFrame(index=features)
    final_scores['correlation'] = abs(correlations)
    final_scores['f_score'] = feature_scores.set_index('Feature')['F_Score']
    final_scores['xgb_importance'] = importance_df.set_index('Feature')['XGB_Importance']
    final_scores['stability'] = 1 / (1 + stability_metrics['mean_diff_pct'])
    
    # Normalize scores
    for col in final_scores.columns:
        final_scores[col] = (final_scores[col] - final_scores[col].min()) / \
                           (final_scores[col].max() - final_scores[col].min())
    
    # Calculate combined score
    final_scores['combined_score'] = (
        final_scores['correlation'] * 0.25 +
        final_scores['f_score'] * 0.25 +
        final_scores['xgb_importance'] * 0.35 +
        final_scores['stability'] * 0.15
    )
    
    final_scores = final_scores.sort_values('combined_score', ascending=False)
    
    # Plot top features
    plt.figure(figsize=(15, 8))
    sns.barplot(x=final_scores.head(20).index, y=final_scores.head(20)['combined_score'])
    plt.xticks(rotation=45, ha='right')
    plt.title('Top 20 Features by Combined Score')
    plt.tight_layout()
    plt.show()
    
    # Select features above threshold
    threshold = final_scores['combined_score'].mean()
    selected_features = final_scores[final_scores['combined_score'] > threshold].index.tolist()
    
    return selected_features, final_scores

# Run the analysis
selected_features, feature_scores = analyze_engineered_features(train_df, test_df, numeric_features)

print(f"\nSelected {len(selected_features)} features above threshold")
print("\nTop 20 selected features:")
print(selected_features[:20])
Train shape: (2974158, 67)
Test shape: (26730, 67)
Analyzing 33 features...

Top 10 correlations with sales:
sales                    1.000000
sales_rolling_mean_7     0.948087
sales_rolling_mean_14    0.941438
sales_rolling_mean_30    0.936116
sales_lag_7              0.934755
sales_lag_14             0.926094
sales_lag_1              0.918589
sales_lag_30             0.843825
promo_rolling_mean_30    0.550787
promo_rolling_mean_14    0.544124
Name: sales, dtype: float64
Checking for NaN values in X_train:
id                       0
store_nbr                0
onpromotion              0
day_of_week              0
month                    0
is_holiday               0
year                     0
day_of_month             0
is_weekend               0
week_of_year             0
day_of_year              0
quarter                  0
is_payday                0
sales_lag_1              0
sales_lag_7              0
sales_lag_14             0
sales_lag_30             0
sales_rolling_mean_7     0
sales_rolling_mean_14    0
sales_rolling_mean_30    0
sales_momentum           0
promo_rolling_mean_7     0
promo_rolling_mean_14    0
promo_rolling_mean_30    0
promo_changed            0
cluster                  0
dcoilwtico               0
oil_price_change         0
oil_rolling_mean_7       0
oil_rolling_mean_14      0
oil_rolling_mean_30      0
days_from_earthquake     0
earthquake_period        0
dtype: int64
Checking for infinite values in X_train:
id                           0
store_nbr                    0
onpromotion                  0
day_of_week                  0
month                        0
is_holiday                   0
year                         0
day_of_month                 0
is_weekend                   0
week_of_year                 0
day_of_year                  0
quarter                      0
is_payday                    0
sales_lag_1                  0
sales_lag_7                  0
sales_lag_14                 0
sales_lag_30                 0
sales_rolling_mean_7         0
sales_rolling_mean_14        0
sales_rolling_mean_30        0
sales_momentum           95165
promo_rolling_mean_7         0
promo_rolling_mean_14        0
promo_rolling_mean_30        0
promo_changed                0
cluster                      0
dcoilwtico                   0
oil_price_change             0
oil_rolling_mean_7           0
oil_rolling_mean_14          0
oil_rolling_mean_30          0
days_from_earthquake         0
earthquake_period            0
dtype: Int64
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_32656/3325036056.py:38: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.fillna(X_train.mean(), inplace=True)
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_32656/3325036056.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf with NaN
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_32656/3325036056.py:42: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.fillna(X_train.mean(), inplace=True)  # Fill NaN again after replacing inf
/Users/saraliu/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/sklearn/feature_selection/_univariate_selection.py:776: UserWarning: k=50 is greater than n_features=33. All the features will be returned.
  warnings.warn(
Top 10 features by F-score:
                  Feature       F_Score  P_Value
17   sales_rolling_mean_7  2.643486e+07      0.0
18  sales_rolling_mean_14  2.318513e+07      0.0
19  sales_rolling_mean_30  2.107167e+07      0.0
14            sales_lag_7  2.058678e+07      0.0
15           sales_lag_14  1.791918e+07      0.0
13            sales_lag_1  1.606727e+07      0.0
16           sales_lag_30  7.354236e+06      0.0
23  promo_rolling_mean_30  1.295169e+06      0.0
22  promo_rolling_mean_14  1.250925e+06      0.0
21   promo_rolling_mean_7  1.223275e+06      0.0

Top 10 features by XGBoost importance:
                  Feature  XGB_Importance
17   sales_rolling_mean_7        0.490323
14            sales_lag_7        0.142128
18  sales_rolling_mean_14        0.136588
13            sales_lag_1        0.087619
15           sales_lag_14        0.045323
20         sales_momentum        0.018786
6                    year        0.018071
5              is_holiday        0.012333
8              is_weekend        0.007327
3             day_of_week        0.005674
/Users/saraliu/Library/Caches/pypoetry/virtualenvs/titanic-SA5bcgBn-py3.9/lib/python3.9/site-packages/pandas/core/nanops.py:1016: RuntimeWarning: invalid value encountered in subtract
  sqr = _ensure_numeric((avg - values) ** 2)
/var/folders/kf/_5f4g2zd40n8t2rnn343psmc0000gn/T/ipykernel_32656/3325036056.py:93: RuntimeWarning: invalid value encountered in scalar subtract
  abs(train_mean - test_mean) / (abs(train_mean) + 1e-10) * 100
../_images/588c6554bdc4a3aef4f9f270dc43db8be9d49486aad788375b19989ea220bea3.png
Selected 8 features above threshold

Top 20 selected features:
['sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_lag_7', 'sales_lag_1', 'sales_rolling_mean_30', 'sales_lag_14', 'sales_lag_30', 'year']
# Save analysis results
feature_scores.to_csv('feature_analysis_results.csv')

# Visualize feature importance distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=feature_scores['combined_score'], bins=50)
plt.title('Distribution of Feature Importance Scores')
plt.axvline(x=feature_scores['combined_score'].mean(), color='r', linestyle='--', 
            label='Selection Threshold')
plt.legend()
plt.show()

# Additional Analysis: Feature Groups
def analyze_feature_groups(selected_features):
    """Analyze which types of engineered features were most useful"""
    feature_groups = {
        'lag': [f for f in selected_features if 'lag' in f],
        'rolling': [f for f in selected_features if 'rolling' in f],
        'time': [f for f in selected_features if any(x in f for x in ['day', 'month', 'year'])],
        'store': [f for f in selected_features if any(x in f for x in ['store', 'cluster'])],
        'promo': [f for f in selected_features if 'promo' in f],
        'oil': [f for f in selected_features if 'oil' in f],
        'holiday': [f for f in selected_features if 'holiday' in f]
    }
    
    group_counts = {k: len(v) for k, v in feature_groups.items()}
    
    plt.figure(figsize=(10, 5))
    sns.barplot(x=list(group_counts.keys()), y=list(group_counts.values()))
    plt.title('Number of Selected Features by Group')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return feature_groups

feature_groups = analyze_feature_groups(selected_features)

# Print summary of most important features by group
print("\nMost important features by group:")
for group, features in feature_groups.items():
    if features:
        top_features = feature_scores.loc[features].head(3).index.tolist()
        print(f"\n{group.upper()}:")
        for feat in top_features:
            score = feature_scores.loc[feat, 'combined_score']
            print(f"  - {feat}: {score:.3f}")

# Save final selected features
with open('selected_features.txt', 'w') as f:
    for feature in selected_features:
        f.write(f"{feature}\n")

print("\nAnalysis completed. Results saved to:")
print("- feature_analysis_results.csv")
print("- selected_features.txt")
../_images/b31595f96a0b20afd186917cfe9056772be40b7ee4a65454cdea1fdcbe14122f.png ../_images/203baa819264730b07576a0de3934530590f142a28fd5072885d584dc346d883.png
Most important features by group:

LAG:
  - sales_lag_7: 0.546
  - sales_lag_1: 0.461
  - sales_lag_14: 0.450

ROLLING:
  - sales_rolling_mean_7: 0.854
  - sales_rolling_mean_14: 0.569
  - sales_rolling_mean_30: 0.451

TIME:
  - year: 0.170

Analysis completed. Results saved to:
- feature_analysis_results.csv
- selected_features.txt
# Split data into train and test
test_dates = df_engineered['date'].max() - pd.Timedelta(days=15)
test_df = df_engineered[df_engineered['date'] > test_dates]
train_df = df_engineered[df_engineered['date'] <= test_dates]

# Prepare final training and test sets based on selected features
X_train = train_df[selected_features]
X_test = test_df[selected_features]
y_train = train_df['sales']
y_test = test_df['sales']

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Model Training
def train_model():
    """Train XGBoost model with time-series cross-validation"""
    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)
    
    # XGBoost parameters
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': 1000,
        'max_depth': 7,
        'learning_rate': 0.01,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'eval_metric': 'rmse',
        'early_stopping_rounds': 50
    }
    
    # Track cross-validation scores
    cv_scores = []
    
    print("\nPerforming cross-validation...")
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
        print(f"\nFold {fold}")
        
        # Split data
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_val = y_train.iloc[val_idx]
        
        # Train model
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_tr, y_tr), (X_val, y_val)],
            verbose=100
        )
        
        # Evaluate
        val_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        cv_scores.append(rmse)
        
        print(f"Fold {fold} RMSE: {rmse:.4f}")
    
    print(f"\nCross-validation RMSE: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
    
    # Train final model on full training set
    print("\nTraining final model...")
    final_model = xgb.XGBRegressor(**params)
    final_model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100
    )
    
    return final_model, cv_scores

# Train model
print("Training model...")
model, cv_scores = train_model()
Training set shape: (2974158, 8)
Test set shape: (26730, 8)
Training model...

Performing cross-validation...

Fold 1
[0]	validation_0-rmse:691.67712	validation_1-rmse:882.03937
[100]	validation_0-rmse:288.88501	validation_1-rmse:411.22439
[200]	validation_0-rmse:166.28840	validation_1-rmse:286.69556
[300]	validation_0-rmse:137.98073	validation_1-rmse:261.39180
[400]	validation_0-rmse:131.35113	validation_1-rmse:258.03835
[445]	validation_0-rmse:130.02453	validation_1-rmse:258.38223
Fold 1 RMSE: 257.9833

Fold 2
[0]	validation_0-rmse:791.78319	validation_1-rmse:1012.15412
[100]	validation_0-rmse:342.45569	validation_1-rmse:448.56593
[200]	validation_0-rmse:211.01928	validation_1-rmse:295.31680
[300]	validation_0-rmse:181.16969	validation_1-rmse:266.03842
[400]	validation_0-rmse:173.64871	validation_1-rmse:262.35918
[469]	validation_0-rmse:171.07830	validation_1-rmse:262.69627
Fold 2 RMSE: 262.2781

Fold 3
[0]	validation_0-rmse:870.91512	validation_1-rmse:1188.16696
[100]	validation_0-rmse:378.78196	validation_1-rmse:523.63333
[200]	validation_0-rmse:237.14389	validation_1-rmse:338.70086
[300]	validation_0-rmse:205.96137	validation_1-rmse:299.82173
[400]	validation_0-rmse:198.27652	validation_1-rmse:291.82202
[500]	validation_0-rmse:194.84032	validation_1-rmse:289.54595
[600]	validation_0-rmse:192.32138	validation_1-rmse:288.79562
[700]	validation_0-rmse:190.39611	validation_1-rmse:288.32908
[786]	validation_0-rmse:189.02899	validation_1-rmse:288.29907
Fold 3 RMSE: 288.2754

Fold 4
[0]	validation_0-rmse:959.41370	validation_1-rmse:1250.61199
[100]	validation_0-rmse:416.34484	validation_1-rmse:596.10055
[200]	validation_0-rmse:260.15985	validation_1-rmse:430.76577
[300]	validation_0-rmse:225.61796	validation_1-rmse:398.15729
[400]	validation_0-rmse:216.90011	validation_1-rmse:392.00127
[500]	validation_0-rmse:212.79600	validation_1-rmse:391.20508
[524]	validation_0-rmse:212.04169	validation_1-rmse:391.32241
Fold 4 RMSE: 391.1075

Fold 5
[0]	validation_0-rmse:1023.90359	validation_1-rmse:1377.29923
[100]	validation_0-rmse:449.60322	validation_1-rmse:609.68166
[200]	validation_0-rmse:285.40938	validation_1-rmse:396.01106
[300]	validation_0-rmse:248.54358	validation_1-rmse:353.14832
[400]	validation_0-rmse:238.21387	validation_1-rmse:344.86979
[500]	validation_0-rmse:233.50222	validation_1-rmse:342.69121
[600]	validation_0-rmse:230.08223	validation_1-rmse:341.71961
[700]	validation_0-rmse:227.46782	validation_1-rmse:341.30323
[800]	validation_0-rmse:225.71577	validation_1-rmse:341.08853
[857]	validation_0-rmse:224.77883	validation_1-rmse:341.09266
Fold 5 RMSE: 341.0755

Cross-validation RMSE: 308.1440 (+/- 50.9548)

Training final model...
[0]	validation_0-rmse:1090.43230	validation_1-rmse:1236.55493
[100]	validation_0-rmse:477.06684	validation_1-rmse:474.79989
[200]	validation_0-rmse:302.06964	validation_1-rmse:266.87825
[300]	validation_0-rmse:263.55594	validation_1-rmse:240.15834
[384]	validation_0-rmse:254.77797	validation_1-rmse:240.55897
# Model evaluation function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Make predictions
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'Train RMSE': np.sqrt(mean_squared_error(y_train, train_pred)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, test_pred)),
        'Train MAE': mean_absolute_error(y_train, train_pred),
        'Test MAE': mean_absolute_error(y_test, test_pred)
    }
    
    print("\nModel Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Train predictions
    ax1.scatter(y_train, train_pred, alpha=0.5)
    ax1.plot([y_train.min(), y_train.max()], 
             [y_train.min(), y_train.max()], 
             'r--', lw=2)
    ax1.set_title('Train: Actual vs Predicted')
    ax1.set_xlabel('Actual Sales')
    ax1.set_ylabel('Predicted Sales')
    
    # Test predictions
    ax2.scatter(y_test, test_pred, alpha=0.5)
    ax2.plot([y_test.min(), y_test.max()], 
             [y_test.min(), y_test.max()], 
             'r--', lw=2)
    ax2.set_title('Test: Actual vs Predicted')
    ax2.set_xlabel('Actual Sales')
    ax2.set_ylabel('Predicted Sales')
    
    plt.tight_layout()
    plt.show()
    
    return metrics, test_pred
  1. Model Evaluation

# Evaluate model
print("\nEvaluating model...")
metrics, test_predictions = evaluate_model(model, X_train, y_train, X_test, y_test)

# Save results
model.save_model('store_sales_model.json')

test_predictions_df = pd.DataFrame({
    'date': test_df['date'],
    'store_nbr': test_df['store_nbr'],
    'family': test_df['family'],
    'actual': y_test,
    'predicted': test_predictions
})
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("\nResults saved to:")
print("- store_sales_model.json")
print("- feature_importance.csv")
print("- test_predictions.csv")

# Analysis of predictions by store and family
def analyze_predictions_by_segment():
    predictions_analysis = test_predictions_df.copy()
    predictions_analysis['error'] = predictions_analysis['actual'] - predictions_analysis['predicted']
    predictions_analysis['abs_error'] = abs(predictions_analysis['error'])
    
    # Store level analysis
    store_performance = predictions_analysis.groupby('store_nbr').agg({
        'abs_error': ['mean', 'std'],
        'error': ['mean', 'count']
    }).round(2)
    
    # Family level analysis
    family_performance = predictions_analysis.groupby('family').agg({
        'abs_error': ['mean', 'std'],
        'error': ['mean', 'count']
    }).round(2)
    
    return store_performance, family_performance

store_perf, family_perf = analyze_predictions_by_segment()
print("\nStore Performance Summary:")
print(store_perf.head())
print("\nProduct Family Performance Summary:")
print(family_perf.head())

# Plot actual vs predicted
plt.figure(figsize=(15,6))
plt.plot(test_predictions_df.index, test_predictions_df['actual'], label='Actual')
plt.plot(test_predictions_df.index, test_predictions_df['predicted'], label='Predicted')
plt.title('Actual vs Predicted Sales')
plt.legend()
plt.show()
Evaluating model...

Model Performance Metrics:
Train RMSE: 259.0707
Test RMSE: 239.7587
Train MAE: 62.5357
Test MAE: 74.5602
../_images/cdfa0fff6fac26d8802cd1930c9591888fedc6e3810f35cab1ebc10a51709d7a.png
Results saved to:
- store_sales_model.json
- feature_importance.csv
- test_predictions.csv

Store Performance Summary:
          abs_error          error      
               mean     std   mean count
store_nbr                               
1             58.56  136.73 -24.34   495
2             53.04  107.62 -23.46   495
3             92.90  234.08 -26.68   495
4             58.78  134.97 -19.02   495
5             45.63  103.86 -15.87   495

Product Family Performance Summary:
           abs_error          error      
                mean     std   mean count
family                                   
AUTOMOTIVE     12.00    3.62 -11.97   810
BABY CARE      12.63    0.50 -12.63   810
BEAUTY         11.72    3.05 -11.57   810
BEVERAGES     507.55  571.44   9.78   810
BOOKS          12.73    0.11 -12.73   810
../_images/d671d462bda0d762cfe03e0a53fbf44cb348a26360208965bf34f868df2d378d.png
# forecast on the test set


print(f"Test data shape: {test.shape}")
print(f"Test date range: {test['date'].min()} to {test['date'].max()}")

# Create the same features as used in training
def prepare_test_features(test_df, train_df):
    """
    Create lag and rolling features for test data using training data
    """
    print("Preparing features...")
    test_features = test_df.copy()
    
    # Get last date of training data
    last_train_date = train_df['date'].max()
    print(f"Last training date: {last_train_date}")
    
    # Get last 30 days of training data (for creating lag features)
    last_30_days = train_df[train_df['date'] > last_train_date - pd.Timedelta(days=30)]
    
    # Create lag features for each store-family combination
    for (store, family), group in last_30_days.groupby(['store_nbr', 'family']):
        # Get corresponding test rows
        mask = (test_features['store_nbr'] == store) & (test_features['family'] == family)
        
        if mask.any():
            # Last known values for different lags
            last_sales = group['sales'].iloc[-1]  # Most recent sales value
            last_7_mean = group['sales'].tail(7).mean()  # Last 7 days mean
            last_14_mean = group['sales'].tail(14).mean()  # Last 14 days mean
            last_30_mean = group['sales'].tail(30).mean()  # Last 30 days mean
            
            # Create lag features
            test_features.loc[mask, 'sales_lag_1'] = last_sales
            test_features.loc[mask, 'sales_lag_7'] = group['sales'].iloc[-7] if len(group) >= 7 else last_sales
            test_features.loc[mask, 'sales_lag_14'] = group['sales'].iloc[-14] if len(group) >= 14 else last_sales
            test_features.loc[mask, 'sales_lag_30'] = group['sales'].iloc[-30] if len(group) >= 30 else last_sales
            
            # Create rolling mean features
            test_features.loc[mask, 'sales_rolling_mean_7'] = last_7_mean
            test_features.loc[mask, 'sales_rolling_mean_14'] = last_14_mean
            test_features.loc[mask, 'sales_rolling_mean_30'] = last_30_mean
    
    # Add year feature
    test_features['year'] = test_features['date'].dt.year
    
    return test_features

# Create features for test data
test_engineered = prepare_test_features(test, train)

test_engineered = handle_missing_values(test_engineered)


# Selected features (in order of importance as previously determined)
selected_features = [
    'sales_rolling_mean_7',
    'sales_rolling_mean_14',
    'sales_lag_7',
    'sales_lag_1',
    'sales_rolling_mean_30',
    'sales_lag_14',
    'sales_lag_30',
    'year'
]

# Prepare feature matrix
X_test = test_engineered[selected_features]

# Handle missing values
X_test = X_test.fillna(0)

print("\nFeature matrix shape:", X_test.shape)
print("\nFeatures created:", X_test.columns.tolist())

# 3. Make predictions
print("\nMaking predictions...")
# dtest = xgb.DMatrix(X_test)
predictions = model.predict(X_test)

# Check the unique values in the predictions
print("Unique predictions:", np.unique(predictions))

# Check the distribution of predictions
import matplotlib.pyplot as plt
plt.hist(predictions, bins=30)
plt.title('Distribution of Test Predictions')
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.show()
Test data shape: (28512, 6)
Test date range: 2017-08-16 00:00:00 to 2017-08-31 00:00:00
Preparing features...
Last training date: 2017-08-15 00:00:00

Feature matrix shape: (28512, 8)

Features created: ['sales_rolling_mean_7', 'sales_rolling_mean_14', 'sales_lag_7', 'sales_lag_1', 'sales_rolling_mean_30', 'sales_lag_14', 'sales_lag_30', 'year']

Making predictions...
Unique predictions: [1.27371588e+01 1.27453165e+01 1.27522192e+01 ... 1.00117842e+04
 1.06327578e+04 1.27613691e+04]
../_images/afd34a3c468a8df25da59613da06e502e238295e08569d53bdc4b24e158539f5.png
# Create submission file    
submission = pd.DataFrame({
    'id': test['id'],
    'sales': predictions
})

# Ensure predictions are non-negative
submission['sales'] = submission['sales'].clip(lower=0)

# Save submission file
submission.to_csv('submission.csv', index=False)
print("\nSubmission saved to submission.csv")
Submission saved to submission.csv