BMW Car Sales — Data Preparation & EDA

INFO-523 Final Project | Min Set Khant (Solo)

This notebook performs: data audit, cleaning, feature engineering, EDA, temporal-split checks, and saves cleaned/processed datasets + key visuals for the dashboard.

#### Setup, Imports, and Data Loading

# 1. SETUP AND IMPORTS
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
from scipy import stats

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")
%matplotlib inline

# Load the Data and Set Paths
DATA_PATH = Path("..") / "data" / "bmw_worldwide_sales.csv"
OUTPUT_DIR = Path("..") / "data" / "cleaned"
VISUALS_DIR = Path("..") / "visuals"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VISUALS_DIR, exist_ok=True)

# Load Data
try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"{DATA_PATH} not found. Ensure the file is in the correct path.")
    df = pd.DataFrame() # Create empty DF to prevent errors

if not df.empty:
    print("Raw shape:", df.shape)
    display(df.head(10))
    df.info()
Raw shape: (50000, 11)
Model Year Region Color Fuel_Type Transmission Engine_Size_L Mileage_KM Price_USD Sales_Volume Sales_Classification
0 5 Series 2016 Asia Red Petrol Manual 3.5 151748 98740 8300 High
1 i8 2013 North America Red Hybrid Automatic 1.6 121671 79219 3428 Low
2 5 Series 2022 North America Blue Petrol Automatic 4.5 10991 113265 6994 Low
3 X3 2024 Middle East Blue Petrol Automatic 1.7 27255 60971 4047 Low
4 7 Series 2020 South America Black Diesel Manual 2.1 122131 49898 3080 Low
5 5 Series 2017 Middle East Silver Diesel Manual 1.9 171362 42926 1232 Low
6 i8 2022 Europe White Diesel Manual 1.8 196741 55064 7949 High
7 M5 2014 Asia Black Diesel Automatic 1.6 121156 102778 632 Low
8 X3 2016 South America White Diesel Automatic 1.7 48073 116482 8944 High
9 i8 2019 Europe White Electric Manual 3.0 35700 96257 4411 Low
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  object 
 1   Year                  50000 non-null  int64  
 2   Region                50000 non-null  object 
 3   Color                 50000 non-null  object 
 4   Fuel_Type             50000 non-null  object 
 5   Transmission          50000 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  int64  
 8   Price_USD             50000 non-null  int64  
 9   Sales_Volume          50000 non-null  int64  
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 4.2+ MB

Data Cleaning and Preparation

# Fix numeric types & simple conversions
num_cols = ["Year","Engine_Size_L","Mileage_KM","Price_USD","Sales_Volume"]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Quick check
print(df[num_cols].isna().sum())
Year             0
Engine_Size_L    0
Mileage_KM       0
Price_USD        0
Sales_Volume     0
dtype: int64
# Derived features
CURRENT_YEAR = 2025
df["Car_Age"] = CURRENT_YEAR - df["Year"]
df["log_Price_USD"] = np.log1p(df["Price_USD"]) # Standardized name for target
df["Price_per_KM"] = df["Price_USD"] / df["Mileage_KM"].replace(0, np.nan)
df["Price_per_KM"] = df["Price_per_KM"].fillna(df["Price_per_KM"].median())
# Categorical Normalization

# Transmission normalization
df["Transmission"] = df["Transmission"].replace({
    "Auto": "Automatic",
    "Man": "Manual",
    "Automated Manual": "Automatic",
}).fillna("Unknown")

# Fuel check (ensure consistency, fill unknown)
df["Fuel_Type"] = df["Fuel_Type"].fillna("Other")
# Missing values & duplicates handling
print("Duplicates before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicates after:", df.duplicated().sum())

# Impute Engine_Size_L per Model, fallback to global median
if "Engine_Size_L" in df.columns and "Model" in df.columns:
    df["Engine_Size_L"] = df.groupby("Model")["Engine_Size_L"].transform(lambda x: x.fillna(x.median()))
df["Engine_Size_L"] = df["Engine_Size_L"].fillna(df["Engine_Size_L"].median())

# Drop rows missing Price_USD (target)
before = df.shape[0]
df = df[~df["Price_USD"].isna()].copy()
after = df.shape[0]
print(f"Dropped {before-after} rows with missing Price_USD.")
Duplicates before: 0
Duplicates after: 0
Dropped 0 rows with missing Price_USD.
# Outlier check (as per proposal, no transformation needed for now)
def iqr_bounds(s, k=1.5):
    q1, q3 = np.nanpercentile(s, [25,75])
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

for c in ["Price_USD","Mileage_KM","Engine_Size_L","Sales_Volume"]:
    if c in df.columns:
        low, high = iqr_bounds(df[c].dropna(), k=1.5)
        pct_out = ((df[c] < low) | (df[c] > high)).mean() * 100
        print(f"{c}: IQR bounds [{low:.2f}, {high:.2f}], outliers ~ {pct_out:.3f}% ")
Price_USD: IQR bounds [-15355.50, 165418.50], outliers ~ 0.000% 
Mileage_KM: IQR bounds [-100500.38, 301308.62], outliers ~ 0.000% 
Engine_Size_L: IQR bounds [-0.15, 6.65], outliers ~ 0.000% 
Sales_Volume: IQR bounds [-4835.88, 14961.12], outliers ~ 0.000% 

Additional Feature Engineering

# Additional Feature Engineering (for EDA and modeling)

# Engine size bins
df["Engine_Bin"] = pd.cut(df["Engine_Size_L"], bins=[0,1.6,2.0,3.0,4.0,10],
                            labels=["<=1.6","1.7-2.0","2.1-3.0","3.1-4.0",">4.0"])
# Age bins
df["Age_Bin"] = pd.cut(df["Car_Age"], bins=[-1,1,3,6,10,100], labels=["0-1","2-3","4-6","7-10","10+"])
# Model popularity (Total Sales)
model_sales = df.groupby("Model")["Sales_Volume"].sum().rename("Total_Sales_Model").reset_index()
df = df.merge(model_sales, on="Model", how="left")
# Save cleaned CSV (for dashboard/visuals)
clean_path = OUTPUT_DIR / "bmw_cleaned.csv"
df.to_csv(clean_path, index=False)
print("Saved cleaned CSV to:", clean_path)
Saved cleaned CSV to: ../data/cleaned/bmw_cleaned.csv

Exploratory Data Analysis (EDA) Visualizations

# Setup/Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set(style="whitegrid")
VISUALS_DIR = Path("..") / "visuals" 
os.makedirs(VISUALS_DIR, exist_ok=True) 

# Load the CLEANED data for visualizations
DATA_PATH = Path("..") / "data" / "cleaned" / "bmw_cleaned.csv"
try:
    df = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print("Error: Cleaned data not found.")
# Price Distribution (USD)
plt.figure(figsize=(10, 6))
sns.histplot(df["Price_USD"], kde=True, bins=50)
plt.title("Distribution of BMW Car Prices (USD)", fontsize=16)
plt.xlabel("Price (USD)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(VISUALS_DIR / "price_distribution.png")
plt.show()

# Log-Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df["log_Price_USD"], kde=True, bins=50, color='darkorange')
plt.title("Distribution of Log-Transformed Price (Target)", fontsize=16)
plt.xlabel("Log(1 + Price_USD)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(VISUALS_DIR / "log_price_distribution.png")
plt.show()

# Average Price Trend by Year
plt.figure(figsize=(12, 6))
avg_price_year = df.groupby("Year")["Price_USD"].mean().reset_index()
sns.lineplot(x="Year", y="Price_USD", data=avg_price_year, marker='o', color='navy')
plt.title("Average BMW Car Price Trend Over Time", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Average Price (USD)")
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(VISUALS_DIR / "avg_price_by_year.png")
plt.show()

# Price by Fuel Type
plt.figure(figsize=(12, 7))
sns.boxplot(x="Fuel_Type", y="Price_USD", data=df, palette="Pastel1", showfliers=False)
plt.title("Price Distribution by Fuel Type", fontsize=16)
plt.xlabel("Fuel Type")
plt.ylabel("Price (USD)")
plt.tight_layout()
plt.savefig(VISUALS_DIR / "price_by_fueltype_box.png")
plt.show()

# Top 10 Models Globally (Sales Volume)
plt.figure(figsize=(14, 7))
# Calculate total sales volume per model
top_models = df.groupby('Model')['Sales_Volume'].sum().nlargest(10).sort_values(ascending=False).reset_index()

sns.barplot(x='Sales_Volume', y='Model', data=top_models, palette='rocket')
plt.title("Top 10 BMW Models by Global Sales Volume", fontsize=16)
plt.xlabel("Total Sales Volume")
plt.ylabel("Model")
plt.tight_layout()
plt.savefig(VISUALS_DIR / "top10_models_global.png")
plt.show()

# Correlation Matrix (Heatmap)
# Selecting key numerical/quasi-numerical features
corr_cols = ['log_Price_USD', 'Price_USD', 'Mileage_KM', 'Engine_Size_L', 'Car_Age', 'Sales_Volume', 'Price_per_KM']
corr_df = df[corr_cols].dropna()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_df.corr(), annot=True, fmt=".2f", cmap='coolwarm', linewidths=.5, linecolor='black')
plt.title("Correlation Matrix of Key Numerical Variables", fontsize=16)
plt.tight_layout()
plt.savefig(VISUALS_DIR / "correlation_matrix.png")
plt.show()

# Top Models by Region (Sales Volume)
# Calculate the top 5 models per region based on total sales volume
def get_top_models(group):
    return group.nlargest(5, 'Total_Sales_Model')

top_models_region = df.groupby(['Region', 'Model'])['Sales_Volume'].sum().reset_index()
top_models_region = top_models_region.rename(columns={'Sales_Volume': 'Total_Sales_Model'})
top_models_per_region = top_models_region.groupby('Region', group_keys=False).apply(get_top_models).reset_index(drop=True)

plt.figure(figsize=(14, 8))
sns.barplot(x='Total_Sales_Model', y='Model', hue='Region', data=top_models_per_region, dodge=False, palette='tab10')
plt.title("Top BMW Models by Sales Volume Across Regions", fontsize=16)
plt.xlabel("Total Sales Volume")
plt.ylabel("Model")
plt.legend(title='Region', loc='lower right')
plt.tight_layout()
plt.savefig(VISUALS_DIR / "top_model_per_region.png")
plt.show()

4. Final Preparation for Modeling: Temporal Split Readiness

The final step prepares the dataset by converting categorical features into numerical ones using One-Hot Encoding (OHE) and dropping redundant columns. This bmw_modeling_ready.csv file is the essential input for the 02-modeling.ipynb notebook.

# Prepare modeling-ready dataset
model_df = df.copy()

# Select columns for One-Hot Encoding (OHE)
cat_cols = ["Region","Model","Fuel_Type","Transmission","Color"]
model_df = pd.get_dummies(model_df, columns=[c for c in cat_cols if c in model_df.columns], drop_first=True)

# Drop original categorical columns
model_df = model_df.drop(columns=[c for c in cat_cols if c in model_df.columns], errors='ignore')

# Drop columns that are redundant/not used in modeling, but were useful for EDA/initial features
# Note: Price_USD is kept here so it can be extracted in 02-modeling for the final MAE calculation
cols_to_drop = ["Sales_Classification", "Engine_Bin", "Age_Bin", "index"] 
model_df = model_df.drop(columns=[c for c in cols_to_drop if c in model_df.columns], errors='ignore')

# Save modeling-ready CSV 
processed_path = OUTPUT_DIR / "bmw_modeling_ready.csv" 
model_df.to_csv(processed_path, index=False)
print("Saved modeling-ready CSV:", processed_path)
print("Final Modeling Shape:", model_df.shape)
Saved modeling-ready CSV: ../data/cleaned/bmw_modeling_ready.csv
Final Modeling Shape: (50000, 33)