# 💻 Step 1: Import core packages and print versions
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Set style for plots
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

print("Libraries imported successfully!")
print(f"Pandas version:     {pd.__version__}")
print(f"NumPy version:      {np.__version__}")
print(f"Seaborn version:    {sns.__version__}")
print(f"Matplotlib version: {mpl.__version__}")

Libraries imported successfully!
Pandas version:     3.0.0
NumPy version:      2.4.1
Seaborn version:    0.13.2
Matplotlib version: 3.10.8

# Seed for reproducibility
np.random.seed(42)
n_samples = 200

# 1. Create Customer Profiles DataFrame
customers_data = {
    'customer_id': [f"CUST_{i:03d}" for i in range(1, 51)],
    'age': np.random.choice([22, 28, 35, 42, 50, np.nan], size=50, p=[0.2, 0.2, 0.2, 0.2, 0.1, 0.1]), # Injecting NaNs
    'gender': np.random.choice(['M', 'F', 'Other'], size=50),
    'membership': np.random.choice(['Bronze', 'Silver', 'Gold'], size=50, p=[0.5, 0.3, 0.2]),
    'signup_date': pd.date_range(start='2025-01-01', periods=50, freq='d')
}
df_customers = pd.DataFrame(customers_data)

# 2. Create Transactions DataFrame
tx_dates = pd.date_range(start='2026-01-01', periods=n_samples, freq='h')
transactions_data = {
    'transaction_id': [f"TX_{i:04d}" for i in range(1, n_samples + 1)],
    'customer_id': np.random.choice(df_customers['customer_id'], size=n_samples),
    'timestamp': tx_dates,
    'amount': np.random.normal(loc=120, scale=40, size=n_samples).round(2),
    'category': np.random.choice(['Electronics', 'Clothing', 'Books', np.nan], size=n_samples, p=[0.4, 0.4, 0.15, 0.05]), # Injecting NaNs
    'is_fraud': np.random.choice([0, 1], size=n_samples, p=[0.95, 0.05]) # Imbalanced target variable
}
df_transactions = pd.DataFrame(transactions_data)

# Inject some duplicate rows into transactions
duplicates = df_transactions.iloc[10:15]
df_transactions = pd.concat([df_transactions, duplicates], ignore_index=True)

print(f"Transactions dataset generated. Shape: {df_transactions.shape}")
print(f"Customers dataset generated. Shape: {df_customers.shape}")

Transactions dataset generated. Shape: (205, 6)
Customers dataset generated. Shape: (50, 5)

C:\Users\aakashkhandelwal\AppData\Local\Temp\ipykernel_18864\1921948098.py:11: Pandas4Warning: 'd' is deprecated and will be removed in a future version, please use 'D' instead.
  'signup_date': pd.date_range(start='2025-01-01', periods=50, freq='d')

# 1. Check for Duplicate Rows
dup_count = df_transactions.duplicated().sum()
print(f"Number of duplicate rows in transactions: {dup_count}")

# Remove duplicates
df_transactions = df_transactions.drop_duplicates(keep='first')
print(f"Shape after removing duplicates: {df_transactions.shape}")

# 2. Check for Missing Values
print("\nMissing values in customer profiles:")
print(df_customers.isnull().sum())

# Impute missing Customer Age with the Median
median_age = df_customers['age'].median()
df_customers['age'] = df_customers['age'].fillna(median_age)
print(f"\nImputed missing age with median age: {median_age}")
print(f"Missing age counts after imputation: {df_customers['age'].isnull().sum()}")

Number of duplicate rows in transactions: 5
Shape after removing duplicates: (200, 6)

Missing values in customer profiles:
customer_id    0
age            5
gender         0
membership     0
signup_date    0
dtype: int64

Imputed missing age with median age: 28.0
Missing age counts after imputation: 0

# 1. Ordinal Encoding (Mapping ranked data to numerical scale)
membership_map = {'Bronze': 0, 'Silver': 1, 'Gold': 2}
df_customers['membership_encoded'] = df_customers['membership'].map(membership_map)

# 2. One-Hot Encoding (Creating dummy variables for nominal features)
# We drop the first column using drop_first=True to avoid the dummy variable trap (collinearity)
df_customers_encoded = pd.get_dummies(df_customers, columns=['gender'], drop_first=True)

print("Sample of encoded customer profiles:")
df_customers_encoded[['customer_id', 'membership', 'membership_encoded', 'gender_M', 'gender_Other']].head()

Sample of encoded customer profiles:

# Check memory usage of string categories before optimization
print("--- Memory before category type-casting ---")
df_customers[['membership', 'gender']].info(memory_usage='deep')

# Cast to category type
df_mem_opt = df_customers.copy()
df_mem_opt['membership'] = df_mem_opt['membership'].astype('category')
df_mem_opt['gender'] = df_mem_opt['gender'].astype('category')

print("\n--- Memory after category type-casting ---")
df_mem_opt[['membership', 'gender']].info(memory_usage='deep')

--- Memory before category type-casting ---
<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   membership  50 non-null     str  
 1   gender      50 non-null     str  
dtypes: str(2)
memory usage: 1.3 KB

--- Memory after category type-casting ---
<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   membership  50 non-null     category
 1   gender      50 non-null     category
dtypes: category(2)
memory usage: 305.0 bytes

# 1. Filtering using bitwise operators
high_value_electronics = df_transactions[
    (df_transactions['category'] == 'Electronics') & (df_transactions['amount'] > 150)
]

# 2. Row-wise Transformation using lambda & .apply()
# (Creating a category-based premium flag)
df_transactions['premium_fee'] = df_transactions.apply(
    lambda row: row['amount'] * 0.05 if row['category'] == 'Electronics' else row['amount'] * 0.02,
    axis=1
)

# 3. Fast Vectorized Categorization using np.where() and np.select()
# Create an binary indicator column
df_transactions['is_high_value'] = np.where(df_transactions['amount'] > 150, 1, 0)

# Create multi-class categorical columns
conditions = [
    df_transactions['amount'] < 80,
    (df_transactions['amount'] >= 80) & (df_transactions['amount'] < 150),
    df_transactions['amount'] >= 150
]
choices = ['Low_Spend', 'Medium_Spend', 'High_Spend']
df_transactions['spend_group'] = np.select(conditions, choices, default='Unknown')

df_transactions[['transaction_id', 'amount', 'category', 'premium_fee', 'is_high_value', 'spend_group']].head()

# SQL-style Left Join to attach customer features to transaction records
df_merged = pd.merge(df_transactions, df_customers_encoded, on='customer_id', how='left')

print(f"Merged DataFrame Shape: {df_merged.shape}")
print("Merged columns:")
print(df_merged.columns.tolist())
df_merged[['transaction_id', 'customer_id', 'amount', 'age', 'membership_encoded', 'gender_M']].head()

Merged DataFrame Shape: (200, 15)
Merged columns:
['transaction_id', 'customer_id', 'timestamp', 'amount', 'category', 'is_fraud', 'premium_fee', 'is_high_value', 'spend_group', 'age', 'membership', 'signup_date', 'membership_encoded', 'gender_M', 'gender_Other']

# 1. Basic grouping and mean calculation
group_means = df_merged.groupby('membership')['amount'].mean()
print("Mean transaction amount by membership level:")
print(group_means)

# 2. Multi-column aggregation using .agg()
agg_results = df_merged.groupby('membership').agg({
    'amount': ['mean', 'max', 'std'],
    'age': 'median'
})
print("\nMulti-metric aggregations:")
print(agg_results)

# 3. Advanced Feature Engineering using .transform()
# (Subtracting group mean to center the purchase amount per customer tier)
df_merged['amount_diff_tier_mean'] = df_merged['amount'] - df_merged.groupby('membership')['amount'].transform('mean')
df_merged[['customer_id', 'membership', 'amount', 'amount_diff_tier_mean']].head()

Mean transaction amount by membership level:
membership
Bronze    113.059903
Gold      123.339535
Silver    123.535741
Name: amount, dtype: float64

Multi-metric aggregations:
                amount                       age
                  mean     max        std median
membership                                      
Bronze      113.059903  205.73  42.116029   28.0
Gold        123.339535  204.52  41.723574   28.0
Silver      123.535741  247.48  46.608314   28.0

# 1. Parse dates and extract numerical features
df_merged['timestamp'] = pd.to_datetime(df_merged['timestamp'])
df_merged['hour'] = df_merged['timestamp'].dt.hour
df_merged['day_of_week'] = df_merged['timestamp'].dt.dayofweek
df_merged['is_weekend'] = np.where(df_merged['day_of_week'] >= 5, 1, 0)

# 2. Build Time Series Shift Features (Lag/Lead)
# For timeline alignment, we set index to timestamp and sort it
df_ts = df_merged.set_index('timestamp').sort_index()

# Lag features (creating features from past hours)
df_ts['lag_amount_1h'] = df_ts['amount'].shift(1)
df_ts['lag_amount_2h'] = df_ts['amount'].shift(2)

# Lead feature (creating target column for tomorrow's prediction)
df_ts['target_lead_1h'] = df_ts['amount'].shift(-1)

# 3. Rolling Window Aggregations (Smoothing trends)
df_ts['rolling_mean_3h'] = df_ts['amount'].rolling(window=3).mean()

# Drop boundary NaNs generated by shifting
df_ts = df_ts.dropna(subset=['lag_amount_1h', 'lag_amount_2h', 'target_lead_1h', 'rolling_mean_3h'])

df_ts[['amount', 'lag_amount_1h', 'lag_amount_2h', 'target_lead_1h', 'rolling_mean_3h']].head()

# Write your code here to calculate 14-day rolling standard deviation:
# Hint: use .rolling() with a window parameter of '14d' or 14 on the datetime-indexed df_ts

# Calculate a 14-day rolling standard deviation
df_ts['rolling_std_14d'] = df_ts['amount'].rolling(window='14d').std()
print(df_ts[['amount', 'rolling_std_14d']].head(10))

# 1. Target Imbalance Analysis
fraud_dist = df_ts['is_fraud'].value_counts(normalize=True) * 100
print("Fraud (Target) distribution (%):")
print(fraud_dist.round(2))

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Boxplot for Outlier Detection across Groups
sns.boxplot(
    data=df_ts, 
    x='membership', 
    y='amount', 
    hue='is_fraud', 
    palette='muted', 
    ax=axes[0]
)
axes[0].set_title("Transaction Amounts by Membership & Fraud Status")
axes[0].set_xlabel("Membership Level")
axes[0].set_ylabel("Amount ($)")

# Correlation Heatmap for Feature Selection (Checking collinear features)
# Isolating numeric variables
numeric_cols = df_ts.select_dtypes(include=[np.number]).columns.tolist()
# Drop ID indices and constants from correlation inspection
cols_to_corr = [col for col in numeric_cols if col not in ['is_high_value']]
corr_matrix = df_ts[cols_to_corr].corr()

sns.heatmap(
    corr_matrix, 
    annot=True, 
    cmap='coolwarm', 
    fmt='.2f', 
    linewidths=0.5, 
    ax=axes[1]
)
axes[1].set_title("Correlation Matrix for Feature Selection")

plt.tight_layout()
plt.show()

Fraud (Target) distribution (%):
is_fraud
0    95.94
1     4.06
Name: proportion, dtype: float64

# Drop ID labels and string/date objects that are not encoded
features_to_drop = [
    'transaction_id',
    'customer_id', 
    'category', # Raw string categorical - encoded counterpart is needed if we kept it
    'membership', # Raw string categorical - encoded counterpart is membership_encoded
    'signup_date', # Raw datetime string
    'is_fraud', # Target label must be dropped from feature space X
    'target_lead_1h', # Target label for regression forecasting
    'spend_group' # Created for demonstration - dropping to keep X numerical
]

# Split into Feature Matrix X and Target vector y
X = df_ts.drop(columns=features_to_drop)
y = df_ts['is_fraud'] # Binary target for classification

print("--- Final Preprocessed Shapes for Scikit-Learn ---")
print(f"Feature Matrix X Shape: {X.shape} (Rows, Features)")
print(f"Target Vector y Shape: {y.shape} (Rows,)")
print("\nFeatures list (X):")
print(X.columns.tolist())
print("\nFirst 3 rows of feature matrix X:")
X.head(3)

--- Final Preprocessed Shapes for Scikit-Learn ---
Feature Matrix X Shape: (197, 14) (Rows, Features)
Target Vector y Shape: (197,) (Rows,)

Features list (X):
['amount', 'premium_fee', 'is_high_value', 'age', 'membership_encoded', 'gender_M', 'gender_Other', 'amount_diff_tier_mean', 'hour', 'day_of_week', 'is_weekend', 'lag_amount_1h', 'lag_amount_2h', 'rolling_mean_3h']

First 3 rows of feature matrix X:

df_clean = (
    df.drop_duplicates()
      .fillna({'age': df['age'].median()})
      .query("amount > 10")
)

	customer_id	membership	membership_encoded	gender_M	gender_Other
0	CUST_001	Bronze	0	False	True
1	CUST_002	Silver	1	False	False
2	CUST_003	Silver	1	False	False
3	CUST_004	Gold	2	False	False
4	CUST_005	Bronze	0	False	False

	transaction_id	customer_id	amount	age	membership_encoded	gender_M
0	TX_0001	CUST_032	47.92	22.0	2	False
1	TX_0002	CUST_032	138.71	22.0	2	False
2	TX_0003	CUST_024	57.94	28.0	0	False
3	TX_0004	CUST_041	99.58	22.0	0	False
4	TX_0005	CUST_049	90.68	35.0	0	False

	customer_id	membership	amount	amount_diff_tier_mean
0	CUST_032	Gold	47.92	-75.419535
1	CUST_032	Gold	138.71	15.370465
2	CUST_024	Bronze	57.94	-55.119903
3	CUST_041	Bronze	99.58	-13.479903
4	CUST_049	Bronze	90.68	-22.379903

	amount	lag_amount_1h	lag_amount_2h	target_lead_1h	rolling_mean_3h
timestamp
2026-01-01 02:00:00	57.94	138.71	47.92	99.58	81.523333
2026-01-01 03:00:00	99.58	57.94	138.71	90.68	98.743333
2026-01-01 04:00:00	90.68	99.58	57.94	127.81	82.733333
2026-01-01 05:00:00	127.81	90.68	99.58	97.76	106.023333
2026-01-01 06:00:00	97.76	127.81	90.68	114.89	105.416667

	amount	premium_fee	is_high_value	age	membership_encoded	gender_M	gender_Other	amount_diff_tier_mean	hour	day_of_week	is_weekend	lag_amount_1h	lag_amount_2h	rolling_mean_3h
timestamp
2026-01-01 02:00:00	57.94	2.8970	0	28.0	0	False	True	-55.119903	2	3	0	138.71	47.92	81.523333
2026-01-01 03:00:00	99.58	4.9790	0	22.0	0	False	True	-13.479903	3	3	0	57.94	138.71	98.743333
2026-01-01 04:00:00	90.68	1.8136	0	35.0	0	False	True	-22.379903	4	3	0	99.58	57.94	82.733333

📓 Interactive Notebook: Advanced Data Manipulation with Pandas & Seaborn¶

🎯 Learning Objectives¶

🧠 Understanding the Ecosystem: Pandas Internals & Modern Alternates¶

1. Pandas Internals: The Constraints¶

2. Modern Alternates for Big Data & ML Pipelines¶

🛠️ Step 2: Synthetic Data Generation¶

🧹 Step 3: Data Cleaning & Pre-ML Formatting¶

🏷️ Step 4: Categorical Encoding¶

💡 Advanced Tip: Memory Optimization with Categoricals¶

🔍 Step 5: Advanced Filtering & Custom Feature Engineering¶

🤝 Step 6: Integrating Data Sources¶

📊 Step 6.5: GroupBy and Aggregations (Split-Apply-Combine)¶

📅 Step 7: Dates and Time Series Preprocessing¶

🧠 Challenge Exercise: Volatility Tracking¶

🔍 Click here to see the Solution!¶

📊 Step 8: Exploratory Data Analysis & Seaborn Visualizations¶

🚀 Step 9: Scikit-Learn Handover Split ($X$ and $y$)¶

🏆 Step 10: Advanced Best Practices & Pandas 3.0 Roadmap¶

1. Pandas Performance Best Practices¶

🚀 The Roadmap: What's New in Pandas 3.0?¶

🎉 Congratulations!¶

📚 Appendix: Computer Architecture & Data Engineering 101¶

🧱 Part 1: CPU Internals (Cores, Threads, Concurrency, and Parallelism)¶

🧩 Cores vs. Threads: The Hardware Basics¶

🍽️ The Restaurant Analogy¶

⏱️ Concurrency vs. Parallelism¶

How Software Meets Hardware:¶

🐍 Python's Execution Limit: The GIL¶

⚙️ Execution Models compared: Pandas, Polars & DuckDB¶

1. Pandas: The Single-Threaded Traditionalist¶

2. Polars: The Multi-Threaded Speedster¶

3. DuckDB: The Vectorized Columnar Engine¶

🔍 Part 2: Data Engineering Paradoxes¶

📊 Paradox 1: Row-Oriented vs. Column-Oriented (CSV vs. Parquet)¶

⏳ Paradox 2: Eager vs. Lazy Evaluation (Pandas vs. Polars)¶

💾 Paradox 3: The 10x RAM Rule (In-Memory vs. Disk-Spilling)¶

🔗 Paradox 4: Views vs. Copies (Chained Indexing)¶

📡 Part 3: Deep Hardware Architecture Concepts¶

⚡ Cache Locality & SIMD (Desk Drawer vs. Library Warehouse)¶

⏱️ The Latency Scale (Visualizing CPU Speeds)¶

	transaction_id	amount	category	premium_fee	spend_group
0	TX_0001	47.92	Electronics	2.3960	Low_Spend
1	TX_0002	138.71	Electronics	6.9355	Medium_Spend
2	TX_0003	57.94	Electronics	2.8970	Low_Spend
3	TX_0004	99.58	Electronics	4.9790	Medium_Spend
4	TX_0005	90.68	Books	1.8136	Medium_Spend

Concept	What It Is	Restaurant Analogy
Core	Physical hardware processor inside the CPU.	A physical chef working in the kitchen.
Thread	A stream of tasks executed by software.	A recipe/order ticket that needs to be prepared.
Single-Threading	One task is executed at a time.	One chef working on exactly one order from start to finish.
Multi-Threading	Multiple tasks are executed concurrently.	One chef juggling multiple orders (e.g. chopping vegetables for Order B while waiting for Order A's water to boil).
Multi-Core Processing	Multiple physical processors execute tasks at the same time.	Multiple chefs working simultaneously on different orders.

Framework	Threading Model	Core Utilization	Underlying Engine
Pandas	Single-threaded by default.	Utilizes 1 core (unless calling pre-compiled NumPy C-libraries).	Python/C, restricted by the Global Interpreter Lock (GIL).
Polars	Multi-threaded by default.	Spreads workloads across all available CPU cores.	Rust-native, using Apache Arrow and the Rayon parallel framework.
DuckDB	Multi-threaded by default.	Divides queries across all available CPU cores.	C++ vectorized, push-based execution engine.

Operation	Time in CPU Scale	Human Analogy
1 CPU Cycle	1 Second	A single blink of an eye.
L1 Cache Access	0.5 Seconds	Grabbing a notebook on your desk.
L3 Cache Access	15 Seconds	Grabbing a book from a nearby shelf.
Main Memory (RAM)	2 Minutes	Walking down the hall to the water cooler.
Solid State Drive (SSD)	2 Days	Taking a weekend trip to another city.
Mechanical Hard Drive (HDD)	1.5 Months	Going on a long summer vacation.
Network Call (Internet)	2 Years	Going to university and earning a degree.