Congratulations on your new role! You've been hired as a Data Scientist at TechRise, a rapidly growing e-commerce startup. Your first assignment is to analyze customer behavior and provide actionable insights to improve sales and customer retention.
TechRise sells electronics and tech accessories online. The company has been experiencing steady growth but wants to optimize their marketing strategy, improve product recommendations, and reduce customer churn.
You'll be working with the following datasets:
Note: Let's simulate these datasets with Python code:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
# Set random seed for reproducibility
np.random.seed(42)
# Generate customer data
def generate_customer_data(num_customers=1000):
customer_ids = range(1, num_customers + 1)
registration_dates = [
(datetime(2022, 1, 1) + timedelta(days=random.randint(0, 730))).strftime('%Y-%m-%d')
for _ in range(num_customers)
]
ages = np.random.normal(35, 12, num_customers).astype(int)
# Clip ages to reasonable range
ages = np.clip(ages, 18, 80)
genders = np.random.choice(['M', 'F', 'Other'], num_customers, p=[0.48, 0.48, 0.04])
locations = np.random.choice(
['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia',
'San Antonio', 'San Diego', 'Dallas', 'San Francisco'],
num_customers
)
email_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'icloud.com']
emails = [
f"customer_{i}@{random.choice(email_domains)}"
for i in customer_ids
]
customer_data = pd.DataFrame({
'customer_id': customer_ids,
'registration_date': registration_dates,
'age': ages,
'gender': genders,
'location': locations,
'email': emails
})
return customer_data
# Generate product data
def generate_product_data(num_products=100):
product_ids = range(1, num_products + 1)
categories = {
'Smartphones': ['iPhone 13', 'Galaxy S22', 'Pixel 6', 'OnePlus 9', 'Xiaomi 12'],
'Laptops': ['MacBook Pro', 'Dell XPS', 'ThinkPad X1', 'HP Spectre', 'Asus ZenBook'],
'Tablets': ['iPad Pro', 'Galaxy Tab', 'Surface Pro', 'Kindle Fire', 'Lenovo Tab'],
'Accessories': ['Wireless Earbuds', 'Phone Case', 'Laptop Stand', 'Wireless Charger', 'USB-C Hub'],
'Smart Home': ['Echo Dot', 'Nest Thermostat', 'Philips Hue', 'Ring Doorbell', 'Smart Plug']
}
product_names = []
product_categories = []
for _ in range(num_products):
category = random.choice(list(categories.keys()))
product_base = random.choice(categories[category])
variant = random.choice(['', ' Pro', ' Plus', ' Mini', ' Max'])
color = random.choice(['Black', 'White', 'Silver', 'Gold', 'Blue', 'Red'])
product_name = f"{product_base}{variant} {color}"
product_names.append(product_name)
product_categories.append(category)
prices = []
for category in product_categories:
if category == 'Smartphones':
prices.append(round(random.uniform(500, 1200), 2))
elif category == 'Laptops':
prices.append(round(random.uniform(800, 2000), 2))
elif category == 'Tablets':
prices.append(round(random.uniform(300, 900), 2))
elif category == 'Accessories':
prices.append(round(random.uniform(20, 150), 2))
else: # Smart Home
prices.append(round(random.uniform(50, 250), 2))
stock_levels = np.random.randint(0, 100, num_products)
ratings = np.random.normal(4, 0.5, num_products)
ratings = np.clip(ratings, 1, 5).round(1)
product_data = pd.DataFrame({
'product_id': product_ids,
'product_name': product_names,
'category': product_categories,
'price': prices,
'stock_level': stock_levels,
'rating': ratings
})
return product_data
# Generate transaction data
def generate_transaction_data(customers, products, num_transactions=5000):
transaction_ids = range(1, num_transactions + 1)
# Most transactions in past year
end_date = datetime.now()
start_date = end_date - timedelta(days=365)
transaction_dates = [
(start_date + (end_date - start_date) * random.random()).strftime('%Y-%m-%d')
for _ in range(num_transactions)
]
# Some customers make more purchases than others (pareto principle)
customer_ids = np.random.choice(
customers['customer_id'],
size=num_transactions,
p=np.power(np.arange(1, len(customers) + 1), -0.8) / sum(np.power(np.arange(1, len(customers) + 1), -0.8))
)
# Some products are more popular than others
product_ids = np.random.choice(
products['product_id'],
size=num_transactions,
p=np.power(np.arange(1, len(products) + 1), -0.6) / sum(np.power(np.arange(1, len(products) + 1), -0.6))
)
quantities = np.random.choice([1, 1, 1, 2, 2, 3], num_transactions)
# Calculate transaction amounts
amounts = []
for i in range(num_transactions):
product_price = products.loc[products['product_id'] == product_ids[i], 'price'].values[0]
quantity = quantities[i]
amounts.append(round(product_price * quantity, 2))
# Generate payment methods with realistic distribution
payment_methods = np.random.choice(
['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay', 'Bank Transfer'],
num_transactions,
p=[0.6, 0.2, 0.1, 0.05, 0.05]
)
transaction_data = pd.DataFrame({
'transaction_id': transaction_ids,
'customer_id': customer_ids,
'product_id': product_ids,
'transaction_date': transaction_dates,
'quantity': quantities,
'amount': amounts,
'payment_method': payment_methods
})
return transaction_data
# Generate browsing history data
def generate_browsing_history(customers, products, num_events=10000):
event_ids = range(1, num_events + 1)
# Events in the past 60 days
end_date = datetime.now()
start_date = end_date - timedelta(days=60)
timestamps = [
(start_date + (end_date - start_date) * random.random()).strftime('%Y-%m-%d %H:%M:%S')
for _ in range(num_events)
]
customer_ids = np.random.choice(customers['customer_id'], size=num_events)
product_ids = np.random.choice(products['product_id'], size=num_events)
event_types = np.random.choice(
['view', 'view', 'view', 'view', 'add_to_cart', 'add_to_cart', 'purchase'],
num_events,
p=[0.7, 0.1, 0.1, 0.05, 0.03, 0.01, 0.01] # Most events are views
)
# Generate realistic session durations
session_durations = np.random.exponential(scale=5, size=num_events).round(2)
devices = np.random.choice(
['Desktop', 'Mobile', 'Tablet'],
num_events,
p=[0.4, 0.5, 0.1]
)
browsing_data = pd.DataFrame({
'event_id': event_ids,
'customer_id': customer_ids,
'product_id': product_ids,
'timestamp': timestamps,
'event_type': event_types,
'session_duration_min': session_durations,
'device': devices
})
return browsing_data
# Generate all datasets
customers = generate_customer_data()
products = generate_product_data()
transactions = generate_transaction_data(customers, products)
browsing_history = generate_browsing_history(customers, products)
# Save to CSV files
customers.to_csv('customers.csv', index=False)
products.to_csv('products.csv', index=False)
transactions.to_csv('transactions.csv', index=False)
browsing_history.to_csv('browsing_history.csv', index=False)
print("Datasets generated successfully!")