📊 Module 5: Python Data Science Project

🚀 Project Introduction

Congratulations on your new role! You've been hired as a Data Scientist at TechRise, a rapidly growing e-commerce startup. Your first assignment is to analyze customer behavior and provide actionable insights to improve sales and customer retention.

🏢 Company Background

TechRise sells electronics and tech accessories online. The company has been experiencing steady growth but wants to optimize their marketing strategy, improve product recommendations, and reduce customer churn.

📋 Project Goals

Analyze customer purchase patterns
Identify high-value customer segments
Build a model to predict customer churn
Create a dashboard for business stakeholders
Generate product recommendations

📁 Project Dataset

You'll be working with the following datasets:

customers.csv - Customer demographic information
transactions.csv - Purchase history
products.csv - Product catalog information
browsing_history.csv - Website activity logs

Note: Let's simulate these datasets with Python code:

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Generate customer data
def generate_customer_data(num_customers=1000):
    customer_ids = range(1, num_customers + 1)
    registration_dates = [
        (datetime(2022, 1, 1) + timedelta(days=random.randint(0, 730))).strftime('%Y-%m-%d')
        for _ in range(num_customers)
    ]
    ages = np.random.normal(35, 12, num_customers).astype(int)
    # Clip ages to reasonable range
    ages = np.clip(ages, 18, 80)

    genders = np.random.choice(['M', 'F', 'Other'], num_customers, p=[0.48, 0.48, 0.04])

    locations = np.random.choice(
        ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia',
         'San Antonio', 'San Diego', 'Dallas', 'San Francisco'],
        num_customers
    )

    email_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'icloud.com']
    emails = [
        f"customer_{i}@{random.choice(email_domains)}"
        for i in customer_ids
    ]

    customer_data = pd.DataFrame({
        'customer_id': customer_ids,
        'registration_date': registration_dates,
        'age': ages,
        'gender': genders,
        'location': locations,
        'email': emails
    })

    return customer_data

# Generate product data
def generate_product_data(num_products=100):
    product_ids = range(1, num_products + 1)

    categories = {
        'Smartphones': ['iPhone 13', 'Galaxy S22', 'Pixel 6', 'OnePlus 9', 'Xiaomi 12'],
        'Laptops': ['MacBook Pro', 'Dell XPS', 'ThinkPad X1', 'HP Spectre', 'Asus ZenBook'],
        'Tablets': ['iPad Pro', 'Galaxy Tab', 'Surface Pro', 'Kindle Fire', 'Lenovo Tab'],
        'Accessories': ['Wireless Earbuds', 'Phone Case', 'Laptop Stand', 'Wireless Charger', 'USB-C Hub'],
        'Smart Home': ['Echo Dot', 'Nest Thermostat', 'Philips Hue', 'Ring Doorbell', 'Smart Plug']
    }

    product_names = []
    product_categories = []

    for _ in range(num_products):
        category = random.choice(list(categories.keys()))
        product_base = random.choice(categories[category])
        variant = random.choice(['', ' Pro', ' Plus', ' Mini', ' Max'])
        color = random.choice(['Black', 'White', 'Silver', 'Gold', 'Blue', 'Red'])
        product_name = f"{product_base}{variant} {color}"

        product_names.append(product_name)
        product_categories.append(category)

    prices = []
    for category in product_categories:
        if category == 'Smartphones':
            prices.append(round(random.uniform(500, 1200), 2))
        elif category == 'Laptops':
            prices.append(round(random.uniform(800, 2000), 2))
        elif category == 'Tablets':
            prices.append(round(random.uniform(300, 900), 2))
        elif category == 'Accessories':
            prices.append(round(random.uniform(20, 150), 2))
        else:  # Smart Home
            prices.append(round(random.uniform(50, 250), 2))

    stock_levels = np.random.randint(0, 100, num_products)

    ratings = np.random.normal(4, 0.5, num_products)
    ratings = np.clip(ratings, 1, 5).round(1)

    product_data = pd.DataFrame({
        'product_id': product_ids,
        'product_name': product_names,
        'category': product_categories,
        'price': prices,
        'stock_level': stock_levels,
        'rating': ratings
    })

    return product_data

# Generate transaction data
def generate_transaction_data(customers, products, num_transactions=5000):
    transaction_ids = range(1, num_transactions + 1)

    # Most transactions in past year
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)

    transaction_dates = [
        (start_date + (end_date - start_date) * random.random()).strftime('%Y-%m-%d')
        for _ in range(num_transactions)
    ]

    # Some customers make more purchases than others (pareto principle)
    customer_ids = np.random.choice(
        customers['customer_id'],
        size=num_transactions,
        p=np.power(np.arange(1, len(customers) + 1), -0.8) / sum(np.power(np.arange(1, len(customers) + 1), -0.8))
    )

    # Some products are more popular than others
    product_ids = np.random.choice(
        products['product_id'],
        size=num_transactions,
        p=np.power(np.arange(1, len(products) + 1), -0.6) / sum(np.power(np.arange(1, len(products) + 1), -0.6))
    )

    quantities = np.random.choice([1, 1, 1, 2, 2, 3], num_transactions)

    # Calculate transaction amounts
    amounts = []
    for i in range(num_transactions):
        product_price = products.loc[products['product_id'] == product_ids[i], 'price'].values[0]
        quantity = quantities[i]
        amounts.append(round(product_price * quantity, 2))

    # Generate payment methods with realistic distribution
    payment_methods = np.random.choice(
        ['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay', 'Bank Transfer'],
        num_transactions,
        p=[0.6, 0.2, 0.1, 0.05, 0.05]
    )

    transaction_data = pd.DataFrame({
        'transaction_id': transaction_ids,
        'customer_id': customer_ids,
        'product_id': product_ids,
        'transaction_date': transaction_dates,
        'quantity': quantities,
        'amount': amounts,
        'payment_method': payment_methods
    })

    return transaction_data

# Generate browsing history data
def generate_browsing_history(customers, products, num_events=10000):
    event_ids = range(1, num_events + 1)

    # Events in the past 60 days
    end_date = datetime.now()
    start_date = end_date - timedelta(days=60)

    timestamps = [
        (start_date + (end_date - start_date) * random.random()).strftime('%Y-%m-%d %H:%M:%S')
        for _ in range(num_events)
    ]

    customer_ids = np.random.choice(customers['customer_id'], size=num_events)
    product_ids = np.random.choice(products['product_id'], size=num_events)

    event_types = np.random.choice(
        ['view', 'view', 'view', 'view', 'add_to_cart', 'add_to_cart', 'purchase'],
        num_events,
        p=[0.7, 0.1, 0.1, 0.05, 0.03, 0.01, 0.01]  # Most events are views
    )

    # Generate realistic session durations
    session_durations = np.random.exponential(scale=5, size=num_events).round(2)

    devices = np.random.choice(
        ['Desktop', 'Mobile', 'Tablet'],
        num_events,
        p=[0.4, 0.5, 0.1]
    )

    browsing_data = pd.DataFrame({
        'event_id': event_ids,
        'customer_id': customer_ids,
        'product_id': product_ids,
        'timestamp': timestamps,
        'event_type': event_types,
        'session_duration_min': session_durations,
        'device': devices
    })

    return browsing_data

# Generate all datasets
customers = generate_customer_data()
products = generate_product_data()
transactions = generate_transaction_data(customers, products)
browsing_history = generate_browsing_history(customers, products)

# Save to CSV files
customers.to_csv('customers.csv', index=False)
products.to_csv('products.csv', index=False)
transactions.to_csv('transactions.csv', index=False)
browsing_history.to_csv('browsing_history.csv', index=False)

print("Datasets generated successfully!")

🚀 Project Introduction

🏢 Company Background

📋 Project Goals

📁 Project Dataset

💻 Project Steps

1️⃣ Step 1: Data Exploration and Understanding