Skip to content

๐Ÿš€ Real-World KDP Examples

๐Ÿ“‹ Quick Overview

Ready to see KDP in action? This guide showcases complete, practical examples that demonstrate how to use KDP's advanced features for real-world scenarios. Each example includes detailed explanations, code snippets, and visualization of the resulting preprocessing pipeline.

๐Ÿ’ผ E-commerce Product Analytics

This example demonstrates how to preprocess product data for an e-commerce recommendation system, combining numerical, categorical, text, and date features with advanced KDP capabilities.

1๏ธโƒฃ Setting Up Features

import pandas as pd
import tensorflow as tf
from kdp.features import (
    NumericalFeature, CategoricalFeature, TextFeature, DateFeature, FeatureType
)
from kdp.processor import PreprocessingModel, OutputModeOptions

# Define features with specialized processing for each type
features = {
    # Numerical features with different processing strategies
    "price": NumericalFeature(
        name="price",
        feature_type=FeatureType.FLOAT_NORMALIZED,
        use_embedding=True,                  # Create richer representation
        embedding_dim=8                      # Size of embedding
    ),
    "quantity": NumericalFeature(
        name="quantity",
        feature_type=FeatureType.FLOAT_RESCALED,
        preferred_distribution="poisson"     # Handle count data appropriately
    ),

    # Categorical features with semantic embeddings
    "category": CategoricalFeature(
        name="category",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=32,                    # Larger embeddings for complex categories
        max_vocabulary_size=1000            # Limit vocabulary size
    ),
    "brand": CategoricalFeature(
        name="brand",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=16
    ),

    # Text features with different token limits
    "description": TextFeature(
        name="description",
        feature_type=FeatureType.TEXT,
        max_tokens=100,                      # Longer limit for descriptions
        output_mode="embedding"              # Use text embeddings
    ),
    "title": TextFeature(
        name="title",
        feature_type=FeatureType.TEXT,
        max_tokens=50                        # Shorter limit for titles
    ),

    # Date features with seasonality
    "sale_date": DateFeature(
        name="sale_date",
        feature_type=FeatureType.DATE,
        add_season=True,                     # Add seasonal indicators
        add_day_of_week=True                 # Add day of week information
    )
}

2๏ธโƒฃ Creating the Advanced Preprocessor

# Create a comprehensive preprocessor with multiple advanced features
preprocessor = PreprocessingModel(
    path_data="ecommerce_data.csv",
    features_specs=features,
    output_mode=OutputModeOptions.CONCAT,

    # Enable distribution-aware processing
    use_distribution_aware=True,
    distribution_aware_bins=1000,

    # Enable transformer blocks for complex interactions
    transfo_nr_blocks=2,
    transfo_nr_heads=4,
    transfo_ff_units=64,
    transfo_dropout_rate=0.1,
    transfo_placement="all_features",

    # Enable tabular attention for feature relationships
    tabular_attention=True,
    tabular_attention_placement="multi_resolution",
    tabular_attention_heads=3,
    tabular_attention_dim=32,

    # Enable feature selection to focus on what matters
    feature_selection_placement="all_features",
    feature_selection_units=32,

    # Enable caching for better performance
    enable_caching=True
)

# Build the preprocessor
result = preprocessor.build_preprocessor()
model = result["model"]

3๏ธโƒฃ Using the Preprocessor

# Generate predictions on test data
test_batch = tf.data.Dataset.from_tensor_slices(dict(test_df)).batch(32)
processed_features = model.predict(test_batch)

# Analyze feature importance
feature_importances = preprocessor.get_feature_importances()
print("Top features:", sorted(
    feature_importances.items(),
    key=lambda x: x[1],
    reverse=True
)[:3])

# Visualize the model architecture
preprocessor.plot_model("ecommerce_model.png")

E-commerce Model Architecture

๐Ÿ“Š Financial Time Series Analysis

This example demonstrates using KDP to preprocess financial time series data for forecasting, focusing on distribution-aware encoding and feature interactions.

Setting Up Features

# Define financial features
features = {
    # Price data with custom distribution handling
    "close_price": NumericalFeature(
        name="close_price",
        feature_type=FeatureType.FLOAT_RESCALED,
        use_embedding=True,
        embedding_dim=16,
        preferred_distribution="heavy_tailed"  # Handle market data distributions
    ),
    "volume": NumericalFeature(
        name="volume",
        feature_type=FeatureType.FLOAT_RESCALED,
        preferred_distribution="log_normal"    # Common for volume data
    ),

    # Technical indicators
    "rsi": NumericalFeature(
        name="rsi",
        feature_type=FeatureType.FLOAT_NORMALIZED,  # RSI is already 0-100
        preferred_distribution="normal"
    ),
    "macd": NumericalFeature(
        name="macd",
        feature_type=FeatureType.FLOAT_NORMALIZED,
        preferred_distribution="normal"
    ),

    # Categorical market data
    "market_regime": CategoricalFeature(
        name="market_regime",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=8
    ),

    # Date information with market-specific features
    "date": DateFeature(
        name="date",
        feature_type=FeatureType.DATE,
        add_day_of_week=True,  # Markets behave differently on different days
        add_month=True         # Capture seasonal effects
    )
}

# Create financial preprocessor
financial_preprocessor = PreprocessingModel(
    path_data="market_data.csv",
    features_specs=features,

    # Enable advanced numerical embeddings for better pattern detection
    use_numerical_embedding=True,
    numerical_embedding_dim=32,

    # Enable distribution-aware encoding for market data
    use_distribution_aware=True,

    # Enable feature selection to identify important signals
    feature_selection_placement="ALL_FEATURES",

    # Financial time series benefits from attention mechanisms
    tabular_attention=True,
    tabular_attention_heads=8
)

# Build the preprocessor
financial_result = financial_preprocessor.build_preprocessor()

๐Ÿ‘ฅ User Behavior Analysis

This example shows how to preprocess user behavior data for churn prediction or personalization algorithms.

Setting Up Features

# Define user behavior features
features = {
    # User demographics
    "age": NumericalFeature(
        name="age",
        feature_type=FeatureType.FLOAT_NORMALIZED
    ),
    "gender": CategoricalFeature(
        name="gender",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=4
    ),
    "location": CategoricalFeature(
        name="location",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=16,
        max_vocabulary_size=500  # Limit to top locations
    ),

    # Behavioral metrics
    "days_since_last_login": NumericalFeature(
        name="days_since_last_login",
        feature_type=FeatureType.FLOAT_RESCALED,
        preferred_distribution="exponential"  # Often follows exponential distribution
    ),
    "total_purchases": NumericalFeature(
        name="total_purchases",
        feature_type=FeatureType.FLOAT_RESCALED,
        preferred_distribution="zero_inflated"  # Many users have zero purchases
    ),
    "average_session_time": NumericalFeature(
        name="average_session_time",
        feature_type=FeatureType.FLOAT_RESCALED
    ),

    # Categorical engagement data
    "subscription_tier": CategoricalFeature(
        name="subscription_tier",
        feature_type=FeatureType.STRING_CATEGORICAL,
        embedding_dim=8
    ),

    # Textual data
    "user_feedback": TextFeature(
        name="user_feedback",
        feature_type=FeatureType.TEXT,
        max_tokens=150
    ),

    # Time-based features
    "account_creation_date": DateFeature(
        name="account_creation_date",
        feature_type=FeatureType.DATE,
        add_season=True
    )
}

# Create user behavior preprocessor
user_preprocessor = PreprocessingModel(
    path_data="user_data.csv",
    features_specs=features,

    # Enable mixture of experts for specialized handling
    use_feature_moe=True,
    feature_moe_num_experts=5,

    # Enable feature selection to identify churn indicators
    feature_selection_placement="ALL_FEATURES",

    # Enable distribution-aware encoding
    use_distribution_aware=True,

    # User behavior data benefits from transformer blocks
    transfo_nr_blocks=2,
    transfo_nr_heads=4
)

# Build the preprocessor
user_result = user_preprocessor.build_preprocessor()

๐Ÿ’ก Pro Tips

  1. Start Simple, Then Add Complexity
  2. Begin with basic feature definitions
  3. Add advanced features one at a time
  4. Test each addition's impact

  5. Monitor Resource Usage

  6. Complex models can be memory-intensive
  7. Use batch processing for large datasets
  8. Monitor preprocessing time and adjust accordingly

  9. Balance Preprocessing Complexity

  10. More complex preprocessing isn't always better
  11. Focus on features that provide the most value
  12. Use get_feature_importances() to identify key features

  13. Save Your Preprocessing Pipeline

    # Save the complete pipeline for reuse
    preprocessor.save_model("my_preprocessor.keras")
    
    # Load it when needed
    from kdp import PreprocessingModel
    loaded_preprocessor = PreprocessingModel.load_model("my_preprocessor.keras")