๐ Date Features
Date Features in KDP
Extract powerful patterns from temporal data like timestamps, dates, and time series.
๐ Overview
Date features transform timestamps and dates into ML-ready representations that capture important temporal patterns and seasonality. KDP automatically handles date parsing and formatting, enabling your models to learn from time-based signals.
๐ Date Processing Approaches
Component Extraction
Breaking dates into day, month, year, etc.
Cyclical Encoding
Representing cyclic time components (hour, weekday)
Temporal Distances
Computing time since reference points
Seasonality Analysis
Capturing seasonal patterns and trends
๐ Basic Usage
from kdp import PreprocessingModel, FeatureType
# Quick date feature definition
features = {
"purchase_date": FeatureType.DATE, # Transaction dates
"signup_date": FeatureType.DATE, # User signup dates
"last_active": FeatureType.DATE # Last activity timestamps
}
# Create your preprocessor
preprocessor = PreprocessingModel(
path_data="customer_data.csv",
features_specs=features
)
๐ง Advanced Configuration
For more control over date processing, use the DateFeature
class:
from kdp.features import DateFeature
features = {
# Transaction date with component extraction
"transaction_date": DateFeature(
name="transaction_date",
feature_type=FeatureType.DATE,
add_day_of_week=True, # Extract day of week
add_month=True, # Extract month
add_quarter=True, # Extract quarter
cyclical_encoding=True # Use sine/cosine encoding for cyclical features
),
# User signup date with time since reference
"signup_date": DateFeature(
name="signup_date",
feature_type=FeatureType.DATE,
add_time_since_reference=True,
reference_date="2020-01-01" # Reference point
),
# Event timestamp with hour component
"event_timestamp": DateFeature(
name="event_timestamp",
feature_type=FeatureType.DATE,
add_hour=True, # Extract hour
add_day_of_week=True, # Extract day of week
add_is_weekend=True # Add weekend indicator
)
}
โ๏ธ Key Configuration Parameters
Parameter | Description | Default | Options |
---|---|---|---|
add_year |
Extract year component | False | Boolean |
add_month |
Extract month component | False | Boolean |
add_day |
Extract day component | False | Boolean |
add_day_of_week |
Extract day of week | False | Boolean |
add_hour |
Extract hour component | False | Boolean |
cyclical_encoding |
Use sine/cosine encoding | False | Boolean |
add_is_weekend |
Add weekend indicator | False | Boolean |
๐ก Powerful Features
๐ Cyclical Encoding
Properly represent cyclical time components (like hour, day of week) using sine/cosine transformations:
# Configure cyclical encoding for time components
date_feature = DateFeature(
name="event_time",
feature_type=FeatureType.DATE,
add_hour=True,
add_day_of_week=True,
cyclical_encoding=True # Enable cyclical encoding
)
# Create preprocessor with cyclical date features
preprocessor = PreprocessingModel(
path_data="events.csv",
features_specs={"event_time": date_feature}
)
๐ Time-Since Features
Calculate time since reference points for meaningful temporal distances:
# Compute days since reference date
date_feature = DateFeature(
name="signup_date",
feature_type=FeatureType.DATE,
add_time_since_reference=True,
reference_date="2020-01-01", # Fixed reference
time_since_unit="days" # Unit for calculation
)
# Compute time since multiple references
preprocessor = PreprocessingModel(
path_data="user_data.csv",
features_specs={
"signup_date": date_feature,
"last_purchase": DateFeature(
name="last_purchase",
add_time_since_reference=True,
reference_date="today", # Dynamic reference (current date)
time_since_unit="days"
)
}
)
๐ง Real-World Examples
E-commerce Purchase Analysis
# Analyze purchase patterns over time
from kdp.features import DateFeature, NumericalFeature, CategoricalFeature
preprocessor = PreprocessingModel(
path_data="ecommerce_data.csv",
features_specs={
# Purchase date with rich time components
"purchase_date": DateFeature(
name="purchase_date",
add_day_of_week=True,
add_hour=True,
add_month=True,
add_is_weekend=True,
cyclical_encoding=True
),
# User signup date to determine user tenure
"user_signup_date": DateFeature(
name="user_signup_date",
add_time_since_reference=True,
reference_date="today",
time_since_unit="days"
),
# Additional features
"product_category": CategoricalFeature(
name="product_category",
feature_type=FeatureType.STRING_CATEGORICAL
),
"purchase_amount": NumericalFeature(
name="purchase_amount",
feature_type=FeatureType.FLOAT_RESCALED
)
},
# Define crosses to capture time-based patterns
feature_crosses=[
("purchase_date_day_of_week", "product_category", 16)
]
)
Time Series Forecasting
# Time series feature extraction for forecasting
preprocessor = PreprocessingModel(
path_data="sensor_readings.csv",
features_specs={
# Timestamp with multiple components
"timestamp": DateFeature(
name="timestamp",
add_year=True,
add_month=True,
add_day=True,
add_hour=True,
add_day_of_week=True,
cyclical_encoding=True
),
# Numerical features to predict
"value": NumericalFeature(
name="value",
feature_type=FeatureType.FLOAT_RESCALED,
use_distribution_aware=True
),
# Additional context features
"sensor_id": CategoricalFeature(
name="sensor_id",
feature_type=FeatureType.STRING_CATEGORICAL
)
},
# Enable tabular attention for discovering temporal patterns
tabular_attention=True
)
๐ Pro Tips
๐ Date Format Handling
KDP automatically handles common date formats, but you can specify custom formats:
# Handle custom date formats
from datetime import datetime
import pandas as pd
# Convert dates to standard format before feeding to KDP
def standardize_date(date_str):
try:
# Try parsing custom format
dt = datetime.strptime(date_str, "%d-%b-%Y")
return dt.strftime("%Y-%m-%d")
except:
return date_str
# Apply standardization to your data
data = pd.read_csv("custom_dates.csv")
data["standard_date"] = data["custom_date"].apply(standardize_date)
# Use standardized dates in KDP
preprocessor = PreprocessingModel(
path_data=data,
features_specs={"standard_date": FeatureType.DATE}
)
๐ง Feature Selection
Use feature selection to identify important temporal patterns:
# Determine which date components matter most
preprocessor = PreprocessingModel(
path_data="events.csv",
features_specs={
"event_date": DateFeature(
name="event_date",
# Extract all potentially relevant components
add_year=True,
add_quarter=True,
add_month=True,
add_day=True,
add_day_of_week=True,
add_hour=True,
add_is_weekend=True
)
},
# Enable feature selection to identify important components
use_feature_selection=True,
feature_selection_strategy="gradient_based"
)
# After training, check feature importance
result = preprocessor.build_preprocessor()
importance = result["feature_importance"]
print("Most important date components:", importance)
โ Cross Features
Create crosses with date components to capture context-dependent patterns:
# Cross date components with categorical features
preprocessor = PreprocessingModel(
path_data="transactions.csv",
features_specs={
# Date with components
"transaction_date": DateFeature(
name="transaction_date",
add_day_of_week=True,
add_hour=True,
add_is_weekend=True
),
# Categorical context
"store_location": FeatureType.STRING_CATEGORICAL,
"product_category": FeatureType.STRING_CATEGORICAL
},
# Define crosses to capture contextual patterns
feature_crosses=[
# Weekend shopping differs by location
("transaction_date_is_weekend", "store_location", 16),
# Day of week impacts product category popularity
("transaction_date_day_of_week", "product_category", 32),
# Hour of day impacts product selections
("transaction_date_hour", "product_category", 32)
]
)
๐ Handling Timezones
Standardize timezone handling for consistent date processing:
# Standardize timezones before processing
import pandas as pd
from datetime import datetime
import pytz
# Convert timestamps to a standard timezone
def standardize_timezone(timestamp_str, from_tz='UTC', to_tz='America/New_York'):
if pd.isna(timestamp_str):
return None
# Parse timestamp and set timezone
dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
if dt.tzinfo is None:
dt = pytz.timezone(from_tz).localize(dt)
# Convert to target timezone
dt = dt.astimezone(pytz.timezone(to_tz))
return dt.isoformat()
# Apply timezone standardization
data = pd.read_csv("global_events.csv")
data["standardized_time"] = data["event_timestamp"].apply(
lambda x: standardize_timezone(x, from_tz='UTC', to_tz='America/New_York')
)
# Use standardized timestamps in KDP
preprocessor = PreprocessingModel(
path_data=data,
features_specs={"standardized_time": FeatureType.DATE}
)
๐ Model Architecture
KDP processes dates by extracting components, applying appropriate transformations, and then combining them into a unified representation that captures temporal patterns.