๐ Text Features
Text Features in KDP
Transform textual data into meaningful features with advanced text processing techniques.
๐ Overview
Text features represent natural language data like product descriptions, user reviews, comments, and other forms of unstructured text. KDP provides powerful tools to convert raw text into compact, meaningful representations that capture semantic meaning and context.
๐ Text Processing Approaches
Tokenization
Breaking text into words, subwords, or characters
Vectorization
Converting tokens into numerical representations
Embeddings
Mapping tokens to dense vector spaces that capture semantics
Sequence Handling
Managing variable-length text with padding, truncation
๐ Basic Usage
from kdp import PreprocessingModel, FeatureType
# Define text features with simple configuration
features = {
"product_description": FeatureType.TEXT,
"user_review": FeatureType.TEXT,
"comment": FeatureType.TEXT
}
# Create preprocessor
preprocessor = PreprocessingModel(
path_data="text_data.csv",
features_specs=features
)
๐ง Advanced Configuration
For more control over text processing, use the TextFeature
class:
from kdp import PreprocessingModel, FeatureType, TextFeature
# Detailed text feature configuration
features = {
# Basic text feature
"short_comment": FeatureType.TEXT,
# Full configuration with TextFeature
"product_description": TextFeature(
name="product_description",
max_tokens=10000, # Vocabulary size
embedding_dim=64, # Embedding dimensionality
sequence_length=128, # Max sequence length
tokenizer="word", # Tokenization strategy
ngrams=2, # Include bigrams
output_mode="embedding" # Return embeddings
),
# Text feature with pre-trained embeddings
"user_query": TextFeature(
name="user_query",
use_pretrained=True, # Use pre-trained embeddings
pretrained_name="glove.6B.100d",# GloVe embeddings
trainable=False # Freeze embeddings during training
),
# Multilingual text processing
"multilingual_text": TextFeature(
name="multilingual_text",
use_pretrained=True,
pretrained_name="multilingual", # Multilingual embeddings
max_sequence_length=256
)
}
preprocessor = PreprocessingModel(
path_data="text_data.csv",
features_specs=features
)
โ๏ธ Key Configuration Parameters
Parameter | Description | Default | Options |
---|---|---|---|
max_tokens |
Maximum vocabulary size | 10000 | Typically 5K-50K for most applications |
sequence_length |
Maximum sequence length | 64 | Shorter for queries (32-64), longer for documents (128-512) |
embedding_dim |
Size of embedding vectors | 32 | 16-300 depending on complexity of text |
tokenizer |
Tokenization strategy | "word" | "word", "char", "subword" |
output_mode |
Text representation format | "embedding" | "embedding", "int", "binary", "tfidf" |
ngrams |
Include n-grams in tokenization | 1 | 1 (unigrams only), 2 (uni+bigrams), 3 (uni+bi+trigrams) |
๐ก Powerful Features
๐ Pre-trained Embeddings
KDP supports several pre-trained embeddings to jump-start your text processing:
# Using GloVe embeddings
text_feature = TextFeature(
name="article_text",
use_pretrained=True,
pretrained_name="glove.6B.100d",
trainable=False # Freeze embeddings
)
# Using Word2Vec embeddings
text_feature = TextFeature(
name="article_text",
use_pretrained=True,
pretrained_name="word2vec.google.300d",
trainable=True # Fine-tune embeddings
)
# Using BERT embeddings for contextual representations
text_feature = TextFeature(
name="article_text",
use_pretrained=True,
pretrained_name="bert-base-uncased",
use_attention=True # Enable attention mechanism
)
๐ Attention Mechanisms
Enable attention to better capture the context and important parts of text:
# Text feature with self-attention
text_feature = TextFeature(
name="long_document",
sequence_length=512,
use_attention=True, # Enable attention
attention_heads=8, # Multi-head attention
attention_dropout=0.1 # Regularization
)
# Create a preprocessor with text attention
preprocessor = PreprocessingModel(
path_data="documents.csv",
features_specs={"document": text_feature},
text_attention_mode="self" # Self-attention mode
)
๐ง Real-World Examples
Sentiment Analysis from Product Reviews
# Text preprocessing for sentiment analysis
preprocessor = PreprocessingModel(
path_data="reviews.csv",
features_specs={
# Review text with attention for key sentiment phrases
"review_text": TextFeature(
name="review_text",
max_tokens=15000,
embedding_dim=64,
use_attention=True,
attention_heads=4
),
# Additional metadata features
"product_category": FeatureType.STRING_CATEGORICAL,
"star_rating": FeatureType.FLOAT_NORMALIZED,
"verified_purchase": FeatureType.BOOLEAN
},
tabular_attention=True # Enable attention across all features
)
Document Classification System
# Document classification preprocessor
preprocessor = PreprocessingModel(
path_data="documents.csv",
features_specs={
# Main document text
"document_text": TextFeature(
name="document_text",
max_tokens=20000,
sequence_length=256,
embedding_dim=128,
tokenizer="subword", # Better for rare words
ngrams=3 # Include n-grams
),
# Document metadata
"document_title": TextFeature(
name="document_title",
max_tokens=5000,
sequence_length=32
),
"author": FeatureType.STRING_CATEGORICAL,
"publication_date": FeatureType.DATE
}
)
๐ Pro Tips
๐งน Text Cleaning
Clean your text data before feeding it to KDP for better results:
import re
import pandas as pd
# Clean text data before preprocessing
def clean_text(text):
text = text.lower() # Lowercase
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
return text.strip()
# Apply cleaning to your data
data = pd.read_csv("raw_reviews.csv")
data["cleaned_review"] = data["review"].apply(clean_text)
# Use cleaned text in KDP
preprocessor = PreprocessingModel(
path_data=data,
features_specs={"cleaned_review": FeatureType.TEXT}
)
๐ Choose the Right Sequence Length
Set sequence length based on your text distribution to avoid truncating important information:
import pandas as pd
import numpy as np
# Analyze text length distribution
data = pd.read_csv("reviews.csv")
lengths = data["review"].apply(lambda x: len(x.split()))
# Get statistics
print(f"Mean length: {np.mean(lengths)}")
print(f"Median length: {np.median(lengths)}")
print(f"95th percentile: {np.percentile(lengths, 95)}")
# Choose sequence length based on distribution
# A common approach is to use the 95th percentile
sequence_length = int(np.percentile(lengths, 95))
# Configure with appropriate length
text_feature = TextFeature(
name="review",
sequence_length=sequence_length
)
๐ Combine Multiple Representations
Use different text representations for the same field to capture different aspects:
# Use multiple representations of the same text
preprocessor = PreprocessingModel(
path_data="reviews.csv",
features_specs={
# Semantic embedding representation
"review_embedding": TextFeature(
name="review",
output_mode="embedding",
embedding_dim=64
),
# Bag-of-words representation (good for keywords)
"review_bow": TextFeature(
name="review",
output_mode="binary", # Binary bag-of-words
max_tokens=5000
)
}
)
๐ Visualize Embeddings
Visualize your text embeddings to understand the semantic space:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# Get embeddings from preprocessor
preprocessor.fit()
result = preprocessor.build_preprocessor()
# Extract embeddings for visualization
embeddings = preprocessor.get_text_embeddings("review_text")
words = preprocessor.get_text_vocabulary("review_text")
# Visualize with t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)
# Plot most common words
plt.figure(figsize=(12, 10))
plt.scatter(embeddings_2d[:100, 0], embeddings_2d[:100, 1])
for i, word in enumerate(words[:100]):
plt.annotate(word, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]))
plt.title("Text Embedding Visualization")
plt.show()
๐ Understanding Text Processing
KDP converts raw text into meaningful vector representations through a series of transformations, from tokenization to final pooling or attention mechanisms.