"""Sklearn-style API for Biterm Topic Model."""
__all__ = ["BTMClassifier"]
from typing import List, Union, Optional, Dict, Any
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.validation import check_is_fitted
from ._btm import BTM
from ._util import get_biterms
[docs]
class BTMClassifier(BaseEstimator, TransformerMixin):
"""Sklearn-compatible Biterm Topic Model for short text analysis.
This class provides a scikit-learn compatible interface for the Biterm Topic Model,
designed specifically for short text analysis such as tweets, reviews, and messages.
Unlike traditional topic models like LDA, BTM extracts biterms (word pairs) from
the entire corpus to overcome data sparsity issues in short texts.
The BTMClassifier automatically handles text preprocessing, vectorization, biterm
generation, model training, and inference, making topic modeling as simple as
calling fit() and transform().
Parameters
----------
n_topics : int, default=8
Number of topics to extract from the corpus.
alpha : float, default=None
Dirichlet prior parameter for topic distribution. Controls topic sparsity
in documents. Higher values create more uniform topic distributions.
If None, uses 50/n_topics as recommended in the original paper.
beta : float, default=0.01
Dirichlet prior parameter for word distribution within topics. Controls
topic-word sparsity. Lower values create more focused topics.
max_iter : int, default=600
Maximum number of Gibbs sampling iterations for model training.
More iterations generally improve convergence but increase training time.
random_state : int, default=None
Random seed for reproducible results. Set to an integer for consistent
results across runs.
window_size : int, default=15
Window size for biterm generation. Biterms are extracted from word pairs
within this window distance in each document.
has_background : bool, default=False
Whether to use a background topic to model highly frequent words that
appear across many topics (e.g., stop words).
coherence_window : int, default=20
Number of top words used for coherence calculation. This affects the
semantic coherence metric computation.
vectorizer_params : dict, default=None
Additional parameters to pass to the internal CountVectorizer for text
preprocessing. Common options include min_df, max_df, stop_words, etc.
epsilon : float, default=1e-10
Small numerical constant to prevent division by zero and improve
numerical stability in probability calculations.
Attributes
----------
model_ : BTM
The fitted BTM model instance containing learned parameters.
vocabulary_ : numpy.ndarray
Vocabulary learned from training data (words corresponding to features).
feature_names_out_ : numpy.ndarray
Alias for vocabulary_ for sklearn compatibility.
n_features_in_ : int
Number of features (vocabulary size) after preprocessing.
vectorizer_ : CountVectorizer
The fitted vectorizer used for text preprocessing.
Methods
-------
fit(X, y=None)
Fit the BTM model to documents.
transform(X, infer_type='sum_b')
Transform documents to topic probability distributions.
fit_transform(X, y=None, infer_type='sum_b')
Fit model and transform documents in one step.
get_topic_words(topic_id=None, n_words=10)
Get top words for topics.
get_document_topics(X, threshold=0.1)
Get dominant topics for documents.
score(X, y=None)
Return mean coherence score across topics.
Examples
--------
Basic usage:
>>> import bitermplus as btm
>>> texts = [
... "machine learning algorithms are powerful",
... "deep learning neural networks process data",
... "natural language processing understands text"
... ]
>>> model = btm.BTMClassifier(n_topics=2, random_state=42)
>>> model.fit(texts)
BTMClassifier(n_topics=2, random_state=42)
>>> doc_topics = model.transform(texts)
>>> print(f"Shape: {doc_topics.shape}")
Shape: (3, 2)
Getting topic words:
>>> topic_words = model.get_topic_words(n_words=5)
>>> for topic_id, words in topic_words.items():
... print(f"Topic {topic_id}: {', '.join(words)}")
Using with sklearn pipelines:
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.preprocessing import FunctionTransformer
>>> pipeline = Pipeline([
... ('preprocess', FunctionTransformer(lambda x: [s.lower() for s in x])),
... ('btm', btm.BTMClassifier(n_topics=3, random_state=42))
... ])
>>> topics = pipeline.fit_transform(texts)
References
----------
Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013). A biterm topic model for
short texts. In Proceedings of the 22nd international conference on World
Wide Web (pp. 1445-1456).
See Also
--------
BTM : Low-level BTM implementation
get_words_freqs : Extract word frequencies from documents
get_biterms : Generate biterms from vectorized documents
"""
[docs]
def __init__(
self,
n_topics: int = 8,
alpha: Optional[float] = None,
beta: float = 0.01,
max_iter: int = 600,
random_state: Optional[int] = None,
window_size: int = 15,
has_background: bool = False,
coherence_window: int = 20,
vectorizer_params: Optional[Dict[str, Any]] = None,
epsilon: float = 1e-10,
):
self.n_topics = n_topics
self.beta = beta
self.max_iter = max_iter
self.random_state = random_state
self.window_size = window_size
self.has_background = has_background
self.coherence_window = coherence_window
self.vectorizer_params = vectorizer_params
self.epsilon = epsilon
# Validate parameters before calculating alpha
self._validate_params()
self.alpha = alpha if alpha is not None else 50.0 / n_topics
# Validate alpha after calculation
if self.alpha <= 0:
raise ValueError("alpha must be positive")
def _validate_params(self):
"""Validate model parameters."""
if self.n_topics <= 0:
raise ValueError("n_topics must be positive")
if self.beta <= 0:
raise ValueError("beta must be positive")
if self.max_iter <= 0:
raise ValueError("max_iter must be positive")
if self.window_size <= 0:
raise ValueError("window_size must be positive")
if self.coherence_window <= 0:
raise ValueError("coherence_window must be positive")
if self.epsilon <= 0:
raise ValueError("epsilon must be positive")
def _setup_vectorizer(self):
"""Initialize the vectorizer with default parameters."""
default_params = {
"lowercase": True,
"token_pattern": r"\b[a-zA-Z][a-zA-Z0-9]*\b",
"min_df": 1,
"max_df": 0.95,
"stop_words": "english",
}
default_params.update(self.vectorizer_params or {})
return CountVectorizer(**default_params)
def _get_vectorized_docs(self, X: List[str]) -> List[np.ndarray]:
"""Vectorize docs using the fitted vectorizer's own analyzer.
This ensures tokenization (lowercasing, token pattern, stop words)
is identical to what CountVectorizer used when building the vocabulary.
Raw whitespace splitting would silently drop mixed-case words and
words containing punctuation that the vectorizer would have tokenized
differently.
"""
analyzer = self.vectorizer_.build_analyzer()
vocab_dict = self.vectorizer_.vocabulary_
result = []
for doc in X:
if doc is None:
doc = ""
word_ids = [vocab_dict[w] for w in analyzer(doc) if w in vocab_dict]
result.append(np.array(word_ids, dtype=np.int32))
return result
[docs]
def fit(self, X: Union[List[str], pd.Series], y=None, verbose: bool = False):
"""Fit the BTM model to documents.
Parameters
----------
X : array-like of shape (n_documents,)
Documents to fit the model on. Each element should be a string.
y : Ignored
Not used, present for sklearn compatibility.
verbose : bool, default=False
Whether to show a progress bar during training.
Returns
-------
self : BTMClassifier
Returns the instance itself.
"""
# Re-validate in case params were changed via set_params() after __init__
self._validate_params()
# Convert input to list of strings
if isinstance(X, pd.Series):
X = X.tolist()
elif not isinstance(X, list):
X = list(X)
if len(X) == 0:
raise ValueError("Input documents cannot be empty")
# Vectorize documents using the configured vectorizer
self.vectorizer_ = self._setup_vectorizer()
doc_term_matrix = self.vectorizer_.fit_transform(X)
vocabulary = np.array(self.vectorizer_.get_feature_names_out())
# Store vocabulary information
self.vocabulary_ = vocabulary
self.feature_names_out_ = vocabulary
self.n_features_in_ = len(vocabulary)
# Prepare documents and biterms using the vectorizer's own analyzer
# so tokenization (lowercasing, token pattern, stop words) is consistent
docs_vec = self._get_vectorized_docs(X)
biterms = get_biterms(docs_vec, win=self.window_size)
# Adjust coherence window to not exceed vocabulary size
effective_coherence_window = min(self.coherence_window, len(vocabulary))
# Initialize and fit BTM model
self.model_ = BTM(
doc_term_matrix,
vocabulary,
T=self.n_topics,
M=effective_coherence_window,
alpha=self.alpha,
beta=self.beta,
seed=self.random_state or 0,
win=self.window_size,
has_background=self.has_background,
epsilon=self.epsilon,
)
self.model_.fit(biterms, iterations=self.max_iter, verbose=verbose)
return self
[docs]
def get_topic_words(
self, topic_id: Optional[int] = None, n_words: int = 10
) -> Union[List[str], Dict[int, List[str]]]:
"""Get top words for topics.
Parameters
----------
topic_id : int, optional
If provided, return words for this topic only.
If None, return words for all topics.
n_words : int, default=10
Number of top words to return per topic.
Returns
-------
topic_words : list or dict
If topic_id is provided, returns list of top words for that topic.
Otherwise, returns dict mapping topic_id to list of words.
"""
check_is_fitted(self, "model_")
topic_word_matrix = self.model_.matrix_topics_words_
if topic_id is not None:
if not 0 <= topic_id < self.n_topics:
raise ValueError(f"topic_id must be between 0 and {self.n_topics - 1}")
word_indices = np.argsort(topic_word_matrix[topic_id])[-n_words:][::-1]
return self.vocabulary_[word_indices].tolist()
else:
result = {}
for t in range(self.n_topics):
word_indices = np.argsort(topic_word_matrix[t])[-n_words:][::-1]
result[t] = self.vocabulary_[word_indices].tolist()
return result
[docs]
def get_document_topics(
self, X: Union[List[str], pd.Series], threshold: float = 0.1
) -> List[List[int]]:
"""Get dominant topics for documents.
Parameters
----------
X : array-like of shape (n_documents,)
Documents to analyze.
threshold : float, default=0.1
Minimum probability threshold for topic assignment.
Returns
-------
doc_topics : list of list of int
For each document, list of topic IDs above threshold.
"""
doc_topic_probs = self.transform(X)
doc_topics = []
for doc_probs in doc_topic_probs:
topics = [i for i, prob in enumerate(doc_probs) if prob >= threshold]
doc_topics.append(topics)
return doc_topics
@property
def coherence_(self) -> np.ndarray:
"""Topic coherence scores."""
check_is_fitted(self, "model_")
return self.model_.coherence_
@property
def perplexity_(self) -> float:
"""Model perplexity."""
check_is_fitted(self, "model_")
return self.model_.perplexity_
@property
def topic_word_matrix_(self) -> np.ndarray:
"""Topic-word probability matrix."""
check_is_fitted(self, "model_")
return self.model_.matrix_topics_words_
[docs]
def score(self, X: Union[List[str], pd.Series], y=None) -> float:
"""Return the mean coherence score.
Parameters
----------
X : array-like of shape (n_documents,)
Documents to score.
y : Ignored
Not used, present for sklearn compatibility.
Returns
-------
score : float
Mean coherence score across topics.
"""
check_is_fitted(self, "model_")
return float(np.mean(self.coherence_))