Source code for bitermplus._api

"""Sklearn-style API for Biterm Topic Model."""

__all__ = ["BTMClassifier"]

from typing import List, Union, Optional, Dict, Any
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.validation import check_is_fitted

from ._btm import BTM
from ._util import get_biterms


[docs] class BTMClassifier(BaseEstimator, TransformerMixin): """Sklearn-compatible Biterm Topic Model for short text analysis. This class provides a scikit-learn compatible interface for the Biterm Topic Model, designed specifically for short text analysis such as tweets, reviews, and messages. Unlike traditional topic models like LDA, BTM extracts biterms (word pairs) from the entire corpus to overcome data sparsity issues in short texts. The BTMClassifier automatically handles text preprocessing, vectorization, biterm generation, model training, and inference, making topic modeling as simple as calling fit() and transform(). Parameters ---------- n_topics : int, default=8 Number of topics to extract from the corpus. alpha : float, default=None Dirichlet prior parameter for topic distribution. Controls topic sparsity in documents. Higher values create more uniform topic distributions. If None, uses 50/n_topics as recommended in the original paper. beta : float, default=0.01 Dirichlet prior parameter for word distribution within topics. Controls topic-word sparsity. Lower values create more focused topics. max_iter : int, default=600 Maximum number of Gibbs sampling iterations for model training. More iterations generally improve convergence but increase training time. random_state : int, default=None Random seed for reproducible results. Set to an integer for consistent results across runs. window_size : int, default=15 Window size for biterm generation. Biterms are extracted from word pairs within this window distance in each document. has_background : bool, default=False Whether to use a background topic to model highly frequent words that appear across many topics (e.g., stop words). coherence_window : int, default=20 Number of top words used for coherence calculation. This affects the semantic coherence metric computation. vectorizer_params : dict, default=None Additional parameters to pass to the internal CountVectorizer for text preprocessing. Common options include min_df, max_df, stop_words, etc. epsilon : float, default=1e-10 Small numerical constant to prevent division by zero and improve numerical stability in probability calculations. Attributes ---------- model_ : BTM The fitted BTM model instance containing learned parameters. vocabulary_ : numpy.ndarray Vocabulary learned from training data (words corresponding to features). feature_names_out_ : numpy.ndarray Alias for vocabulary_ for sklearn compatibility. n_features_in_ : int Number of features (vocabulary size) after preprocessing. vectorizer_ : CountVectorizer The fitted vectorizer used for text preprocessing. Methods ------- fit(X, y=None) Fit the BTM model to documents. transform(X, infer_type='sum_b') Transform documents to topic probability distributions. fit_transform(X, y=None, infer_type='sum_b') Fit model and transform documents in one step. get_topic_words(topic_id=None, n_words=10) Get top words for topics. get_document_topics(X, threshold=0.1) Get dominant topics for documents. score(X, y=None) Return mean coherence score across topics. Examples -------- Basic usage: >>> import bitermplus as btm >>> texts = [ ... "machine learning algorithms are powerful", ... "deep learning neural networks process data", ... "natural language processing understands text" ... ] >>> model = btm.BTMClassifier(n_topics=2, random_state=42) >>> model.fit(texts) BTMClassifier(n_topics=2, random_state=42) >>> doc_topics = model.transform(texts) >>> print(f"Shape: {doc_topics.shape}") Shape: (3, 2) Getting topic words: >>> topic_words = model.get_topic_words(n_words=5) >>> for topic_id, words in topic_words.items(): ... print(f"Topic {topic_id}: {', '.join(words)}") Using with sklearn pipelines: >>> from sklearn.pipeline import Pipeline >>> from sklearn.preprocessing import FunctionTransformer >>> pipeline = Pipeline([ ... ('preprocess', FunctionTransformer(lambda x: [s.lower() for s in x])), ... ('btm', btm.BTMClassifier(n_topics=3, random_state=42)) ... ]) >>> topics = pipeline.fit_transform(texts) References ---------- Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013). A biterm topic model for short texts. In Proceedings of the 22nd international conference on World Wide Web (pp. 1445-1456). See Also -------- BTM : Low-level BTM implementation get_words_freqs : Extract word frequencies from documents get_biterms : Generate biterms from vectorized documents """
[docs] def __init__( self, n_topics: int = 8, alpha: Optional[float] = None, beta: float = 0.01, max_iter: int = 600, random_state: Optional[int] = None, window_size: int = 15, has_background: bool = False, coherence_window: int = 20, vectorizer_params: Optional[Dict[str, Any]] = None, epsilon: float = 1e-10, ): self.n_topics = n_topics self.beta = beta self.max_iter = max_iter self.random_state = random_state self.window_size = window_size self.has_background = has_background self.coherence_window = coherence_window self.vectorizer_params = vectorizer_params self.epsilon = epsilon # Validate parameters before calculating alpha self._validate_params() self.alpha = alpha if alpha is not None else 50.0 / n_topics # Validate alpha after calculation if self.alpha <= 0: raise ValueError("alpha must be positive")
def _validate_params(self): """Validate model parameters.""" if self.n_topics <= 0: raise ValueError("n_topics must be positive") if self.beta <= 0: raise ValueError("beta must be positive") if self.max_iter <= 0: raise ValueError("max_iter must be positive") if self.window_size <= 0: raise ValueError("window_size must be positive") if self.coherence_window <= 0: raise ValueError("coherence_window must be positive") if self.epsilon <= 0: raise ValueError("epsilon must be positive") def _setup_vectorizer(self): """Initialize the vectorizer with default parameters.""" default_params = { "lowercase": True, "token_pattern": r"\b[a-zA-Z][a-zA-Z0-9]*\b", "min_df": 1, "max_df": 0.95, "stop_words": "english", } default_params.update(self.vectorizer_params or {}) return CountVectorizer(**default_params) def _get_vectorized_docs(self, X: List[str]) -> List[np.ndarray]: """Vectorize docs using the fitted vectorizer's own analyzer. This ensures tokenization (lowercasing, token pattern, stop words) is identical to what CountVectorizer used when building the vocabulary. Raw whitespace splitting would silently drop mixed-case words and words containing punctuation that the vectorizer would have tokenized differently. """ analyzer = self.vectorizer_.build_analyzer() vocab_dict = self.vectorizer_.vocabulary_ result = [] for doc in X: if doc is None: doc = "" word_ids = [vocab_dict[w] for w in analyzer(doc) if w in vocab_dict] result.append(np.array(word_ids, dtype=np.int32)) return result
[docs] def fit(self, X: Union[List[str], pd.Series], y=None, verbose: bool = False): """Fit the BTM model to documents. Parameters ---------- X : array-like of shape (n_documents,) Documents to fit the model on. Each element should be a string. y : Ignored Not used, present for sklearn compatibility. verbose : bool, default=False Whether to show a progress bar during training. Returns ------- self : BTMClassifier Returns the instance itself. """ # Re-validate in case params were changed via set_params() after __init__ self._validate_params() # Convert input to list of strings if isinstance(X, pd.Series): X = X.tolist() elif not isinstance(X, list): X = list(X) if len(X) == 0: raise ValueError("Input documents cannot be empty") # Vectorize documents using the configured vectorizer self.vectorizer_ = self._setup_vectorizer() doc_term_matrix = self.vectorizer_.fit_transform(X) vocabulary = np.array(self.vectorizer_.get_feature_names_out()) # Store vocabulary information self.vocabulary_ = vocabulary self.feature_names_out_ = vocabulary self.n_features_in_ = len(vocabulary) # Prepare documents and biterms using the vectorizer's own analyzer # so tokenization (lowercasing, token pattern, stop words) is consistent docs_vec = self._get_vectorized_docs(X) biterms = get_biterms(docs_vec, win=self.window_size) # Adjust coherence window to not exceed vocabulary size effective_coherence_window = min(self.coherence_window, len(vocabulary)) # Initialize and fit BTM model self.model_ = BTM( doc_term_matrix, vocabulary, T=self.n_topics, M=effective_coherence_window, alpha=self.alpha, beta=self.beta, seed=self.random_state or 0, win=self.window_size, has_background=self.has_background, epsilon=self.epsilon, ) self.model_.fit(biterms, iterations=self.max_iter, verbose=verbose) return self
[docs] def transform( self, X: Union[List[str], pd.Series], infer_type: str = "sum_b" ) -> np.ndarray: """Transform documents to topic distribution. Parameters ---------- X : array-like of shape (n_documents,) Documents to transform. infer_type : str, default='sum_b' Inference method. Options: 'sum_b', 'sum_w', 'mix'. Returns ------- doc_topic_matrix : np.ndarray of shape (n_documents, n_topics) Document-topic probability matrix. """ check_is_fitted(self, "model_") # Convert input to list of strings if isinstance(X, pd.Series): X = X.tolist() elif not isinstance(X, list): X = list(X) # Vectorize documents using the fitted vectorizer's analyzer docs_vec = self._get_vectorized_docs(X) # Transform using BTM model return self.model_.transform(docs_vec, infer_type=infer_type, verbose=False)
[docs] def fit_transform( self, X: Union[List[str], pd.Series], y=None, infer_type: str = "sum_b", verbose: bool = False, ) -> np.ndarray: """Fit model and transform documents in one step. Parameters ---------- X : array-like of shape (n_documents,) Documents to fit and transform. y : Ignored Not used, present for sklearn compatibility. infer_type : str, default='sum_b' Inference method. Options: 'sum_b', 'sum_w', 'mix'. verbose : bool, default=False Whether to show a progress bar during training. Returns ------- doc_topic_matrix : np.ndarray of shape (n_documents, n_topics) Document-topic probability matrix. """ return self.fit(X, verbose=verbose).transform(X, infer_type=infer_type)
[docs] def get_topic_words( self, topic_id: Optional[int] = None, n_words: int = 10 ) -> Union[List[str], Dict[int, List[str]]]: """Get top words for topics. Parameters ---------- topic_id : int, optional If provided, return words for this topic only. If None, return words for all topics. n_words : int, default=10 Number of top words to return per topic. Returns ------- topic_words : list or dict If topic_id is provided, returns list of top words for that topic. Otherwise, returns dict mapping topic_id to list of words. """ check_is_fitted(self, "model_") topic_word_matrix = self.model_.matrix_topics_words_ if topic_id is not None: if not 0 <= topic_id < self.n_topics: raise ValueError(f"topic_id must be between 0 and {self.n_topics - 1}") word_indices = np.argsort(topic_word_matrix[topic_id])[-n_words:][::-1] return self.vocabulary_[word_indices].tolist() else: result = {} for t in range(self.n_topics): word_indices = np.argsort(topic_word_matrix[t])[-n_words:][::-1] result[t] = self.vocabulary_[word_indices].tolist() return result
[docs] def get_document_topics( self, X: Union[List[str], pd.Series], threshold: float = 0.1 ) -> List[List[int]]: """Get dominant topics for documents. Parameters ---------- X : array-like of shape (n_documents,) Documents to analyze. threshold : float, default=0.1 Minimum probability threshold for topic assignment. Returns ------- doc_topics : list of list of int For each document, list of topic IDs above threshold. """ doc_topic_probs = self.transform(X) doc_topics = [] for doc_probs in doc_topic_probs: topics = [i for i, prob in enumerate(doc_probs) if prob >= threshold] doc_topics.append(topics) return doc_topics
@property def coherence_(self) -> np.ndarray: """Topic coherence scores.""" check_is_fitted(self, "model_") return self.model_.coherence_ @property def perplexity_(self) -> float: """Model perplexity.""" check_is_fitted(self, "model_") return self.model_.perplexity_ @property def topic_word_matrix_(self) -> np.ndarray: """Topic-word probability matrix.""" check_is_fitted(self, "model_") return self.model_.matrix_topics_words_
[docs] def score(self, X: Union[List[str], pd.Series], y=None) -> float: """Return the mean coherence score. Parameters ---------- X : array-like of shape (n_documents,) Documents to score. y : Ignored Not used, present for sklearn compatibility. Returns ------- score : float Mean coherence score across topics. """ check_is_fitted(self, "model_") return float(np.mean(self.coherence_))