Source code for bitermplus._util

__all__ = [
    "get_words_freqs",
    "get_vectorized_docs",
    "get_biterms",
    "get_top_topic_words",
    "get_top_topic_docs",
    "get_docs_top_topic",
]

from typing import Any, Dict, List, Sequence, Tuple, Union

import numpy as np
from pandas import DataFrame, Series, concat
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer

from ._btm import BTM



[docs]
def get_words_freqs(
    docs: Union[List[str], np.ndarray, Series], **kwargs: dict
) -> Tuple[csr_matrix, np.ndarray, Dict]:
    """Extract word frequencies and vocabulary from text documents.

    This function vectorizes a collection of text documents into a sparse matrix
    representation suitable for topic modeling. It uses scikit-learn's CountVectorizer
    to tokenize, count, and filter words, creating a document-term matrix.

    Parameters
    ----------
    docs : list of str, numpy.ndarray, or pandas.Series
        Collection of text documents to vectorize. Each element should be a string
        containing the text content of one document.
    **kwargs : dict
        Additional keyword arguments passed to CountVectorizer. Common options include:

        - min_df : int or float, minimum document frequency
        - max_df : int or float, maximum document frequency
        - stop_words : str or list, stop words to remove
        - lowercase : bool, whether to convert to lowercase
        - token_pattern : str, regex pattern for tokenization

    Returns
    -------
    doc_term_matrix : scipy.sparse.csr_matrix, shape (n_documents, n_features)
        Sparse matrix where element (i,j) represents the count of term j in document i.
    vocabulary : numpy.ndarray, shape (n_features,)
        Array of feature names (words) corresponding to the matrix columns.
    vocab_dict : dict
        Dictionary mapping terms to their column indices in the matrix.

    Examples
    --------
    Basic usage:

    >>> import bitermplus as btm
    >>> texts = ["machine learning is great", "I love natural language processing"]
    >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
    >>> print(f"Matrix shape: {X.shape}")
    >>> print(f"Vocabulary size: {len(vocabulary)}")

    With custom parameters:

    >>> X, vocab, vocab_dict = btm.get_words_freqs(
    ...     texts, min_df=1, stop_words='english', lowercase=True
    ... )

    Notes
    -----
    This function is primarily used internally by BTMClassifier, but can be useful
    for manual preprocessing when using the low-level BTM class directly.

    See Also
    --------
    get_vectorized_docs : Convert documents to word ID representation
    get_biterms : Generate biterms from vectorized documents
    sklearn.feature_extraction.text.CountVectorizer : Underlying vectorization method
    """
    vec = CountVectorizer(**kwargs)
    X = vec.fit_transform(docs)
    words = np.array(vec.get_feature_names_out())
    return X, words, vec.vocabulary_




[docs]
def get_vectorized_docs(
    docs: Union[List[str], np.ndarray], vocab: Union[List[str], np.ndarray]
) -> List[np.ndarray]:
    """Convert text documents to vectorized representation using word IDs.

    This function transforms raw text documents into a numerical representation
    where each word is replaced by its corresponding index in the vocabulary.
    This is a preprocessing step required before biterm generation and BTM training.

    Parameters
    ----------
    docs : list of str or numpy.ndarray
        Collection of text documents. Each document should be a string.
    vocab : list of str or numpy.ndarray
        Vocabulary array containing all unique terms. Typically obtained from
        get_words_freqs() function.

    Returns
    -------
    vectorized_docs : list of numpy.ndarray
        List of vectorized documents. Each document is represented as a numpy
        array of word IDs (integers) corresponding to vocabulary indices.
        Words not in the vocabulary are filtered out.

    Examples
    --------
    Basic usage:

    >>> import bitermplus as btm
    >>> texts = ["machine learning is great", "I love deep learning"]
    >>> X, vocabulary, _ = btm.get_words_freqs(texts)
    >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
    >>> print(f"Original: {texts[0]}")
    >>> print(f"Vectorized: {docs_vec[0]}")

    Complete preprocessing pipeline:

    >>> texts = ["AI and ML are exciting", "Deep learning transforms data"]
    >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
    >>> docs_vectorized = btm.get_vectorized_docs(texts, vocabulary)
    >>> biterms = btm.get_biterms(docs_vectorized)

    Notes
    -----
    - Documents are split on whitespace and filtered to include only known vocabulary
    - Empty strings and None values are handled gracefully
    - This function is automatically called by BTMClassifier but useful for manual preprocessing

    See Also
    --------
    get_words_freqs : Extract vocabulary and document-term matrix
    get_biterms : Generate biterms from vectorized documents
    BTMClassifier : High-level interface that handles preprocessing automatically
    """
    vocab_idx = {word: idx for idx, word in enumerate(vocab)}

    result = []
    for doc in docs:
        # Handle potential None/empty doc and filter out empty strings
        if doc is None:
            doc = ""
        words = [word.strip() for word in doc.split() if word.strip()]
        word_ids = [vocab_idx[word] for word in words if word in vocab_idx]
        result.append(np.array(word_ids, dtype=np.int32))
    return result




[docs]
def get_biterms(docs: List[np.ndarray], win: int = 15) -> List[List[int]]:
    """Generate biterms (word pairs) from vectorized documents.

    Biterms are word co-occurrence pairs that capture local word associations
    within a specified window. This is the core data structure used by BTM
    to model topics in short texts. Unlike traditional topic models that work
    with individual documents, BTM aggregates biterms across the entire corpus.

    Parameters
    ----------
    docs : list of numpy.ndarray
        List of vectorized documents where each document is a numpy array
        of word IDs. Typically obtained from get_vectorized_docs() function.
    win : int, default=15
        Window size for biterm extraction. Biterms are created from all word
        pairs within this distance in each document. Larger windows capture
        more long-range dependencies but may introduce noise.

    Returns
    -------
    biterms : list of list of list
        Nested list structure where biterms[i] contains all biterms for document i.
        Each biterm is represented as [word_id1, word_id2] where word_id1 <= word_id2.

    Raises
    ------
    ValueError
        If no biterms can be generated from the input documents (e.g., all
        documents are too short or vocabulary overlap is insufficient).

    Examples
    --------
    Basic usage:

    >>> import bitermplus as btm
    >>> texts = ["machine learning algorithms", "deep learning networks"]
    >>> X, vocabulary, _ = btm.get_words_freqs(texts)
    >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
    >>> biterms = btm.get_biterms(docs_vec)
    >>> print(f"Number of documents: {len(biterms)}")
    >>> print(f"Biterms in first doc: {biterms[0]}")

    With custom window size:

    >>> biterms = btm.get_biterms(docs_vec, win=10)

    Complete preprocessing pipeline:

    >>> texts = ["AI and machine learning", "Natural language processing"]
    >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
    >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
    >>> biterms = btm.get_biterms(docs_vec, win=15)
    >>> # Now ready for BTM training
    >>> model = btm.BTM(X, vocabulary, T=2)
    >>> model.fit(biterms)

    Notes
    -----
    - Documents with fewer than 2 words produce no biterms and are skipped
    - Biterms are ordered such that the smaller word ID comes first
    - The function validates that at least some biterms are generated
    - Window size should be chosen based on document length and desired dependencies

    See Also
    --------
    get_vectorized_docs : Convert documents to word ID representation
    BTM.fit : Fit BTM model using generated biterms
    BTMClassifier : High-level interface that handles biterm generation automatically
    """
    biterms = []
    for doc in docs:
        doc_biterms = []
        doc_len = len(doc)
        if doc_len < 2:
            continue
        for i in range(doc_len - 1):
            for j in range(i + 1, min(i + win, doc_len)):
                wi = min(doc[i], doc[j])
                wj = max(doc[i], doc[j])
                doc_biterms.append([wi, wj])
        biterms.append(doc_biterms)

    # Check if we have any biterms at all
    total_biterms = sum(len(doc_biterms) for doc_biterms in biterms)
    if total_biterms == 0:
        raise ValueError(
            "No biterms could be generated from the documents. "
            "Documents may be too short or have insufficient vocabulary overlap."
        )

    return biterms




[docs]
def get_top_topic_words(
    model: BTM, words_num: int = 20, topics_idx: Sequence[Any] = None
) -> DataFrame:
    """Select top topic words from a fitted model.

    Parameters
    ----------
    model : bitermplus._btm.BTM
        Fitted BTM model.
    words_num : int = 20
        The number of words to select.
    topics_idx : Union[List, numpy.ndarray] = None
        Topics indices. Meant to be used to select only stable
        topics.

    Returns
    -------
    DataFrame
        Words with highest probabilities per each selected topic.

    Example
    -------
    >>> stable_topics = [0, 3, 10, 12, 18, 21]
    >>> top_words = btm.get_top_topic_words(
    ...     model,
    ...     words_num=100,
    ...     topics_idx=stable_topics)
    """

    def _select_words(model, topic_id: int):
        probs = model.matrix_topics_words_[topic_id, :]
        idx = np.argsort(probs)[: -words_num - 1 : -1]
        result = Series(model.vocabulary_[idx])
        result.name = "topic{}".format(topic_id)
        return result

    topics_num = model.topics_num_
    topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
    return concat(map(lambda x: _select_words(model, x), topics_idx), axis=1)




[docs]
def get_top_topic_docs(
    docs: Sequence[Any], p_zd: np.ndarray, docs_num: int = 20, topics_idx: Sequence[Any] = None
) -> DataFrame:
    """Select top topic docs from a fitted model.

    Parameters
    ----------
    docs : Sequence[Any]
        Iterable of documents (e.g. list of strings).
    p_zd : np.ndarray
        Documents vs topics probabilities matrix.
    docs_num : int = 20
        The number of documents to select.
    topics_idx : Sequence[Any] = None
        Topics indices. Meant to be used to select only stable
        topics.

    Returns
    -------
    DataFrame
        Documents with highest probabilities in all selected topics.

    Example
    -------
    >>> top_docs = btm.get_top_topic_docs(
    ...     texts,
    ...     p_zd,
    ...     docs_num=100,
    ...     topics_idx=[1,2,3,4])
    """

    def _select_docs(docs, p_zd, topic_id: int):
        probs = p_zd[:, topic_id]
        idx = np.argsort(probs)[: -docs_num - 1 : -1]
        result = Series(np.asarray(docs)[idx])
        result.name = "topic{}".format(topic_id)
        return result

    topics_num = p_zd.shape[1]
    topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
    return concat(map(lambda x: _select_docs(docs, p_zd, x), topics_idx), axis=1)




[docs]
def get_docs_top_topic(docs: Sequence[Any], p_zd: np.ndarray) -> DataFrame:
    """Select most probable topic for each document.

    Parameters
    ----------
    docs : Sequence[Any]
        Iterable of documents (e.g. list of strings).
    p_zd : np.ndarray
        Documents vs topics probabilities matrix.

    Returns
    -------
    DataFrame
        Documents and the most probable topic for each of them.

    Example
    -------
    >>> import bitermplus as btm
    >>> # Read documents from file
    >>> # texts = ...
    >>> # Build and train a model
    >>> # model = ...
    >>> # model.fit(...)
    >>> btm.get_docs_top_topic(texts, model.matrix_docs_topics_)
    """
    return DataFrame({"documents": docs, "label": p_zd.argmax(axis=1)})