diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index cfafb58a..9c29aebd 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -29,7 +29,8 @@ from scipy.cluster import hierarchy as sch from importlib.util import find_spec -from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable, TYPE_CHECKING, Literal +from typing import Any, TYPE_CHECKING, Literal +from collections.abc import Mapping, Callable, Iterable # Plotting if find_spec("plotly") is None: @@ -146,13 +147,13 @@ def __init__( self, language: str = "english", top_n_words: int = 10, - n_gram_range: Tuple[int, int] = (1, 1), + n_gram_range: tuple[int, int] = (1, 1), min_topic_size: int = 10, - nr_topics: Union[int, str] | None = None, + nr_topics: int | str | None = None, low_memory: bool = False, calculate_probabilities: bool = False, - seed_topic_list: List[List[str]] | None = None, - zeroshot_topic_list: List[str] | None = None, + seed_topic_list: list[list[str]] | None = None, + zeroshot_topic_list: list[str] | None = None, zeroshot_min_similarity: float = 0.7, embedding_model=None, umap_model=None, @@ -349,10 +350,10 @@ def topic_labels_(self): def fit( self, - documents: List[str], + documents: list[str], embeddings: np.ndarray = None, - images: List[str] | None = None, - y: Union[List[int], np.ndarray] = None, + images: list[str] | None = None, + y: list[int] | np.ndarray = None, ): """Fit the models on a collection of documents and generate topics. @@ -394,11 +395,11 @@ def fit( def fit_transform( self, - documents: List[str], + documents: list[str], embeddings: np.ndarray = None, - images: List[str] | None = None, - y: Union[List[int], np.ndarray] = None, - ) -> Tuple[List[int], Union[np.ndarray, None]]: + images: list[str] | None = None, + y: list[int] | np.ndarray = None, + ) -> tuple[list[int], np.ndarray | None]: """Fit the models on a collection of documents, generate topics, and return the probabilities and topic per document. @@ -544,10 +545,10 @@ def fit_transform( def transform( self, - documents: Union[str, List[str]], + documents: str | list[str], embeddings: np.ndarray = None, - images: List[str] | None = None, - ) -> Tuple[List[int], np.ndarray]: + images: list[str] | None = None, + ) -> tuple[list[int], np.ndarray]: """After having fit a model, use transform to predict new instances. Arguments: @@ -648,9 +649,9 @@ def transform( def partial_fit( self, - documents: List[str], + documents: list[str], embeddings: np.ndarray = None, - y: Union[List[int], np.ndarray] = None, + y: list[int] | np.ndarray = None, ): """Fit BERTopic on a subset of the data and perform online learning with batch-like data. @@ -796,9 +797,9 @@ def partial_fit( def topics_over_time( self, - docs: List[str], - timestamps: Union[List[str], List[int]], - topics: List[int] | None = None, + docs: list[str], + timestamps: list[str] | list[int], + topics: list[int] | None = None, nr_bins: int | None = None, datetime_format: str | None = None, evolution_tuning: bool = True, @@ -955,8 +956,8 @@ def topics_over_time( def topics_per_class( self, - docs: List[str], - classes: Union[List[int], List[str]], + docs: list[str], + classes: list[int] | list[str], global_tuning: bool = True, ) -> pd.DataFrame: """Create topics per class. @@ -1034,7 +1035,7 @@ def topics_per_class( def hierarchical_topics( self, - docs: List[str], + docs: list[str], use_ctfidf: bool = True, linkage_function: Callable[[csr_matrix], np.ndarray] | None = None, distance_function: Callable[[csr_matrix], csr_matrix] | None = None, @@ -1203,7 +1204,7 @@ def hierarchical_topics( def approximate_distribution( self, - documents: Union[str, List[str]], + documents: str | list[str], window: int = 4, stride: int = 1, min_similarity: float = 0.1, @@ -1212,7 +1213,7 @@ def approximate_distribution( use_embedding_model: bool = False, calculate_tokens: bool = False, separator: str = " ", - ) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]: + ) -> tuple[np.ndarray, list[np.ndarray] | None]: """A post-hoc approximation of topic distributions across documents. In order to perform this approximation, each document is split into tokens @@ -1430,7 +1431,7 @@ def approximate_distribution( def find_topics( self, search_term: str | None = None, image: str | None = None, top_n: int = 5 - ) -> Tuple[List[int], List[float]]: + ) -> tuple[list[int], list[float]]: """Find topics most similar to a search_term. Creates an embedding for a search query and compares that with @@ -1487,11 +1488,11 @@ def find_topics( def update_topics( self, - docs: List[str], - images: List[str] | None = None, - topics: List[int] | None = None, + docs: list[str], + images: list[str] | None = None, + topics: list[int] | None = None, top_n_words: int = 10, - n_gram_range: Tuple[int, int] | None = None, + n_gram_range: tuple[int, int] | None = None, vectorizer_model: CountVectorizer = None, ctfidf_model: ClassTfidfTransformer = None, representation_model: BaseRepresentation = None, @@ -1595,7 +1596,7 @@ def update_topics( else: self._create_topic_vectors() - def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: + def get_topics(self, full: bool = False) -> Mapping[str, tuple[str, float]]: """Return topics with top n words and their c-TF-IDF score. Arguments: @@ -1619,7 +1620,7 @@ def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: else: return self.topic_representations_ - def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]: + def get_topic(self, topic: int, full: bool = False) -> Mapping[str, tuple[str, float]] | bool: """Return top n words for a specific topic and their c-TF-IDF scores. Arguments: @@ -1700,7 +1701,7 @@ def get_topic_info(self, topic: int | None = None) -> pd.DataFrame: return info.reset_index(drop=True) - def get_topic_freq(self, topic: int | None = None) -> Union[pd.DataFrame, int]: + def get_topic_freq(self, topic: int | None = None) -> pd.DataFrame | int: """Return the size of topics (descending order). Arguments: @@ -1733,7 +1734,7 @@ def get_topic_freq(self, topic: int | None = None) -> Union[pd.DataFrame, int]: def get_document_info( self, - docs: List[str], + docs: list[str], df: pd.DataFrame = None, metadata: Mapping[str, Any] | None = None, ) -> pd.DataFrame: @@ -1823,7 +1824,7 @@ def get_document_info( document_info[column] = values return document_info - def get_representative_docs(self, topic: int | None = None) -> List[str]: + def get_representative_docs(self, topic: int | None = None) -> list[str]: """Extract the best representing documents per topic. Note: @@ -1976,7 +1977,7 @@ def _tree(to_print, start, parent, tree, grandpa=None, indent=""): start = str(hier_topics.Parent_ID.astype(int).max()) return get_tree(start, tree) - def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None: + def set_topic_labels(self, topic_labels: list[str] | Mapping[int, str]) -> None: """Set custom topic labels in your fitted BERTopic model. Arguments: @@ -2046,7 +2047,7 @@ def generate_topic_labels( word_length: int | None = None, separator: str = "_", aspect: str | None = None, - ) -> List[str]: + ) -> list[str]: """Get labels for each topic in a user-defined format. Arguments: @@ -2100,9 +2101,9 @@ def generate_topic_labels( def merge_topics( self, - docs: List[str], - topics_to_merge: List[Union[Iterable[int], int]], - images: List[str] | None = None, + docs: list[str], + topics_to_merge: list[Iterable[int] | int], + images: list[str] | None = None, ) -> None: """Arguments: docs: The documents you used when calling either `fit` or `fit_transform` @@ -2176,7 +2177,7 @@ def merge_topics( def delete_topics( self, - topics_to_delete: List[int], + topics_to_delete: list[int], ) -> None: """Delete topics from the topic model. @@ -2312,9 +2313,9 @@ def delete_topics( def reduce_topics( self, - docs: List[str], - nr_topics: Union[int, str] = 20, - images: List[str] | None = None, + docs: list[str], + nr_topics: int | str = 20, + images: list[str] | None = None, use_ctfidf: bool = False, ) -> None: """Reduce the number of topics to a fixed number of topics @@ -2379,15 +2380,15 @@ def reduce_topics( def reduce_outliers( self, - documents: List[str], - topics: List[int], - images: List[str] | None = None, + documents: list[str], + topics: list[int], + images: list[str] | None = None, strategy: str = "distributions", probabilities: np.ndarray = None, threshold: float = 0, embeddings: np.ndarray = None, distributions_params: Mapping[str, Any] = {}, - ) -> List[int]: + ) -> list[int]: """Reduce outliers by merging them with their nearest topic according to one of several strategies. @@ -2540,7 +2541,7 @@ def reduce_outliers( def visualize_topics( self, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = False, custom_labels: bool = False, @@ -2594,8 +2595,8 @@ def visualize_topics( def visualize_documents( self, - docs: List[str], - topics: List[int] | None = None, + docs: list[str], + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: float | None = None, @@ -2693,13 +2694,13 @@ def visualize_documents( def visualize_document_datamap( self, - docs: List[str] | None = None, - topics: List[int] | None = None, + docs: list[str] | None = None, + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Documents and Topics", - sub_title: Union[str, None] = None, + sub_title: str | None = None, width: int = 1200, height: int = 750, interactive: bool = False, @@ -2804,12 +2805,12 @@ def visualize_document_datamap( def visualize_hierarchical_documents( self, - docs: List[str], + docs: list[str], hierarchical_topics: pd.DataFrame, - topics: List[int] | None = None, + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, - sample: Union[float, int] | None = None, + sample: float | int | None = None, hide_annotations: bool = False, hide_document_hover: bool = True, nr_levels: int = 10, @@ -2924,7 +2925,7 @@ def visualize_hierarchical_documents( def visualize_term_rank( self, - topics: List[int] | None = None, + topics: list[int] | None = None, log_scale: bool = False, custom_labels: bool = False, title: str = "Term score decline per Topic", @@ -2989,7 +2990,7 @@ def visualize_topics_over_time( self, topics_over_time: pd.DataFrame, top_n_topics: int | None = None, - topics: List[int] | None = None, + topics: list[int] | None = None, normalize_frequency: bool = False, custom_labels: bool = False, title: str = "Topics over Time", @@ -3045,7 +3046,7 @@ def visualize_topics_per_class( self, topics_per_class: pd.DataFrame, top_n_topics: int = 10, - topics: List[int] | None = None, + topics: list[int] | None = None, normalize_frequency: bool = False, custom_labels: bool = False, title: str = "Topics per Class", @@ -3201,7 +3202,7 @@ def visualize_approximate_distribution( def visualize_hierarchy( self, orientation: str = "left", - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = True, custom_labels: bool = False, @@ -3300,7 +3301,7 @@ def visualize_hierarchy( def visualize_heatmap( self, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, n_clusters: int | None = None, use_ctfidf: bool = False, @@ -3360,7 +3361,7 @@ def visualize_heatmap( def visualize_barchart( self, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int = 8, n_words: int = 5, custom_labels: bool = False, @@ -3417,7 +3418,7 @@ def save( self, path, serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle", - save_embedding_model: Union[bool, str] = True, + save_embedding_model: bool | str = True, save_ctfidf: bool = False, ): """Saves the model to the specified path or folder. @@ -3758,7 +3759,7 @@ def push_to_hf_hub( create_pr: bool = False, model_card: bool = True, serialization: str = "safetensors", - save_embedding_model: Union[str, bool] = True, + save_embedding_model: str | bool = True, save_ctfidf: bool = False, ): """Push your BERTopic model to a HuggingFace Hub. @@ -3843,8 +3844,8 @@ def get_params(self, deep: bool = False) -> Mapping[str, Any]: def _extract_embeddings( self, - documents: Union[List[str], str], - images: List[str] | None = None, + documents: list[str] | str, + images: list[str] | None = None, method: str = "document", verbose: bool | None = None, ) -> np.ndarray: @@ -3898,7 +3899,7 @@ def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd logger.info("Images - Completed \u2713") return documents - def _map_predictions(self, predictions: List[int]) -> List[int]: + def _map_predictions(self, predictions: list[int]) -> list[int]: """Map predictions to the correct topics if topics were reduced.""" mappings = self.topic_mapper_.get_mappings(original_topics=True) mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions] @@ -3906,8 +3907,8 @@ def _map_predictions(self, predictions: List[int]) -> List[int]: def _reduce_dimensionality( self, - embeddings: Union[np.ndarray, csr_matrix], - y: Union[List[int], np.ndarray] = None, + embeddings: np.ndarray | csr_matrix, + y: list[int] | np.ndarray = None, partial_fit: bool = False, ) -> np.ndarray: """Reduce dimensionality of embeddings using UMAP and train a UMAP model. @@ -3961,7 +3962,7 @@ def _cluster_embeddings( documents: pd.DataFrame, partial_fit: bool = False, y: np.ndarray = None, - ) -> Tuple[pd.DataFrame, np.ndarray]: + ) -> tuple[pd.DataFrame, np.ndarray]: """Cluster UMAP reduced embeddings with HDBSCAN. Arguments: @@ -4009,7 +4010,7 @@ def _cluster_embeddings( def _zeroshot_topic_modeling( self, documents: pd.DataFrame, embeddings: np.ndarray - ) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]: + ) -> tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]: """Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. We transform the topics in `self.zeroshot_topic_list` to embeddings and @@ -4081,7 +4082,7 @@ def _combine_zeroshot_topics( embeddings: np.ndarray, assigned_documents: pd.DataFrame, assigned_embeddings: np.ndarray, - ) -> Tuple[pd.DataFrame, np.ndarray]: + ) -> tuple[pd.DataFrame, np.ndarray]: """Combine the zero-shot topics with the clustered topics. The zero-shot topics will be inserted between the outlier topic (that may or may not exist) and the rest of the @@ -4134,7 +4135,7 @@ def _combine_zeroshot_topics( logger.info("Zeroshot Step 2 - Completed \u2713") return documents, embeddings - def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: + def _guided_topic_modeling(self, embeddings: np.ndarray) -> tuple[list[int], np.array]: """Apply Guided Topic Modeling. We transform the seeded topics to embeddings using the @@ -4236,11 +4237,11 @@ def _extract_representative_docs( self, c_tf_idf: csr_matrix, documents: pd.DataFrame, - topics: Mapping[str, List[Tuple[str, float]]], + topics: Mapping[str, list[tuple[str, float]]], nr_samples: int = 500, nr_repr_docs: int = 5, diversity: float | None = None, - ) -> Union[List[str], List[List[int]]]: + ) -> list[str] | list[list[int]]: """Approximate most representative documents per topic by sampling a subset of the documents in each topic and calculating which are most representative to their topic based on the cosine similarity between @@ -4400,7 +4401,7 @@ def _c_tf_idf( documents_per_topic: pd.DataFrame, fit: bool = True, partial_fit: bool = False, - ) -> Tuple[csr_matrix, List[str]]: + ) -> tuple[csr_matrix, list[str]]: """Calculate a class-based TF-IDF where m is the number of total documents. Arguments: @@ -4463,13 +4464,13 @@ def _update_topic_size(self, documents: pd.DataFrame): def _extract_words_per_topic( self, - words: List[str], + words: list[str], documents: pd.DataFrame, c_tf_idf: csr_matrix = None, fine_tune_representation: bool = True, calculate_aspects: bool = True, embeddings: np.ndarray = None, - ) -> Mapping[str, List[Tuple[str, float]]]: + ) -> Mapping[str, list[tuple[str, float]]]: """Based on tf_idf scores per topic, extract the top n words per topic. If the top words per topic need to be extracted, then only the `words` parameter @@ -4764,9 +4765,7 @@ def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: self._update_topic_size(documents) return documents - def _map_probabilities( - self, probabilities: Union[np.ndarray, None], original_topics: bool = False - ) -> Union[np.ndarray, None]: + def _map_probabilities(self, probabilities: np.ndarray | None, original_topics: bool = False) -> np.ndarray | None: """Map the probabilities to the reduced topics. This is achieved by adding together the probabilities of all topics that are mapped to the same topic. Then, @@ -4801,7 +4800,7 @@ def _map_probabilities( return probabilities - def _preprocess_text(self, documents: np.ndarray) -> List[str]: + def _preprocess_text(self, documents: np.ndarray) -> list[str]: r"""Basic preprocessing of text. Steps: @@ -4910,7 +4909,7 @@ class TopicMapper: of topics. """ - def __init__(self, topics: List[int]): + def __init__(self, topics: list[int]): """Initialization of Topic Mapper. Arguments: diff --git a/bertopic/_save_utils.py b/bertopic/_save_utils.py index 2dca6e56..46ce758f 100644 --- a/bertopic/_save_utils.py +++ b/bertopic/_save_utils.py @@ -21,9 +21,6 @@ except ImportError: _has_hf_hub = False -# Typing -from typing import Union - # Pytorch check try: import torch @@ -113,7 +110,7 @@ def push_to_hf_hub( create_pr: bool = False, model_card: bool = True, serialization: str = "safetensors", - save_embedding_model: Union[str, bool] = True, + save_embedding_model: str | bool = True, save_ctfidf: bool = False, ): """Push your BERTopic model to a HuggingFace Hub. @@ -450,9 +447,9 @@ def save_topics(model, path: str): json.dump(topics, f, indent=2, cls=NumpyEncoder) -def load_cfg_from_json(json_file: Union[str, os.PathLike]): +def load_cfg_from_json(json_file: str | os.PathLike): """Load configuration from json.""" - with open(json_file, "r", encoding="utf-8") as reader: + with open(json_file, encoding="utf-8") as reader: text = reader.read() return json.loads(text) @@ -463,7 +460,7 @@ def default(self, obj): return int(obj) if isinstance(obj, np.floating): return float(obj) - return super(NumpyEncoder, self).default(obj) + return super().default(obj) def get_package_versions(): diff --git a/bertopic/_utils.py b/bertopic/_utils.py index 035c6acb..136707c7 100644 --- a/bertopic/_utils.py +++ b/bertopic/_utils.py @@ -4,7 +4,7 @@ from collections.abc import Iterable from scipy.sparse import csr_matrix from scipy.spatial.distance import squareform -from typing import Optional, Union, Tuple, Any +from typing import Any class MyLogger: @@ -142,7 +142,7 @@ def validate_distance_matrix(X, n_samples): "distance matrix of shape (n*(n-1)/2,) or a " "2-D square distance matrix of shape (n, n)." "where n is the number of documents." - "Got a distance matrix of shape %s" % str(s) + f"Got a distance matrix of shape {s}" ) # Make sure its entries are non-negative @@ -177,11 +177,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array: def select_topic_representation( - ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, - embeddings: Optional[Union[np.ndarray, csr_matrix]] = None, + ctfidf_embeddings: np.ndarray | csr_matrix | None = None, + embeddings: np.ndarray | csr_matrix | None = None, use_ctfidf: bool = True, output_ndarray: bool = False, -) -> Tuple[np.ndarray, bool]: +) -> tuple[np.ndarray, bool]: """Select the topic representation. Arguments: @@ -199,7 +199,7 @@ def select_topic_representation( The selected topic representation and a boolean indicating whether it is c-TF-IDF. """ - def to_ndarray(array: Union[np.ndarray, csr_matrix]) -> np.ndarray: + def to_ndarray(array: np.ndarray | csr_matrix) -> np.ndarray: if isinstance(array, csr_matrix): return array.toarray() return array diff --git a/bertopic/backend/_base.py b/bertopic/backend/_base.py index 97809b15..8f11541b 100644 --- a/bertopic/backend/_base.py +++ b/bertopic/backend/_base.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List class BaseEmbedder: @@ -18,7 +17,7 @@ def __init__(self, embedding_model=None, word_embedding_model=None): self.embedding_model = embedding_model self.word_embedding_model = word_embedding_model - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. @@ -32,7 +31,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: """ pass - def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: + def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. @@ -47,7 +46,7 @@ def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: """ return self.embed(words, verbose) - def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: + def embed_documents(self, document: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_cohere.py b/bertopic/backend/_cohere.py index 77e1ec8f..c1a47b32 100644 --- a/bertopic/backend/_cohere.py +++ b/bertopic/backend/_cohere.py @@ -1,7 +1,8 @@ import time import numpy as np from tqdm import tqdm -from typing import Any, List, Mapping +from typing import Any +from collections.abc import Mapping from bertopic.backend import BaseEmbedder @@ -60,7 +61,7 @@ def __init__( else: self.embed_kwargs["model"] = self.embedding_model - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_fastembed.py b/bertopic/backend/_fastembed.py index 2aa3a1d4..83cb9a0e 100644 --- a/bertopic/backend/_fastembed.py +++ b/bertopic/backend/_fastembed.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List from fastembed import TextEmbedding from bertopic.backend import BaseEmbedder @@ -38,7 +37,7 @@ def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"): "The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_flair.py b/bertopic/backend/_flair.py index f6e27fea..90f66cdc 100644 --- a/bertopic/backend/_flair.py +++ b/bertopic/backend/_flair.py @@ -1,6 +1,5 @@ import numpy as np from tqdm import tqdm -from typing import Union, List from flair.data import Sentence from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings @@ -30,7 +29,7 @@ class FlairBackend(BaseEmbedder): ``` """ - def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]): + def __init__(self, embedding_model: TokenEmbeddings | DocumentEmbeddings): super().__init__() # Flair word embeddings @@ -52,7 +51,7 @@ def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]): "`roberta = TransformerDocumentEmbeddings('roberta-base')`" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_gensim.py b/bertopic/backend/_gensim.py index e5e72fb4..04d52296 100644 --- a/bertopic/backend/_gensim.py +++ b/bertopic/backend/_gensim.py @@ -1,6 +1,5 @@ import numpy as np from tqdm import tqdm -from typing import List from bertopic.backend import BaseEmbedder from gensim.models.keyedvectors import Word2VecKeyedVectors @@ -36,7 +35,7 @@ def __init__(self, embedding_model: Word2VecKeyedVectors): "`ft = api.load('fasttext-wiki-news-subwords-300')`" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_hftransformers.py b/bertopic/backend/_hftransformers.py index 344412e9..033d7c9b 100644 --- a/bertopic/backend/_hftransformers.py +++ b/bertopic/backend/_hftransformers.py @@ -1,7 +1,6 @@ import numpy as np from tqdm import tqdm -from typing import List from torch.utils.data import Dataset from sklearn.preprocessing import normalize from transformers.pipelines import Pipeline @@ -42,7 +41,7 @@ def __init__(self, embedding_model: Pipeline): "pipeline('feature-extraction', model='distilbert-base-cased', device=0)" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_langchain.py b/bertopic/backend/_langchain.py index 2e27fb14..114e7292 100644 --- a/bertopic/backend/_langchain.py +++ b/bertopic/backend/_langchain.py @@ -1,5 +1,3 @@ -from typing import List - import numpy as np from bertopic.backend import BaseEmbedder from langchain_core.embeddings import Embeddings @@ -25,7 +23,7 @@ class LangChainBackend(BaseEmbedder): def __init__(self, embedding_model: Embeddings): self.embedding_model = embedding_model - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_model2vec.py b/bertopic/backend/_model2vec.py index 42567580..82caae38 100644 --- a/bertopic/backend/_model2vec.py +++ b/bertopic/backend/_model2vec.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List, Union from model2vec import StaticModel from sklearn.feature_extraction.text import CountVectorizer @@ -53,7 +52,7 @@ class Model2VecBackend(BaseEmbedder): def __init__( self, - embedding_model: Union[str, StaticModel], + embedding_model: str | StaticModel, distill: bool = False, distill_kwargs: dict = {}, distill_vectorizer: str | None = None, @@ -87,7 +86,7 @@ def __init__( "`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_multimodal.py b/bertopic/backend/_multimodal.py index 5ed31747..01a744ab 100644 --- a/bertopic/backend/_multimodal.py +++ b/bertopic/backend/_multimodal.py @@ -1,7 +1,6 @@ import numpy as np from PIL import Image from tqdm import tqdm -from typing import List, Union from sentence_transformers import SentenceTransformer from bertopic.backend import BaseEmbedder @@ -44,8 +43,8 @@ class MultiModalBackend(BaseEmbedder): def __init__( self, - embedding_model: Union[str, SentenceTransformer], - image_model: Union[str, SentenceTransformer] = None, + embedding_model: str | SentenceTransformer, + image_model: str | SentenceTransformer = None, batch_size: int = 32, ): super().__init__() @@ -84,7 +83,7 @@ def __init__( except: # noqa: E722 self.tokenizer = None - def embed(self, documents: List[str], images: List[str] | None = None, verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], images: list[str] | None = None, verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words or images into an n-dimensional matrix of embeddings. @@ -122,7 +121,7 @@ def embed(self, documents: List[str], images: List[str] | None = None, verbose: elif image_embeddings is not None: return image_embeddings - def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed_documents(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. @@ -138,7 +137,7 @@ def embed_documents(self, documents: List[str], verbose: bool = False) -> np.nda embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose) return embeddings - def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: + def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_openai.py b/bertopic/backend/_openai.py index 67057fcf..c77f70a3 100644 --- a/bertopic/backend/_openai.py +++ b/bertopic/backend/_openai.py @@ -2,7 +2,8 @@ import openai import numpy as np from tqdm import tqdm -from typing import List, Mapping, Any +from typing import Any +from collections.abc import Mapping from bertopic.backend import BaseEmbedder @@ -51,7 +52,7 @@ def __init__( elif not self.generator_kwargs.get("engine"): self.generator_kwargs["model"] = self.embedding_model - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_sentencetransformers.py b/bertopic/backend/_sentencetransformers.py index e82751ce..bbc053f0 100644 --- a/bertopic/backend/_sentencetransformers.py +++ b/bertopic/backend/_sentencetransformers.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List, Union from sentence_transformers import SentenceTransformer from sentence_transformers.models import StaticEmbedding @@ -50,7 +49,7 @@ class SentenceTransformerBackend(BaseEmbedder): ``` """ - def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False): + def __init__(self, embedding_model: str | SentenceTransformer, model2vec: bool = False): super().__init__() self._hf_model = None @@ -69,7 +68,7 @@ def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: "`model = SentenceTransformer('all-MiniLM-L6-v2')`" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_spacy.py b/bertopic/backend/_spacy.py index f55fd080..9f19d1eb 100644 --- a/bertopic/backend/_spacy.py +++ b/bertopic/backend/_spacy.py @@ -1,6 +1,5 @@ import numpy as np from tqdm import tqdm -from typing import List from bertopic.backend import BaseEmbedder @@ -61,7 +60,7 @@ def __init__(self, embedding_model): "or create a nlp model using: `nlp = spacy.load('en_core_web_md')" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_use.py b/bertopic/backend/_use.py index a17a87d1..bf72e1fa 100644 --- a/bertopic/backend/_use.py +++ b/bertopic/backend/_use.py @@ -1,6 +1,5 @@ import numpy as np from tqdm import tqdm -from typing import List from bertopic.backend import BaseEmbedder @@ -37,7 +36,7 @@ def __init__(self, embedding_model): "`embedding_model = tensorflow_hub.load(path_to_model)`" ) - def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n documents/words into an n-dimensional matrix of embeddings. diff --git a/bertopic/backend/_word_doc.py b/bertopic/backend/_word_doc.py index 4cb7a201..9edfe0dd 100644 --- a/bertopic/backend/_word_doc.py +++ b/bertopic/backend/_word_doc.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List from bertopic.backend._base import BaseEmbedder from bertopic.backend._utils import select_backend @@ -13,7 +12,7 @@ def __init__(self, embedding_model, word_embedding_model): self.embedding_model = select_backend(embedding_model) self.word_embedding_model = select_backend(word_embedding_model) - def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: + def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. @@ -28,7 +27,7 @@ def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: """ return self.word_embedding_model.embed(words, verbose) - def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: + def embed_documents(self, document: list[str], verbose: bool = False) -> np.ndarray: """Embed a list of n words into an n-dimensional matrix of embeddings. diff --git a/bertopic/plotting/_approximate_distribution.py b/bertopic/plotting/_approximate_distribution.py index 72e37047..1c98a46c 100644 --- a/bertopic/plotting/_approximate_distribution.py +++ b/bertopic/plotting/_approximate_distribution.py @@ -82,10 +82,10 @@ def visualize_approximate_distribution( # Style the resulting dataframe def text_color(val): color = "white" if val == 0 else "black" - return "color: %s" % color + return f"color: {color}" def highligh_color(data, color="white"): - attr = "background-color: {}".format(color) + attr = f"background-color: {color}" return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns) if len(df) == 0: diff --git a/bertopic/plotting/_barchart.py b/bertopic/plotting/_barchart.py index c6ceac52..186bf89d 100644 --- a/bertopic/plotting/_barchart.py +++ b/bertopic/plotting/_barchart.py @@ -1,6 +1,5 @@ import itertools import numpy as np -from typing import List, Union import plotly.graph_objects as go from plotly.subplots import make_subplots @@ -8,10 +7,10 @@ def visualize_barchart( topic_model, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int = 8, n_words: int = 5, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Topic Word Scores", width: int = 250, height: int = 250, diff --git a/bertopic/plotting/_datamap.py b/bertopic/plotting/_datamap.py index 58522fdc..6b22e777 100644 --- a/bertopic/plotting/_datamap.py +++ b/bertopic/plotting/_datamap.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -from typing import List, Union from warnings import warn try: @@ -10,19 +9,19 @@ warn("Data map plotting is unavailable unless datamapplot is installed.") # Create a dummy figure type for typing - class Figure(object): + class Figure: pass def visualize_document_datamap( topic_model, - docs: List[str] | None = None, - topics: List[int] | None = None, + docs: list[str] | None = None, + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Documents and Topics", - sub_title: Union[str, None] = None, + sub_title: str | None = None, width: int = 1200, height: int = 750, interactive: bool = False, diff --git a/bertopic/plotting/_distribution.py b/bertopic/plotting/_distribution.py index c04a851b..0a445bfb 100644 --- a/bertopic/plotting/_distribution.py +++ b/bertopic/plotting/_distribution.py @@ -1,5 +1,4 @@ import numpy as np -from typing import Union import plotly.graph_objects as go @@ -7,7 +6,7 @@ def visualize_distribution( topic_model, probabilities: np.ndarray, min_probability: float = 0.015, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Topic Probability Distribution", width: int = 800, height: int = 600, diff --git a/bertopic/plotting/_documents.py b/bertopic/plotting/_documents.py index 73ff8997..a526b164 100644 --- a/bertopic/plotting/_documents.py +++ b/bertopic/plotting/_documents.py @@ -2,19 +2,17 @@ import pandas as pd import plotly.graph_objects as go -from typing import List, Union - def visualize_documents( topic_model, - docs: List[str], - topics: List[int] | None = None, + docs: list[str], + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, sample: float | None = None, hide_annotations: bool = False, hide_document_hover: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Documents and Topics", width: int = 1200, height: int = 750, diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py index fe7c9bc1..535c9b9b 100644 --- a/bertopic/plotting/_heatmap.py +++ b/bertopic/plotting/_heatmap.py @@ -1,5 +1,4 @@ import numpy as np -from typing import List, Union from scipy.cluster.hierarchy import fcluster, linkage from sklearn.metrics.pairwise import cosine_similarity from bertopic._utils import select_topic_representation @@ -10,11 +9,11 @@ def visualize_heatmap( topic_model, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, n_clusters: int | None = None, use_ctfidf: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Similarity Matrix", width: int = 800, height: int = 800, diff --git a/bertopic/plotting/_hierarchical_documents.py b/bertopic/plotting/_hierarchical_documents.py index 6e974e6e..8c9d00bd 100644 --- a/bertopic/plotting/_hierarchical_documents.py +++ b/bertopic/plotting/_hierarchical_documents.py @@ -3,22 +3,20 @@ import plotly.graph_objects as go import math -from typing import List, Union - def visualize_hierarchical_documents( topic_model, - docs: List[str], + docs: list[str], hierarchical_topics: pd.DataFrame, - topics: List[int] | None = None, + topics: list[int] | None = None, embeddings: np.ndarray = None, reduced_embeddings: np.ndarray = None, - sample: Union[float, int] | None = None, + sample: float | int | None = None, hide_annotations: bool = False, hide_document_hover: bool = True, nr_levels: int = 10, level_scale: str = "linear", - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Hierarchical Documents and Topics", width: int = 1200, height: int = 750, diff --git a/bertopic/plotting/_hierarchy.py b/bertopic/plotting/_hierarchy.py index 177ae874..28b75636 100644 --- a/bertopic/plotting/_hierarchy.py +++ b/bertopic/plotting/_hierarchy.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from typing import Callable, List, Union +from collections.abc import Callable from scipy.sparse import csr_matrix from scipy.cluster import hierarchy as sch from sklearn.metrics.pairwise import cosine_similarity @@ -16,10 +16,10 @@ def visualize_hierarchy( topic_model, orientation: str = "left", - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = True, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Hierarchical Clustering", width: int = 1000, height: int = 600, @@ -235,7 +235,7 @@ def _get_annotations( distance_function: Callable[[csr_matrix], csr_matrix], orientation: str, custom_labels: bool = False, -) -> List[List[str]]: +) -> list[list[str]]: """Get annotations by replicating linkage function calculation in scipy. Arguments: diff --git a/bertopic/plotting/_term_rank.py b/bertopic/plotting/_term_rank.py index 1f7e7a1f..023b921d 100644 --- a/bertopic/plotting/_term_rank.py +++ b/bertopic/plotting/_term_rank.py @@ -1,13 +1,12 @@ import numpy as np -from typing import List, Union import plotly.graph_objects as go def visualize_term_rank( topic_model, - topics: List[int] | None = None, + topics: list[int] | None = None, log_scale: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Term score decline per Topic", width: int = 800, height: int = 500, diff --git a/bertopic/plotting/_topics.py b/bertopic/plotting/_topics.py index 7dba9a4a..da27aad2 100644 --- a/bertopic/plotting/_topics.py +++ b/bertopic/plotting/_topics.py @@ -8,7 +8,6 @@ except (ImportError, ModuleNotFoundError): HAS_UMAP = False -from typing import List, Union from sklearn.preprocessing import MinMaxScaler from bertopic._utils import select_topic_representation import plotly.express as px @@ -17,10 +16,10 @@ def visualize_topics( topic_model, - topics: List[int] | None = None, + topics: list[int] | None = None, top_n_topics: int | None = None, use_ctfidf: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Intertopic Distance Map", width: int = 650, height: int = 650, @@ -120,7 +119,7 @@ def visualize_topics( return _plotly_topic_visualization(df, topic_list, title, width, height) -def _plotly_topic_visualization(df: pd.DataFrame, topic_list: List[str], title: str, width: int, height: int): +def _plotly_topic_visualization(df: pd.DataFrame, topic_list: list[str], title: str, width: int, height: int): """Create plotly-based visualization of topics with a slider for topic selection.""" def get_color(topic_selected): diff --git a/bertopic/plotting/_topics_over_time.py b/bertopic/plotting/_topics_over_time.py index 9966e1c5..3614996b 100644 --- a/bertopic/plotting/_topics_over_time.py +++ b/bertopic/plotting/_topics_over_time.py @@ -1,5 +1,4 @@ import pandas as pd -from typing import List, Union import plotly.graph_objects as go from sklearn.preprocessing import normalize @@ -8,9 +7,9 @@ def visualize_topics_over_time( topic_model, topics_over_time: pd.DataFrame, top_n_topics: int | None = None, - topics: List[int] | None = None, + topics: list[int] | None = None, normalize_frequency: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Topics over Time", width: int = 1250, height: int = 450, diff --git a/bertopic/plotting/_topics_per_class.py b/bertopic/plotting/_topics_per_class.py index c7fafc3c..946b6d93 100644 --- a/bertopic/plotting/_topics_per_class.py +++ b/bertopic/plotting/_topics_per_class.py @@ -1,5 +1,4 @@ import pandas as pd -from typing import List, Union import plotly.graph_objects as go from sklearn.preprocessing import normalize @@ -8,9 +7,9 @@ def visualize_topics_per_class( topic_model, topics_per_class: pd.DataFrame, top_n_topics: int = 10, - topics: List[int] | None = None, + topics: list[int] | None = None, normalize_frequency: bool = False, - custom_labels: Union[bool, str] = False, + custom_labels: bool | str = False, title: str = "Topics per Class", width: int = 1250, height: int = 900, diff --git a/bertopic/representation/_base.py b/bertopic/representation/_base.py index 63feeda9..6af9d1f4 100644 --- a/bertopic/representation/_base.py +++ b/bertopic/representation/_base.py @@ -1,7 +1,7 @@ import pandas as pd from scipy.sparse import csr_matrix from sklearn.base import BaseEstimator -from typing import Mapping, List, Tuple +from collections.abc import Mapping class BaseRepresentation(BaseEstimator): @@ -12,8 +12,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Each representation model that inherits this class will have diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index 9ccd5ae5..a0085db9 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -2,7 +2,7 @@ import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Union, Callable +from collections.abc import Mapping, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters @@ -118,7 +118,7 @@ def __init__( nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, - tokenizer: Union[str, Callable] | None = None, + tokenizer: str | Callable | None = None, ): self.client = client self.model = model @@ -140,8 +140,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: diff --git a/bertopic/representation/_keybert.py b/bertopic/representation/_keybert.py index b68bb1ed..3887ebf5 100644 --- a/bertopic/representation/_keybert.py +++ b/bertopic/representation/_keybert.py @@ -3,7 +3,7 @@ from packaging import version from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Union +from collections.abc import Mapping from sklearn.metrics.pairwise import cosine_similarity from bertopic.representation._base import BaseRepresentation from sklearn import __version__ as sklearn_version @@ -70,9 +70,9 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], + topics: Mapping[str, list[tuple[str, float]]], embeddings: np.ndarray = None, - ) -> Mapping[str, List[Tuple[str, float]]]: + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: @@ -113,8 +113,8 @@ def _extract_candidate_words( self, topic_model, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """For each topic, extract candidate words based on the c-TF-IDF representation. @@ -156,11 +156,11 @@ def _extract_candidate_words( def _extract_embeddings( self, topic_model, - topics: Mapping[str, List[Tuple[str, float]]], - representative_docs: List[str], - repr_doc_indices: List[List[int]], + topics: Mapping[str, list[tuple[str, float]]], + representative_docs: list[str], + repr_doc_indices: list[list[int]], repr_embeddings: np.ndarray = None, - ) -> Union[np.ndarray, List[str]]: + ) -> np.ndarray | list[str]: """Extract the representative document embeddings and create topic embeddings. Then extract word embeddings and calculate the cosine similarity between topic embeddings and the word embeddings. Topic embeddings are the average of @@ -193,10 +193,10 @@ def _extract_embeddings( def _extract_top_words( self, - vocab: List[str], - topics: Mapping[str, List[Tuple[str, float]]], + vocab: list[str], + topics: Mapping[str, list[tuple[str, float]]], sim: np.ndarray, - ) -> Mapping[str, List[Tuple[str, float]]]: + ) -> Mapping[str, list[tuple[str, float]]]: """Extract the top n words per topic based on the similarity matrix between topics and words. diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index b80d9b77..7399047b 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -1,7 +1,7 @@ import pandas as pd from langchain.docstore.document import Document from scipy.sparse import csr_matrix -from typing import Callable, Mapping, List, Tuple, Union +from collections.abc import Callable, Mapping from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters @@ -137,7 +137,7 @@ def __init__( nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, - tokenizer: Union[str, Callable] | None = None, + tokenizer: str | Callable | None = None, chain_config=None, ): self.chain = chain @@ -155,8 +155,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, int]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, int]]]: """Extract topics. Arguments: @@ -179,7 +179,7 @@ def extract_topics( ) # Generate label using langchain's batch functionality - chain_docs: List[List[Document]] = [ + chain_docs: list[list[Document]] = [ [ Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc)) for doc in docs diff --git a/bertopic/representation/_litellm.py b/bertopic/representation/_litellm.py index c9f69ae8..97002594 100644 --- a/bertopic/representation/_litellm.py +++ b/bertopic/representation/_litellm.py @@ -3,7 +3,8 @@ import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Any +from typing import Any +from collections.abc import Mapping from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import retry_with_exponential_backoff @@ -101,8 +102,8 @@ def __init__( del self.generator_kwargs["prompt"] def extract_topics( - self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: + self, topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, topics: Mapping[str, list[tuple[str, float]]] + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 730a5f55..17496e54 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -2,7 +2,8 @@ from tqdm import tqdm from scipy.sparse import csr_matrix from llama_cpp import Llama -from typing import Mapping, List, Tuple, Any, Union, Callable +from typing import Any +from collections.abc import Mapping, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters @@ -114,14 +115,14 @@ class LlamaCPP(BaseRepresentation): def __init__( self, - model: Union[str, Llama], + model: str | Llama, prompt: str | None = None, system_prompt: str | None = None, pipeline_kwargs: Mapping[str, Any] = {}, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, - tokenizer: Union[str, Callable] | None = None, + tokenizer: str | Callable | None = None, ): if isinstance(model, str): self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML") @@ -151,8 +152,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topic representations and return a single label. Arguments: diff --git a/bertopic/representation/_mmr.py b/bertopic/representation/_mmr.py index b3b1b232..3eb43dbd 100644 --- a/bertopic/representation/_mmr.py +++ b/bertopic/representation/_mmr.py @@ -1,7 +1,7 @@ import warnings import numpy as np import pandas as pd -from typing import List, Mapping, Tuple +from collections.abc import Mapping from scipy.sparse import csr_matrix from sklearn.metrics.pairwise import cosine_similarity from bertopic.representation._base import BaseRepresentation @@ -45,8 +45,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topic representations. Arguments: @@ -86,10 +86,10 @@ def extract_topics( def mmr( doc_embedding: np.ndarray, word_embeddings: np.ndarray, - words: List[str], + words: list[str], diversity: float = 0.1, top_n: int = 10, -) -> List[str]: +) -> list[str]: """Maximal Marginal Relevance. Arguments: diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 71eb8c9a..d4473f4b 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -3,7 +3,8 @@ import pandas as pd from tqdm import tqdm from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Any, Union, Callable +from typing import Any +from collections.abc import Mapping, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import ( retry_with_exponential_backoff, @@ -142,7 +143,7 @@ def __init__( nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, - tokenizer: Union[str, Callable] | None = None, + tokenizer: str | Callable | None = None, **kwargs, ): self.client = client @@ -184,8 +185,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: diff --git a/bertopic/representation/_pos.py b/bertopic/representation/_pos.py index a0f32d60..46d74106 100644 --- a/bertopic/representation/_pos.py +++ b/bertopic/representation/_pos.py @@ -7,7 +7,7 @@ from packaging import version from scipy.sparse import csr_matrix -from typing import List, Mapping, Tuple, Union +from collections.abc import Mapping from sklearn import __version__ as sklearn_version from bertopic.representation._base import BaseRepresentation @@ -65,9 +65,9 @@ class PartOfSpeech(BaseRepresentation): def __init__( self, - model: Union[str, Language] = "en_core_web_sm", + model: str | Language = "en_core_web_sm", top_n_words: int = 10, - pos_patterns: List[str] | None = None, + pos_patterns: list[str] | None = None, ): if isinstance(model, str): self.model = spacy.load(model) @@ -96,8 +96,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index 8c8e36f6..963bb67c 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -3,7 +3,8 @@ from scipy.sparse import csr_matrix from transformers import pipeline, set_seed from transformers.pipelines.base import Pipeline -from typing import Mapping, List, Tuple, Any, Union, Callable +from typing import Any +from collections.abc import Mapping, Callable from bertopic.representation._base import BaseRepresentation from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters @@ -84,14 +85,14 @@ class TextGeneration(BaseRepresentation): def __init__( self, - model: Union[str, pipeline], + model: str | Pipeline, prompt: str | None = None, pipeline_kwargs: Mapping[str, Any] = {}, random_state: int = 42, nr_docs: int = 4, diversity: float | None = None, doc_length: int | None = None, - tokenizer: Union[str, Callable] | None = None, + tokenizer: str | Callable | None = None, ): self.random_state = random_state set_seed(random_state) @@ -121,8 +122,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topic representations and return a single label. Arguments: diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index 6a0305db..1300d97a 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -1,9 +1,9 @@ import random import time -from typing import Union +from collections.abc import Callable -def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str: +def truncate_document(topic_model, doc_length: int | None, tokenizer: str | Callable, document: str) -> str: """Truncate a document to a certain length. If you want to add a custom tokenizer, then it will need to have a `decode` and @@ -59,7 +59,7 @@ def decode(self, doc_chunks): return document -def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]: +def validate_truncate_document_parameters(tokenizer, doc_length) -> None | ValueError: """Validates parameters that are used in the function `truncate_document`.""" if tokenizer is None and doc_length is not None: raise ValueError( diff --git a/bertopic/representation/_visual.py b/bertopic/representation/_visual.py index 8c98d5a6..aceb7c79 100644 --- a/bertopic/representation/_visual.py +++ b/bertopic/representation/_visual.py @@ -4,7 +4,7 @@ from PIL import Image from tqdm import tqdm from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Union +from collections.abc import Mapping from transformers.pipelines import Pipeline, pipeline from bertopic.representation._mmr import mmr @@ -49,9 +49,9 @@ def __init__( self, nr_repr_images: int = 9, nr_samples: int = 500, - image_height: Tuple[int, int] = 600, + image_height: tuple[int, int] = 600, image_squares: bool = False, - image_to_text_model: Union[str, Pipeline] = None, + image_to_text_model: str | Pipeline = None, batch_size: int = 32, ): self.nr_repr_images = nr_repr_images @@ -78,8 +78,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: @@ -128,7 +128,7 @@ def extract_topics( return representative_images - def _convert_image_to_text(self, images: List[str], verbose: bool = False) -> List[str]: + def _convert_image_to_text(self, images: list[str], verbose: bool = False) -> list[str]: """Convert a list of images to captions. Arguments: @@ -268,7 +268,7 @@ def get_concat_tile_resize(im_list_2d, image_height=600, image_squares=False): images = [get_concat_h_multi_resize(im_list_h) for im_list_h in images] img = get_concat_v_multi_resize(images) height_percentage = image_height / float(img.size[1]) - adjusted_width = int((float(img.size[0]) * float(height_percentage))) + adjusted_width = int(float(img.size[0]) * float(height_percentage)) img = img.resize((adjusted_width, image_height), Image.Resampling.LANCZOS) return img diff --git a/bertopic/representation/_zeroshot.py b/bertopic/representation/_zeroshot.py index ff9d54da..a2098d61 100644 --- a/bertopic/representation/_zeroshot.py +++ b/bertopic/representation/_zeroshot.py @@ -2,7 +2,8 @@ from transformers import pipeline from transformers.pipelines.base import Pipeline from scipy.sparse import csr_matrix -from typing import Mapping, List, Tuple, Any +from typing import Any +from collections.abc import Mapping from bertopic.representation._base import BaseRepresentation @@ -37,7 +38,7 @@ class ZeroShotClassification(BaseRepresentation): def __init__( self, - candidate_topics: List[str], + candidate_topics: list[str], model: str = "facebook/bart-large-mnli", pipeline_kwargs: Mapping[str, Any] = {}, min_prob: float = 0.8, @@ -61,8 +62,8 @@ def extract_topics( topic_model, documents: pd.DataFrame, c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]], - ) -> Mapping[str, List[Tuple[str, float]]]: + topics: Mapping[str, list[tuple[str, float]]], + ) -> Mapping[str, list[tuple[str, float]]]: """Extract topics. Arguments: diff --git a/bertopic/vectorizers/_ctfidf.py b/bertopic/vectorizers/_ctfidf.py index 5ce58ed8..8bfb74fb 100644 --- a/bertopic/vectorizers/_ctfidf.py +++ b/bertopic/vectorizers/_ctfidf.py @@ -1,4 +1,3 @@ -from typing import List from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import normalize from sklearn.utils import check_array @@ -42,14 +41,14 @@ def __init__( self, bm25_weighting: bool = False, reduce_frequent_words: bool = False, - seed_words: List[str] | None = None, + seed_words: list[str] | None = None, seed_multiplier: float = 2, ): self.bm25_weighting = bm25_weighting self.reduce_frequent_words = reduce_frequent_words self.seed_words = seed_words self.seed_multiplier = seed_multiplier - super(ClassTfidfTransformer, self).__init__() + super().__init__() def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None): """Learn the idf vector (global term weights). diff --git a/bertopic/vectorizers/_online_cv.py b/bertopic/vectorizers/_online_cv.py index 4f40ddbf..3fa64d7d 100644 --- a/bertopic/vectorizers/_online_cv.py +++ b/bertopic/vectorizers/_online_cv.py @@ -1,6 +1,5 @@ import numpy as np from itertools import chain -from typing import List from scipy import sparse from scipy.sparse import csr_matrix @@ -71,9 +70,9 @@ class OnlineCountVectorizer(CountVectorizer): def __init__(self, decay: float | None = None, delete_min_df: float | None = None, **kwargs): self.decay = decay self.delete_min_df = delete_min_df - super(OnlineCountVectorizer, self).__init__(**kwargs) + super().__init__(**kwargs) - def partial_fit(self, raw_documents: List[str]) -> None: + def partial_fit(self, raw_documents: list[str]) -> None: """Perform a partial fit and update vocabulary with OOV tokens. Arguments: @@ -99,7 +98,7 @@ def partial_fit(self, raw_documents: List[str]) -> None: return self - def update_bow(self, raw_documents: List[str]) -> csr_matrix: + def update_bow(self, raw_documents: list[str]) -> csr_matrix: """Create or update the bag-of-words matrix. Update the bag-of-words matrix by adding the newly transformed diff --git a/pyproject.toml b/pyproject.toml index d3019893..aa155511 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,7 @@ select = [ "D", # pydocstyle "PD", # pandas-vet "RUF", # ruff + "UP", # pyupgrade ] ignore = [ diff --git a/tests/test_utils.py b/tests/test_utils.py index 90876e76..5ce13594 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,6 @@ import pytest import logging import numpy as np -from typing import List from bertopic._utils import ( check_documents_type, check_embeddings_shape, @@ -40,7 +39,7 @@ def test_check_embeddings_shape(): def test_make_unique_distances(): - def check_dists(dists: List[float], noise_max: float): + def check_dists(dists: list[float], noise_max: float): unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max) assert len(unique_dists) == len(dists), "The number of elements must be the same" assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"