diff --git a/docs/cvp.md b/docs/cvp.md new file mode 100644 index 00000000..289b3e6d --- /dev/null +++ b/docs/cvp.md @@ -0,0 +1,79 @@ +# Concept Vector Projection + +Concept Vector Projection is an embedding-based method for extracting continuous sentiment (or other) scores from free-text documents. + +
+ +
Figure 1: Schematic Overview of Concept Vector Projection.
Figure from Lyngbæk et al. (2025)
+
+ +The method rests on the idea that one can construct a _concept vector_ by encoding positive and negative _seed phrases_ with a transformer, then taking the difference of these mean vectors. +We can then project other documents' embeddings onto these concept vectors by taking the dot product with the concept vector, thereby giving continuous scores on how related documents are to a given concept. + +## Usage + +### Single Concept + +When projecting onto a single concept, you should specify the seeds as a tuple of positive and negative phrases. + +```python +from turftopic import ConceptVectorProjection + +positive = [ + "I love this product", + "This is absolutely lovely", + "My daughter is going to adore this" +] +negative = [ + "This product is not at all as advertised, I'm very displeased", + "I hate this", + "What a horrible way to deal with people" +] +cvp = ConceptVectorProjection(seeds=(positive, negative)) + +test_documents = ["My cute little doggy", "Few this is digusting"] +doc_concept_matrix = cvp.transform(test_documents) +print(doc_concept_matrix) +``` + +```python +[[0.24265897] + [0.01709663]] +``` + +### Multiple Concepts + +When projecting documents to multiple concepts at once, you will need to specify seeds for each concept, as well as its name. +Internally this is handled with an `OrderedDict`, which you can either specify yourself, or Turftopic can do it for you: + +```python +import pandas as pd +from collections import OrderedDict + +cuteness_seeds = (["Absolutely adorable", "I love how he dances with his little feet"], ["What a big slob of an abomination", "A suspicious old man sat next to me on the bus today"]) +bullish_seeds = (["We are going to the moon", "This stock will prove an incredible investment"], ["I will short the hell out of them", "Uber stocks drop 7% in value after down-time."]) + +# Either specify it like this: +seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)] +# or as an OrderedDict: +seeds = OrderedDict([("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]) +cvp = ConceptVectorProjection(seeds=seeds) + +test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"] +doc_concept_matrix = cvp.transform(test_documents) +concept_df = pd.DataFrame(doc_concept_matrix, columns=cvp.get_feature_names_out()) +print(concept_df) +``` + +```python + cuteness bullish +0 0.085957 0.288779 +1 0.269454 0.009495 +``` + +## API Reference + + +::: turftopic.models.cvp.ConceptVectorProjection + + diff --git a/docs/images/cvp.png b/docs/images/cvp.png new file mode 100644 index 00000000..112a5e42 Binary files /dev/null and b/docs/images/cvp.png differ diff --git a/mkdocs.yml b/mkdocs.yml index 69108c4a..1d3e5f68 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -33,6 +33,8 @@ nav: - Clustering Models (BERTopic & Top2Vec): clustering.md - Autoencoding Models (ZeroShotTM & CombinedTM): ctm.md - FASTopic: FASTopic.md + - Other Models (e.g. Sentiment Analysis): + - Concept Vector Projection (Continuous Sentiment Scoring): cvp.md - Embedding Models: encoders.md - Vectorizers (Term extraction): vectorizers.md - Topic Analysis and Naming with LLMs: analyzers.md diff --git a/tests/test_cvp.py b/tests/test_cvp.py new file mode 100644 index 00000000..8548dad0 --- /dev/null +++ b/tests/test_cvp.py @@ -0,0 +1,25 @@ +def test_cvp(): + from turftopic import ConceptVectorProjection + + cuteness_seeds = ( + ["Absolutely adorable", "I love how he dances with his little feet"], + [ + "What a big slob of an abomination", + "A suspicious old man sat next to me on the bus today", + ], + ) + bullish_seeds = ( + [ + "We are going to the moon", + "This stock will prove an incredible investment", + ], + [ + "I will short the hell out of them", + "Uber stocks drop 7% in value after down-time.", + ], + ) + seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)] + cvp = ConceptVectorProjection(seeds=seeds) + test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"] + doc_concept_matrix = cvp.transform(test_documents) + assert doc_concept_matrix.shape == (2, 2) diff --git a/turftopic/__init__.py b/turftopic/__init__.py index 8372841d..a763b262 100644 --- a/turftopic/__init__.py +++ b/turftopic/__init__.py @@ -3,6 +3,7 @@ from turftopic.base import ContextualModel from turftopic.error import NotInstalled from turftopic.models.cluster import BERTopic, ClusteringTopicModel, Top2Vec +from turftopic.models.cvp import ConceptVectorProjection from turftopic.models.decomp import S3, SemanticSignalSeparation from turftopic.models.fastopic import FASTopic from turftopic.models.gmm import GMM @@ -34,4 +35,5 @@ "create_concept_browser", "S3", "SensTopic", + "ConceptVectorProjection", ] diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py index ba3a749e..b709617c 100644 --- a/turftopic/encoders/utils.py +++ b/turftopic/encoders/utils.py @@ -18,43 +18,63 @@ def batched(iterable, n: int) -> Iterable[List[str]]: def encode_chunks( encoder, - sentences, + texts, batch_size=64, window_size=50, step_size=40, - return_chunks=False, - show_progress_bar=False, ): - chunks = [] + """ + Returns + ------- + chunk_embeddings: list[np.ndarray] + Embedding matrix of chunks for each document. + chunk_positions: list[list[tuple[int, int]]] + List of start and end character index of chunks for each document. + """ + chunk_positions = [] chunk_embeddings = [] for start_index in trange( 0, - len(sentences), + len(texts), batch_size, desc="Encoding batches...", - disable=not show_progress_bar, ): - batch = sentences[start_index : start_index + batch_size] + batch = texts[start_index : start_index + batch_size] features = encoder.tokenize(batch) with torch.no_grad(): output_features = encoder.forward(features) n_tokens = output_features["attention_mask"].sum(axis=1) + # Find first nonzero elements in each document + # The document could be padded from the left, so we have to watch out for this. + start_token = torch.argmax( + (output_features["attention_mask"] > 0).to(torch.long), axis=1 + ) + end_token = start_token + n_tokens for i_doc in range(len(batch)): - for chunk_start in range(0, n_tokens[i_doc], step_size): - chunk_end = min(chunk_start + window_size, n_tokens[i_doc]) + _chunk_embeddings = [] + _chunk_positions = [] + for chunk_start in range( + start_token[i_doc], end_token[i_doc], step_size + ): + chunk_end = min(chunk_start + window_size, end_token[i_doc]) _emb = output_features["token_embeddings"][ i_doc, chunk_start:chunk_end, : ].mean(axis=0) - chunk_embeddings.append(_emb) - if return_chunks: - chunks.append( - encoder.tokenizer.decode( - features["input_ids"][i_doc, chunk_start:chunk_end] - ) - .replace("[CLS]", "") - .replace("[SEP]", "") + _chunk_embeddings.append(_emb) + chunk_text = ( + encoder.tokenizer.decode( + features["input_ids"][i_doc, chunk_start:chunk_end], + skip_special_tokens=True, ) - if not return_chunks: - chunks = None - chunk_embeddings = np.stack(chunk_embeddings) - return chunk_embeddings, chunks + .replace("[CLS]", "") + .replace("[SEP]", "") + .strip() + ) + doc_text = texts[start_index + i_doc] + start_char = doc_text.find(chunk_text) + end_char = start_char + len(chunk_text) + _chunk_positions.append((start_char, end_char)) + _chunk_embeddings = np.stack(_chunk_embeddings) + chunk_embeddings.append(_chunk_embeddings) + chunk_positions.append(_chunk_positions) + return chunk_embeddings, chunk_positions diff --git a/turftopic/models/cvp.py b/turftopic/models/cvp.py new file mode 100644 index 00000000..6a8a7b8f --- /dev/null +++ b/turftopic/models/cvp.py @@ -0,0 +1,149 @@ +import json +import tempfile +from collections import OrderedDict +from pathlib import Path +from typing import Union + +import joblib +import numpy as np +from huggingface_hub import HfApi +from sentence_transformers import SentenceTransformer +from sklearn.base import BaseEstimator, TransformerMixin + +from turftopic.base import Encoder +from turftopic.encoders.multimodal import MultimodalEncoder +from turftopic.serialization import create_readme, get_package_versions + +Seeds = tuple[list[str], list[str]] + + +class ConceptVectorProjection(BaseEstimator, TransformerMixin): + """Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD) + Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases. + The primary use case is sentiment analysis, and continuous sentiment scores, + especially for languages where dedicated models are not available. + + Parameters + ---------- + seeds: (list[str], list[str]) or list of (str, (list[str], list[str])) + If you want to project to a single concept, then + a tuple of (list of negative terms, list of positive terms).
+ If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list. + Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys, + and the tuples of negative and positive seeds as the values. + encoder: str or SentenceTransformer + Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default + per Lyngbæk et al. (2025). + """ + + def __init__( + self, + seeds: Union[Seeds, list[tuple[str, Seeds]], OrderedDict[str, Seeds]], + encoder: Union[ + Encoder, str, MultimodalEncoder + ] = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", + ): + self.seeds = seeds + if isinstance(seeds, OrderedDict): + self._seeds = seeds + elif ( + (len(seeds) == 2) + and (isinstance(seeds, tuple)) + and (isinstance(seeds[0][0], str)) + ): + self._seeds = OrderedDict([("default", seeds)]) + else: + self._seeds = OrderedDict(seeds) + self.encoder = encoder + if isinstance(encoder, str): + self.encoder_ = SentenceTransformer(encoder) + else: + self.encoder_ = encoder + self.classes_ = np.array([name for name in self._seeds]) + self.concept_matrix_ = [] + for _, (positive, negative) in self._seeds.items(): + positive_emb = self.encoder_.encode(positive) + negative_emb = self.encoder_.encode(negative) + cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0) + self.concept_matrix_.append(cv / np.linalg.norm(cv)) + self.concept_matrix_ = np.stack(self.concept_matrix_) + + def get_feature_names_out(self): + """Returns concept names in an array.""" + return self.classes_ + + def fit_transform(self, raw_documents=None, y=None, embeddings=None): + """Project documents onto the concept vectors. + + Parameters + ---------- + raw_documents: list[str] or None + List of documents to project to the concept vectors. + embeddings: ndarray of shape (n_documents, n_dimensions) + Document embeddings (has to be created with the same encoder as the concept vectors.) + + Returns + ------- + document_concept_matrix: ndarray of shape (n_documents, n_dimensions) + Prevalance of each concept in each document. + """ + if (raw_documents is None) and (embeddings is None): + raise ValueError( + "Either embeddings or raw_documents has to be passed, both are None." + ) + if embeddings is None: + embeddings = self.encoder_.encode(raw_documents) + return embeddings @ self.concept_matrix_.T + + def transform(self, raw_documents=None, embeddings=None): + """Project documents onto the concept vectors. + + Parameters + ---------- + raw_documents: list[str] or None + List of documents to project to the concept vectors. + embeddings: ndarray of shape (n_documents, n_dimensions) + Document embeddings (has to be created with the same encoder as the concept vectors.) + + Returns + ------- + document_concept_matrix: ndarray of shape (n_documents, n_dimensions) + Prevalance of each concept in each document. + """ + return self.fit_transform(raw_documents, embeddings=embeddings) + + def to_disk(self, out_dir: Union[Path, str]): + """Persists model to directory on your machine. + + Parameters + ---------- + out_dir: Path | str + Directory to save the model to. + """ + out_dir = Path(out_dir) + out_dir.mkdir(exist_ok=True) + package_versions = get_package_versions() + with out_dir.joinpath("package_versions.json").open("w") as ver_file: + ver_file.write(json.dumps(package_versions)) + joblib.dump(self, out_dir.joinpath("model.joblib")) + + def push_to_hub(self, repo_id: str): + """Uploads model to HuggingFace Hub + + Parameters + ---------- + repo_id: str + Repository to upload the model to. + """ + api = HfApi() + api.create_repo(repo_id, exist_ok=True) + with tempfile.TemporaryDirectory() as tmp_dir: + readme_path = Path(tmp_dir).joinpath("README.md") + with readme_path.open("w") as readme_file: + readme_file.write(create_readme(self, repo_id)) + self.to_disk(tmp_dir) + api.upload_folder( + folder_path=tmp_dir, + repo_id=repo_id, + repo_type="model", + ) diff --git a/turftopic/vectorizers/phrases.py b/turftopic/vectorizers/phrases.py new file mode 100644 index 00000000..b4a5a8f0 --- /dev/null +++ b/turftopic/vectorizers/phrases.py @@ -0,0 +1,82 @@ +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import CountVectorizer + + +class PhraseVectorizer(BaseEstimator, TransformerMixin): + """NPMI-score-based phrase extraction.""" + + def __init__( + self, + max_ngram=3, + min_df=10, + max_df=1.0, + threshold=0.5, + stop_words="english", + smoothing=5, + ): + self.stop_words = stop_words + self.threshold = threshold + self.max_ngram = max_ngram + self.min_df = min_df + self.max_df = max_df + self.smoothing = smoothing + self.ngram_range = (1, max_ngram) + + def fit_transform(self, raw_documents, y=None): + self.vectorizer_ = CountVectorizer( + stop_words=self.stop_words, + min_df=self.min_df, + max_df=self.max_df, + ngram_range=self.ngram_range, + ) + dtm = self.vectorizer_.fit_transform(raw_documents) + all_vocab = self.vectorizer_.get_feature_names_out() + token_count = dict( + zip( + self.vectorizer_.get_feature_names_out(), + np.ravel(dtm.sum(axis=1)), + ) + ) + counts = np.ravel(dtm.sum(axis=1)) + word_indices = [ + i + for word, i in self.vectorizer_.vocabulary_.items() + if len(word.split()) == 1 + ] + n_ws = dtm[:, word_indices].sum() + len(word_indices) * self.smoothing + ngram_indices = [] + for i, (token, n_w1w2) in enumerate(zip(all_vocab, counts)): + _words = token.split() + if len(_words) == 1: + continue + w1, w2 = _words[0], _words[-1] + n_w1 = token_count.get(w1, None) + n_w2 = token_count.get(w2, None) + if (n_w1 is None) or (n_w2 is None): + continue + p_w1w2 = (n_w1w2 + self.smoothing) / n_ws + p_w1 = (n_w1 + self.smoothing) / n_ws + p_w2 = (n_w2 + self.smoothing) / n_ws + pmi = np.log2(p_w1w2 / (p_w1 * p_w2)) + npmi = pmi / (-np.log2(p_w1w2)) + if npmi > self.threshold: + ngram_indices.append(i) + self.indices_ = np.array(word_indices + ngram_indices) + self.feature_names_out_ = all_vocab[self.indices_] + self.vocabulary_ = dict( + zip(self.feature_names_out_, range(len(self.feature_names_out_))) + ) + dtm = dtm[:, self.indices_] + return dtm + + def transform(self, raw_documents): + dtm = self.vectorizer_.transform(raw_documents) + return dtm[:, self.indices_] + + def fit(self, raw_documents, y=None): + self.fit_transform(raw_documents, y) + return self + + def get_feature_names_out(self): + return self.feature_names_out_