diff --git a/docs/cvp.md b/docs/cvp.md
new file mode 100644
index 00000000..289b3e6d
--- /dev/null
+++ b/docs/cvp.md
@@ -0,0 +1,79 @@
+# Concept Vector Projection
+
+Concept Vector Projection is an embedding-based method for extracting continuous sentiment (or other) scores from free-text documents.
+
+
+
+ Figure 1: Schematic Overview of Concept Vector Projection. Figure from Lyngbæk et al. (2025)
+
+
+The method rests on the idea that one can construct a _concept vector_ by encoding positive and negative _seed phrases_ with a transformer, then taking the difference of these mean vectors.
+We can then project other documents' embeddings onto these concept vectors by taking the dot product with the concept vector, thereby giving continuous scores on how related documents are to a given concept.
+
+## Usage
+
+### Single Concept
+
+When projecting onto a single concept, you should specify the seeds as a tuple of positive and negative phrases.
+
+```python
+from turftopic import ConceptVectorProjection
+
+positive = [
+ "I love this product",
+ "This is absolutely lovely",
+ "My daughter is going to adore this"
+]
+negative = [
+ "This product is not at all as advertised, I'm very displeased",
+ "I hate this",
+ "What a horrible way to deal with people"
+]
+cvp = ConceptVectorProjection(seeds=(positive, negative))
+
+test_documents = ["My cute little doggy", "Few this is digusting"]
+doc_concept_matrix = cvp.transform(test_documents)
+print(doc_concept_matrix)
+```
+
+```python
+[[0.24265897]
+ [0.01709663]]
+```
+
+### Multiple Concepts
+
+When projecting documents to multiple concepts at once, you will need to specify seeds for each concept, as well as its name.
+Internally this is handled with an `OrderedDict`, which you can either specify yourself, or Turftopic can do it for you:
+
+```python
+import pandas as pd
+from collections import OrderedDict
+
+cuteness_seeds = (["Absolutely adorable", "I love how he dances with his little feet"], ["What a big slob of an abomination", "A suspicious old man sat next to me on the bus today"])
+bullish_seeds = (["We are going to the moon", "This stock will prove an incredible investment"], ["I will short the hell out of them", "Uber stocks drop 7% in value after down-time."])
+
+# Either specify it like this:
+seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
+# or as an OrderedDict:
+seeds = OrderedDict([("cuteness", cuteness_seeds), ("bullish", bullish_seeds)])
+cvp = ConceptVectorProjection(seeds=seeds)
+
+test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
+doc_concept_matrix = cvp.transform(test_documents)
+concept_df = pd.DataFrame(doc_concept_matrix, columns=cvp.get_feature_names_out())
+print(concept_df)
+```
+
+```python
+ cuteness bullish
+0 0.085957 0.288779
+1 0.269454 0.009495
+```
+
+## API Reference
+
+
+::: turftopic.models.cvp.ConceptVectorProjection
+
+
diff --git a/docs/images/cvp.png b/docs/images/cvp.png
new file mode 100644
index 00000000..112a5e42
Binary files /dev/null and b/docs/images/cvp.png differ
diff --git a/mkdocs.yml b/mkdocs.yml
index 69108c4a..1d3e5f68 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -33,6 +33,8 @@ nav:
- Clustering Models (BERTopic & Top2Vec): clustering.md
- Autoencoding Models (ZeroShotTM & CombinedTM): ctm.md
- FASTopic: FASTopic.md
+ - Other Models (e.g. Sentiment Analysis):
+ - Concept Vector Projection (Continuous Sentiment Scoring): cvp.md
- Embedding Models: encoders.md
- Vectorizers (Term extraction): vectorizers.md
- Topic Analysis and Naming with LLMs: analyzers.md
diff --git a/tests/test_cvp.py b/tests/test_cvp.py
new file mode 100644
index 00000000..8548dad0
--- /dev/null
+++ b/tests/test_cvp.py
@@ -0,0 +1,25 @@
+def test_cvp():
+ from turftopic import ConceptVectorProjection
+
+ cuteness_seeds = (
+ ["Absolutely adorable", "I love how he dances with his little feet"],
+ [
+ "What a big slob of an abomination",
+ "A suspicious old man sat next to me on the bus today",
+ ],
+ )
+ bullish_seeds = (
+ [
+ "We are going to the moon",
+ "This stock will prove an incredible investment",
+ ],
+ [
+ "I will short the hell out of them",
+ "Uber stocks drop 7% in value after down-time.",
+ ],
+ )
+ seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
+ cvp = ConceptVectorProjection(seeds=seeds)
+ test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
+ doc_concept_matrix = cvp.transform(test_documents)
+ assert doc_concept_matrix.shape == (2, 2)
diff --git a/turftopic/__init__.py b/turftopic/__init__.py
index 8372841d..a763b262 100644
--- a/turftopic/__init__.py
+++ b/turftopic/__init__.py
@@ -3,6 +3,7 @@
from turftopic.base import ContextualModel
from turftopic.error import NotInstalled
from turftopic.models.cluster import BERTopic, ClusteringTopicModel, Top2Vec
+from turftopic.models.cvp import ConceptVectorProjection
from turftopic.models.decomp import S3, SemanticSignalSeparation
from turftopic.models.fastopic import FASTopic
from turftopic.models.gmm import GMM
@@ -34,4 +35,5 @@
"create_concept_browser",
"S3",
"SensTopic",
+ "ConceptVectorProjection",
]
diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py
index ba3a749e..b709617c 100644
--- a/turftopic/encoders/utils.py
+++ b/turftopic/encoders/utils.py
@@ -18,43 +18,63 @@ def batched(iterable, n: int) -> Iterable[List[str]]:
def encode_chunks(
encoder,
- sentences,
+ texts,
batch_size=64,
window_size=50,
step_size=40,
- return_chunks=False,
- show_progress_bar=False,
):
- chunks = []
+ """
+ Returns
+ -------
+ chunk_embeddings: list[np.ndarray]
+ Embedding matrix of chunks for each document.
+ chunk_positions: list[list[tuple[int, int]]]
+ List of start and end character index of chunks for each document.
+ """
+ chunk_positions = []
chunk_embeddings = []
for start_index in trange(
0,
- len(sentences),
+ len(texts),
batch_size,
desc="Encoding batches...",
- disable=not show_progress_bar,
):
- batch = sentences[start_index : start_index + batch_size]
+ batch = texts[start_index : start_index + batch_size]
features = encoder.tokenize(batch)
with torch.no_grad():
output_features = encoder.forward(features)
n_tokens = output_features["attention_mask"].sum(axis=1)
+ # Find first nonzero elements in each document
+ # The document could be padded from the left, so we have to watch out for this.
+ start_token = torch.argmax(
+ (output_features["attention_mask"] > 0).to(torch.long), axis=1
+ )
+ end_token = start_token + n_tokens
for i_doc in range(len(batch)):
- for chunk_start in range(0, n_tokens[i_doc], step_size):
- chunk_end = min(chunk_start + window_size, n_tokens[i_doc])
+ _chunk_embeddings = []
+ _chunk_positions = []
+ for chunk_start in range(
+ start_token[i_doc], end_token[i_doc], step_size
+ ):
+ chunk_end = min(chunk_start + window_size, end_token[i_doc])
_emb = output_features["token_embeddings"][
i_doc, chunk_start:chunk_end, :
].mean(axis=0)
- chunk_embeddings.append(_emb)
- if return_chunks:
- chunks.append(
- encoder.tokenizer.decode(
- features["input_ids"][i_doc, chunk_start:chunk_end]
- )
- .replace("[CLS]", "")
- .replace("[SEP]", "")
+ _chunk_embeddings.append(_emb)
+ chunk_text = (
+ encoder.tokenizer.decode(
+ features["input_ids"][i_doc, chunk_start:chunk_end],
+ skip_special_tokens=True,
)
- if not return_chunks:
- chunks = None
- chunk_embeddings = np.stack(chunk_embeddings)
- return chunk_embeddings, chunks
+ .replace("[CLS]", "")
+ .replace("[SEP]", "")
+ .strip()
+ )
+ doc_text = texts[start_index + i_doc]
+ start_char = doc_text.find(chunk_text)
+ end_char = start_char + len(chunk_text)
+ _chunk_positions.append((start_char, end_char))
+ _chunk_embeddings = np.stack(_chunk_embeddings)
+ chunk_embeddings.append(_chunk_embeddings)
+ chunk_positions.append(_chunk_positions)
+ return chunk_embeddings, chunk_positions
diff --git a/turftopic/models/cvp.py b/turftopic/models/cvp.py
new file mode 100644
index 00000000..6a8a7b8f
--- /dev/null
+++ b/turftopic/models/cvp.py
@@ -0,0 +1,149 @@
+import json
+import tempfile
+from collections import OrderedDict
+from pathlib import Path
+from typing import Union
+
+import joblib
+import numpy as np
+from huggingface_hub import HfApi
+from sentence_transformers import SentenceTransformer
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from turftopic.base import Encoder
+from turftopic.encoders.multimodal import MultimodalEncoder
+from turftopic.serialization import create_readme, get_package_versions
+
+Seeds = tuple[list[str], list[str]]
+
+
+class ConceptVectorProjection(BaseEstimator, TransformerMixin):
+ """Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD)
+ Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases.
+ The primary use case is sentiment analysis, and continuous sentiment scores,
+ especially for languages where dedicated models are not available.
+
+ Parameters
+ ----------
+ seeds: (list[str], list[str]) or list of (str, (list[str], list[str]))
+ If you want to project to a single concept, then
+ a tuple of (list of negative terms, list of positive terms).
+ If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list.
+ Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys,
+ and the tuples of negative and positive seeds as the values.
+ encoder: str or SentenceTransformer
+ Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default
+ per Lyngbæk et al. (2025).
+ """
+
+ def __init__(
+ self,
+ seeds: Union[Seeds, list[tuple[str, Seeds]], OrderedDict[str, Seeds]],
+ encoder: Union[
+ Encoder, str, MultimodalEncoder
+ ] = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+ ):
+ self.seeds = seeds
+ if isinstance(seeds, OrderedDict):
+ self._seeds = seeds
+ elif (
+ (len(seeds) == 2)
+ and (isinstance(seeds, tuple))
+ and (isinstance(seeds[0][0], str))
+ ):
+ self._seeds = OrderedDict([("default", seeds)])
+ else:
+ self._seeds = OrderedDict(seeds)
+ self.encoder = encoder
+ if isinstance(encoder, str):
+ self.encoder_ = SentenceTransformer(encoder)
+ else:
+ self.encoder_ = encoder
+ self.classes_ = np.array([name for name in self._seeds])
+ self.concept_matrix_ = []
+ for _, (positive, negative) in self._seeds.items():
+ positive_emb = self.encoder_.encode(positive)
+ negative_emb = self.encoder_.encode(negative)
+ cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
+ self.concept_matrix_.append(cv / np.linalg.norm(cv))
+ self.concept_matrix_ = np.stack(self.concept_matrix_)
+
+ def get_feature_names_out(self):
+ """Returns concept names in an array."""
+ return self.classes_
+
+ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
+ """Project documents onto the concept vectors.
+
+ Parameters
+ ----------
+ raw_documents: list[str] or None
+ List of documents to project to the concept vectors.
+ embeddings: ndarray of shape (n_documents, n_dimensions)
+ Document embeddings (has to be created with the same encoder as the concept vectors.)
+
+ Returns
+ -------
+ document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
+ Prevalance of each concept in each document.
+ """
+ if (raw_documents is None) and (embeddings is None):
+ raise ValueError(
+ "Either embeddings or raw_documents has to be passed, both are None."
+ )
+ if embeddings is None:
+ embeddings = self.encoder_.encode(raw_documents)
+ return embeddings @ self.concept_matrix_.T
+
+ def transform(self, raw_documents=None, embeddings=None):
+ """Project documents onto the concept vectors.
+
+ Parameters
+ ----------
+ raw_documents: list[str] or None
+ List of documents to project to the concept vectors.
+ embeddings: ndarray of shape (n_documents, n_dimensions)
+ Document embeddings (has to be created with the same encoder as the concept vectors.)
+
+ Returns
+ -------
+ document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
+ Prevalance of each concept in each document.
+ """
+ return self.fit_transform(raw_documents, embeddings=embeddings)
+
+ def to_disk(self, out_dir: Union[Path, str]):
+ """Persists model to directory on your machine.
+
+ Parameters
+ ----------
+ out_dir: Path | str
+ Directory to save the model to.
+ """
+ out_dir = Path(out_dir)
+ out_dir.mkdir(exist_ok=True)
+ package_versions = get_package_versions()
+ with out_dir.joinpath("package_versions.json").open("w") as ver_file:
+ ver_file.write(json.dumps(package_versions))
+ joblib.dump(self, out_dir.joinpath("model.joblib"))
+
+ def push_to_hub(self, repo_id: str):
+ """Uploads model to HuggingFace Hub
+
+ Parameters
+ ----------
+ repo_id: str
+ Repository to upload the model to.
+ """
+ api = HfApi()
+ api.create_repo(repo_id, exist_ok=True)
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ readme_path = Path(tmp_dir).joinpath("README.md")
+ with readme_path.open("w") as readme_file:
+ readme_file.write(create_readme(self, repo_id))
+ self.to_disk(tmp_dir)
+ api.upload_folder(
+ folder_path=tmp_dir,
+ repo_id=repo_id,
+ repo_type="model",
+ )
diff --git a/turftopic/vectorizers/phrases.py b/turftopic/vectorizers/phrases.py
new file mode 100644
index 00000000..b4a5a8f0
--- /dev/null
+++ b/turftopic/vectorizers/phrases.py
@@ -0,0 +1,82 @@
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+class PhraseVectorizer(BaseEstimator, TransformerMixin):
+ """NPMI-score-based phrase extraction."""
+
+ def __init__(
+ self,
+ max_ngram=3,
+ min_df=10,
+ max_df=1.0,
+ threshold=0.5,
+ stop_words="english",
+ smoothing=5,
+ ):
+ self.stop_words = stop_words
+ self.threshold = threshold
+ self.max_ngram = max_ngram
+ self.min_df = min_df
+ self.max_df = max_df
+ self.smoothing = smoothing
+ self.ngram_range = (1, max_ngram)
+
+ def fit_transform(self, raw_documents, y=None):
+ self.vectorizer_ = CountVectorizer(
+ stop_words=self.stop_words,
+ min_df=self.min_df,
+ max_df=self.max_df,
+ ngram_range=self.ngram_range,
+ )
+ dtm = self.vectorizer_.fit_transform(raw_documents)
+ all_vocab = self.vectorizer_.get_feature_names_out()
+ token_count = dict(
+ zip(
+ self.vectorizer_.get_feature_names_out(),
+ np.ravel(dtm.sum(axis=1)),
+ )
+ )
+ counts = np.ravel(dtm.sum(axis=1))
+ word_indices = [
+ i
+ for word, i in self.vectorizer_.vocabulary_.items()
+ if len(word.split()) == 1
+ ]
+ n_ws = dtm[:, word_indices].sum() + len(word_indices) * self.smoothing
+ ngram_indices = []
+ for i, (token, n_w1w2) in enumerate(zip(all_vocab, counts)):
+ _words = token.split()
+ if len(_words) == 1:
+ continue
+ w1, w2 = _words[0], _words[-1]
+ n_w1 = token_count.get(w1, None)
+ n_w2 = token_count.get(w2, None)
+ if (n_w1 is None) or (n_w2 is None):
+ continue
+ p_w1w2 = (n_w1w2 + self.smoothing) / n_ws
+ p_w1 = (n_w1 + self.smoothing) / n_ws
+ p_w2 = (n_w2 + self.smoothing) / n_ws
+ pmi = np.log2(p_w1w2 / (p_w1 * p_w2))
+ npmi = pmi / (-np.log2(p_w1w2))
+ if npmi > self.threshold:
+ ngram_indices.append(i)
+ self.indices_ = np.array(word_indices + ngram_indices)
+ self.feature_names_out_ = all_vocab[self.indices_]
+ self.vocabulary_ = dict(
+ zip(self.feature_names_out_, range(len(self.feature_names_out_)))
+ )
+ dtm = dtm[:, self.indices_]
+ return dtm
+
+ def transform(self, raw_documents):
+ dtm = self.vectorizer_.transform(raw_documents)
+ return dtm[:, self.indices_]
+
+ def fit(self, raw_documents, y=None):
+ self.fit_transform(raw_documents, y)
+ return self
+
+ def get_feature_names_out(self):
+ return self.feature_names_out_