diff --git a/requirements.txt b/requirements.txt index 421f9a9..93a8a83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ anchor-topic==0.1.2 -bigartm==0.9.2 -dill==0.3.1.1 -lapsolver==1.0.2 -matplotlib -numpy==1.22.0 -pandas==1.0.1 -pytest==5.3.5 -scikit-learn==1.5.0 -scipy==1.10.0 -topicnet>=0.8.0 -tqdm==4.66.3 +bigartm>=0.9.2 +dill==0.3.8 +lapsolver==1.1.0 +matplotlib==3.7.5 +numpy==1.24.4 +pandas==2.0.3 +pytest==8.1.1 +pytest-rerunfailures==14.0 +scikit-learn==1.3.2 +scipy==1.10.1 +topicnet>=0.9.0 +tqdm==4.66.2 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..289a12f --- /dev/null +++ b/setup.py @@ -0,0 +1,45 @@ +from distutils.core import setup + + +setup( + name='topnum', + packages=[ + 'topnum', + 'topnum.data', + 'topnum.scores', + 'topnum.search_methods', + 'topnum.search_methods.topic_bank', + 'topnum.search_methods.topic_bank.phi_initialization', + 'topnum.tests' + ], + version='0.3.0', + license='MIT', + description='A set of methods for finding an appropriate number of topics in a text collection', + author='Machine Intelligence Laboratory', + author_email='vasiliy.alekseyev@phystech.edu', + url='https://github.com/machine-intelligence-laboratory/OptimalNumberOfTopics', + keywords=[ + 'topic modeling', + 'document clustering', + 'number of clusters', + 'ARTM', + 'regularization', + ], + install_requires=[ + 'anchor-topic==0.1.2', + 'bigartm>=0.9.2', + 'dill==0.3.8', + 'lapsolver==1.1.0', + 'matplotlib==3.7.5', + 'numpy==1.24.4', + 'pandas==2.0.3', + 'pytest==8.1.1', + 'scikit-learn==1.3.2', + 'scipy==1.10.1', + 'topicnet>=0.9.0', + 'tqdm==4.66.2', + ], + classifiers=[ + 'Programming Language :: Python :: 3.8', + ], +) diff --git a/topnum/model_constructor.py b/topnum/model_constructor.py index d8ea6da..1e01a59 100644 --- a/topnum/model_constructor.py +++ b/topnum/model_constructor.py @@ -102,6 +102,9 @@ def init_model_from_family( model = init_decorrelated_plsa( dataset, modalities_to_use, main_modality, num_topics, model_params ) + # model = init_decorrelated_artm( + # dataset, modalities_to_use, main_modality, num_topics, 1, model_params + # ) elif family == "ARTM": model = init_baseline_artm( dataset, modalities_to_use, main_modality, num_topics, 1, model_params @@ -213,6 +216,82 @@ def init_decorrelated_plsa( return model +# TODO: is it the same as init_baseline_artm? +def init_decorrelated_artm( + dataset, + modalities_to_use, + main_modality, + num_topics, + bcg_topics, + model_params: dict = None +): + """ + Creates simple artm model with standard scores. + + Parameters + ---------- + dataset : Dataset + modalities_to_use : list of str + main_modality : str + num_topics : int + model_params : dict + + Returns + ------- + model: artm.ARTM() instance + """ + if model_params is None: + model_params = dict() + + model = init_plsa( + dataset, modalities_to_use, main_modality, num_topics + ) + tau = model_params.get('decorrelation_tau', 0.01) + + specific_topic_names = model.topic_names # let's decorrelate everything + model.regularizers.add( + artm.DecorrelatorPhiRegularizer( + gamma=0, + tau=tau, + name='decorrelation', + topic_names=specific_topic_names, + class_ids=modalities_to_use, + ) + ) + + dictionary = dataset.get_dictionary() + baseline_class_ids = {class_id: 1 for class_id in modalities_to_use} + data_stats = count_vocab_size(dictionary, baseline_class_ids) + + background_topic_names = model.topic_names[-bcg_topics:] + specific_topic_names = model.topic_names[:-bcg_topics] + + # all coefficients are relative + regularizers = [ + artm.SmoothSparsePhiRegularizer( + name='smooth_phi_bcg', + topic_names=background_topic_names, + tau=model_params.get("smooth_bcg_tau", 0.1), + class_ids=[main_modality], + ), + artm.SmoothSparseThetaRegularizer( + name='smooth_theta_bcg', + topic_names=background_topic_names, + tau=model_params.get("smooth_bcg_tau", 0.1), + ), + ] + + for reg in regularizers: + model.regularizers.add(transform_regularizer( + data_stats, + reg, + model.class_ids, + n_topics=len(reg.topic_names) + )) + + return model + + def _init_dirichlet_prior(name, num_topics, num_terms): """ Adapted from github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521 diff --git a/topnum/regularizers/__init__.py b/topnum/regularizers/__init__.py new file mode 100644 index 0000000..1a66e47 --- /dev/null +++ b/topnum/regularizers/__init__.py @@ -0,0 +1,5 @@ +from .fix_phi import FastFixPhiRegularizer +from .decorrelate_with_other_phi import ( + DecorrelateWithOtherPhiRegularizer, + DecorrelateWithOtherPhiRegularizer2, +) diff --git a/topnum/regularizers/decorrelate_with_other_phi.py b/topnum/regularizers/decorrelate_with_other_phi.py new file mode 100644 index 0000000..ead5f8d --- /dev/null +++ b/topnum/regularizers/decorrelate_with_other_phi.py @@ -0,0 +1,122 @@ +from typing import List, Optional + +import numpy as np +from numpy import ndarray +from pandas import DataFrame +from scipy.spatial.distance import cdist + +from artm import ARTM +from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer + + +# TODO: find (and make possible to use) relative taus for these regularizers + +class DecorrelateWithOtherPhiRegularizer(BaseRegularizer): + def __init__( + self, + name: str, + tau: float, + topic_names: List[str], + other_phi: DataFrame, + ): + """ + + Parameters + ---------- + name + tau + To select a value, try a few test runs to find the tau + that affects the perplexity (worsens, but not very much). + Recommendation based on experimentation: try 1e5 or 1e6. + topic_names + other_phi + + """ + super().__init__(name, tau=tau) + + self._topic_names = topic_names + self._other_phi = other_phi + self._other_topic_sum = self._other_phi.values.sum( + axis=1, keepdims=True + ) + + self._topic_indices = None + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + rwt[:, self._topic_indices] += ( + pwt.values[:, self._topic_indices] * self._other_topic_sum + ) + + return -1 * self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] + + +class DecorrelateWithOtherPhiRegularizer2(BaseRegularizer): + def __init__( + self, + name: str, + tau: float, + topic_names: List[str], + other_phi: DataFrame, + num_iters: Optional[int] = None, + ): + """ + + Parameters + ---------- + name + tau + To select a value, try a few test runs to find the tau + that affects the perplexity (worsens, but not very much). + Recommendation based on experimentation: try 1e8, 1e9, or 1e10. + topic_names + other_phi + num_iters + + """ + super().__init__(name, tau=tau) + + self._topic_names = topic_names + self._other_phi = other_phi + self._num_iters = num_iters + self._cur_iter = 0 + + self._topic_indices = None + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + + if self._num_iters is not None and self._cur_iter >= self._num_iters: + return rwt + + correlations = cdist( + self._other_phi.values.T, + pwt.values[:, self._topic_indices].T, + lambda u, v: (u * v).sum() + ) + weighted_other_topics = self._other_phi.values.dot(correlations) + + rwt[:, self._topic_indices] += ( + pwt.values[:, self._topic_indices] * weighted_other_topics + ) + self._cur_iter += 1 + + return -1 * self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] diff --git a/topnum/regularizers/fix_phi.py b/topnum/regularizers/fix_phi.py new file mode 100644 index 0000000..4d83724 --- /dev/null +++ b/topnum/regularizers/fix_phi.py @@ -0,0 +1,57 @@ +from typing import List, Optional + +import numpy as np +from numpy import ndarray +from pandas import DataFrame + +from artm import ARTM +from topicnet.cooking_machine.models.topic_model import TopicModel +from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer + + +class FastFixPhiRegularizer(BaseRegularizer): + _VERY_BIG_TAU = 10 ** 9 + + def __init__( + self, + name: str, + topic_names: List[str], + parent_model: Optional[TopicModel] = None, # TODO: TopicModel or ARTM? + parent_phi: DataFrame = None, + tau: float = _VERY_BIG_TAU, + ): + super().__init__(name, tau=tau) + + if parent_phi is None and parent_model is None: + raise ValueError('Both parent Phi and parent model not specified.') + + self._topic_names = topic_names + self._topic_indices = None + self._parent_model = parent_model + self._parent_phi = parent_phi + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + + if self._parent_phi is not None: + parent_phi = self._parent_phi + vals = parent_phi.values + else: + parent_phi = self._parent_model.get_phi() + vals = parent_phi.values[:, self._topic_indices] + + assert vals.shape[0] == rwt.shape[0] + assert vals.shape[1] == len(self._topic_indices), (vals.shape[1], len(self._topic_indices)) + + rwt[:, self._topic_indices] += vals + + return self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = self._model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 8aa5388..735f8a6 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -11,6 +11,16 @@ Enum, IntEnum ) +from functools import lru_cache +from typing import ( + Callable, + Dict, + List, + Optional, + Tuple, + Union +) + from topicnet.cooking_machine.dataset import ( Dataset, VW_TEXT_COL, @@ -20,12 +30,6 @@ ) from topicnet.cooking_machine.models.base_model import BaseModel from topicnet.cooking_machine.models.base_score import BaseScore as TopicNetBaseScore -from typing import ( - Dict, - List, - Tuple, - Union -) from .base_custom_score import BaseCustomScore from ..data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection @@ -68,13 +72,13 @@ class SpecificityEstimationMethod(IntEnum): Way to estimate how particular word is specific for particular topic. Unlike probability, eg. p(w | t), specificity_estimation takes into account values for all topics, eg. p(w | t_1), p(w | t_2), ..., p(w | t_n): - the higher the value p(w | t) comparing other p(w | t_i), + the higher the value p(w | t) comparing to other p(w | t_i), the higher the specificity_estimation of word "w" for the topic "t" Attributes ---------- NONE - Don't try to estimate specificity_estimation, return the probability as is + Don't try to estimate specificity, return the probability as is MAXIMUM From probability, corresponding to word and topic, extract *maximum* among probabilities for the word and other topics @@ -88,6 +92,8 @@ class SpecificityEstimationMethod(IntEnum): class _BaseCoherenceScore(TopicNetBaseScore): + _EPS = np.finfo(float).tiny + def __init__( self, dataset: Dataset, @@ -96,8 +102,10 @@ def __init__( word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, verbose: bool = False, - ): - super().__init__() + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = None, + ): + super().__init__(should_compute=should_compute) if not isinstance(dataset, Dataset): raise TypeError( @@ -171,6 +179,20 @@ def compute( word_topic_relatednesses = self._get_word_topic_relatednesses(model) + self._word_topic_relatednesses_fast = word_topic_relatednesses.to_dict() + self._neutral_word_topic_relatedness = float(np.mean(word_topic_relatednesses.values)) + self._word2index = { + word: index # word_topic_relatednesses.index.get_loc(word) + for index, word in enumerate(word_topic_relatednesses.index) + } + self._topic2index = { + topic: index # word_topic_relatednesses.columns.get_loc(topic) + for index, topic in enumerate(word_topic_relatednesses.columns) + } + self._word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + + # TODO: topic coherence may be evaluated on any peace of text + # (paragraph, sentence, phrase), that is, not only on whole documents topic_document_coherences = np.zeros((len(topics), len(documents))) document_indices_with_topic_coherence = defaultdict(list) @@ -255,7 +277,6 @@ def _get_word_topic_probs(self, phi: pd.DataFrame) -> pd.DataFrame: elif self._word_topic_relatedness == WordTopicRelatednessType.PTW: # Treat all topics as equally probable - eps = np.finfo(float).tiny pwt = phi pwt_values = pwt.values @@ -263,7 +284,7 @@ def _get_word_topic_probs(self, phi: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame( index=pwt.index, columns=pwt.columns, - data=pwt_values / (pwt_values.sum(axis=1).reshape(-1, 1) + eps) + data=pwt_values / (pwt_values.sum(axis=1).reshape(-1, 1) + self._EPS) ) assert False @@ -324,21 +345,32 @@ def _get_source_document(self, document_id: str) -> str: def _get_vw_document(self, document_id: str) -> str: return self._dataset.get_vw_document(document_id).loc[document_id, VW_TEXT_COL] - @staticmethod + @lru_cache(maxsize=None) def _get_relatedness( + self, word: Tuple[str, str], topic: str, word_topic_relatednesses: pd.DataFrame) -> float: - if word in word_topic_relatednesses.index: - return word_topic_relatednesses.loc[word, topic] - - _logger.warning( - f'The word "{word}" not found in Word-Topic relatedness matrix!' - f' Returning mean value over all word relatednesses for topic "{topic}"' - ) + # try: + # return word_topic_relatednesses.loc[word, topic] + # except KeyError as error: + # _logger.warning( + # f'Some word not found in Word-Topic relatedness matrix: "{error}"!' + # f' Returning mean value over all word relatednesses for topic "{topic}".' + # ) + # + # return float(np.mean(word_topic_relatednesses.values)) + + try: + return self._word_topic_relatednesses_fast[topic][word] + except KeyError as error: + _logger.warning( + f'Some word not found in Word-Topic relatedness matrix: "{error}"!' + f' Returning mean value over all word relatednesses for topic "{topic}".' + ) - return float(np.mean(word_topic_relatednesses.values)) + return self._neutral_word_topic_relatedness # TODO: DRY def save(self, path: str) -> None: diff --git a/topnum/scores/arun.py b/topnum/scores/arun.py index af37642..3196116 100644 --- a/topnum/scores/arun.py +++ b/topnum/scores/arun.py @@ -63,8 +63,15 @@ def call(self, model: TopicModel): phi = model.get_phi(class_ids=self.modalities) c_m1 = np.linalg.svd(phi, compute_uv=False) + c_m2 = self.document_lengths.dot(theta.T) - c_m2 += 0.0001 # we need this to prevent components equal to zero + c_m2 = c_m2.to_numpy() + + # Otherwise, _symmetric_kl will result in error (np.float32 vs np.float arrays...) + c_m2 = c_m2.astype(c_m1.dtype, copy=False) + + # We need this to prevent components equal to zero + c_m2 += 0.0001 if len(c_m1) != phi.shape[1]: warnings.warn( @@ -76,10 +83,10 @@ def call(self, model: TopicModel): return 1.0 - # we do not need to normalize these vectors + # We do not need to normalize these vectors return _symmetric_kl(c_m1, c_m2) - # TODO: this piece is copy-pastd among three different scores + # TODO: this piece is copy-pasted among three different scores def save(self, path: str) -> None: dataset = self._dataset self._dataset = None diff --git a/topnum/scores/diversity_score.py b/topnum/scores/diversity_score.py index 311c9d7..0b0435b 100644 --- a/topnum/scores/diversity_score.py +++ b/topnum/scores/diversity_score.py @@ -1,15 +1,20 @@ -from scipy.spatial.distance import pdist +import warnings + +from typing import ( + List, + Union +) + import numpy as np -from scipy.spatial.distance import squareform import pandas as pd + +from scipy.spatial.distance import pdist +from scipy.spatial.distance import squareform + from topicnet.cooking_machine.models import ( BaseScore as BaseTopicNetScore, TopicModel ) -from typing import ( - List, - Union -) from .base_custom_score import BaseCustomScore @@ -80,8 +85,9 @@ def __init__( name: str, metric: str = L2, class_ids: Union[List[str], str] = None, + topic_names: List[str] = None, closest: bool = False): - ''' + """ Parameters ---------- metric @@ -91,27 +97,41 @@ def __init__( (Actually, supports anything implemented in scipy.spatial.distance, but not everything is sanity-checked) class_ids + topic_names closest if False, the score will calculate average pairwise distance (default) if True, will calculate the average distance to the closest topic - ''' + """ super().__init__(name) metric = metric.lower() self._metric = metric self._class_ids = class_ids - + self._topic_names = topic_names self._closest = closest + self._score = self._initialize() + if self._topic_names is None: + warnings.warn( + 'Make sure you do not compute diversity with background topics!' + ' Specify the `topic_names` parameter if needed.' + ) + def _initialize(self) -> BaseTopicNetScore: - return _DiversityScore(self._metric, self._class_ids, self._closest) + return _DiversityScore(self._metric, self._class_ids, self._topic_names, self._closest) class _DiversityScore(BaseTopicNetScore): - def __init__(self, metric: str, class_ids: Union[List[str], str] = None, closest: bool = False): + def __init__( + self, + metric: str, + class_ids: Union[List[str], str] = None, + topic_names: List[str] = None, + closest: bool = False + ): super().__init__() metric = metric.lower() @@ -128,10 +148,17 @@ def __init__(self, metric: str, class_ids: Union[List[str], str] = None, closest self._metric = metric self._class_ids = class_ids + self._topic_names = topic_names self.closest = closest def call(self, model: TopicModel): phi = model.get_phi(class_ids=self._class_ids) + all_topic_names = list(phi.columns) + + if self._topic_names is not None: + phi = phi.loc[:, self._topic_names] + else: + phi = phi.loc[:, all_topic_names] if self._metric == "hellinger": matrix = np.sqrt(phi.T) @@ -139,6 +166,14 @@ def call(self, model: TopicModel): else: condensed_distances = pdist(phi.T, metric=self._metric) + orig_num_dists = len(condensed_distances) + condensed_distances = condensed_distances[np.isfinite(condensed_distances)] + filtered_num_dists = len(condensed_distances) + + if filtered_num_dists < 0.9 * orig_num_dists: + print(f'Skipping computation of dists: {(filtered_num_dists, orig_num_dists)}.') + return -1 + if self.closest: df = pd.DataFrame( index=phi.columns, columns=phi.columns, diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 23d327a..c2abb08 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -1,19 +1,23 @@ import numpy as np import pandas as pd +import warnings from enum import ( auto, IntEnum ) -from topicnet.cooking_machine import Dataset -from topicnet.cooking_machine.models.base_model import BaseModel from typing import ( + Callable, Dict, List, + Optional, Tuple, Union ) +from topicnet.cooking_machine import Dataset +from topicnet.cooking_machine.models.base_model import BaseModel + from ..data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection from ._base_coherence_score import ( _BaseCoherenceScore, @@ -33,19 +37,35 @@ class ComputationMethod(IntEnum): Attributes ---------- SEGMENT_LENGTH - Estimate the length of topic segments + Estimate the length of topic segments (TopLen) SEGMENT_WEIGHT Estimate the weight of topic segment - (weight - sum of specificities for the topic over words in segment) + (weight as sum of specificities for the topic over words in segment) SUM_OVER_WINDOW Sum of specificities for the topic over words in given window. The process is as follows: word of the topic is found in text, it is the center of the first window; - next word of the topic is found (outside of the previous window), window; etc + next word of the topic is found (outside of the previous window), + it is the center of the new window; etc + VARIANCE_IN_WINDOW + Estimate the variance between segment word vector components + corresponding to the topic (SemantiC_Var) + FOCUS_CONSISTENCY + Estimate how much text adjacent words differ, + summing the pairs of differences between max components + of corresponding word vectors (FoCon) """ SEGMENT_LENGTH = auto() SEGMENT_WEIGHT = auto() SUM_OVER_WINDOW = auto() + VARIANCE_IN_WINDOW = auto() + FOCUS_CONSISTENCY = auto() + + +_RESEARCH_COMPUTATION_METHODS = [ + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, +] class IntratextCoherenceScore(BaseTopicScore): @@ -73,6 +93,8 @@ def __init__( max_num_out_of_topic_words=10, window=10, verbose: bool = False, + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = True, # TODO: very slow on full collection ): """ Parameters @@ -120,6 +142,7 @@ def __init__( self._window = window self._verbose = verbose + self._should_compute = should_compute self._score = self._initialize() @@ -139,6 +162,7 @@ def _initialize(self) -> _BaseCoherenceScore: max_num_out_of_topic_words=self._max_num_out_of_topic_words, window=self._window, verbose=self._verbose, + should_compute=self._should_compute, ) def compute( @@ -164,7 +188,10 @@ def __init__( specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 10, - verbose: bool = False): + verbose: bool = False, + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = None, + ): # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff super().__init__( @@ -174,6 +201,7 @@ def __init__( word_topic_relatedness=word_topic_relatedness, specificity_estimation=specificity_estimation, verbose=verbose, + should_compute=should_compute, ) if not isinstance(computation_method, ComputationMethod): @@ -181,6 +209,16 @@ def __init__( f'Wrong "computation_method": \"{computation_method}\". ' f'Expect to be \"{ComputationMethod}\"') + if computation_method in _RESEARCH_COMPUTATION_METHODS: + warnings.warn( + f"Coherences {_RESEARCH_COMPUTATION_METHODS} were also presented in the original paper" + f" but preference should be given to other (TopLen-based) methods." + f" Still, coherences {_RESEARCH_COMPUTATION_METHODS} are also implemented," + f" partly as a tribute, partly for research purposes." + f" Once again, coherence {computation_method} is not intended for \"production\" use." + f" But you do you, it's not like there's a coherence police or something." + ) + if not isinstance(max_num_out_of_topic_words, int): raise TypeError( f'Wrong "max_num_out_of_topic_words": \"{max_num_out_of_topic_words}\". ' @@ -191,11 +229,12 @@ def __init__( f'Wrong "window": \"{window}\". ' f'Expect to be \"int\"') - if window < 0 or (window == 0 and computation_method == ComputationMethod.SUM_OVER_WINDOW): + if window < 0 or (window == 0 and computation_method in [ComputationMethod.SUM_OVER_WINDOW, + ComputationMethod.VARIANCE_IN_WINDOW]): raise ValueError( f'Wrong value for "window": \"{window}\". ' f'Expect to be non-negative. And greater than zero in case ' - f'computation_method == ComputationMethod.SUM_OVER_WINDOW') + f'computation_method is SUM_OVER_WINDOW or VARIANCE_IN_WINDOW.') self._computation_method = computation_method self._max_num_out_of_topic_words = max_num_out_of_topic_words @@ -218,6 +257,20 @@ def _compute_coherence( return average_sum_over_window + elif self._computation_method == ComputationMethod.VARIANCE_IN_WINDOW: + average_variance_in_window = self._compute_variance_in_window( + topic, words, word_topic_relatednesses + ) + + return average_variance_in_window + + elif self._computation_method == ComputationMethod.FOCUS_CONSISTENCY: + average_focus_consistency = self._compute_focus_consistency( + topic, words, word_topic_relatednesses + ) + + return average_focus_consistency + topic_segment_length, topic_segment_weight = self._compute_segment_characteristics( topic, words, word_topic_relatednesses ) @@ -228,6 +281,30 @@ def _compute_coherence( elif self._computation_method == ComputationMethod.SEGMENT_WEIGHT: return topic_segment_weight + # @lru_cache(maxsize=None) # did't provide speed up + def _get_word_topic_index( + self, + word: WordType, + word_topic_relatednesses: pd.DataFrame, + word_topic_indices: np.array, + ) -> int: + # if word not in word_topic_relatednesses.index: + # return -1 + # else: + # return word_topic_indices[ + # word_topic_relatednesses.index.get_loc(word) + # ] + + # if word not in self._word2index: + # return -1 + # else: + # return word_topic_indices[self._word2index[word]] + + try: + return self._word_topic_indices[self._word2index[word]] + except KeyError: + return -1 + def _compute_segment_characteristics( self, topic: str, @@ -238,16 +315,15 @@ def _compute_segment_characteristics( topic_segment_lengths = [] topic_segment_weights = [] - topic_index = word_topic_relatednesses.columns.get_loc(topic) - word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + topic_index = self._topic2index[topic] # word_topic_relatednesses.columns.get_loc(topic) + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) - def get_word_topic_index(word): - if word not in word_topic_relatednesses.index: - return -1 - else: - return word_topic_indices[ - word_topic_relatednesses.index.get_loc(word) - ] + def get_word_topic_index(word: WordType) -> int: + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=None, + word_topic_indices=None, + ) index = 0 @@ -260,8 +336,9 @@ def get_word_topic_index(word): continue segment_length = 1 - segment_weight = _IntratextCoherenceScore._get_relatedness( - words[index], topic, word_topic_relatednesses + segment_weight = self._get_relatedness( + words[index], topic, None # word_topic_relatednesses is not used here + # (besides, lru_cache is applied and who knows how it would react to pd.DataFrame as param) ) num_out_of_topic_words = 0 @@ -273,8 +350,8 @@ def get_word_topic_index(word): num_out_of_topic_words += 1 else: segment_length += 1 - segment_weight += _IntratextCoherenceScore._get_relatedness( - words[index], topic, word_topic_relatednesses + segment_weight += self._get_relatedness( + words[index], topic, None ) num_out_of_topic_words = 0 @@ -300,22 +377,21 @@ def _sum_relatednesses_over_window( words: List[WordType], word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: - topic_index = word_topic_relatednesses.columns.get_loc(topic) - word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + topic_index = self._topic2index[topic] # word_topic_relatednesses.columns.get_loc(topic) + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) def get_word_topic_index(word: WordType) -> int: - if word not in word_topic_relatednesses.index: - return -1 - else: - return word_topic_indices[ - word_topic_relatednesses.index.get_loc(word) - ] + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=None, + word_topic_indices=None, + ) def find_next_topic_word(starting_index: int) -> int: index = starting_index - while index < len(words) and\ - get_word_topic_index(words[index]) != topic_index: + while (index < len(words) + and get_word_topic_index(words[index]) != topic_index): index += 1 if index == len(words): @@ -329,26 +405,123 @@ def find_next_topic_word(starting_index: int) -> int: return None sums = list() + border_left_index = 0 + border_right_index = len(words) - 1 while word_index < len(words) and word_index != -1: original_word_index = word_index - window_lower_bound = word_index - int(np.floor(self._window // 2)) - window_upper_bound = word_index + int(np.ceil(self._window // 2)) - - sum_in_window = np.sum( - [ - _IntratextCoherenceScore._get_relatedness( - w, topic, word_topic_relatednesses - ) - for w in words[window_lower_bound:window_upper_bound] - ] + window_lower_bound = max( + border_left_index, word_index - self._window // 2 ) + window_upper_bound = min( + border_right_index, word_index + self._window // 2 + ) + 1 + + assert window_lower_bound <= word_index + + # sum_in_window = sum( # np.sum + # [ + # self._get_relatedness(w, topic, word_topic_relatednesses) + # for w in words[window_lower_bound:window_upper_bound] + # ] + # ) + + sum_in_window = 0.0 + + for j in range(window_lower_bound, window_upper_bound): + sum_in_window = sum_in_window + self._get_relatedness( + words[j], topic, None + ) sums.append(sum_in_window) word_index = find_next_topic_word(window_upper_bound) + border_left_index = window_upper_bound assert word_index > original_word_index or word_index == -1 return float(np.mean(sums)) + + def _compute_variance_in_window( + self, + topic: str, + words: List[WordType], + word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: + + topic_relatednesses = [ + self._get_relatedness(word, topic, None) + for word in words + ] + + variances = list() + index = 0 + + while index == 0 or index + self._window - 1 < len(words): + relatedness_window = topic_relatednesses[index:index + self._window] + # TODO: better differentiate good and bad topics?.. + # (low variance is not necessarily a good "goodness" sign: + # for example, sequences [100, 100, 100] + # and [-17.5, -17.5, -17.5] both have zero variance) + variances.append(np.var(relatedness_window)) + + index += 1 + + if len(variances) == 0: + return None + else: + return -1 * float(np.mean(variances)) # the higher the better + + def _compute_focus_consistency( + self, + topic: str, + words: List[WordType], + word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: + + if len(words) == 0: + return None + + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + + def get_word_topic_index(word: WordType) -> int: + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=None, + word_topic_indices=None, + ) + + word_topics = [ + word_topic_relatednesses.columns[get_word_topic_index(word)] + for word in words + ] + + differences = list() + index = 0 + + while index + 1 < len(words): # like window = 2 + cur_word, next_word = words[index], words[index + 1] + cur_topic, next_topic = word_topics[index], word_topics[index + 1] + + r_cw_ct = self._get_relatedness( + cur_word, cur_topic, None + ) + r_cw_nt = self._get_relatedness( + cur_word, next_topic, None + ) + r_nw_ct = self._get_relatedness( + next_word, cur_topic, None + ) + r_nw_nt = self._get_relatedness( + next_word, next_topic, None + ) + + diff1 = abs(r_cw_ct - r_nw_ct) + diff2 = abs(r_cw_nt - r_nw_nt) + differences.append(diff1 + diff2) + + index += 1 + + if len(differences) == 0: + return None + else: + return -1 * float(np.mean(differences)) # the higher the better diff --git a/topnum/scores/plavin.py b/topnum/scores/plavin.py index 183639b..a2abe2c 100644 --- a/topnum/scores/plavin.py +++ b/topnum/scores/plavin.py @@ -27,7 +27,7 @@ def _compute_kl(T, theta, doc_lengths): theta_distrib = theta.dot(doc_lengths) # TODO: dtype was 'object'? how could it be? - theta_distrib = np.array(theta_distrib.values, dtype=np.float) + theta_distrib = np.array(theta_distrib.values, dtype=uniform_distrib.dtype) return stats.entropy(uniform_distrib, theta_distrib) diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index 6e8e27d..385e60e 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -1,11 +1,13 @@ import artm +import numpy as np import pandas as pd from topicnet.cooking_machine.dataset import Dataset from topicnet.cooking_machine.models import TopicModel from typing import ( Callable, - List + List, + Optional, ) from topnum.scores.base_score import BaseScore @@ -15,6 +17,7 @@ def default_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -30,6 +33,7 @@ def default_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -53,6 +57,7 @@ def default_train_func( def specific_initial_phi_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -62,6 +67,7 @@ def specific_initial_phi_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -71,7 +77,21 @@ def specific_initial_phi_train_func( initialize_phi_func = initialize_phi_funcs.initialize_randomly initial_phi = initialize_phi_func(dataset, model_number, num_topics) - init_phi_utils._copy_phi(topic_model._model, initial_phi) + + if main_modality is not None: + initial_phi = init_phi_utils.get_modality_phi( + initial_phi, modality=main_modality + ) + + # TODO: However strange it may seem, + # it is really crucial to initialize `phi_ref` variable here. + # Otherwise, all this init-copy manipulation won't work. + # (Yes, at first glance `phi_ref` is not used anywhere, + # but apparently it is used somewhere...) + # The owls are not what they seem. + phi_ref = init_phi_utils._copy_phi(topic_model._model, initial_phi) + + assert np.allclose(phi_ref, topic_model.get_phi().to_numpy()) num_fit_iterations_with_scores = 1 @@ -91,6 +111,7 @@ def specific_initial_phi_train_func( def regularization_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -102,11 +123,11 @@ def regularization_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, ) - topic_model._model.regularizers.add( artm.regularizers.DecorrelatorPhiRegularizer(tau=decorrelating_tau) ) @@ -141,7 +162,7 @@ def regularization_train_func( topic_model._fit( dataset.get_batch_vectorizer(), - num_iterations=max(0, second_num_fit_iterations - num_fit_iterations_with_scores) + num_iterations=max(0, second_num_fit_iterations) ) _fit_model_with_scores( topic_model, @@ -155,6 +176,7 @@ def regularization_train_func( def background_topics_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -165,6 +187,7 @@ def background_topics_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics + num_background_topics, seed=model_number, **kwargs, @@ -189,8 +212,10 @@ def background_topics_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, + **kwargs, ) num_fit_iterations_with_scores = 1 @@ -222,7 +247,7 @@ def background_topics_train_func( ) # TODO: not very safe here? (if cache_theta us True, Theta not updated here) - init_phi_utils._copy_phi( + phi_ref = init_phi_utils._copy_phi( topic_model._model, specific_topics_phi, phi_ref=phi_ref @@ -233,16 +258,31 @@ def background_topics_train_func( def _get_topic_model( dataset: Dataset, + main_modality: Optional[str], phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, scores: List[BaseScore] = None, - num_safe_fit_iterations: int = 3, + num_safe_fit_iterations: int = 3, # TODO: remove param (only FastFixPhiRegularizer to be used for safe copy) num_processors: int = 3, cache_theta: bool = False) -> TopicModel: + if phi is not None: + raise ValueError( + "Do not use `phi` parameter, use `num_topics` instead!" + " Currently, this method is not responsible for copying Phi matrix." + " We have temporarily turned off this functionality," + " because the realization appeared not perfectly reliable." + " In the future, Phi copying will be improved and returned" + " (it will be based on FastFixPhiRegularizer)." + ) + dictionary = dataset.get_dictionary() + # for modality in dataset.get_possible_modalities(): + # if modality not in modalities_to_use: + # dictionary.filter(class_id=modality, max_df=0, inplace=True) + if num_topics is not None and phi is not None: assert num_topics >= phi.shape[1] elif num_topics is None and phi is not None: @@ -252,21 +292,43 @@ def _get_topic_model( topic_names = [f'topic_{i}' for i in range(num_topics)] - if seed is None: - artm_model = artm.ARTM(topic_names=topic_names) + # if seed is None: + # artm_model = artm.ARTM(topic_names=topic_names) + # else: + # artm_model = artm.ARTM(topic_names=topic_names, seed=seed) + + if main_modality is not None: + class_ids = {main_modality: 1} else: - artm_model = artm.ARTM(topic_names=topic_names, seed=seed) + class_ids = None + + if seed is None: + seed = -1 # for ARTM, it means "no seed" + + artm_model = artm.ARTM(topic_names=topic_names, seed=seed, class_ids=class_ids) # TODO: not list, but dict!!! + + # artm_model = init_model(topic_names, class_ids=[MAIN_MODALITY]) + + # artm_model = init_plsa(DATASET, [MAIN_MODALITY], MAIN_MODALITY, 5) artm_model.num_processors = num_processors artm_model.initialize(dictionary) + """ if phi is None: pass elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0: init_phi_utils._safe_copy_phi(artm_model, phi, dataset, num_safe_fit_iterations) else: init_phi_utils._copy_phi(artm_model, phi) - + """ + # this breaks smth in ARTM + # test_ppl@word [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@word_with_d [4073.36328125, 6035.2822265625, 6035.3779296875, 6035.37841796875, 6035.37841796875] + # test_ppl@all [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@all_2 [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@all_2_with_d [4073.36328125, 6035.2822265625, 6035.3779296875, 6035.37841796875, 6035.37841796875] + topic_model = TopicModel( artm_model=artm_model, model_id='0', diff --git a/topnum/search_methods/topic_bank/phi_initialization/arora.py b/topnum/search_methods/topic_bank/phi_initialization/arora.py index 50bfa54..a3ffdab 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/arora.py +++ b/topnum/search_methods/topic_bank/phi_initialization/arora.py @@ -17,6 +17,11 @@ ) +np.int = np.int32 # Arora uses old NumPy (current version has not "int" attribute) + # https://stackoverflow.com/q/74946845/8094251 + # https://github.com/scikit-learn-contrib/boruta_py/issues/122#issuecomment-1914122968 + + def compute_phi( dataset: Dataset, main_modality: str, @@ -46,7 +51,10 @@ def compute_phi( } word_document_frequencies = _count_word_document_frequencies( - dataset, text_column, word2index + dataset=dataset, + vocabulary_size=len(phi_index), + text_column=text_column, + word2index=word2index, ) word_document_frequencies = scipy.sparse.csc_matrix(word_document_frequencies) @@ -68,22 +76,24 @@ def compute_phi( def _count_word_document_frequencies( - dataset: Dataset, text_column: str, word2index: Dict[str, int]) -> np.ndarray: + dataset: Dataset, + vocabulary_size: int, + text_column: str, + word2index: Dict[str, int], + ) -> np.ndarray: num_documents = len(dataset._data) # TODO: for big data may be slow here - words_dimension_size = max(list(word2index.values())) + 1 frequencies = np.zeros( - shape=(words_dimension_size, num_documents) + shape=(vocabulary_size, num_documents) ) for doc_index, doc_text in enumerate(dataset._data[text_column]): words = doc_text.split() preprocessed_words = list(utils._trim_vw(words)) # TODO: maybe require much memory - if preprocessed_words[:100] != words[:100]: warnings.warn(WARNING_VW_TEXT_WRONG_FORMAT) - words_counter = Counter(words) + words_counter = Counter(preprocessed_words) for w, c in words_counter.items(): if w not in word2index: diff --git a/topnum/search_methods/topic_bank/phi_initialization/cdc.py b/topnum/search_methods/topic_bank/phi_initialization/cdc.py index 39156d4..7ec5add 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/cdc.py +++ b/topnum/search_methods/topic_bank/phi_initialization/cdc.py @@ -69,8 +69,9 @@ def compute_phi( word_in_word_frequencies, document_frequencies = _count_word_in_word_frequencies( dataset=dataset, + vocabulary_size=len(phi_index), text_column=text_column, - word2index=word2index + word2index=word2index, ) word_in_word_probabilities = _count_word_in_word_probabilities( word_in_word_frequencies @@ -122,6 +123,7 @@ def _check_clusterization_distance_func( def _count_word_in_word_frequencies( dataset: Dataset, + vocabulary_size: int, text_column: str, word2index: Dict[str, int], split_on_paragraphs: bool = True, @@ -130,13 +132,11 @@ def _count_word_in_word_frequencies( smoothing_value: float = 0.01, num_docs_to_log: int = 500) -> Tuple[np.ndarray, np.ndarray]: # 2D, 1D - words_dimension_size = max(list(word2index.values())) + 1 - frequencies = np.zeros( - shape=(words_dimension_size, words_dimension_size) + shape=(vocabulary_size, vocabulary_size) ) document_frequencies = np.zeros( - shape=(words_dimension_size,) + shape=(vocabulary_size,) ) def process_words(words: List[str]) -> None: diff --git a/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py b/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py index e780633..b3341f1 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py +++ b/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py @@ -17,12 +17,20 @@ def initialize_randomly( phi_template = _get_phi_template(dataset, num_topics) random = np.random.RandomState(seed=model_number) - phi_values = random.random(phi_template.shape) + modality_phi_datas = [] + + for modality in phi_template.index.unique(level=0): + modality_phi_template = phi_template.xs(modality) + modality_phi_data = random.random(modality_phi_template.shape) + modality_phi_data = modality_phi_data / modality_phi_data.sum(axis=0) + modality_phi_datas.append(modality_phi_data) + + phi_data = np.vstack(modality_phi_datas) return pd.DataFrame( index=phi_template.index, columns=phi_template.columns, - data=phi_values + data=phi_data, ) diff --git a/topnum/search_methods/topic_bank/phi_initialization/utils.py b/topnum/search_methods/topic_bank/phi_initialization/utils.py index 156984b..aa947af 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/utils.py +++ b/topnum/search_methods/topic_bank/phi_initialization/utils.py @@ -32,6 +32,10 @@ def get_phi_index(dataset: Dataset) -> Index: return phi_index +def get_modality_phi(phi: pd.DataFrame, modality: str) -> pd.DataFrame: + return phi.iloc[phi.index.get_level_values(0).isin([modality])] + + def _copy_phi(model: artm.ARTM, phi: pd.DataFrame, phi_ref: np.ndarray = None) -> np.ndarray: model_wrapper = TopicModel(artm_model=model) base_phi_index = model_wrapper.get_phi().index diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 54268dc..42961fa 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -18,11 +18,14 @@ Callable, Dict, List, + Optional, Tuple, Union ) from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection +from topnum.model_constructor import init_model_from_family +from topnum.regularizers import FastFixPhiRegularizer from topnum.scores._base_coherence_score import ( SpecificityEstimationMethod, TextType, @@ -57,7 +60,10 @@ default_train_func, _get_topic_model ) -from topnum.search_methods.topic_bank.phi_initialization.utils import _safe_copy_phi +from topnum.search_methods.topic_bank.phi_initialization.utils import ( + _safe_copy_phi, + get_modality_phi, +) _KEY_BANK_SCORES = 'bank_scores' @@ -75,6 +81,12 @@ _logger = logging.getLogger() +TRAIN_FUNC_TYPE = Callable[ + [Dataset, str, int, int, int, List[BaseScore]], + TopicModel +] + + class TopicBankMethod(BaseSearchMethod): _MINIMUM_TOPIC_DISTANCE = 0.0 _MAXIMUM_TOPIC_DISTANCE = 1.0 @@ -96,10 +108,9 @@ def __init__( max_num_models: int = 100, one_model_num_topics: Union[int, List[int]] = 100, num_fit_iterations: int = DEFAULT_NUM_FIT_ITERATIONS, - train_funcs: Union[ - Callable[[Dataset, int, int, int], TopicModel], - List[Callable[[Dataset, int, int, int], TopicModel]], - None] = None, + train_funcs: Optional[Union[ + TRAIN_FUNC_TYPE, + List[TRAIN_FUNC_TYPE]]] = None, topic_score_threshold_percentile: int = 95, distance_threshold: float = 0.5, bank_update: BankUpdateMethod = BankUpdateMethod.PROVIDE_NON_LINEARITY, @@ -201,9 +212,9 @@ def __init__( ] self._one_model_num_topics: List[int] = one_model_num_topics - self._train_func: List[Callable[[Dataset, int, int, int], TopicModel]] = train_funcs + self._train_func: List[TRAIN_FUNC_TYPE] = train_funcs - if topic_score_threshold_percentile < 1: + if topic_score_threshold_percentile % 1 != 0: warnings.warn( f'topic_score_threshold_percentile {topic_score_threshold_percentile}' f' is less than one! It is expected to be in [0, 100].' @@ -313,13 +324,13 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) # TODO: stop when perplexity stabilizes _logger.info(f'Building topic model number {model_number}...') - topic_model = self._train_func[model_number]( dataset=self._dataset, + main_modality=self._main_modality, model_number=model_number, num_topics=self._one_model_num_topics[model_number], num_fit_iterations=self._num_fit_iterations, - scores=self._all_model_scores + scores=self._all_model_scores, ) scores = dict() @@ -341,21 +352,26 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._result[_KEY_MODEL_SCORES].append(scores) self._result[_KEY_NUM_MODEL_TOPICS].append(topic_model.get_phi().shape[1]) - self.save() + # Better one time at the end of the iteration + # (otherwise, incomplete information will be saved) + # self.save() - threshold = self._aggregate_scores_for_models( - raw_topic_scores[self._main_topic_score.name], - self._topic_score_threshold_percentile - ) + if self._topic_score_threshold_percentile % 1 != 0: + print(f'Using absolute threshold: {self._topic_score_threshold_percentile}.') + + threshold = self._topic_score_threshold_percentile + else: + threshold = self._aggregate_scores_for_models( + raw_topic_scores[self._main_topic_score.name], + self._topic_score_threshold_percentile + ) _logger.info('Finding new topics...') phi = topic_model.get_phi() - if self._main_modality is None: - phi = phi - else: - phi = phi.iloc[phi.index.get_level_values(0).isin([self._main_modality])] + if self._main_modality is not None: + phi = get_modality_phi(phi, modality=self._main_modality) if word2index is None: word2index = { @@ -368,8 +384,24 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topics_for_append = list(range(len(phi.columns))) topics_for_update = dict() elif self._bank_update == BankUpdateMethod.PROVIDE_NON_LINEARITY: + self._last_bank_phi = self._get_phi(self._topic_bank.topics, word2index) + self._last_model_phi = phi + + # TODO: TopicNet's model should be able to tell + # what topics are subject topics, + # and what topics are background ones + if hasattr(topic_model, 'num_bcg') and topic_model.num_bcg > 0: + print( + f'Eliminating {topic_model.num_bcg} bcg topic before Hierarchy.' + f' Current |T| is {phi.shape[1]}, topics are: {phi.columns}.' + ) + + phi = phi.iloc[:, :-topic_model.num_bcg] + + print(f'Now |T| is {phi.shape[1]}, topics are: {phi.columns}.') + topics_for_append, topics_for_update = self._extract_hierarchical_relationship( - bank_phi=self._get_phi(self._topic_bank.topics, word2index), + bank_phi=self._last_bank_phi, new_model_phi=phi, psi_threshold=self._child_parent_relationship_threshold ) @@ -380,7 +412,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) good_new_topics = [ topic_index for topic_index, topic_name in enumerate(phi.columns) - if raw_topic_scores[self._main_topic_score.name][topic_name] is not None and + if topic_name in raw_topic_scores[self._main_topic_score.name] and + raw_topic_scores[self._main_topic_score.name][topic_name] is not None and raw_topic_scores[self._main_topic_score.name][topic_name] >= threshold ] topics_for_append, topics_for_update, topics_for_update_reverse = ( @@ -390,10 +423,19 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) ) model_topic_current_scores = list() + num_model_topics = len(topic_model.get_phi().columns) _logger.info('Calculating model topic scores...') for topic_index, topic_name in enumerate(topic_model.get_phi().columns): + if hasattr(topic_model, 'num_bcg') and topic_index >= num_model_topics - topic_model.num_bcg: + print( + f'Skipping saving scores for bcg topic number {topic_index}' + f' of {num_model_topics} model topics.' + ) + + continue + topic_scores = dict() topic_word_prob_values = topic_model.get_phi()[topic_name].values @@ -402,6 +444,14 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topic_word_prob_values[topic_word_prob_values > 1.0 / num_words] ) + if topic_scores[_KEY_TOPIC_SCORE_KERNEL_SIZE] == 0: + warnings.warn( + f'Not going to add topic "{topic_name}" to the bank' + f' because it has zero kernel!' + ) + + continue + for score_name in raw_topic_scores: topic_scores[score_name] = raw_topic_scores[score_name][topic_name] @@ -443,9 +493,13 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._topic_bank.delete_topic(topics_for_update_reverse[topic_index]) self._result[_KEY_MODEL_TOPIC_SCORES].append(model_topic_current_scores) - self._result[_KEY_BANK_TOPIC_SCORES] = self._topic_bank.topic_scores # TODO: append + self._result[_KEY_BANK_TOPIC_SCORES].append( + self._topic_bank.topic_scores # TODO: append + ) - self.save() + # Better one time at the end of the iteration + # (otherwise, incomplete information will be saved) + # self.save() if self._save_model_topics: self._topic_bank.save_model_topics( @@ -464,18 +518,107 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) _logger.info('No topics in bank — returning empty default scores for bank model') else: bank_phi = self._get_phi(self._topic_bank.topics, word2index) + regularizer = FastFixPhiRegularizer( + name='fix', + parent_phi=bank_phi, + topic_names=bank_phi.columns, + ) bank_model = _get_topic_model( self._dataset, - phi=bank_phi, + main_modality=self._main_modality, + num_topics=bank_phi.shape[1], scores=self._all_model_scores, - num_safe_fit_iterations=1 + num_safe_fit_iterations=1, + ) + # Safe fit to make topics so-so adequate (just in case) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=1, + ) + + bank_model._model.scores.add( + artm.scores.PerplexityScore( + name=f'ppl_fair', + ) + ) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=5, + custom_regularizers={ + regularizer.name: regularizer, + } ) - bank_model._fit(self._dataset.get_batch_vectorizer(), 1) + + if not np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy(), + atol=1e-3): + warnings.warn( + 'Seems that bank topics are not perfectly fixed in the bank topic model!' + ' Check your bank topics!' + ) + + print(f'Bank Phi:\n{bank_phi.to_numpy()}') + print(f'Total topic probs: {bank_phi.to_numpy().sum(axis=0)}.') + print(f'Bank model Phi:\n{bank_model.get_phi().to_numpy()}') + print(f'Total topic probs: {bank_model.get_phi().to_numpy().sum(axis=0)}.') _logger.info('Computing default scores for bank model...') scores.update(self._get_default_scores(bank_model)) + scores['ppl_fair'] = bank_model.scores['ppl_fair'][-1] + + # TODO: Second bank model is needed for experiments with regularizers + + # Model with one bcg topic + bank_model = init_model_from_family( + family='sparse', + dataset=self._dataset, + main_modality=self._main_modality, + num_topics=len(bank_phi.columns), + seed=0, + ) + # Safe fit to make topics so-so adequate (just in case) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=1, + ) + + bank_model._model.scores.add( + artm.scores.PerplexityScore( + name=f'ppl_cheatty', + ) + ) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=5, + custom_regularizers={ + regularizer.name: regularizer, + } + ) + + # One background topic + assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 + + if not np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy()[:, :-1], + atol=1e-3): + warnings.warn( + 'Seems that bank topics are not perfectly fixed in the bank topic model!' + ' (The last model topic — background — is not considered.)' + ' Check your bank topics!' + ) + + print(f'Bank Phi:\n{bank_phi.to_numpy()}') + print(f'Total topic probs: {bank_phi.to_numpy().sum(axis=0)}.') + print(f'Bank model Phi (including bcg topic):\n{bank_model.get_phi().to_numpy()}') + print(f'Total topic probs (including bcg topic): {bank_model.get_phi().to_numpy().sum(axis=0)}.') + + scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] + + print(f'Bank scores: {scores}.') # Topic scores already calculated @@ -544,7 +687,7 @@ def _extract_hierarchical_relationship( hierarchy = artm.hARTM(num_processors=1) - _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics') + _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics. Dictionary: {self._dictionary}.') level0 = hierarchy.add_level( num_topics=bank_phi.shape[1] @@ -557,6 +700,9 @@ def _extract_hierarchical_relationship( f' First words: {bank_phi.index[:10]}' ) + # TODO: use FastFixPhiRegularizer + # (seems not critical here, but nevertheless) + # TODO: until then -- do not remove `phi_ref0` variable! phi_ref0 = _safe_copy_phi( level0, bank_phi, self._dataset, small_num_fit_iterations=1 @@ -671,6 +817,11 @@ def _jaccard_distance( q: Dict[str, float], kernel_only: bool = True) -> float: + # TODO: Can topics appear close if + # top words are the same, but in different order? + # (with different probabilities) + # In other words, "same top words (no matter the order)" == "similar topics"? + # (seems like it should be so) numerator = 0 denominator = 0 diff --git a/topnum/tests/test_coherence_scores.py b/topnum/tests/test_coherence_scores.py index 256378c..fa1e61c 100644 --- a/topnum/tests/test_coherence_scores.py +++ b/topnum/tests/test_coherence_scores.py @@ -41,9 +41,43 @@ SMALL_SEGMENT_LENGTH_PROBABILITIES = [0.3, 0.45, 0.25] DOCUMENT_LENGTH = 100 TOP_WORD_PROBABILITY_TIMES_BIGGER = 4 + PHI_FILE_NAME = 'phi.csv' DATASET_FILE_NAME = 'dataset.csv' +TEXT_TYPES = [ + TextType.VW_TEXT, + TextType.RAW_TEXT, +] +COMPUTATION_METHODS = [ + ComputationMethod.SEGMENT_LENGTH, + ComputationMethod.SEGMENT_WEIGHT, + ComputationMethod.SUM_OVER_WINDOW, + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, +] +RESEARCH_COMPUTATION_METHODS = [ + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, +] +WORD_TOPIC_RELATEDNESS_TYPES = [ + WordTopicRelatednessType.PWT, + WordTopicRelatednessType.PTW, +] +SPECIFICITY_ESTIMATION_METHODS = [ + SpecificityEstimationMethod.NONE, + SpecificityEstimationMethod.MAXIMUM, + SpecificityEstimationMethod.AVERAGE, +] + + +RESEARCH_INTRATEXT_MESSAGE = ( + f"Coherences {RESEARCH_COMPUTATION_METHODS} were presented in the original paper" + f" and are implemented partly as a tribute," + f" partly for research purposes." + f" For real use, preference should be given to {COMPUTATION_METHODS} methods." +) + class _MockModel(BaseModel): def __init__(self, phi: pd.DataFrame): @@ -211,12 +245,10 @@ def get_vw_text(cls, doc: str, document_words: Dict[str, List[str]]) -> str: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_compute_intratext( @@ -226,6 +258,9 @@ def test_compute_intratext( word_topic_relatedness: WordTopicRelatednessType, specificity_estimation: SpecificityEstimationMethod) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + score = _IntratextCoherenceScore( self.dataset, text_type=text_type, @@ -236,6 +271,22 @@ def test_compute_intratext( self._check_compute(score) + @pytest.mark.parametrize( + 'window', + [2, 4, 10] # TODO: window = 1 -> fail (sometimes?) + ) + def test_compute_topden(self, window) -> None: + score = _IntratextCoherenceScore( + self.dataset, + text_type=TextType.VW_TEXT, + computation_method=ComputationMethod.SUM_OVER_WINDOW, + word_topic_relatedness=WordTopicRelatednessType.PTW, + specificity_estimation=SpecificityEstimationMethod.NONE, + window=window, + ) + + self._check_compute(score) + @pytest.mark.parametrize('keep_in_memory', [True, False]) def test_compute_intratext_small_big_data(self, keep_in_memory) -> None: dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory) @@ -246,12 +297,10 @@ def test_compute_intratext_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_call_intratext( @@ -261,6 +310,9 @@ def test_call_intratext( word_topic_relatedness: WordTopicRelatednessType, specificity_estimation: SpecificityEstimationMethod) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + score = _IntratextCoherenceScore( self.dataset, text_type=text_type, @@ -281,12 +333,10 @@ def test_call_intratext_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) @pytest.mark.parametrize( @@ -301,6 +351,9 @@ def test_call_intratext_with_specified_documents( specificity_estimation: SpecificityEstimationMethod, what_documents: str) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + if what_documents == 'first': documents = [self.documents[0]] elif what_documents == 'all': @@ -324,10 +377,9 @@ def test_call_intratext_with_specified_documents( @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_compute_toptokens( @@ -355,10 +407,9 @@ def test_compute_toptokens_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_call_toptokens( @@ -386,10 +437,9 @@ def test_call_toptokens_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) @pytest.mark.parametrize( diff --git a/topnum/tests/test_regularizers.py b/topnum/tests/test_regularizers.py new file mode 100644 index 0000000..faf6521 --- /dev/null +++ b/topnum/tests/test_regularizers.py @@ -0,0 +1,353 @@ +import logging +import os +import shutil +import tempfile + +from copy import deepcopy +from typing import List + +import numpy as np +import pytest + +from pandas import DataFrame + +from topicnet.cooking_machine.dataset import ( + Dataset, + W_DIFF_BATCHES_1, +) +from topicnet.cooking_machine.models import TopicModel +from topicnet.cooking_machine.model_constructor import init_simple_default_model + + +from topnum.regularizers import ( + FastFixPhiRegularizer, + DecorrelateWithOtherPhiRegularizer, + DecorrelateWithOtherPhiRegularizer2, +) +from topnum.scores import PerplexityScore +from topnum.tests.data_generator import TestDataGenerator + + +_Logger = logging.getLogger() + + +@pytest.mark.filterwarnings(f'ignore:{W_DIFF_BATCHES_1}') +class TestOptimizeScores: + PPL_SCORE_NAME = 'ppl' + ONE_FIT_NUM_ITERS = 10 + + NUM_TOPICS = 10 + + # Ideally, these topics should be found by looking at the scores + # (Here we are assigning the labels just out of thin air) + GOOD_TOPIC_INDICES = [0, 1, 2] + BAD_TOPIC_INDICES = [-1, -2, -3] + + data_generator = None + + main_modality = None + other_modality = None + text_collection = None + + working_folder_path = None + + @classmethod + def setup_class(cls): + cls.data_generator = TestDataGenerator() + + cls.data_generator.generate() + + cls.data_generator.text_collection._dataset = None + + cls.text_collection = cls.data_generator.text_collection + cls.main_modality = cls.data_generator.main_modality + cls.other_modality = cls.data_generator.other_modality + + cls.working_folder_path = tempfile.mktemp(prefix='test_optimize_scores__') + + def setup_method(self): + assert self.text_collection._dataset is None + + os.mkdir(self.working_folder_path) + + def teardown_method(self): + self.text_collection._set_dataset_kwargs() + self.text_collection._dataset = None + + if os.path.isdir(self.working_folder_path): + shutil.rmtree(self.working_folder_path) + + @classmethod + def teardown_class(cls): + if cls.data_generator is not None: + cls.data_generator.clear() + + if os.path.isdir(cls.working_folder_path): + shutil.rmtree(cls.working_folder_path) + + def _get_dataset(self, keep_in_memory: bool = True) -> Dataset: + self.text_collection._set_dataset_kwargs( + keep_in_memory=keep_in_memory + ) + dataset = self.text_collection._to_dataset() + + return dataset + + def _get_topic_model_and_topics( + self, + dataset: Dataset, + num_specific_topics=5, + num_background_topics=1, + num_processors=2, + ): + artm_model = init_simple_default_model( + dataset=dataset, + modalities_to_use=[self.main_modality, self.other_modality], + main_modality=self.main_modality, + specific_topics=num_specific_topics, + background_topics=num_background_topics, + ) + artm_model.num_processors = num_processors + + topic_model = TopicModel(artm_model) + score = PerplexityScore(self.PPL_SCORE_NAME) + score._attach(topic_model) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + ) + + phi = topic_model.get_phi() + good_topic_names = [phi.columns[t] for t in self.GOOD_TOPIC_INDICES] + bad_topic_names = [phi.columns[t] for t in self.BAD_TOPIC_INDICES] + not_good_topic_names = [ + phi.columns[t] + for t in range(len(phi.columns)) + if t not in self.GOOD_TOPIC_INDICES + ] + + return ( + topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names, + ) + + def _get_fix_regularizer( + self, + name: str, + target_topic_names: List[str], + parent_topic_model: TopicModel = None, + parent_phi: DataFrame = None, + ): + fix_regularizer = FastFixPhiRegularizer( + name=name, + topic_names=target_topic_names, + parent_model=parent_topic_model, + parent_phi=parent_phi, + ) + + return fix_regularizer + + def _get_decorr_regularizer_base( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + decorrelate_regularizer_class, + ): + other_phi = other_topic_model._model.get_phi()[other_topic_names] + other_phi = deepcopy(other_phi) + decorr_regularizer = decorrelate_regularizer_class( + name=name, + tau=tau, + topic_names=target_topic_names, + other_phi=other_phi, + ) + + return decorr_regularizer, other_phi + + def _get_decorr_regularizer( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + ): + return self._get_decorr_regularizer_base( + name=name, tau=tau, + target_topic_names=target_topic_names, + other_topic_model=other_topic_model, + other_topic_names=other_topic_names, + decorrelate_regularizer_class=DecorrelateWithOtherPhiRegularizer, + ) + + def _get_decorr_regularizer2( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + ): + return self._get_decorr_regularizer_base( + name=name, tau=tau, + target_topic_names=target_topic_names, + other_topic_model=other_topic_model, + other_topic_names=other_topic_names, + decorrelate_regularizer_class=DecorrelateWithOtherPhiRegularizer2, + ) + + @pytest.mark.parametrize('keep_in_memory', [True, False]) + def test_fix_good(self, keep_in_memory): + dataset = self._get_dataset(keep_in_memory=keep_in_memory) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) + + good_phi = deepcopy( + topic_model._model.get_phi()[good_topic_names] + ) + + fix_regularizer = self._get_fix_regularizer( + name='fix', + target_topic_names=good_topic_names, + parent_phi=good_phi, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + fix_regularizer.name: fix_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + assert np.allclose( + new_phi[good_topic_names], good_phi + ) + + @pytest.mark.parametrize('decorr_v2', [False, True]) + @pytest.mark.parametrize('keep_in_memory', [True, False]) + def test_decorr_bad(self, decorr_v2, keep_in_memory): + dataset = self._get_dataset(keep_in_memory=keep_in_memory) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) + + good_phi = deepcopy( + topic_model._model.get_phi()[good_topic_names] + ) + base_topic_decorr_kwargs = dict( + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=bad_topic_names, + ) + + if not decorr_v2: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', + tau=1e5, + **base_topic_decorr_kwargs, + ) + else: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', + tau=1e8, + **base_topic_decorr_kwargs, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + decorr_bad_regularizer.name: decorr_bad_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + # TODO: good topics also change (as they are not fixed) + # so, the meaningfulness of this test is questionable + # (other than the fact that it simply tests runnability) + # assert np.allclose( + # new_phi[good_topic_names], good_phi, rtol=0.05 + # ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) + + @pytest.mark.parametrize('decorr_v2', [False, True]) + def test_fix_good_and_decorr_good_bad(self, decorr_v2): + dataset = self._get_dataset(keep_in_memory=True) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) + + fix_regularizer = self._get_fix_regularizer( + name='fix', + target_topic_names=good_topic_names, + parent_topic_model=topic_model._model, + ) + # TODO: test breaks if pass just `topic_model` for `parent_topic_model` + # aah, I guess, there are some score saving issues (_score_caches=None) + + base_topic_decorr_kwargs = dict( + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + ) + + if not decorr_v2: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', tau=1e5, + other_topic_names=bad_topic_names, + **base_topic_decorr_kwargs, + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer( + name='ext_decorr_good', tau=1e5, + other_topic_names=good_topic_names, + **base_topic_decorr_kwargs + ) + else: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', tau=1e8, + other_topic_names=bad_topic_names, + **base_topic_decorr_kwargs + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer2( + name='ext_decorr_good2', tau=1e8, + other_topic_names=good_topic_names, + **base_topic_decorr_kwargs + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + fix_regularizer.name: fix_regularizer, + decorr_bad_regularizer.name: decorr_bad_regularizer, + decorr_good_regularizer.name: decorr_good_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + assert np.allclose( + new_phi[good_topic_names], good_phi + ) + + assert not np.allclose( + new_phi[not_good_topic_names], good_phi, rtol=0.5 + ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index 298c3c4..f5994ae 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -18,6 +18,7 @@ Callable, Dict, List, + Optional, ) from topnum.scores.base_score import BaseScore @@ -124,14 +125,28 @@ def test_topic_bank_smoke(self, keep_in_memory): ] ) @pytest.mark.parametrize( - 'train_funcs', - [None, background_topics_train_func, default_train_func, regularization_train_func] + 'train_funcs, params', + [ + (None, {}), + (background_topics_train_func, {}), + (default_train_func, {}), + (regularization_train_func, dict( + decorrelating_tau=1, + smoothing_tau=1e-5, + sparsing_tau=-1 * 1e-5, + )) + ] ) - def test_topic_bank(self, keep_in_memory, bank_update, train_funcs): + def test_topic_bank(self, keep_in_memory, bank_update, train_funcs, params): + if params == {}: + train_func = train_funcs + else: + train_func = lambda *args, **kwargs: train_funcs(*args, **kwargs, **params) + self._test_topic_bank( self.dataset(keep_in_memory=keep_in_memory), bank_update, - train_func=train_funcs, + train_func=train_func, ) @pytest.mark.parametrize('keep_in_memory', [True, False]) @@ -149,15 +164,22 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( @@ -183,6 +205,10 @@ def test_topic_bank_specific_phi_cdc(self, keep_in_memory, bank_update): min_samples=1 ) + print(f'CDC Phi: {phi}') + + assert not phi.isnull().any(axis=None) + def initialize_phi_func( dataset: Dataset, model_number: int, @@ -195,15 +221,22 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( @@ -229,6 +262,10 @@ def test_topic_bank_specific_phi_arora(self, keep_in_memory, bank_update): document_occurrences_threshold_percentage=0.001 ) + print(f'Arora Phi: {phi}.') + + assert not phi.isnull().any(axis=None) + def initialize_phi_func( dataset: Dataset, model_number: int, @@ -241,15 +278,22 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( @@ -266,6 +310,7 @@ def _test_topic_bank( one_model_num_topics: int = 2, train_func: Callable = None): + small_probability = 0.001 self.optimizer = TopicBankMethod( data=dataset, main_modality=self.main_modality, @@ -289,3 +334,16 @@ def _test_topic_bank( for result_key in ['optimum', 'optimum_std']: assert result_key in self.optimizer._result assert isinstance(self.optimizer._result[result_key], Number) + + topic_bank = self.optimizer._topic_bank + bank_topics = topic_bank.topics + bank_topic_scores = topic_bank.topic_scores + + print(f'Bank topics: {bank_topics}.') + + assert len(bank_topics) == len(bank_topic_scores) + assert len(bank_topics) > 0 + + for bank_topic in bank_topics: + assert len(bank_topic) > 0 + assert any(v >= small_probability for v in bank_topic.values())