From 7c94ebe31bf65eaf42bc515438d0076b51f6eab4 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 23 Mar 2024 10:48:50 +0300 Subject: [PATCH 01/49] fix fix topics --- topnum/model_constructor.py | 80 +++++++++++- topnum/scores/diversity_score.py | 25 +++- topnum/search_methods/stability_method.py | 2 +- .../topic_bank/one_model_train_funcs.py | 30 ++++- .../topic_bank/topic_bank_method.py | 120 ++++++++++++++++-- 5 files changed, 237 insertions(+), 20 deletions(-) diff --git a/topnum/model_constructor.py b/topnum/model_constructor.py index d8ea6da..f422107 100644 --- a/topnum/model_constructor.py +++ b/topnum/model_constructor.py @@ -99,8 +99,8 @@ def init_model_from_family( dataset, modalities_to_use, main_modality, num_topics, 1, model_params ) elif family == "decorrelation": - model = init_decorrelated_plsa( - dataset, modalities_to_use, main_modality, num_topics, model_params + model = init_decorrelated_artm( + dataset, modalities_to_use, main_modality, num_topics, 1, model_params ) elif family == "ARTM": model = init_baseline_artm( @@ -213,6 +213,82 @@ def init_decorrelated_plsa( return model +def init_decorrelated_artm( + dataset, + modalities_to_use, + main_modality, + num_topics, + bcg_topics, + model_params: dict = None +): + """ + Creates simple artm model with standard scores. + + Parameters + ---------- + dataset : Dataset + modalities_to_use : list of str + main_modality : str + num_topics : int + model_params : dict + + Returns + ------- + model: artm.ARTM() instance + """ + if model_params is None: + model_params = dict() + + model = init_plsa( + dataset, modalities_to_use, main_modality, num_topics + ) + tau = model_params.get('decorrelation_tau', 0.01) + + specific_topic_names = model.topic_names # let's decorrelate everything + model.regularizers.add( + artm.DecorrelatorPhiRegularizer( + gamma=0, + tau=tau, + name='decorrelation', + topic_names=specific_topic_names, + class_ids=modalities_to_use, + ) + ) + + + dictionary = dataset.get_dictionary() + baseline_class_ids = {class_id: 1 for class_id in modalities_to_use} + data_stats = count_vocab_size(dictionary, baseline_class_ids) + + background_topic_names = model.topic_names[-bcg_topics:] + specific_topic_names = model.topic_names[:-bcg_topics] + + # all coefficients are relative + regularizers = [ + artm.SmoothSparsePhiRegularizer( + name='smooth_phi_bcg', + topic_names=background_topic_names, + tau=model_params.get("smooth_bcg_tau", 0.1), + class_ids=[main_modality], + ), + artm.SmoothSparseThetaRegularizer( + name='smooth_theta_bcg', + topic_names=background_topic_names, + tau=model_params.get("smooth_bcg_tau", 0.1), + ), + ] + + for reg in regularizers: + model.regularizers.add(transform_regularizer( + data_stats, + reg, + model.class_ids, + n_topics=len(reg.topic_names) + )) + + return model + + def _init_dirichlet_prior(name, num_topics, num_terms): """ Adapted from github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521 diff --git a/topnum/scores/diversity_score.py b/topnum/scores/diversity_score.py index 311c9d7..6293f5a 100644 --- a/topnum/scores/diversity_score.py +++ b/topnum/scores/diversity_score.py @@ -80,6 +80,7 @@ def __init__( name: str, metric: str = L2, class_ids: Union[List[str], str] = None, + topic_names = None, closest: bool = False): ''' Parameters @@ -102,16 +103,17 @@ def __init__( self._metric = metric self._class_ids = class_ids - + self._topic_names = topic_names self._closest = closest + self._score = self._initialize() def _initialize(self) -> BaseTopicNetScore: - return _DiversityScore(self._metric, self._class_ids, self._closest) + return _DiversityScore(self._metric, self._class_ids, self._topic_names, self._closest) class _DiversityScore(BaseTopicNetScore): - def __init__(self, metric: str, class_ids: Union[List[str], str] = None, closest: bool = False): + def __init__(self, metric: str, class_ids: Union[List[str], str] = None, topic_names = None, closest: bool = False): super().__init__() metric = metric.lower() @@ -128,10 +130,22 @@ def __init__(self, metric: str, class_ids: Union[List[str], str] = None, closest self._metric = metric self._class_ids = class_ids + self._topic_names = topic_names self.closest = closest def call(self, model: TopicModel): phi = model.get_phi(class_ids=self._class_ids) + all_topic_names = list(phi.columns) + + if hasattr(model, 'has_bcg'): + print(f'Detected bcg topics! Skipping for diversity computation (and now {len(all_topic_names) - 1} topics).') + + all_topic_names = all_topic_names[:-1] + + if self._topic_names is not None: + phi = phi.loc[:, self._topic_names] + else: + phi = phi.loc[:, all_topic_names] if self._metric == "hellinger": matrix = np.sqrt(phi.T) @@ -139,6 +153,11 @@ def call(self, model: TopicModel): else: condensed_distances = pdist(phi.T, metric=self._metric) + orig_num_dists = len(condensed_distances) + condensed_distances = condensed_distances[np.isfinite(condensed_distances)] + filtered_num_dists = len(condensed_distances) + assert filtered_num_dists >= 0.9 * orig_num_dists, (filtered_num_dists, orig_num_dists) + if self.closest: df = pd.DataFrame( index=phi.columns, columns=phi.columns, diff --git a/topnum/search_methods/stability_method.py b/topnum/search_methods/stability_method.py index 9bb6847..31896d6 100644 --- a/topnum/search_methods/stability_method.py +++ b/topnum/search_methods/stability_method.py @@ -12,7 +12,7 @@ import sys import tempfile -from lapsolver import solve_dense +#from lapsolver import solve_dense from tqdm import tqdm from typing import ( Any, diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index 6e8e27d..477389f 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -15,6 +15,7 @@ def default_train_func( dataset: Dataset, + main_modality: str, model_number: int, num_topics: int, num_fit_iterations: int, @@ -30,6 +31,7 @@ def default_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -233,6 +235,7 @@ def background_topics_train_func( def _get_topic_model( dataset: Dataset, + main_modality, phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, @@ -243,6 +246,10 @@ def _get_topic_model( dictionary = dataset.get_dictionary() + # for modality in dataset.get_possible_modalities(): + # if modality not in modalities_to_use: + # dictionary.filter(class_id=modality, max_df=0, inplace=True) + if num_topics is not None and phi is not None: assert num_topics >= phi.shape[1] elif num_topics is None and phi is not None: @@ -252,21 +259,38 @@ def _get_topic_model( topic_names = [f'topic_{i}' for i in range(num_topics)] + # if seed is None: + # artm_model = artm.ARTM(topic_names=topic_names) + # else: + # artm_model = artm.ARTM(topic_names=topic_names, seed=seed) + if seed is None: - artm_model = artm.ARTM(topic_names=topic_names) + artm_model = artm.ARTM(topic_names=topic_names, class_ids={main_modality: 1}) # TODO: not list, but dict!!! else: - artm_model = artm.ARTM(topic_names=topic_names, seed=seed) + artm_model = artm.ARTM(topic_names=topic_names, seed=seed, class_ids={main_modality: 1}) + + # artm_model = init_model(topic_names, class_ids=[MAIN_MODALITY]) + + # artm_model = init_plsa(DATASET, [MAIN_MODALITY], MAIN_MODALITY, 5) artm_model.num_processors = num_processors artm_model.initialize(dictionary) + """ if phi is None: pass elif num_safe_fit_iterations is not None and num_safe_fit_iterations > 0: init_phi_utils._safe_copy_phi(artm_model, phi, dataset, num_safe_fit_iterations) else: init_phi_utils._copy_phi(artm_model, phi) - + """ + # this breaks smth in ARTM + # test_ppl@word [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@word_with_d [4073.36328125, 6035.2822265625, 6035.3779296875, 6035.37841796875, 6035.37841796875] + # test_ppl@all [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@all_2 [1827.4515380859375, 2707.63623046875, 2707.67919921875, 2707.679443359375, 2707.679443359375] + # test_ppl@all_2_with_d [4073.36328125, 6035.2822265625, 6035.3779296875, 6035.37841796875, 6035.37841796875] + topic_model = TopicModel( artm_model=artm_model, model_id='0', diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 54268dc..1a9979a 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -23,6 +23,7 @@ ) from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection +from topnum.model_constructor import init_model_from_family from topnum.scores._base_coherence_score import ( SpecificityEstimationMethod, TextType, @@ -210,12 +211,12 @@ def __init__( f' Are you sure you want to proceed (yes/no)?' ) - answer = input() + #answer = input() - if strtobool(answer) is False: - warnings.warn('Exiting') + #if strtobool(answer) is False: + # warnings.warn('Exiting') - exit(0) + # exit(0) self._topic_score_threshold_percentile = topic_score_threshold_percentile @@ -316,6 +317,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topic_model = self._train_func[model_number]( dataset=self._dataset, + main_modality=self._main_modality, model_number=model_number, num_topics=self._one_model_num_topics[model_number], num_fit_iterations=self._num_fit_iterations, @@ -343,10 +345,13 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self.save() - threshold = self._aggregate_scores_for_models( - raw_topic_scores[self._main_topic_score.name], - self._topic_score_threshold_percentile - ) + if self._topic_score_threshold_percentile < 1: + threshold = self._topic_score_threshold_percentile + else: + threshold = self._aggregate_scores_for_models( + raw_topic_scores[self._main_topic_score.name], + self._topic_score_threshold_percentile + ) _logger.info('Finding new topics...') @@ -380,7 +385,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) good_new_topics = [ topic_index for topic_index, topic_name in enumerate(phi.columns) - if raw_topic_scores[self._main_topic_score.name][topic_name] is not None and + if topic_name in raw_topic_scores[self._main_topic_score.name] and + raw_topic_scores[self._main_topic_score.name][topic_name] is not None and raw_topic_scores[self._main_topic_score.name][topic_name] >= threshold ] topics_for_append, topics_for_update, topics_for_update_reverse = ( @@ -390,10 +396,15 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) ) model_topic_current_scores = list() + num_model_topics = len(topic_model.get_phi().columns) _logger.info('Calculating model topic scores...') for topic_index, topic_name in enumerate(topic_model.get_phi().columns): + if hasattr(topic_model, 'has_bcg') and topic_index == num_model_topics - 1: + print('Skipping saving scores for bcg topic') + continue + topic_scores = dict() topic_word_prob_values = topic_model.get_phi()[topic_name].values @@ -443,7 +454,9 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._topic_bank.delete_topic(topics_for_update_reverse[topic_index]) self._result[_KEY_MODEL_TOPIC_SCORES].append(model_topic_current_scores) - self._result[_KEY_BANK_TOPIC_SCORES] = self._topic_bank.topic_scores # TODO: append + self._result[_KEY_BANK_TOPIC_SCORES].append( + self._topic_bank.topic_scores # TODO: append + ) self.save() @@ -465,17 +478,102 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) else: bank_phi = self._get_phi(self._topic_bank.topics, word2index) + # TODO: you know + from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer + + class FastFixPhiRegularizer(BaseRegularizer): + _VERY_BIG_TAU = 10 ** 9 + + def __init__(self, name: str, phi, topic_names: List[str]): + super().__init__(name, tau=self._VERY_BIG_TAU) + + self._topic_names = topic_names + self._topic_indices = None + self._phi = phi + + def grad(self, pwt, nwt): + # print('Fixing') + + rwt = np.zeros_like(pwt) + parent_phi = self._phi + + rwt[:, self._topic_indices] += parent_phi.values[:, self._topic_indices] + + return self.tau * rwt + + def attach(self, model): + super().attach(model) + + phi = self._model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] + + regularizer = FastFixPhiRegularizer( + name='fix', + phi=bank_phi, + topic_names=bank_phi.columns, + ) + + + bank_model = _get_topic_model( self._dataset, + main_modality=self._main_modality, phi=bank_phi, scores=self._all_model_scores, num_safe_fit_iterations=1 ) - bank_model._fit(self._dataset.get_batch_vectorizer(), 1) + # Safe fit to make topics so-so + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=1, + ) + bank_model._model.scores.add( + artm.scores.PerplexityScore( + name=f'ppl_fair', + ) + ) + # bank_model._fit(self._dataset.get_batch_vectorizer(), 1) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=5, + custom_regularizers={ + regularizer.name: regularizer, + } + ) _logger.info('Computing default scores for bank model...') scores.update(self._get_default_scores(bank_model)) + scores['ppl_fair'] = bank_model.scores['ppl_fair'][-1] + + + bank_model = init_model_from_family('sparse', self._dataset, self._main_modality, len(bank_phi.columns), 0) + # Safe fit to make topics so-so + # bank_model.has_bcg = True + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=1, + ) + bank_model._model.scores.add( + artm.scores.PerplexityScore( + name=f'ppl_cheatty', + ) + ) + bank_model._fit( + self._dataset.get_batch_vectorizer(), + num_iterations=5, + custom_regularizers={ + regularizer.name: regularizer, + } + ) + + scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] + + print(f'Bank scores: {scores}') + # Topic scores already calculated From 5733971edbd9e175e68717ec70a299cee245ca8e Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 12:09:15 +0300 Subject: [PATCH 02/49] fix diversity, debug fix in topic bank --- topnum/scores/diversity_score.py | 5 ++++- .../search_methods/topic_bank/topic_bank_method.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/topnum/scores/diversity_score.py b/topnum/scores/diversity_score.py index 6293f5a..683730c 100644 --- a/topnum/scores/diversity_score.py +++ b/topnum/scores/diversity_score.py @@ -156,7 +156,10 @@ def call(self, model: TopicModel): orig_num_dists = len(condensed_distances) condensed_distances = condensed_distances[np.isfinite(condensed_distances)] filtered_num_dists = len(condensed_distances) - assert filtered_num_dists >= 0.9 * orig_num_dists, (filtered_num_dists, orig_num_dists) + + if filtered_num_dists < 0.9 * orig_num_dists: + print(f'Skipping computation of dists: {(filtered_num_dists, orig_num_dists)}.') + return -1 if self.closest: df = pd.DataFrame( diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 1a9979a..16114fc 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -204,7 +204,7 @@ def __init__( self._one_model_num_topics: List[int] = one_model_num_topics self._train_func: List[Callable[[Dataset, int, int, int], TopicModel]] = train_funcs - if topic_score_threshold_percentile < 1: + if topic_score_threshold_percentile % 1 != 0: warnings.warn( f'topic_score_threshold_percentile {topic_score_threshold_percentile}' f' is less than one! It is expected to be in [0, 100].' @@ -345,7 +345,9 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self.save() - if self._topic_score_threshold_percentile < 1: + if self._topic_score_threshold_percentile % 1 != 0: + print(f'Using absoulte threshold: {self._topic_score_threshold_percentile}.') + threshold = self._topic_score_threshold_percentile else: threshold = self._aggregate_scores_for_models( @@ -642,14 +644,14 @@ def _extract_hierarchical_relationship( hierarchy = artm.hARTM(num_processors=1) - _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics') + print(f'Creating first level with {bank_phi.shape[1]} topics. Dictionary: {self._dictionary}.') level0 = hierarchy.add_level( num_topics=bank_phi.shape[1] ) level0.initialize(dictionary=self._dictionary) - _logger.debug( + print( f'Copying phi for the first level.' f' Phi shape: {bank_phi.shape}.' f' First words: {bank_phi.index[:10]}' @@ -660,7 +662,7 @@ def _extract_hierarchical_relationship( small_num_fit_iterations=1 ) - _logger.debug(f'Creating second level with {new_model_phi.shape[1]} topics') + print(f'Creating second level with {new_model_phi.shape[1]} topics') level1 = hierarchy.add_level( num_topics=new_model_phi.shape[1], @@ -683,7 +685,7 @@ def _extract_hierarchical_relationship( ) ) - _logger.debug( + print( f'Copying phi for the second level.' f' Phi shape: {new_model_phi.shape}.' f' First words: {new_model_phi.index[:10]}' From 37fa99ebeee63e3c0c6ef0bee61146cdaa4f60ee Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 12:09:35 +0300 Subject: [PATCH 03/49] add semantic var and focon intratext --- topnum/scores/_base_coherence_score.py | 12 +- topnum/scores/intratext_coherence_score.py | 152 ++++++++++++++++++--- 2 files changed, 142 insertions(+), 22 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 8aa5388..95caef2 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -68,13 +68,13 @@ class SpecificityEstimationMethod(IntEnum): Way to estimate how particular word is specific for particular topic. Unlike probability, eg. p(w | t), specificity_estimation takes into account values for all topics, eg. p(w | t_1), p(w | t_2), ..., p(w | t_n): - the higher the value p(w | t) comparing other p(w | t_i), + the higher the value p(w | t) comparing to other p(w | t_i), the higher the specificity_estimation of word "w" for the topic "t" Attributes ---------- NONE - Don't try to estimate specificity_estimation, return the probability as is + Don't try to estimate specificity, return the probability as is MAXIMUM From probability, corresponding to word and topic, extract *maximum* among probabilities for the word and other topics @@ -171,6 +171,8 @@ def compute( word_topic_relatednesses = self._get_word_topic_relatednesses(model) + # TODO: topic coherence may be evaluated on any peace of text + # (paragraph, sentence, phrase), that is, not only on whole documents topic_document_coherences = np.zeros((len(topics), len(documents))) document_indices_with_topic_coherence = defaultdict(list) @@ -330,8 +332,10 @@ def _get_relatedness( topic: str, word_topic_relatednesses: pd.DataFrame) -> float: - if word in word_topic_relatednesses.index: - return word_topic_relatednesses.loc[word, topic] + # if word in word_topic_relatednesses.index: + # return word_topic_relatednesses.loc[word, topic] + + return word_topic_relatednesses.loc[word, topic] _logger.warning( f'The word "{word}" not found in Word-Topic relatedness matrix!' diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 23d327a..84a04f4 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -33,19 +33,29 @@ class ComputationMethod(IntEnum): Attributes ---------- SEGMENT_LENGTH - Estimate the length of topic segments + Estimate the length of topic segments (TopLen) SEGMENT_WEIGHT Estimate the weight of topic segment - (weight - sum of specificities for the topic over words in segment) + (weight as sum of specificities for the topic over words in segment) SUM_OVER_WINDOW Sum of specificities for the topic over words in given window. The process is as follows: word of the topic is found in text, it is the center of the first window; - next word of the topic is found (outside of the previous window), window; etc + next word of the topic is found (outside of the previous window), + it is the center of the new window; etc + VARIANCE_IN_WINDOW + Estimate the variance between segment word vector components + corresponding to the topic (SemantiC_Var) + FOCUS_CONSISTENCY + Estimate how much text adjacent words differ, + summing the pairs of differences between max components + of corresponding word vectors (FoCon) """ SEGMENT_LENGTH = auto() SEGMENT_WEIGHT = auto() SUM_OVER_WINDOW = auto() + VARIANCE_IN_WINDOW = auto() + FOCUS_CONSISTENCY = auto() class IntratextCoherenceScore(BaseTopicScore): @@ -191,11 +201,12 @@ def __init__( f'Wrong "window": \"{window}\". ' f'Expect to be \"int\"') - if window < 0 or (window == 0 and computation_method == ComputationMethod.SUM_OVER_WINDOW): + if window < 0 or (window == 0 and computation_method in [ComputationMethod.SUM_OVER_WINDOW, + ComputationMethod.VARIANCE_IN_WINDOW]): raise ValueError( f'Wrong value for "window": \"{window}\". ' f'Expect to be non-negative. And greater than zero in case ' - f'computation_method == ComputationMethod.SUM_OVER_WINDOW') + f'computation_method is SUM_OVER_WINDOW or VARIANCE_IN_WINDOW.') self._computation_method = computation_method self._max_num_out_of_topic_words = max_num_out_of_topic_words @@ -218,6 +229,20 @@ def _compute_coherence( return average_sum_over_window + elif self._computation_method == ComputationMethod.VARIANCE_IN_WINDOW: + average_variance_in_window = self._compute_variance_in_window( + topic, words, word_topic_relatednesses + ) + + return average_variance_in_window + + elif self._computation_method == ComputationMethod.FOCUS_CONSISTENCY: + average_focus_consistency = self._compute_focus_consistency( + topic, words, word_topic_relatednesses + ) + + return average_focus_consistency + topic_segment_length, topic_segment_weight = self._compute_segment_characteristics( topic, words, word_topic_relatednesses ) @@ -228,6 +253,19 @@ def _compute_coherence( elif self._computation_method == ComputationMethod.SEGMENT_WEIGHT: return topic_segment_weight + @staticmethod + def _get_word_topic_index( + word: WordType, + word_topic_relatednesses: pd.DataFrame, + word_topic_indices: np.array, + ) -> int: + if word not in word_topic_relatednesses.index: + return -1 + else: + return word_topic_indices[ + word_topic_relatednesses.index.get_loc(word) + ] + def _compute_segment_characteristics( self, topic: str, @@ -241,13 +279,12 @@ def _compute_segment_characteristics( topic_index = word_topic_relatednesses.columns.get_loc(topic) word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) - def get_word_topic_index(word): - if word not in word_topic_relatednesses.index: - return -1 - else: - return word_topic_indices[ - word_topic_relatednesses.index.get_loc(word) - ] + def get_word_topic_index(word: WordType) -> int: + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=word_topic_relatednesses, + word_topic_indices=word_topic_indices, + ) index = 0 @@ -304,12 +341,11 @@ def _sum_relatednesses_over_window( word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) def get_word_topic_index(word: WordType) -> int: - if word not in word_topic_relatednesses.index: - return -1 - else: - return word_topic_indices[ - word_topic_relatednesses.index.get_loc(word) - ] + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=word_topic_relatednesses, + word_topic_indices=word_topic_indices, + ) def find_next_topic_word(starting_index: int) -> int: index = starting_index @@ -352,3 +388,83 @@ def find_next_topic_word(starting_index: int) -> int: assert word_index > original_word_index or word_index == -1 return float(np.mean(sums)) + + def _compute_variance_in_window( + self, + topic: str, + words: List[WordType], + word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: + + topic_relatednesses = [ + _IntratextCoherenceScore._get_relatedness( + word, topic, word_topic_relatednesses + ) + for word in words + ] + + variances = list() + index = 0 + + while index == 0 or index + self._window - 1 < len(words): + relatedness_window = topic_relatednesses[index:index + self._window] + variances.append(np.var(relatedness_window)) + index += 1 + + if len(variances) == 0: + return None + else: + return -1 * float(np.mean(variances)) # the higher the better + + def _compute_focus_consistency( + self, + topic: str, + words: List[WordType], + word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: + + if len(words) == 0: + return None + + word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + + def get_word_topic_index(word: WordType) -> int: + return self._get_word_topic_index( + word=word, + word_topic_relatednesses=word_topic_relatednesses, + word_topic_indices=word_topic_indices, + ) + + word_topics = [ + word_topic_relatednesses.columns[get_word_topic_index(word)] + for word in words + ] + + differences = list() + index = 0 + + while index + 1 < len(words): # like window = 2 + cur_word, next_word = words[index], words[index + 1] + cur_topic, next_topic = word_topics[index], word_topics[index + 1] + + r_cw_ct = _IntratextCoherenceScore._get_relatedness( + cur_word, cur_topic, word_topic_relatednesses + ) + r_cw_nt = _IntratextCoherenceScore._get_relatedness( + cur_word, next_topic, word_topic_relatednesses + ) + r_nw_ct = _IntratextCoherenceScore._get_relatedness( + next_word, cur_topic, word_topic_relatednesses + ) + r_nw_nt = _IntratextCoherenceScore._get_relatedness( + next_word, next_topic, word_topic_relatednesses + ) + + diff1 = abs(r_cw_ct - r_nw_ct) + diff2 = abs(r_cw_nt - r_nw_nt) + differences.append(diff1 + diff2) + + index += 1 + + if len(differences) == 0: + return None + else: + return -1 * float(np.mean(differences)) # the higher the better From d058374be057be070dbf8f54ac2a675c71742533 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 12:13:43 +0300 Subject: [PATCH 04/49] lick code --- topnum/scores/intratext_coherence_score.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 84a04f4..2a8fd4b 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -297,7 +297,7 @@ def get_word_topic_index(word: WordType) -> int: continue segment_length = 1 - segment_weight = _IntratextCoherenceScore._get_relatedness( + segment_weight = self._get_relatedness( words[index], topic, word_topic_relatednesses ) @@ -310,7 +310,7 @@ def get_word_topic_index(word: WordType) -> int: num_out_of_topic_words += 1 else: segment_length += 1 - segment_weight += _IntratextCoherenceScore._get_relatedness( + segment_weight += self._get_relatedness( words[index], topic, word_topic_relatednesses ) @@ -374,9 +374,7 @@ def find_next_topic_word(starting_index: int) -> int: sum_in_window = np.sum( [ - _IntratextCoherenceScore._get_relatedness( - w, topic, word_topic_relatednesses - ) + self._get_relatedness(w, topic, word_topic_relatednesses) for w in words[window_lower_bound:window_upper_bound] ] ) @@ -396,9 +394,7 @@ def _compute_variance_in_window( word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: topic_relatednesses = [ - _IntratextCoherenceScore._get_relatedness( - word, topic, word_topic_relatednesses - ) + self._get_relatedness(word, topic, word_topic_relatednesses) for word in words ] @@ -445,16 +441,16 @@ def get_word_topic_index(word: WordType) -> int: cur_word, next_word = words[index], words[index + 1] cur_topic, next_topic = word_topics[index], word_topics[index + 1] - r_cw_ct = _IntratextCoherenceScore._get_relatedness( + r_cw_ct = self._get_relatedness( cur_word, cur_topic, word_topic_relatednesses ) - r_cw_nt = _IntratextCoherenceScore._get_relatedness( + r_cw_nt = self._get_relatedness( cur_word, next_topic, word_topic_relatednesses ) - r_nw_ct = _IntratextCoherenceScore._get_relatedness( + r_nw_ct = self._get_relatedness( next_word, cur_topic, word_topic_relatednesses ) - r_nw_nt = _IntratextCoherenceScore._get_relatedness( + r_nw_nt = self._get_relatedness( next_word, next_topic, word_topic_relatednesses ) From baa026bd0a01e6478a7636535a6f800631d00550 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 17:15:14 +0300 Subject: [PATCH 05/49] add tests for new old coherences --- topnum/tests/test_coherence_scores.py | 73 ++++++++++++++++----------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/topnum/tests/test_coherence_scores.py b/topnum/tests/test_coherence_scores.py index 256378c..17274e4 100644 --- a/topnum/tests/test_coherence_scores.py +++ b/topnum/tests/test_coherence_scores.py @@ -41,9 +41,31 @@ SMALL_SEGMENT_LENGTH_PROBABILITIES = [0.3, 0.45, 0.25] DOCUMENT_LENGTH = 100 TOP_WORD_PROBABILITY_TIMES_BIGGER = 4 + PHI_FILE_NAME = 'phi.csv' DATASET_FILE_NAME = 'dataset.csv' +TEXT_TYPES = [ + TextType.VW_TEXT, + TextType.RAW_TEXT, +] +COMPUTATION_METHODS = [ + ComputationMethod.SEGMENT_LENGTH, + ComputationMethod.SEGMENT_WEIGHT, + ComputationMethod.SUM_OVER_WINDOW, + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, +] +WORD_TOPIC_RELATEDNESS_TYPES = [ + WordTopicRelatednessType.PWT, + WordTopicRelatednessType.PTW, +] +SPECIFICITY_ESTIMATION_METHODS = [ + SpecificityEstimationMethod.NONE, + SpecificityEstimationMethod.MAXIMUM, + SpecificityEstimationMethod.AVERAGE, +] + class _MockModel(BaseModel): def __init__(self, phi: pd.DataFrame): @@ -211,12 +233,10 @@ def get_vw_text(cls, doc: str, document_words: Dict[str, List[str]]) -> str: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_compute_intratext( @@ -246,12 +266,10 @@ def test_compute_intratext_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_call_intratext( @@ -281,12 +299,10 @@ def test_call_intratext_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, computation_method, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, - ComputationMethod.SUM_OVER_WINDOW], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + COMPUTATION_METHODS, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) @pytest.mark.parametrize( @@ -324,10 +340,9 @@ def test_call_intratext_with_specified_documents( @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_compute_toptokens( @@ -355,10 +370,9 @@ def test_compute_toptokens_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) def test_call_toptokens( @@ -386,10 +400,9 @@ def test_call_toptokens_small_big_data(self, keep_in_memory) -> None: @pytest.mark.parametrize( 'text_type, word_topic_relatedness, specificity_estimation', list(product( - [TextType.VW_TEXT, TextType.RAW_TEXT], - [WordTopicRelatednessType.PWT, WordTopicRelatednessType.PTW], - [SpecificityEstimationMethod.NONE, SpecificityEstimationMethod.MAXIMUM, - SpecificityEstimationMethod.AVERAGE] + TEXT_TYPES, + WORD_TOPIC_RELATEDNESS_TYPES, + SPECIFICITY_ESTIMATION_METHODS )) ) @pytest.mark.parametrize( From 1df50f77e8a7903ce360b52db853f4d64307901c Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 17:21:18 +0300 Subject: [PATCH 06/49] fix tests --- .../topic_bank/one_model_train_funcs.py | 3 ++- .../topic_bank/phi_initialization/arora.py | 5 +++++ topnum/tests/test_topic_bank.py | 18 ++++++++++++------ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index 477389f..af85ba1 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -193,6 +193,7 @@ def background_topics_train_func( dataset, num_topics=num_topics, seed=model_number, + **kwargs, ) num_fit_iterations_with_scores = 1 @@ -235,7 +236,7 @@ def background_topics_train_func( def _get_topic_model( dataset: Dataset, - main_modality, + main_modality: str, phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, diff --git a/topnum/search_methods/topic_bank/phi_initialization/arora.py b/topnum/search_methods/topic_bank/phi_initialization/arora.py index 50bfa54..603c288 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/arora.py +++ b/topnum/search_methods/topic_bank/phi_initialization/arora.py @@ -17,6 +17,11 @@ ) +np.int = np.int32 # Arora uses old NumPy (current version has not "int" attribute) + # https://stackoverflow.com/q/74946845/8094251 + # https://github.com/scikit-learn-contrib/boruta_py/issues/122#issuecomment-1914122968 + + def compute_phi( dataset: Dataset, main_modality: str, diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index 298c3c4..a0f609f 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -152,12 +152,14 @@ def train_func( model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( dataset, model_number, num_topics, num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( @@ -198,12 +200,14 @@ def train_func( model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( dataset, model_number, num_topics, num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( @@ -244,12 +248,14 @@ def train_func( model_number: int, num_topics: int, num_fit_iterations: int, - scores: List[BaseScore] = None) -> TopicModel: + scores: List[BaseScore] = None, + **kwargs) -> TopicModel: return specific_initial_phi_train_func( dataset, model_number, num_topics, num_fit_iterations, scores, - initialize_phi_func=initialize_phi_func + initialize_phi_func=initialize_phi_func, + **kwargs ) self._test_topic_bank( From d2dd2832103d24c23081ef446bf9273993693776 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 17:28:24 +0300 Subject: [PATCH 07/49] return cautious get relatedness (allow unknown words) --- topnum/scores/_base_coherence_score.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 95caef2..17a26fb 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -332,17 +332,15 @@ def _get_relatedness( topic: str, word_topic_relatednesses: pd.DataFrame) -> float: - # if word in word_topic_relatednesses.index: - # return word_topic_relatednesses.loc[word, topic] - - return word_topic_relatednesses.loc[word, topic] - - _logger.warning( - f'The word "{word}" not found in Word-Topic relatedness matrix!' - f' Returning mean value over all word relatednesses for topic "{topic}"' - ) + try: + return word_topic_relatednesses.loc[word, topic] + except KeyError as error: + _logger.warning( + f'Some word not found in Word-Topic relatedness matrix: "{error}"!' + f' Returning mean value over all word relatednesses for topic "{topic}".' + ) - return float(np.mean(word_topic_relatednesses.values)) + return float(np.mean(word_topic_relatednesses.values)) # TODO: DRY def save(self, path: str) -> None: From eebb429420913e04bd4885efb46cae834323b75a Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 18:15:15 +0300 Subject: [PATCH 08/49] tributize newly added coherences --- topnum/scores/_base_coherence_score.py | 5 +++-- topnum/scores/intratext_coherence_score.py | 22 ++++++++++++++++++++++ topnum/tests/test_coherence_scores.py | 19 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 17a26fb..273f71c 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -88,6 +88,8 @@ class SpecificityEstimationMethod(IntEnum): class _BaseCoherenceScore(TopicNetBaseScore): + _EPS = np.finfo(float).tiny + def __init__( self, dataset: Dataset, @@ -257,7 +259,6 @@ def _get_word_topic_probs(self, phi: pd.DataFrame) -> pd.DataFrame: elif self._word_topic_relatedness == WordTopicRelatednessType.PTW: # Treat all topics as equally probable - eps = np.finfo(float).tiny pwt = phi pwt_values = pwt.values @@ -265,7 +266,7 @@ def _get_word_topic_probs(self, phi: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame( index=pwt.index, columns=pwt.columns, - data=pwt_values / (pwt_values.sum(axis=1).reshape(-1, 1) + eps) + data=pwt_values / (pwt_values.sum(axis=1).reshape(-1, 1) + self._EPS) ) assert False diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 2a8fd4b..b7650a1 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import warnings from enum import ( auto, @@ -58,6 +59,12 @@ class ComputationMethod(IntEnum): FOCUS_CONSISTENCY = auto() +_RESEARCH_COMPUTATION_METHODS = [ + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, +] + + class IntratextCoherenceScore(BaseTopicScore): """ Computes intratext coherence @@ -191,6 +198,16 @@ def __init__( f'Wrong "computation_method": \"{computation_method}\". ' f'Expect to be \"{ComputationMethod}\"') + if computation_method in _RESEARCH_COMPUTATION_METHODS: + warnings.warn( + f"Coherences {_RESEARCH_COMPUTATION_METHODS} were also presented in the original paper" + f" but preference should be given to other (TopLen-based) methods." + f" Still, coherences {_RESEARCH_COMPUTATION_METHODS} are also implemented," + f" partly as a tribute, partly for research purposes." + f" Once again, coherence {computation_method} is not intended for \"production\" use." + f" But you do you, it's not like there's a coherence police or something." + ) + if not isinstance(max_num_out_of_topic_words, int): raise TypeError( f'Wrong "max_num_out_of_topic_words": \"{max_num_out_of_topic_words}\". ' @@ -403,7 +420,12 @@ def _compute_variance_in_window( while index == 0 or index + self._window - 1 < len(words): relatedness_window = topic_relatednesses[index:index + self._window] + # TODO: better differentiate good and bad topics?.. + # (low variance is not necessarily a good "goodness" sign: + # for example, sequences [100, 100, 100] + # and [-17.5, -17.5, -17.5] both have zero variance) variances.append(np.var(relatedness_window)) + index += 1 if len(variances) == 0: diff --git a/topnum/tests/test_coherence_scores.py b/topnum/tests/test_coherence_scores.py index 17274e4..d6d3f78 100644 --- a/topnum/tests/test_coherence_scores.py +++ b/topnum/tests/test_coherence_scores.py @@ -53,6 +53,8 @@ ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, ComputationMethod.SUM_OVER_WINDOW, +] +RESEARCH_COMPUTATION_METHODS = [ ComputationMethod.VARIANCE_IN_WINDOW, ComputationMethod.FOCUS_CONSISTENCY, ] @@ -67,6 +69,14 @@ ] +RESEARCH_INTRATEXT_MESSAGE = ( + f"Coherences {RESEARCH_COMPUTATION_METHODS} were presented in the original paper" + f" and are implemented partly as a tribute," + f" partly for research purposes." + f" For real use, preference should be given to {COMPUTATION_METHODS} methods." +) + + class _MockModel(BaseModel): def __init__(self, phi: pd.DataFrame): self._phi = phi @@ -246,6 +256,9 @@ def test_compute_intratext( word_topic_relatedness: WordTopicRelatednessType, specificity_estimation: SpecificityEstimationMethod) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + score = _IntratextCoherenceScore( self.dataset, text_type=text_type, @@ -279,6 +292,9 @@ def test_call_intratext( word_topic_relatedness: WordTopicRelatednessType, specificity_estimation: SpecificityEstimationMethod) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + score = _IntratextCoherenceScore( self.dataset, text_type=text_type, @@ -317,6 +333,9 @@ def test_call_intratext_with_specified_documents( specificity_estimation: SpecificityEstimationMethod, what_documents: str) -> None: + if computation_method in RESEARCH_COMPUTATION_METHODS: + pytest.xfail(RESEARCH_INTRATEXT_MESSAGE) + if what_documents == 'first': documents = [self.documents[0]] elif what_documents == 'all': From 831a20a784972bd1970c3b12ef43a4ad0d8d2197 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 18:16:50 +0300 Subject: [PATCH 09/49] xfail semantic var and focon in tests --- topnum/tests/test_coherence_scores.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/topnum/tests/test_coherence_scores.py b/topnum/tests/test_coherence_scores.py index d6d3f78..243708e 100644 --- a/topnum/tests/test_coherence_scores.py +++ b/topnum/tests/test_coherence_scores.py @@ -53,6 +53,8 @@ ComputationMethod.SEGMENT_LENGTH, ComputationMethod.SEGMENT_WEIGHT, ComputationMethod.SUM_OVER_WINDOW, + ComputationMethod.VARIANCE_IN_WINDOW, + ComputationMethod.FOCUS_CONSISTENCY, ] RESEARCH_COMPUTATION_METHODS = [ ComputationMethod.VARIANCE_IN_WINDOW, From 6796a0b37115f4103392ca9c9514c7dd645434ce Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 19:39:19 +0300 Subject: [PATCH 10/49] fix scores tests --- topnum/scores/arun.py | 13 ++++++++++--- topnum/scores/plavin.py | 2 +- topnum/search_methods/stability_method.py | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/topnum/scores/arun.py b/topnum/scores/arun.py index af37642..3196116 100644 --- a/topnum/scores/arun.py +++ b/topnum/scores/arun.py @@ -63,8 +63,15 @@ def call(self, model: TopicModel): phi = model.get_phi(class_ids=self.modalities) c_m1 = np.linalg.svd(phi, compute_uv=False) + c_m2 = self.document_lengths.dot(theta.T) - c_m2 += 0.0001 # we need this to prevent components equal to zero + c_m2 = c_m2.to_numpy() + + # Otherwise, _symmetric_kl will result in error (np.float32 vs np.float arrays...) + c_m2 = c_m2.astype(c_m1.dtype, copy=False) + + # We need this to prevent components equal to zero + c_m2 += 0.0001 if len(c_m1) != phi.shape[1]: warnings.warn( @@ -76,10 +83,10 @@ def call(self, model: TopicModel): return 1.0 - # we do not need to normalize these vectors + # We do not need to normalize these vectors return _symmetric_kl(c_m1, c_m2) - # TODO: this piece is copy-pastd among three different scores + # TODO: this piece is copy-pasted among three different scores def save(self, path: str) -> None: dataset = self._dataset self._dataset = None diff --git a/topnum/scores/plavin.py b/topnum/scores/plavin.py index 183639b..a2abe2c 100644 --- a/topnum/scores/plavin.py +++ b/topnum/scores/plavin.py @@ -27,7 +27,7 @@ def _compute_kl(T, theta, doc_lengths): theta_distrib = theta.dot(doc_lengths) # TODO: dtype was 'object'? how could it be? - theta_distrib = np.array(theta_distrib.values, dtype=np.float) + theta_distrib = np.array(theta_distrib.values, dtype=uniform_distrib.dtype) return stats.entropy(uniform_distrib, theta_distrib) diff --git a/topnum/search_methods/stability_method.py b/topnum/search_methods/stability_method.py index 31896d6..9bb6847 100644 --- a/topnum/search_methods/stability_method.py +++ b/topnum/search_methods/stability_method.py @@ -12,7 +12,7 @@ import sys import tempfile -#from lapsolver import solve_dense +from lapsolver import solve_dense from tqdm import tqdm from typing import ( Any, From 1db643a887f49d5a846562750f7c6c2f9dbe7869 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 22:17:35 +0300 Subject: [PATCH 11/49] fix main modality usage in topic bank train and init funcs --- .../topic_bank/one_model_train_funcs.py | 43 ++++++++++++++++--- .../topic_bank/phi_initialization/arora.py | 14 ++++-- .../topic_bank/phi_initialization/cdc.py | 10 ++--- .../topic_bank/phi_initialization/utils.py | 4 ++ .../topic_bank/topic_bank_method.py | 16 ++++--- 5 files changed, 64 insertions(+), 23 deletions(-) diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index af85ba1..8545c95 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -5,7 +5,8 @@ from topicnet.cooking_machine.models import TopicModel from typing import ( Callable, - List + List, + Optional, ) from topnum.scores.base_score import BaseScore @@ -15,7 +16,7 @@ def default_train_func( dataset: Dataset, - main_modality: str, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -55,6 +56,7 @@ def default_train_func( def specific_initial_phi_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -64,6 +66,7 @@ def specific_initial_phi_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -73,6 +76,12 @@ def specific_initial_phi_train_func( initialize_phi_func = initialize_phi_funcs.initialize_randomly initial_phi = initialize_phi_func(dataset, model_number, num_topics) + + if main_modality is not None: + initial_phi = init_phi_utils.get_modality_phi( + initial_phi, modality=main_modality + ) + init_phi_utils._copy_phi(topic_model._model, initial_phi) num_fit_iterations_with_scores = 1 @@ -93,6 +102,7 @@ def specific_initial_phi_train_func( def regularization_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -104,6 +114,7 @@ def regularization_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -157,6 +168,7 @@ def regularization_train_func( def background_topics_train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -167,6 +179,7 @@ def background_topics_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics + num_background_topics, seed=model_number, **kwargs, @@ -191,6 +204,7 @@ def background_topics_train_func( topic_model = _get_topic_model( dataset, + main_modality=main_modality, num_topics=num_topics, seed=model_number, **kwargs, @@ -236,15 +250,25 @@ def background_topics_train_func( def _get_topic_model( dataset: Dataset, - main_modality: str, + main_modality: Optional[str], phi: pd.DataFrame = None, num_topics: int = None, seed: int = None, scores: List[BaseScore] = None, - num_safe_fit_iterations: int = 3, + num_safe_fit_iterations: int = 3, # TODO: remove param (only FastFixPhiRegularizer to be used for safe copy) num_processors: int = 3, cache_theta: bool = False) -> TopicModel: + if phi is not None: + raise ValueError( + "Do not use `phi` parameter, use `num_topics` instead!" + " Currently, this method is not responsible for copying Phi matrix." + " We have temporarily turned off this functionality," + " because the realization appeared not perfectly reliable." + " In the future, Phi copying will be improved and returned" + " (it will be based on FastFixPhiRegularizer)." + ) + dictionary = dataset.get_dictionary() # for modality in dataset.get_possible_modalities(): @@ -265,10 +289,15 @@ def _get_topic_model( # else: # artm_model = artm.ARTM(topic_names=topic_names, seed=seed) - if seed is None: - artm_model = artm.ARTM(topic_names=topic_names, class_ids={main_modality: 1}) # TODO: not list, but dict!!! + if main_modality is not None: + class_ids = {main_modality: 1} else: - artm_model = artm.ARTM(topic_names=topic_names, seed=seed, class_ids={main_modality: 1}) + class_ids = None + + if seed is None: + seed = -1 # for ARTM, it means "no seed" + + artm_model = artm.ARTM(topic_names=topic_names, seed=seed, class_ids=class_ids) # TODO: not list, but dict!!! # artm_model = init_model(topic_names, class_ids=[MAIN_MODALITY]) diff --git a/topnum/search_methods/topic_bank/phi_initialization/arora.py b/topnum/search_methods/topic_bank/phi_initialization/arora.py index 603c288..53642ca 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/arora.py +++ b/topnum/search_methods/topic_bank/phi_initialization/arora.py @@ -51,7 +51,10 @@ def compute_phi( } word_document_frequencies = _count_word_document_frequencies( - dataset, text_column, word2index + dataset=dataset, + vocabulary_size=len(phi_index), + text_column=text_column, + word2index=word2index, ) word_document_frequencies = scipy.sparse.csc_matrix(word_document_frequencies) @@ -73,12 +76,15 @@ def compute_phi( def _count_word_document_frequencies( - dataset: Dataset, text_column: str, word2index: Dict[str, int]) -> np.ndarray: + dataset: Dataset, + vocabulary_size: int, + text_column: str, + word2index: Dict[str, int], + ) -> np.ndarray: num_documents = len(dataset._data) # TODO: for big data may be slow here - words_dimension_size = max(list(word2index.values())) + 1 frequencies = np.zeros( - shape=(words_dimension_size, num_documents) + shape=(vocabulary_size, num_documents) ) for doc_index, doc_text in enumerate(dataset._data[text_column]): diff --git a/topnum/search_methods/topic_bank/phi_initialization/cdc.py b/topnum/search_methods/topic_bank/phi_initialization/cdc.py index 39156d4..7ec5add 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/cdc.py +++ b/topnum/search_methods/topic_bank/phi_initialization/cdc.py @@ -69,8 +69,9 @@ def compute_phi( word_in_word_frequencies, document_frequencies = _count_word_in_word_frequencies( dataset=dataset, + vocabulary_size=len(phi_index), text_column=text_column, - word2index=word2index + word2index=word2index, ) word_in_word_probabilities = _count_word_in_word_probabilities( word_in_word_frequencies @@ -122,6 +123,7 @@ def _check_clusterization_distance_func( def _count_word_in_word_frequencies( dataset: Dataset, + vocabulary_size: int, text_column: str, word2index: Dict[str, int], split_on_paragraphs: bool = True, @@ -130,13 +132,11 @@ def _count_word_in_word_frequencies( smoothing_value: float = 0.01, num_docs_to_log: int = 500) -> Tuple[np.ndarray, np.ndarray]: # 2D, 1D - words_dimension_size = max(list(word2index.values())) + 1 - frequencies = np.zeros( - shape=(words_dimension_size, words_dimension_size) + shape=(vocabulary_size, vocabulary_size) ) document_frequencies = np.zeros( - shape=(words_dimension_size,) + shape=(vocabulary_size,) ) def process_words(words: List[str]) -> None: diff --git a/topnum/search_methods/topic_bank/phi_initialization/utils.py b/topnum/search_methods/topic_bank/phi_initialization/utils.py index 156984b..aa947af 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/utils.py +++ b/topnum/search_methods/topic_bank/phi_initialization/utils.py @@ -32,6 +32,10 @@ def get_phi_index(dataset: Dataset) -> Index: return phi_index +def get_modality_phi(phi: pd.DataFrame, modality: str) -> pd.DataFrame: + return phi.iloc[phi.index.get_level_values(0).isin([modality])] + + def _copy_phi(model: artm.ARTM, phi: pd.DataFrame, phi_ref: np.ndarray = None) -> np.ndarray: model_wrapper = TopicModel(artm_model=model) base_phi_index = model_wrapper.get_phi().index diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 16114fc..e1af47d 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -58,7 +58,10 @@ default_train_func, _get_topic_model ) -from topnum.search_methods.topic_bank.phi_initialization.utils import _safe_copy_phi +from topnum.search_methods.topic_bank.phi_initialization.utils import ( + _safe_copy_phi, + get_modality_phi, +) _KEY_BANK_SCORES = 'bank_scores' @@ -314,7 +317,6 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) # TODO: stop when perplexity stabilizes _logger.info(f'Building topic model number {model_number}...') - topic_model = self._train_func[model_number]( dataset=self._dataset, main_modality=self._main_modality, @@ -359,10 +361,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) phi = topic_model.get_phi() - if self._main_modality is None: - phi = phi - else: - phi = phi.iloc[phi.index.get_level_values(0).isin([self._main_modality])] + if self._main_modality is not None: + phi = get_modality_phi(phi, modality=self._main_modality) if word2index is None: word2index = { @@ -523,7 +523,7 @@ def attach(self, model): bank_model = _get_topic_model( self._dataset, main_modality=self._main_modality, - phi=bank_phi, + num_topics=bank_phi.shape[1], scores=self._all_model_scores, num_safe_fit_iterations=1 ) @@ -657,6 +657,8 @@ def _extract_hierarchical_relationship( f' First words: {bank_phi.index[:10]}' ) + # TODO: use FastFixPhiRegularizer + # (seems not critical here, but nevertheless) phi_ref0 = _safe_copy_phi( level0, bank_phi, self._dataset, small_num_fit_iterations=1 From f8e90fa0c9ef6bd4bae0e450a9a43645c5d50487 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 22:24:25 +0300 Subject: [PATCH 12/49] fix topic bank modality in tests --- topnum/tests/test_topic_bank.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index a0f609f..ba47b16 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -18,6 +18,7 @@ Callable, Dict, List, + Optional, ) from topnum.scores.base_score import BaseScore @@ -197,6 +198,7 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -204,8 +206,12 @@ def train_func( **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, initialize_phi_func=initialize_phi_func, **kwargs ) @@ -245,6 +251,7 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -252,8 +259,12 @@ def train_func( **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, initialize_phi_func=initialize_phi_func, **kwargs ) From 57c775f1517efc207eb9e7ac4d1b314316a752f3 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sat, 20 Jul 2024 22:27:58 +0300 Subject: [PATCH 13/49] fix topic bank modality in tests try 2 --- topnum/tests/test_topic_bank.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index ba47b16..0aabae2 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -150,6 +150,7 @@ def initialize_phi_func( def train_func( dataset: Dataset, + main_modality: Optional[str], model_number: int, num_topics: int, num_fit_iterations: int, @@ -157,8 +158,12 @@ def train_func( **kwargs) -> TopicModel: return specific_initial_phi_train_func( - dataset, model_number, num_topics, - num_fit_iterations, scores, + dataset, + main_modality=main_modality, + model_number=model_number, + num_topics=num_topics, + num_fit_iterations=num_fit_iterations, + scores=scores, initialize_phi_func=initialize_phi_func, **kwargs ) From 462d80376850ac786a534b350dd0b740f6db92d8 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 01:12:21 +0300 Subject: [PATCH 14/49] fix arora, fix copy phi in init func, enhance topic bank tests --- .../topic_bank/one_model_train_funcs.py | 13 ++++++++++-- .../topic_bank/phi_initialization/arora.py | 3 +-- .../initialize_phi_funcs.py | 12 +++++++++-- .../topic_bank/topic_bank_method.py | 1 + topnum/tests/test_topic_bank.py | 20 +++++++++++++++++++ 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index 8545c95..5e13d32 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -1,4 +1,5 @@ import artm +import numpy as np import pandas as pd from topicnet.cooking_machine.dataset import Dataset @@ -82,7 +83,15 @@ def specific_initial_phi_train_func( initial_phi, modality=main_modality ) - init_phi_utils._copy_phi(topic_model._model, initial_phi) + # TODO: However strange it may seem, + # it is really crucial to initialize `phi_ref` variable here. + # Otherwise, all this init-copy manipulation won't work. + # (Yes, at first glance `phi_ref` is not used anywhere, + # but apparently it is used somewhere...) + # The owls are not what they seem. + phi_ref = init_phi_utils._copy_phi(topic_model._model, initial_phi) + + assert np.allclose(phi_ref, topic_model.get_phi().to_numpy()) num_fit_iterations_with_scores = 1 @@ -239,7 +248,7 @@ def background_topics_train_func( ) # TODO: not very safe here? (if cache_theta us True, Theta not updated here) - init_phi_utils._copy_phi( + phi_ref = init_phi_utils._copy_phi( topic_model._model, specific_topics_phi, phi_ref=phi_ref diff --git a/topnum/search_methods/topic_bank/phi_initialization/arora.py b/topnum/search_methods/topic_bank/phi_initialization/arora.py index 53642ca..a3ffdab 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/arora.py +++ b/topnum/search_methods/topic_bank/phi_initialization/arora.py @@ -90,11 +90,10 @@ def _count_word_document_frequencies( for doc_index, doc_text in enumerate(dataset._data[text_column]): words = doc_text.split() preprocessed_words = list(utils._trim_vw(words)) # TODO: maybe require much memory - if preprocessed_words[:100] != words[:100]: warnings.warn(WARNING_VW_TEXT_WRONG_FORMAT) - words_counter = Counter(words) + words_counter = Counter(preprocessed_words) for w, c in words_counter.items(): if w not in word2index: diff --git a/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py b/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py index e780633..b3341f1 100644 --- a/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py +++ b/topnum/search_methods/topic_bank/phi_initialization/initialize_phi_funcs.py @@ -17,12 +17,20 @@ def initialize_randomly( phi_template = _get_phi_template(dataset, num_topics) random = np.random.RandomState(seed=model_number) - phi_values = random.random(phi_template.shape) + modality_phi_datas = [] + + for modality in phi_template.index.unique(level=0): + modality_phi_template = phi_template.xs(modality) + modality_phi_data = random.random(modality_phi_template.shape) + modality_phi_data = modality_phi_data / modality_phi_data.sum(axis=0) + modality_phi_datas.append(modality_phi_data) + + phi_data = np.vstack(modality_phi_datas) return pd.DataFrame( index=phi_template.index, columns=phi_template.columns, - data=phi_values + data=phi_data, ) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index e1af47d..2b336d6 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -659,6 +659,7 @@ def _extract_hierarchical_relationship( # TODO: use FastFixPhiRegularizer # (seems not critical here, but nevertheless) + # TODO: until then -- do not remove `phi_ref0` variable! phi_ref0 = _safe_copy_phi( level0, bank_phi, self._dataset, small_num_fit_iterations=1 diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index 0aabae2..44e6156 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -191,6 +191,10 @@ def test_topic_bank_specific_phi_cdc(self, keep_in_memory, bank_update): min_samples=1 ) + print(f'CDC Phi: {phi}') + + assert not phi.isnull().any(axis=None) + def initialize_phi_func( dataset: Dataset, model_number: int, @@ -244,6 +248,10 @@ def test_topic_bank_specific_phi_arora(self, keep_in_memory, bank_update): document_occurrences_threshold_percentage=0.001 ) + print(f'Arora Phi: {phi}') + + assert not phi.isnull().any(axis=None) + def initialize_phi_func( dataset: Dataset, model_number: int, @@ -288,6 +296,7 @@ def _test_topic_bank( one_model_num_topics: int = 2, train_func: Callable = None): + small_probability = 0.001 self.optimizer = TopicBankMethod( data=dataset, main_modality=self.main_modality, @@ -311,3 +320,14 @@ def _test_topic_bank( for result_key in ['optimum', 'optimum_std']: assert result_key in self.optimizer._result assert isinstance(self.optimizer._result[result_key], Number) + + topic_bank = self.optimizer._topic_bank + bank_topics = topic_bank.topics + bank_topic_scores = topic_bank.topic_scores + + assert len(bank_topics) == len(bank_topic_scores) + assert len(bank_topics) > 0 + + for bank_topic in bank_topics: + assert len(bank_topic) > 0 + assert any(v >= small_probability for v in bank_topic.values()) From 5db31b2b16c5ef69dda4479893488ab84784901b Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 01:43:12 +0300 Subject: [PATCH 15/49] fix topic bank tests with regularization func --- .../topic_bank/one_model_train_funcs.py | 3 +-- .../topic_bank/topic_bank_method.py | 18 +++++++++----- topnum/tests/test_topic_bank.py | 24 +++++++++++++++---- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/topnum/search_methods/topic_bank/one_model_train_funcs.py b/topnum/search_methods/topic_bank/one_model_train_funcs.py index 5e13d32..385e60e 100644 --- a/topnum/search_methods/topic_bank/one_model_train_funcs.py +++ b/topnum/search_methods/topic_bank/one_model_train_funcs.py @@ -128,7 +128,6 @@ def regularization_train_func( seed=model_number, **kwargs, ) - topic_model._model.regularizers.add( artm.regularizers.DecorrelatorPhiRegularizer(tau=decorrelating_tau) ) @@ -163,7 +162,7 @@ def regularization_train_func( topic_model._fit( dataset.get_batch_vectorizer(), - num_iterations=max(0, second_num_fit_iterations - num_fit_iterations_with_scores) + num_iterations=max(0, second_num_fit_iterations) ) _fit_model_with_scores( topic_model, diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 2b336d6..62734d4 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -18,6 +18,7 @@ Callable, Dict, List, + Optional, Tuple, Union ) @@ -79,6 +80,12 @@ _logger = logging.getLogger() +TRAIN_FUNC_TYPE = Callable[ + [Dataset, str, int, int, int, List[BaseScore]], + TopicModel +] + + class TopicBankMethod(BaseSearchMethod): _MINIMUM_TOPIC_DISTANCE = 0.0 _MAXIMUM_TOPIC_DISTANCE = 1.0 @@ -100,10 +107,9 @@ def __init__( max_num_models: int = 100, one_model_num_topics: Union[int, List[int]] = 100, num_fit_iterations: int = DEFAULT_NUM_FIT_ITERATIONS, - train_funcs: Union[ - Callable[[Dataset, int, int, int], TopicModel], - List[Callable[[Dataset, int, int, int], TopicModel]], - None] = None, + train_funcs: Optional[Union[ + TRAIN_FUNC_TYPE, + List[TRAIN_FUNC_TYPE]]] = None, topic_score_threshold_percentile: int = 95, distance_threshold: float = 0.5, bank_update: BankUpdateMethod = BankUpdateMethod.PROVIDE_NON_LINEARITY, @@ -205,7 +211,7 @@ def __init__( ] self._one_model_num_topics: List[int] = one_model_num_topics - self._train_func: List[Callable[[Dataset, int, int, int], TopicModel]] = train_funcs + self._train_func: List[TRAIN_FUNC_TYPE] = train_funcs if topic_score_threshold_percentile % 1 != 0: warnings.warn( @@ -323,7 +329,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) model_number=model_number, num_topics=self._one_model_num_topics[model_number], num_fit_iterations=self._num_fit_iterations, - scores=self._all_model_scores + scores=self._all_model_scores, ) scores = dict() diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index 44e6156..b3440af 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -125,14 +125,28 @@ def test_topic_bank_smoke(self, keep_in_memory): ] ) @pytest.mark.parametrize( - 'train_funcs', - [None, background_topics_train_func, default_train_func, regularization_train_func] + 'train_funcs, params', + [ + (None, {}), + (background_topics_train_func, {}), + (default_train_func, {}), + (regularization_train_func, dict( + decorrelating_tau=1, + smoothing_tau=1e-5, + sparsing_tau=-1 * 1e-5, + )) + ] ) - def test_topic_bank(self, keep_in_memory, bank_update, train_funcs): + def test_topic_bank(self, keep_in_memory, bank_update, train_funcs, params): + if params == {}: + train_func = train_funcs + else: + train_func = lambda *args, **kwargs: train_funcs(*args, **kwargs, **params) + self._test_topic_bank( self.dataset(keep_in_memory=keep_in_memory), bank_update, - train_func=train_funcs, + train_func=train_func, ) @pytest.mark.parametrize('keep_in_memory', [True, False]) @@ -325,6 +339,8 @@ def _test_topic_bank( bank_topics = topic_bank.topics bank_topic_scores = topic_bank.topic_scores + print(f'Bank topics: {bank_topics}.') + assert len(bank_topics) == len(bank_topic_scores) assert len(bank_topics) > 0 From e0b48e760df155ae8c982ea327296c1dfb030e2e Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 02:30:29 +0300 Subject: [PATCH 16/49] update reqs as in tested code, add setup file --- requirements.txt | 21 +++++++++++---------- setup.cfg | 2 ++ setup.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 10 deletions(-) create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/requirements.txt b/requirements.txt index 421f9a9..0a2986b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ anchor-topic==0.1.2 bigartm==0.9.2 -dill==0.3.1.1 -lapsolver==1.0.2 -matplotlib -numpy==1.22.0 -pandas==1.0.1 -pytest==5.3.5 -scikit-learn==1.5.0 -scipy==1.10.0 -topicnet>=0.8.0 -tqdm==4.66.3 +dill==0.3.8 +lapsolver==1.1.0 +matplotlib==3.7.5 +numpy==1.24.4 +pandas==2.0.3 +protobuf==3.20.3 +pytest==8.1.1 +scikit-learn==1.3.2 +scipy==1.10.1 +topicnet>=0.9.0 +tqdm==4.66.2 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..14449ab --- /dev/null +++ b/setup.py @@ -0,0 +1,46 @@ +from distutils.core import setup + + +setup( + name='topnum', + packages=[ + 'topnum', + 'topnum.data', + 'topnum.scores', + 'topnum.search_methods', + 'topnum.search_methods.topic_bank', + 'topnum.search_methods.topic_bank.phi_initialization', + 'topnum.tests' + ], + version='0.3.0', + license='MIT', + description='A set of methods for finding an appropriate number of topics in a text collection', + author='Machine Intelligence Laboratory', + author_email='vasiliy.alekseyev@phystech.edu', + url='https://github.com/machine-intelligence-laboratory/OptimalNumberOfTopics', + keywords=[ + 'topic modeling', + 'document clustering', + 'number of clusters', + 'ARTM', + 'regularization', + ], + install_requires=[ + 'anchor-topic==0.1.2', + 'bigartm>=0.9.2', + 'dill==0.3.8', + 'lapsolver==1.1.0', + 'matplotlib==3.7.5', + 'numpy==1.24.4', + 'pandas==2.0.3', + 'protobuf==3.20.3', # TODO: BigARTM dependency + 'pytest==8.1.1', + 'scikit-learn==1.3.2', + 'scipy==1.10.1', + 'topicnet>=0.9.0', + 'tqdm==4.66.2', + ], + classifiers=[ + 'Programming Language :: Python :: 3.8', + ], +) From 793d788500844de95c903bad663c0e6ee0e0f44f Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 02:32:07 +0300 Subject: [PATCH 17/49] allow bigartm 10 in reqs --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0a2986b..63d58cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ anchor-topic==0.1.2 -bigartm==0.9.2 +bigartm>=0.9.2 dill==0.3.8 lapsolver==1.1.0 matplotlib==3.7.5 From e6c756804fee854ab93633124bab35e666bff1b3 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 10:57:44 +0300 Subject: [PATCH 18/49] remove protobuf from reqs (it will go with topicnet) --- requirements.txt | 1 - setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 63d58cf..2a0ab7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ lapsolver==1.1.0 matplotlib==3.7.5 numpy==1.24.4 pandas==2.0.3 -protobuf==3.20.3 pytest==8.1.1 scikit-learn==1.3.2 scipy==1.10.1 diff --git a/setup.py b/setup.py index 14449ab..289a12f 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ 'matplotlib==3.7.5', 'numpy==1.24.4', 'pandas==2.0.3', - 'protobuf==3.20.3', # TODO: BigARTM dependency 'pytest==8.1.1', 'scikit-learn==1.3.2', 'scipy==1.10.1', From 0418fc885fddcddfb6e066c7b25e45a4bd9c90b8 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 11:38:44 +0300 Subject: [PATCH 19/49] move regularizers from notebooks to files --- topnum/regularizers/__init__.py | 5 + .../decorrelate_with_other_phi.py | 93 +++++++++++++++++++ topnum/regularizers/fix_phi.py | 57 ++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 topnum/regularizers/__init__.py create mode 100644 topnum/regularizers/decorrelate_with_other_phi.py create mode 100644 topnum/regularizers/fix_phi.py diff --git a/topnum/regularizers/__init__.py b/topnum/regularizers/__init__.py new file mode 100644 index 0000000..1a66e47 --- /dev/null +++ b/topnum/regularizers/__init__.py @@ -0,0 +1,5 @@ +from .fix_phi import FastFixPhiRegularizer +from .decorrelate_with_other_phi import ( + DecorrelateWithOtherPhiRegularizer, + DecorrelateWithOtherPhiRegularizer2, +) diff --git a/topnum/regularizers/decorrelate_with_other_phi.py b/topnum/regularizers/decorrelate_with_other_phi.py new file mode 100644 index 0000000..c083006 --- /dev/null +++ b/topnum/regularizers/decorrelate_with_other_phi.py @@ -0,0 +1,93 @@ +from typing import List, Optional + +import numpy as np +from numpy import ndarray +from pandas import DataFrame +from scipy.spatial.distance import cdist + +from artm import ARTM +from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer + + +class DecorrelateWithOtherPhiRegularizer(BaseRegularizer): + def __init__( + self, + name: str, + tau: float, + topic_names: List[str], + other_phi: DataFrame, + ): + super().__init__(name, tau=tau) + + self._topic_names = topic_names + self._other_phi = other_phi + self._other_topic_sum = self._other_phi.values.sum( + axis=1, keepdims=True + ) + + self._topic_indices = None + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + rwt[:, self._topic_indices] += ( + pwt.values[:, self._topic_indices] * self._other_topic_sum + ) + + return -1 * self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] + + +class DecorrelateWithOtherPhiRegularizer2(BaseRegularizer): + def __init__( + self, + name: str, + tau: float, + topic_names: List[str], + other_phi: DataFrame, + num_iters: Optional[int] = None, + ): + super().__init__(name, tau=tau) + + self._topic_names = topic_names + self._other_phi = other_phi + self._num_iters = num_iters + self._cur_iter = 0 + + self._topic_indices = None + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + + if self._num_iters is not None and self._cur_iter >= self._num_iters: + return rwt + + correlations = cdist( + self._other_phi.values.T, + pwt.values[:, self._topic_indices].T, + lambda u, v: (u * v).sum() + ) + weighted_other_topics = self._other_phi.values.dot(correlations) + + rwt[:, self._topic_indices] += ( + pwt.values[:, self._topic_indices] * weighted_other_topics + ) + self._cur_iter += 1 + + return -1 * self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] diff --git a/topnum/regularizers/fix_phi.py b/topnum/regularizers/fix_phi.py new file mode 100644 index 0000000..fbdf68b --- /dev/null +++ b/topnum/regularizers/fix_phi.py @@ -0,0 +1,57 @@ +from typing import List, Optional + +import numpy as np +from numpy import ndarray +from pandas import DataFrame + +from artm import ARTM +from topicnet.cooking_machine.models.topic_model import TopicModel +from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer + + +class FastFixPhiRegularizer(BaseRegularizer): + _VERY_BIG_TAU = 10 ** 9 + + def __init__( + self, + name: str, + topic_names: List[str], + parent_model: Optional[TopicModel] = None, + parent_phi: DataFrame = None, + tau: float = _VERY_BIG_TAU, + ): + super().__init__(name, tau=tau) + + if parent_phi is None and parent_model is None: + raise ValueError('Both parent Phi and parent model not specified.') + + self._topic_names = topic_names + self._topic_indices = None + self._parent_model = parent_model + self._parent_phi = parent_phi + + def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray: + rwt = np.zeros_like(pwt) + + if self._parent_phi is not None: + parent_phi = self._parent_phi + vals = parent_phi.values + else: + parent_phi = self._parent_model.get_phi() + vals = parent_phi.values[:, self._topic_indices] + + assert vals.shape[0] == rwt.shape[0] + assert vals.shape[1] == len(self._topic_indices), (vals.shape[1], len(self._topic_indices)) + + rwt[:, self._topic_indices] += vals + + return self.tau * rwt + + def attach(self, model: ARTM) -> None: + super().attach(model) + + phi = self._model.get_phi() + self._topic_indices = [ + phi.columns.get_loc(topic_name) + for topic_name in self._topic_names + ] From ac348c88a9d1c9a071fcea29531001245e77005f Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 12:03:59 +0300 Subject: [PATCH 20/49] refine regularizers usage in topic bank --- .../topic_bank/topic_bank_method.py | 62 +++++++------------ 1 file changed, 22 insertions(+), 40 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 62734d4..f9952b4 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -25,6 +25,7 @@ from topnum.data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection from topnum.model_constructor import init_model_from_family +from topnum.regularizers import FastFixPhiRegularizer from topnum.scores._base_coherence_score import ( SpecificityEstimationMethod, TextType, @@ -485,47 +486,13 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) _logger.info('No topics in bank — returning empty default scores for bank model') else: bank_phi = self._get_phi(self._topic_bank.topics, word2index) - - # TODO: you know - from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer - - class FastFixPhiRegularizer(BaseRegularizer): - _VERY_BIG_TAU = 10 ** 9 - - def __init__(self, name: str, phi, topic_names: List[str]): - super().__init__(name, tau=self._VERY_BIG_TAU) - - self._topic_names = topic_names - self._topic_indices = None - self._phi = phi - - def grad(self, pwt, nwt): - # print('Fixing') - - rwt = np.zeros_like(pwt) - parent_phi = self._phi - - rwt[:, self._topic_indices] += parent_phi.values[:, self._topic_indices] - - return self.tau * rwt - - def attach(self, model): - super().attach(model) - - phi = self._model.get_phi() - self._topic_indices = [ - phi.columns.get_loc(topic_name) - for topic_name in self._topic_names - ] - regularizer = FastFixPhiRegularizer( name='fix', - phi=bank_phi, + parent_phi=bank_phi, topic_names=bank_phi.columns, ) - bank_model = _get_topic_model( self._dataset, main_modality=self._main_modality, @@ -533,17 +500,18 @@ def attach(self, model): scores=self._all_model_scores, num_safe_fit_iterations=1 ) + # Safe fit to make topics so-so bank_model._fit( self._dataset.get_batch_vectorizer(), num_iterations=1, ) + bank_model._model.scores.add( artm.scores.PerplexityScore( name=f'ppl_fair', - ) + ) ) - # bank_model._fit(self._dataset.get_batch_vectorizer(), 1) bank_model._fit( self._dataset.get_batch_vectorizer(), num_iterations=5, @@ -552,15 +520,26 @@ def attach(self, model): } ) + assert np.allclose(bank_phi.to_numpy(), bank_model.get_phi().to_numpy()) + _logger.info('Computing default scores for bank model...') scores.update(self._get_default_scores(bank_model)) scores['ppl_fair'] = bank_model.scores['ppl_fair'][-1] - bank_model = init_model_from_family('sparse', self._dataset, self._main_modality, len(bank_phi.columns), 0) + # TODO: Second bank model is needed for experiments with regularizers + bank_model = init_model_from_family( + family='sparse', + dataset=self._dataset, main_modality=self._main_modality, + num_topics=len(bank_phi.columns), seed=0, + ) + + # Bcg sparse model + assert hasattr(bank_model, 'has_bcg') + assert bank_model.has_bcg + # Safe fit to make topics so-so - # bank_model.has_bcg = True bank_model._fit( self._dataset.get_batch_vectorizer(), num_iterations=1, @@ -568,7 +547,7 @@ def attach(self, model): bank_model._model.scores.add( artm.scores.PerplexityScore( name=f'ppl_cheatty', - ) + ) ) bank_model._fit( self._dataset.get_batch_vectorizer(), @@ -578,6 +557,9 @@ def attach(self, model): } ) + assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 + assert np.allclose(bank_phi.to_numpy(), bank_model.get_phi().to_numpy()[:, :-1]) + scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] print(f'Bank scores: {scores}') From 26c506eeda88b3fabc16792b14a04f81e98254ed Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 12:24:29 +0300 Subject: [PATCH 21/49] fix topic bank (experiment vs code conflict) --- topnum/search_methods/topic_bank/topic_bank_method.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index f9952b4..7568cf4 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -536,8 +536,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) ) # Bcg sparse model - assert hasattr(bank_model, 'has_bcg') - assert bank_model.has_bcg + # assert hasattr(bank_model, 'has_bcg') + # assert bank_model.has_bcg # Safe fit to make topics so-so bank_model._fit( From 9572fa2f9c23e81d08369c1840faf9d25b549b63 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 12:32:18 +0300 Subject: [PATCH 22/49] fix bank phi equality assert (atol) --- .../search_methods/topic_bank/topic_bank_method.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 7568cf4..ee6744d 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -520,7 +520,11 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) - assert np.allclose(bank_phi.to_numpy(), bank_model.get_phi().to_numpy()) + assert np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy(), + atol=1e-6, + ) _logger.info('Computing default scores for bank model...') @@ -558,7 +562,11 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) ) assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 - assert np.allclose(bank_phi.to_numpy(), bank_model.get_phi().to_numpy()[:, :-1]) + assert np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy()[:, :-1], + atol=1e-6, + ) scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] From fb93ab638a3e76ece89bcd9f13d6f1e9621b47f5 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 16:12:12 +0300 Subject: [PATCH 23/49] accelerate intratext --- topnum/scores/_base_coherence_score.py | 28 +++++++++++++++++--- topnum/scores/intratext_coherence_score.py | 30 ++++++++++++---------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 273f71c..bb1eb2c 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -173,6 +173,18 @@ def compute( word_topic_relatednesses = self._get_word_topic_relatednesses(model) + self._word_topic_relatednesses_fast = word_topic_relatednesses.to_dict() + self._neutral_word_topic_relatedness = float(np.mean(word_topic_relatednesses.values)) + self._word2index = { + word: index # word_topic_relatednesses.index.get_loc(word) + for index, word in enumerate(word_topic_relatednesses.index) + } + self._topic2index = { + topic: index # word_topic_relatednesses.columns.get_loc(topic) + for index, topic in enumerate(word_topic_relatednesses.columns) + } + self._word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + # TODO: topic coherence may be evaluated on any peace of text # (paragraph, sentence, phrase), that is, not only on whole documents topic_document_coherences = np.zeros((len(topics), len(documents))) @@ -327,21 +339,31 @@ def _get_source_document(self, document_id: str) -> str: def _get_vw_document(self, document_id: str) -> str: return self._dataset.get_vw_document(document_id).loc[document_id, VW_TEXT_COL] - @staticmethod def _get_relatedness( + self, word: Tuple[str, str], topic: str, word_topic_relatednesses: pd.DataFrame) -> float: + # try: + # return word_topic_relatednesses.loc[word, topic] + # except KeyError as error: + # _logger.warning( + # f'Some word not found in Word-Topic relatedness matrix: "{error}"!' + # f' Returning mean value over all word relatednesses for topic "{topic}".' + # ) + # + # return float(np.mean(word_topic_relatednesses.values)) + try: - return word_topic_relatednesses.loc[word, topic] + return self._word_topic_relatednesses_fast[topic][word] except KeyError as error: _logger.warning( f'Some word not found in Word-Topic relatedness matrix: "{error}"!' f' Returning mean value over all word relatednesses for topic "{topic}".' ) - return float(np.mean(word_topic_relatednesses.values)) + return self._neutral_word_topic_relatedness # TODO: DRY def save(self, path: str) -> None: diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index b7650a1..f2fe4d2 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -270,18 +270,22 @@ def _compute_coherence( elif self._computation_method == ComputationMethod.SEGMENT_WEIGHT: return topic_segment_weight - @staticmethod def _get_word_topic_index( + self, word: WordType, word_topic_relatednesses: pd.DataFrame, word_topic_indices: np.array, ) -> int: - if word not in word_topic_relatednesses.index: + # if word not in word_topic_relatednesses.index: + # return -1 + # else: + # return word_topic_indices[ + # word_topic_relatednesses.index.get_loc(word) + # ] + if word not in self._word2index: return -1 else: - return word_topic_indices[ - word_topic_relatednesses.index.get_loc(word) - ] + return word_topic_indices[self._word2index[word]] def _compute_segment_characteristics( self, @@ -293,14 +297,14 @@ def _compute_segment_characteristics( topic_segment_lengths = [] topic_segment_weights = [] - topic_index = word_topic_relatednesses.columns.get_loc(topic) - word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + topic_index = self._topic2index[topic] # word_topic_relatednesses.columns.get_loc(topic) + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=word_topic_indices, + word_topic_indices=self._word_topic_indices, ) index = 0 @@ -354,14 +358,14 @@ def _sum_relatednesses_over_window( words: List[WordType], word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: - topic_index = word_topic_relatednesses.columns.get_loc(topic) - word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + topic_index = self._topic2index[topic] # word_topic_relatednesses.columns.get_loc(topic) + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=word_topic_indices, + word_topic_indices=self._word_topic_indices, ) def find_next_topic_word(starting_index: int) -> int: @@ -442,13 +446,13 @@ def _compute_focus_consistency( if len(words) == 0: return None - word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) + # word_topic_indices = np.argmax(word_topic_relatednesses.values, axis=1) def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=word_topic_indices, + word_topic_indices=self._word_topic_indices, ) word_topics = [ From caa4056a872b5852d0d1ea441f92f58be9094a9d Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 16:31:02 +0300 Subject: [PATCH 24/49] turn off should compute for intratext (compute only on last iter) --- topnum/scores/_base_coherence_score.py | 21 +++++++++++++-------- topnum/scores/intratext_coherence_score.py | 12 +++++++++++- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index bb1eb2c..3dee8c6 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -11,6 +11,15 @@ Enum, IntEnum ) +from typing import ( + Callable, + Dict, + List, + Optional, + Tuple, + Union +) + from topicnet.cooking_machine.dataset import ( Dataset, VW_TEXT_COL, @@ -20,12 +29,6 @@ ) from topicnet.cooking_machine.models.base_model import BaseModel from topicnet.cooking_machine.models.base_score import BaseScore as TopicNetBaseScore -from typing import ( - Dict, - List, - Tuple, - Union -) from .base_custom_score import BaseCustomScore from ..data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection @@ -98,8 +101,10 @@ def __init__( word_topic_relatedness: WordTopicRelatednessType = WordTopicRelatednessType.PWT, specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, verbose: bool = False, - ): - super().__init__() + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = None, + ): + super().__init__(should_compute=should_compute) if not isinstance(dataset, Dataset): raise TypeError( diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index f2fe4d2..662ee76 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -9,8 +9,10 @@ from topicnet.cooking_machine import Dataset from topicnet.cooking_machine.models.base_model import BaseModel from typing import ( + Callable, Dict, List, + Optional, Tuple, Union ) @@ -90,6 +92,8 @@ def __init__( max_num_out_of_topic_words=10, window=10, verbose: bool = False, + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = False, # TODO: very slow on full collection ): """ Parameters @@ -137,6 +141,7 @@ def __init__( self._window = window self._verbose = verbose + self._should_compute = should_compute self._score = self._initialize() @@ -156,6 +161,7 @@ def _initialize(self) -> _BaseCoherenceScore: max_num_out_of_topic_words=self._max_num_out_of_topic_words, window=self._window, verbose=self._verbose, + should_compute=self._should_compute, ) def compute( @@ -181,7 +187,10 @@ def __init__( specificity_estimation: SpecificityEstimationMethod = SpecificityEstimationMethod.NONE, max_num_out_of_topic_words: int = 10, window: int = 10, - verbose: bool = False): + verbose: bool = False, + should_compute: Optional[ + Union[Callable[[int], bool], bool]] = None, + ): # TODO: word_topic_relatedness seems to be connected with TopTokensViewer stuff super().__init__( @@ -191,6 +200,7 @@ def __init__( word_topic_relatedness=word_topic_relatedness, specificity_estimation=specificity_estimation, verbose=verbose, + should_compute=should_compute, ) if not isinstance(computation_method, ComputationMethod): From b47f17c0f47693152964a60393d74fa0e115ae98 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 23:13:23 +0300 Subject: [PATCH 25/49] return should compute for intratext to sane default (should) --- topnum/scores/intratext_coherence_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 662ee76..c04f878 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -93,7 +93,7 @@ def __init__( window=10, verbose: bool = False, should_compute: Optional[ - Union[Callable[[int], bool], bool]] = False, # TODO: very slow on full collection + Union[Callable[[int], bool], bool]] = True, # TODO: very slow on full collection ): """ Parameters From f8a316b6d8877a28bacd96a287f73fc004bca40e Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 23:21:09 +0300 Subject: [PATCH 26/49] make equal semi windows for sum over window coherence --- topnum/scores/intratext_coherence_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index c04f878..f734b5c 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -401,7 +401,7 @@ def find_next_topic_word(starting_index: int) -> int: original_word_index = word_index window_lower_bound = word_index - int(np.floor(self._window // 2)) - window_upper_bound = word_index + int(np.ceil(self._window // 2)) + window_upper_bound = word_index + int(np.floor(self._window // 2)) + 1 sum_in_window = np.sum( [ From 31ef6ec7e508862f7a14f740f26a3882adae0b82 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 23:25:16 +0300 Subject: [PATCH 27/49] soften assert equal check in topic bank (increase stability) --- topnum/search_methods/topic_bank/topic_bank_method.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index ee6744d..01e80b1 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -523,7 +523,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy(), - atol=1e-6, + atol=1e-5, ) _logger.info('Computing default scores for bank model...') @@ -565,7 +565,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy()[:, :-1], - atol=1e-6, + atol=1e-5, ) scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] From 88322ad270b290ca0b646811195dd1beaa32e6a7 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 23:41:51 +0300 Subject: [PATCH 28/49] add debug message for tb equality assert --- topnum/scores/intratext_coherence_score.py | 4 ++-- topnum/search_methods/topic_bank/topic_bank_method.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index f734b5c..12bb2d6 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -381,8 +381,8 @@ def get_word_topic_index(word: WordType) -> int: def find_next_topic_word(starting_index: int) -> int: index = starting_index - while index < len(words) and\ - get_word_topic_index(words[index]) != topic_index: + while (index < len(words) + and get_word_topic_index(words[index]) != topic_index): index += 1 if index == len(words): diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 01e80b1..2939c74 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -520,6 +520,9 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) + print(f'!!! Bank Phi: {bank_phi.to_numpy()}.') + print(f'!!! Bank model Phi: {bank_model.get_phi().to_numpy()}.') + assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy(), @@ -561,6 +564,9 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) + print(f'!!! Bank Phi: {bank_phi.to_numpy()}.') + print(f'!!! Bank model Phi: {bank_model.get_phi().to_numpy()}.') + assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 assert np.allclose( bank_phi.to_numpy(), From 7a597c828bb5c042c98041ccc60a4ebd7f30391f Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Sun, 21 Jul 2024 23:45:17 +0300 Subject: [PATCH 29/49] soften atol in tb check as low as possible to remain decent --- topnum/search_methods/topic_bank/topic_bank_method.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 2939c74..b0742b5 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -526,7 +526,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy(), - atol=1e-5, + atol=1e-3, ) _logger.info('Computing default scores for bank model...') @@ -571,7 +571,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy()[:, :-1], - atol=1e-5, + atol=1e-3, ) scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] From 20e0cb8949ecc144a7e44b1a94c7e3de44fb30f6 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 01:39:12 +0300 Subject: [PATCH 30/49] add test for sum over different windows --- topnum/tests/test_coherence_scores.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/topnum/tests/test_coherence_scores.py b/topnum/tests/test_coherence_scores.py index 243708e..fa1e61c 100644 --- a/topnum/tests/test_coherence_scores.py +++ b/topnum/tests/test_coherence_scores.py @@ -271,6 +271,22 @@ def test_compute_intratext( self._check_compute(score) + @pytest.mark.parametrize( + 'window', + [2, 4, 10] # TODO: window = 1 -> fail (sometimes?) + ) + def test_compute_topden(self, window) -> None: + score = _IntratextCoherenceScore( + self.dataset, + text_type=TextType.VW_TEXT, + computation_method=ComputationMethod.SUM_OVER_WINDOW, + word_topic_relatedness=WordTopicRelatednessType.PTW, + specificity_estimation=SpecificityEstimationMethod.NONE, + window=window, + ) + + self._check_compute(score) + @pytest.mark.parametrize('keep_in_memory', [True, False]) def test_compute_intratext_small_big_data(self, keep_in_memory) -> None: dataset = Dataset(self.dataset_file_path, keep_in_memory=keep_in_memory) From 2af66ea7c02ffe720e57f4148e7acb33c61f35a6 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 01:46:10 +0300 Subject: [PATCH 31/49] trying to speed up topden (try instead if) --- topnum/scores/intratext_coherence_score.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 12bb2d6..795104b 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -292,10 +292,16 @@ def _get_word_topic_index( # return word_topic_indices[ # word_topic_relatednesses.index.get_loc(word) # ] - if word not in self._word2index: - return -1 - else: + + # if word not in self._word2index: + # return -1 + # else: + # return word_topic_indices[self._word2index[word]] + + try: return word_topic_indices[self._word2index[word]] + except KeyError: + return -1 def _compute_segment_characteristics( self, From 3a9f10252ee33bedee0c2c73c8a3f7bd4afb34c1 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 01:58:53 +0300 Subject: [PATCH 32/49] trying to speed up topden try 2: remove np floor from window --- topnum/scores/intratext_coherence_score.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 795104b..7313b99 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -406,8 +406,8 @@ def find_next_topic_word(starting_index: int) -> int: while word_index < len(words) and word_index != -1: original_word_index = word_index - window_lower_bound = word_index - int(np.floor(self._window // 2)) - window_upper_bound = word_index + int(np.floor(self._window // 2)) + 1 + window_lower_bound = word_index - self._window // 2 + window_upper_bound = word_index + self._window // 2 + 1 sum_in_window = np.sum( [ From b2567c6cc217b3f302eec85ddc1dc9d0630c1f21 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 02:23:46 +0300 Subject: [PATCH 33/49] speeding up topdep try 3: remove density intersections --- topnum/scores/intratext_coherence_score.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 7313b99..5a3ea10 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -402,13 +402,18 @@ def find_next_topic_word(starting_index: int) -> int: return None sums = list() + border_left_index = 0 while word_index < len(words) and word_index != -1: original_word_index = word_index - window_lower_bound = word_index - self._window // 2 + window_lower_bound = max( + border_left_index, word_index - self._window // 2 + ) window_upper_bound = word_index + self._window // 2 + 1 + assert window_lower_bound <= word_index + sum_in_window = np.sum( [ self._get_relatedness(w, topic, word_topic_relatednesses) From 97579e58a04be6e91da44c9ffde4b96df6b5355c Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 02:30:05 +0300 Subject: [PATCH 34/49] speeding up topdep try 3: remove density intersections (fix) --- topnum/scores/intratext_coherence_score.py | 1 + 1 file changed, 1 insertion(+) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 5a3ea10..333547f 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -424,6 +424,7 @@ def find_next_topic_word(starting_index: int) -> int: sums.append(sum_in_window) word_index = find_next_topic_word(window_upper_bound) + border_left_index = window_upper_bound assert word_index > original_word_index or word_index == -1 From 2dbfcab805a0594cd919656c6ffa6862e17f6073 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 09:41:50 +0300 Subject: [PATCH 35/49] speeding up topdep: np.sum -> sum --- topnum/scores/intratext_coherence_score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 333547f..9a01d2b 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -414,7 +414,7 @@ def find_next_topic_word(starting_index: int) -> int: assert window_lower_bound <= word_index - sum_in_window = np.sum( + sum_in_window = sum( [ self._get_relatedness(w, topic, word_topic_relatednesses) for w in words[window_lower_bound:window_upper_bound] From 255c72dc17147a4267753aac4b020b1f71e40728 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 09:46:16 +0300 Subject: [PATCH 36/49] speeding up topdep: sum(list) -> v += dv --- topnum/scores/intratext_coherence_score.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 9a01d2b..4d6355a 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -414,12 +414,19 @@ def find_next_topic_word(starting_index: int) -> int: assert window_lower_bound <= word_index - sum_in_window = sum( - [ - self._get_relatedness(w, topic, word_topic_relatednesses) - for w in words[window_lower_bound:window_upper_bound] - ] - ) + # sum_in_window = sum( # np.sum + # [ + # self._get_relatedness(w, topic, word_topic_relatednesses) + # for w in words[window_lower_bound:window_upper_bound] + # ] + # ) + + sum_in_window = 0.0 + + for j in range(window_lower_bound, window_upper_bound): + sum_in_window = sum_in_window + self._get_relatedness( + words[j], topic, word_topic_relatednesses + ) sums.append(sum_in_window) From b00a4f8704e1ac858e7f10530bf7084230dfbf3b Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 09:50:41 +0300 Subject: [PATCH 37/49] fix right border in +dv --- topnum/scores/intratext_coherence_score.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 4d6355a..a4cfc0f 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -403,6 +403,7 @@ def find_next_topic_word(starting_index: int) -> int: sums = list() border_left_index = 0 + border_right_index = len(words) - 1 while word_index < len(words) and word_index != -1: original_word_index = word_index @@ -410,7 +411,9 @@ def find_next_topic_word(starting_index: int) -> int: window_lower_bound = max( border_left_index, word_index - self._window // 2 ) - window_upper_bound = word_index + self._window // 2 + 1 + window_upper_bound = min( + border_right_index, word_index + self._window // 2 + ) + 1 assert window_lower_bound <= word_index From d605f50633e69d727146ab8c847bd26f562c34c9 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 10:17:09 +0300 Subject: [PATCH 38/49] use lru cache (unlimited) for get_relatedness --- topnum/scores/_base_coherence_score.py | 2 ++ topnum/scores/intratext_coherence_score.py | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/topnum/scores/_base_coherence_score.py b/topnum/scores/_base_coherence_score.py index 3dee8c6..735f8a6 100644 --- a/topnum/scores/_base_coherence_score.py +++ b/topnum/scores/_base_coherence_score.py @@ -11,6 +11,7 @@ Enum, IntEnum ) +from functools import lru_cache from typing import ( Callable, Dict, @@ -344,6 +345,7 @@ def _get_source_document(self, document_id: str) -> str: def _get_vw_document(self, document_id: str) -> str: return self._dataset.get_vw_document(document_id).loc[document_id, VW_TEXT_COL] + @lru_cache(maxsize=None) def _get_relatedness( self, word: Tuple[str, str], diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index a4cfc0f..6c53665 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -6,8 +6,6 @@ auto, IntEnum ) -from topicnet.cooking_machine import Dataset -from topicnet.cooking_machine.models.base_model import BaseModel from typing import ( Callable, Dict, @@ -17,6 +15,9 @@ Union ) +from topicnet.cooking_machine import Dataset +from topicnet.cooking_machine.models.base_model import BaseModel + from ..data.vowpal_wabbit_text_collection import VowpalWabbitTextCollection from ._base_coherence_score import ( _BaseCoherenceScore, @@ -335,7 +336,7 @@ def get_word_topic_index(word: WordType) -> int: segment_length = 1 segment_weight = self._get_relatedness( - words[index], topic, word_topic_relatednesses + words[index], topic, None ) num_out_of_topic_words = 0 @@ -348,7 +349,7 @@ def get_word_topic_index(word: WordType) -> int: else: segment_length += 1 segment_weight += self._get_relatedness( - words[index], topic, word_topic_relatednesses + words[index], topic, None ) num_out_of_topic_words = 0 @@ -428,7 +429,7 @@ def find_next_topic_word(starting_index: int) -> int: for j in range(window_lower_bound, window_upper_bound): sum_in_window = sum_in_window + self._get_relatedness( - words[j], topic, word_topic_relatednesses + words[j], topic, None ) sums.append(sum_in_window) @@ -447,7 +448,7 @@ def _compute_variance_in_window( word_topic_relatednesses: pd.DataFrame) -> Union[float, None]: topic_relatednesses = [ - self._get_relatedness(word, topic, word_topic_relatednesses) + self._get_relatedness(word, topic, None) for word in words ] @@ -500,16 +501,16 @@ def get_word_topic_index(word: WordType) -> int: cur_topic, next_topic = word_topics[index], word_topics[index + 1] r_cw_ct = self._get_relatedness( - cur_word, cur_topic, word_topic_relatednesses + cur_word, cur_topic, None ) r_cw_nt = self._get_relatedness( - cur_word, next_topic, word_topic_relatednesses + cur_word, next_topic, None ) r_nw_ct = self._get_relatedness( - next_word, cur_topic, word_topic_relatednesses + next_word, cur_topic, None ) r_nw_nt = self._get_relatedness( - next_word, next_topic, word_topic_relatednesses + next_word, next_topic, None ) diff1 = abs(r_cw_ct - r_nw_ct) From a05a8bc629a43e53efa10c0ff6a306fed0c80389 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 10:34:05 +0300 Subject: [PATCH 39/49] use lru cache for get word topic index --- topnum/scores/intratext_coherence_score.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index 6c53665..add7cae 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -6,6 +6,7 @@ auto, IntEnum ) +from functools import lru_cache from typing import ( Callable, Dict, @@ -281,6 +282,7 @@ def _compute_coherence( elif self._computation_method == ComputationMethod.SEGMENT_WEIGHT: return topic_segment_weight + @lru_cache(maxsize=None) def _get_word_topic_index( self, word: WordType, @@ -300,7 +302,7 @@ def _get_word_topic_index( # return word_topic_indices[self._word2index[word]] try: - return word_topic_indices[self._word2index[word]] + return self._word_topic_indices[self._word2index[word]] except KeyError: return -1 @@ -320,8 +322,8 @@ def _compute_segment_characteristics( def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, - word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=self._word_topic_indices, + word_topic_relatednesses=None, + word_topic_indices=None, ) index = 0 @@ -336,7 +338,8 @@ def get_word_topic_index(word: WordType) -> int: segment_length = 1 segment_weight = self._get_relatedness( - words[index], topic, None + words[index], topic, None # word_topic_relatednesses is not used here + # (besides, lru_cache is applied and who knows how it would react to pd.DataFrame as param) ) num_out_of_topic_words = 0 @@ -381,8 +384,8 @@ def _sum_relatednesses_over_window( def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, - word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=self._word_topic_indices, + word_topic_relatednesses=None, + word_topic_indices=None, ) def find_next_topic_word(starting_index: int) -> int: @@ -484,8 +487,8 @@ def _compute_focus_consistency( def get_word_topic_index(word: WordType) -> int: return self._get_word_topic_index( word=word, - word_topic_relatednesses=word_topic_relatednesses, - word_topic_indices=self._word_topic_indices, + word_topic_relatednesses=None, + word_topic_indices=None, ) word_topics = [ From 7a9b92891860a90091ea5bfa4fc225943cf0b514 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Mon, 22 Jul 2024 10:41:10 +0300 Subject: [PATCH 40/49] remove lru cache for get topic index (no speed up) --- topnum/scores/intratext_coherence_score.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/topnum/scores/intratext_coherence_score.py b/topnum/scores/intratext_coherence_score.py index add7cae..c2abb08 100644 --- a/topnum/scores/intratext_coherence_score.py +++ b/topnum/scores/intratext_coherence_score.py @@ -6,7 +6,6 @@ auto, IntEnum ) -from functools import lru_cache from typing import ( Callable, Dict, @@ -282,7 +281,7 @@ def _compute_coherence( elif self._computation_method == ComputationMethod.SEGMENT_WEIGHT: return topic_segment_weight - @lru_cache(maxsize=None) + # @lru_cache(maxsize=None) # did't provide speed up def _get_word_topic_index( self, word: WordType, From dda0e5b745a76ef4bbb205ffd338f5a644a64574 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 7 Aug 2024 16:43:34 +0300 Subject: [PATCH 41/49] remove pre-save in topicbank (may lead to inconsistent results) --- .../topic_bank/topic_bank_method.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index b0742b5..eab73aa 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -352,7 +352,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._result[_KEY_MODEL_SCORES].append(scores) self._result[_KEY_NUM_MODEL_TOPICS].append(topic_model.get_phi().shape[1]) - self.save() + # self.save() if self._topic_score_threshold_percentile % 1 != 0: print(f'Using absoulte threshold: {self._topic_score_threshold_percentile}.') @@ -382,8 +382,18 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topics_for_append = list(range(len(phi.columns))) topics_for_update = dict() elif self._bank_update == BankUpdateMethod.PROVIDE_NON_LINEARITY: + self._last_bank_phi = self._get_phi(self._topic_bank.topics, word2index) + self._last_model_phi = phi + + if hasattr(topic_model, 'has_bcg'): + print(f'Eliminating bcg topic before Hier. Cur |T| is {phi.shape[1]}, topics are: {phi.columns}.') + + phi = phi.iloc[:, :-1] + + print(f'Now |T| is {phi.shape[1]}, topics are: {phi.columns}.') + topics_for_append, topics_for_update = self._extract_hierarchical_relationship( - bank_phi=self._get_phi(self._topic_bank.topics, word2index), + bank_phi=self._last_bank_phi, new_model_phi=phi, psi_threshold=self._child_parent_relationship_threshold ) @@ -467,7 +477,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._topic_bank.topic_scores # TODO: append ) - self.save() + # self.save() if self._save_model_topics: self._topic_bank.save_model_topics( From 98437d3edf30e5a9c3fb78e8d9060331fc9dda1c Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 10:04:19 +0300 Subject: [PATCH 42/49] comment something in topic bank for somebody --- topnum/search_methods/topic_bank/topic_bank_method.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index eab73aa..52f749a 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -786,6 +786,11 @@ def _jaccard_distance( q: Dict[str, float], kernel_only: bool = True) -> float: + # TODO: Can topics appear close if + # top words are the same, but in different order? + # (with different probabilities) + # In other words, "same top words (no matter the order)" == "similar topics"? + # (seems like it should be so) numerator = 0 denominator = 0 From 7050805f69898d23796defc7de25124a7678ab2d Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 12:18:31 +0300 Subject: [PATCH 43/49] add tests for regs --- .../decorrelate_with_other_phi.py | 29 ++ topnum/regularizers/fix_phi.py | 2 +- topnum/tests/test_regularizers.py | 425 ++++++++++++++++++ 3 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 topnum/tests/test_regularizers.py diff --git a/topnum/regularizers/decorrelate_with_other_phi.py b/topnum/regularizers/decorrelate_with_other_phi.py index c083006..ead5f8d 100644 --- a/topnum/regularizers/decorrelate_with_other_phi.py +++ b/topnum/regularizers/decorrelate_with_other_phi.py @@ -9,6 +9,8 @@ from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer +# TODO: find (and make possible to use) relative taus for these regularizers + class DecorrelateWithOtherPhiRegularizer(BaseRegularizer): def __init__( self, @@ -17,6 +19,19 @@ def __init__( topic_names: List[str], other_phi: DataFrame, ): + """ + + Parameters + ---------- + name + tau + To select a value, try a few test runs to find the tau + that affects the perplexity (worsens, but not very much). + Recommendation based on experimentation: try 1e5 or 1e6. + topic_names + other_phi + + """ super().__init__(name, tau=tau) self._topic_names = topic_names @@ -54,6 +69,20 @@ def __init__( other_phi: DataFrame, num_iters: Optional[int] = None, ): + """ + + Parameters + ---------- + name + tau + To select a value, try a few test runs to find the tau + that affects the perplexity (worsens, but not very much). + Recommendation based on experimentation: try 1e8, 1e9, or 1e10. + topic_names + other_phi + num_iters + + """ super().__init__(name, tau=tau) self._topic_names = topic_names diff --git a/topnum/regularizers/fix_phi.py b/topnum/regularizers/fix_phi.py index fbdf68b..4d83724 100644 --- a/topnum/regularizers/fix_phi.py +++ b/topnum/regularizers/fix_phi.py @@ -16,7 +16,7 @@ def __init__( self, name: str, topic_names: List[str], - parent_model: Optional[TopicModel] = None, + parent_model: Optional[TopicModel] = None, # TODO: TopicModel or ARTM? parent_phi: DataFrame = None, tau: float = _VERY_BIG_TAU, ): diff --git a/topnum/tests/test_regularizers.py b/topnum/tests/test_regularizers.py new file mode 100644 index 0000000..ef123d0 --- /dev/null +++ b/topnum/tests/test_regularizers.py @@ -0,0 +1,425 @@ +import logging +import numpy as np +import os + +import pandas as pd +import pytest +import shutil +import tempfile +import warnings + +from copy import deepcopy +from itertools import combinations +from numbers import Number +from time import sleep +from typing import ( + Dict, + List, +) + +from pandas import DataFrame + +import artm + +from topicnet.cooking_machine import Experiment +from topicnet.cooking_machine.cubes import ( + CubeCreator, + RegularizersModifierCube, +) +from topicnet.cooking_machine.dataset import ( + Dataset, + W_DIFF_BATCHES_1, +) +from topicnet.cooking_machine.models import ( + BaseModel, + TopicModel, +) +from topicnet.cooking_machine.model_constructor import init_simple_default_model + + +from topnum.regularizers import ( + FastFixPhiRegularizer, + DecorrelateWithOtherPhiRegularizer, + DecorrelateWithOtherPhiRegularizer2, +) +from topnum.scores import PerplexityScore +from topnum.tests.data_generator import TestDataGenerator + + +_Logger = logging.getLogger() + + +@pytest.mark.filterwarnings(f'ignore:{W_DIFF_BATCHES_1}') +class TestOptimizeScores: + PPL_SCORE_NAME = 'ppl' + ONE_FIT_NUM_ITERS = 10 + + NUM_TOPICS = 10 + + # Ideally, these topics should be found by looking at the scores + # (Here we are assigning the labels just out of thin air) + GOOD_TOPIC_INDICES = [0, 1, 2] + BAD_TOPIC_INDICES = [-1, -2, -3] + + data_generator = None + + main_modality = None + other_modality = None + text_collection = None + + optimizer = None + + working_folder_path = None + + @classmethod + def setup_class(cls): + cls.data_generator = TestDataGenerator() + + cls.data_generator.generate() + + cls.data_generator.text_collection._dataset = None + + cls.text_collection = cls.data_generator.text_collection + cls.main_modality = cls.data_generator.main_modality + cls.other_modality = cls.data_generator.other_modality + + cls.working_folder_path = tempfile.mktemp(prefix='test_optimize_scores__') + + def setup_method(self): + assert self.text_collection._dataset is None + + os.mkdir(self.working_folder_path) + + def teardown_method(self): + self.text_collection._set_dataset_kwargs() + self.text_collection._dataset = None + + if self.optimizer is not None: + self.optimizer.clear() + + if os.path.isdir(self.working_folder_path): + shutil.rmtree(self.working_folder_path) + + @classmethod + def teardown_class(cls): + if cls.data_generator is not None: + cls.data_generator.clear() + + if os.path.isdir(cls.working_folder_path): + shutil.rmtree(cls.working_folder_path) + + def _dataset(self, keep_in_memory: bool = True) -> Dataset: + self.text_collection._set_dataset_kwargs( + keep_in_memory=keep_in_memory + ) + dataset = self.text_collection._to_dataset() + + return dataset + + def _topic_model_and_topics( + self, + dataset: Dataset, + num_specific_topics=5, + num_background_topics=1, + num_processors=2, + ): + artm_model = init_simple_default_model( + dataset=dataset, + modalities_to_use=[self.main_modality, self.other_modality], + main_modality=self.main_modality, + specific_topics=num_specific_topics, + background_topics=num_background_topics, + ) + artm_model.num_processors = num_processors + + topic_model = TopicModel(artm_model) + score = PerplexityScore(self.PPL_SCORE_NAME) + score._attach(topic_model) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + ) + + phi = topic_model.get_phi() + good_topic_names = [phi.columns[t] for t in self.GOOD_TOPIC_INDICES] + bad_topic_names = [phi.columns[t] for t in self.BAD_TOPIC_INDICES] + not_good_topic_names = [ + phi.columns[t] + for t in range(len(phi.columns)) + if t not in self.GOOD_TOPIC_INDICES + ] + + return ( + topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names, + ) + + def _get_fix_regularizer( + self, + name: str, + target_topic_names: List[str], + parent_topic_model: TopicModel = None, + parent_phi: DataFrame = None, + ): + fix_regularizer = FastFixPhiRegularizer( + name=name, + topic_names=target_topic_names, + parent_model=parent_topic_model, + parent_phi=parent_phi, + ) + + return fix_regularizer + + def _get_decorr_regularizer( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + ): + other_phi = other_topic_model._model.get_phi()[other_topic_names] + other_phi = deepcopy(other_phi) + decorr_regularizer = DecorrelateWithOtherPhiRegularizer( + name=name, + tau=tau, + topic_names=target_topic_names, + other_phi=other_phi, + ) + + return decorr_regularizer, other_phi + + def _get_decorr_regularizer2( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + ): + other_phi = other_topic_model._model.get_phi()[other_topic_names] + other_phi = deepcopy(other_phi) + decorr_regularizer = DecorrelateWithOtherPhiRegularizer2( + name=name, + tau=tau, + topic_names=target_topic_names, + other_phi=other_phi, + ) + + return decorr_regularizer, other_phi + + @pytest.mark.parametrize('keep_in_memory', [True, False]) + def test_fix_good(self, keep_in_memory): + dataset = self._dataset(keep_in_memory=keep_in_memory) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + + good_phi = deepcopy( + topic_model._model.get_phi()[good_topic_names] + ) + + fix_regularizer = self._get_fix_regularizer( + name='fix', + target_topic_names=good_topic_names, + parent_phi=good_phi, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + fix_regularizer.name: fix_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + assert np.allclose( + new_phi[good_topic_names], good_phi + ) + + @pytest.mark.parametrize('keep_in_memory', [True, False]) + def test_decorr_bad(self, keep_in_memory): + dataset = self._dataset(keep_in_memory=keep_in_memory) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + + good_phi = deepcopy( + topic_model._model.get_phi()[good_topic_names] + ) + + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', + tau=1e5, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=bad_topic_names, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + decorr_bad_regularizer.name: decorr_bad_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + # TODO: good topics also change (as they are not fixed) + # so, the meaningfulness of this test is questionable + # (other than the fact that it simply tests runnability) + # assert np.allclose( + # new_phi[good_topic_names], good_phi, rtol=0.05 + # ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) + + @pytest.mark.parametrize('keep_in_memory', [True, False]) + def test_decorr_bad2(self, keep_in_memory): + dataset = self._dataset(keep_in_memory=keep_in_memory) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + + good_phi = deepcopy( + topic_model._model.get_phi()[good_topic_names] + ) + + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', + tau=1e8, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=bad_topic_names, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + decorr_bad_regularizer.name: decorr_bad_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + # TODO: good topics also change (as they are not fixed) + # so, the meaningfulness of this test is questionable + # (other than the fact that it simply tests runnability) + # assert np.allclose( + # new_phi[good_topic_names], good_phi, rtol=0.05 + # ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) + + def test_fix_good_and_decorr_good_bad(self): + dataset = self._dataset(keep_in_memory=True) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + + fix_regularizer = self._get_fix_regularizer( + name='fix', + target_topic_names=good_topic_names, + parent_topic_model=topic_model._model, # TODO: test breaks if pass just `topic_model` + # aah, I guess, there are some score saving issues + ) + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', + tau=1e5, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=bad_topic_names, + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer( + name='ext_decorr_good', + tau=1e5, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=good_topic_names, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + fix_regularizer.name: fix_regularizer, + decorr_bad_regularizer.name: decorr_bad_regularizer, + decorr_good_regularizer.name: decorr_good_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + assert np.allclose( + new_phi[good_topic_names], good_phi + ) + + assert not np.allclose( + new_phi[not_good_topic_names], good_phi, rtol=0.5 + ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) + + def test_fix_good_and_decorr_good_bad2(self): + dataset = self._dataset(keep_in_memory=True) + (topic_model, + good_topic_names, + bad_topic_names, + not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + + fix_regularizer = self._get_fix_regularizer( + name='fix', + target_topic_names=good_topic_names, + parent_topic_model=topic_model._model, + ) + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', + tau=1e8, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=bad_topic_names, + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer2( + name='ext_decorr_good2', + tau=1e8, + target_topic_names=not_good_topic_names, + other_topic_model=topic_model, + other_topic_names=good_topic_names, + ) + + topic_model._fit( + dataset.get_batch_vectorizer(), + num_iterations=self.ONE_FIT_NUM_ITERS, + custom_regularizers={ + fix_regularizer.name: fix_regularizer, + decorr_bad_regularizer.name: decorr_bad_regularizer, + decorr_good_regularizer.name: decorr_good_regularizer, + } + ) + + new_phi = topic_model._model.get_phi() + + assert np.allclose( + new_phi[good_topic_names], good_phi + ) + + assert not np.allclose( + new_phi[not_good_topic_names], good_phi, rtol=0.5 + ) + assert not np.allclose( + new_phi[not_good_topic_names], bad_phi, rtol=0.5 + ) From 87732f377abc7240b068159e6d92d50d1e89feee Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 16:48:18 +0300 Subject: [PATCH 44/49] refactor regs tests --- topnum/tests/test_regularizers.py | 236 +++++++++++------------------- 1 file changed, 82 insertions(+), 154 deletions(-) diff --git a/topnum/tests/test_regularizers.py b/topnum/tests/test_regularizers.py index ef123d0..faf6521 100644 --- a/topnum/tests/test_regularizers.py +++ b/topnum/tests/test_regularizers.py @@ -1,39 +1,21 @@ import logging -import numpy as np import os - -import pandas as pd -import pytest import shutil import tempfile -import warnings from copy import deepcopy -from itertools import combinations -from numbers import Number -from time import sleep -from typing import ( - Dict, - List, -) +from typing import List -from pandas import DataFrame +import numpy as np +import pytest -import artm +from pandas import DataFrame -from topicnet.cooking_machine import Experiment -from topicnet.cooking_machine.cubes import ( - CubeCreator, - RegularizersModifierCube, -) from topicnet.cooking_machine.dataset import ( Dataset, W_DIFF_BATCHES_1, ) -from topicnet.cooking_machine.models import ( - BaseModel, - TopicModel, -) +from topicnet.cooking_machine.models import TopicModel from topicnet.cooking_machine.model_constructor import init_simple_default_model @@ -67,8 +49,6 @@ class TestOptimizeScores: other_modality = None text_collection = None - optimizer = None - working_folder_path = None @classmethod @@ -94,9 +74,6 @@ def teardown_method(self): self.text_collection._set_dataset_kwargs() self.text_collection._dataset = None - if self.optimizer is not None: - self.optimizer.clear() - if os.path.isdir(self.working_folder_path): shutil.rmtree(self.working_folder_path) @@ -108,7 +85,7 @@ def teardown_class(cls): if os.path.isdir(cls.working_folder_path): shutil.rmtree(cls.working_folder_path) - def _dataset(self, keep_in_memory: bool = True) -> Dataset: + def _get_dataset(self, keep_in_memory: bool = True) -> Dataset: self.text_collection._set_dataset_kwargs( keep_in_memory=keep_in_memory ) @@ -116,7 +93,7 @@ def _dataset(self, keep_in_memory: bool = True) -> Dataset: return dataset - def _topic_model_and_topics( + def _get_topic_model_and_topics( self, dataset: Dataset, num_specific_topics=5, @@ -173,17 +150,18 @@ def _get_fix_regularizer( return fix_regularizer - def _get_decorr_regularizer( + def _get_decorr_regularizer_base( self, name: str, tau: float, target_topic_names: List[str], other_topic_model: TopicModel, other_topic_names: List[str], + decorrelate_regularizer_class, ): other_phi = other_topic_model._model.get_phi()[other_topic_names] other_phi = deepcopy(other_phi) - decorr_regularizer = DecorrelateWithOtherPhiRegularizer( + decorr_regularizer = decorrelate_regularizer_class( name=name, tau=tau, topic_names=target_topic_names, @@ -192,7 +170,7 @@ def _get_decorr_regularizer( return decorr_regularizer, other_phi - def _get_decorr_regularizer2( + def _get_decorr_regularizer( self, name: str, tau: float, @@ -200,24 +178,37 @@ def _get_decorr_regularizer2( other_topic_model: TopicModel, other_topic_names: List[str], ): - other_phi = other_topic_model._model.get_phi()[other_topic_names] - other_phi = deepcopy(other_phi) - decorr_regularizer = DecorrelateWithOtherPhiRegularizer2( - name=name, - tau=tau, - topic_names=target_topic_names, - other_phi=other_phi, + return self._get_decorr_regularizer_base( + name=name, tau=tau, + target_topic_names=target_topic_names, + other_topic_model=other_topic_model, + other_topic_names=other_topic_names, + decorrelate_regularizer_class=DecorrelateWithOtherPhiRegularizer, ) - return decorr_regularizer, other_phi + def _get_decorr_regularizer2( + self, + name: str, + tau: float, + target_topic_names: List[str], + other_topic_model: TopicModel, + other_topic_names: List[str], + ): + return self._get_decorr_regularizer_base( + name=name, tau=tau, + target_topic_names=target_topic_names, + other_topic_model=other_topic_model, + other_topic_names=other_topic_names, + decorrelate_regularizer_class=DecorrelateWithOtherPhiRegularizer2, + ) @pytest.mark.parametrize('keep_in_memory', [True, False]) def test_fix_good(self, keep_in_memory): - dataset = self._dataset(keep_in_memory=keep_in_memory) + dataset = self._get_dataset(keep_in_memory=keep_in_memory) (topic_model, good_topic_names, bad_topic_names, - not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) good_phi = deepcopy( topic_model._model.get_phi()[good_topic_names] @@ -243,65 +234,36 @@ def test_fix_good(self, keep_in_memory): new_phi[good_topic_names], good_phi ) + @pytest.mark.parametrize('decorr_v2', [False, True]) @pytest.mark.parametrize('keep_in_memory', [True, False]) - def test_decorr_bad(self, keep_in_memory): - dataset = self._dataset(keep_in_memory=keep_in_memory) + def test_decorr_bad(self, decorr_v2, keep_in_memory): + dataset = self._get_dataset(keep_in_memory=keep_in_memory) (topic_model, good_topic_names, bad_topic_names, - not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) good_phi = deepcopy( topic_model._model.get_phi()[good_topic_names] ) - - decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( - name='ext_decorr_bad', - tau=1e5, + base_topic_decorr_kwargs = dict( target_topic_names=not_good_topic_names, other_topic_model=topic_model, other_topic_names=bad_topic_names, ) - topic_model._fit( - dataset.get_batch_vectorizer(), - num_iterations=self.ONE_FIT_NUM_ITERS, - custom_regularizers={ - decorr_bad_regularizer.name: decorr_bad_regularizer, - } - ) - - new_phi = topic_model._model.get_phi() - - # TODO: good topics also change (as they are not fixed) - # so, the meaningfulness of this test is questionable - # (other than the fact that it simply tests runnability) - # assert np.allclose( - # new_phi[good_topic_names], good_phi, rtol=0.05 - # ) - assert not np.allclose( - new_phi[not_good_topic_names], bad_phi, rtol=0.5 - ) - - @pytest.mark.parametrize('keep_in_memory', [True, False]) - def test_decorr_bad2(self, keep_in_memory): - dataset = self._dataset(keep_in_memory=keep_in_memory) - (topic_model, - good_topic_names, - bad_topic_names, - not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) - - good_phi = deepcopy( - topic_model._model.get_phi()[good_topic_names] - ) - - decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( - name='ext_decorr_bad2', - tau=1e8, - target_topic_names=not_good_topic_names, - other_topic_model=topic_model, - other_topic_names=bad_topic_names, - ) + if not decorr_v2: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', + tau=1e5, + **base_topic_decorr_kwargs, + ) + else: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', + tau=1e8, + **base_topic_decorr_kwargs, + ) topic_model._fit( dataset.get_batch_vectorizer(), @@ -323,84 +285,50 @@ def test_decorr_bad2(self, keep_in_memory): new_phi[not_good_topic_names], bad_phi, rtol=0.5 ) - def test_fix_good_and_decorr_good_bad(self): - dataset = self._dataset(keep_in_memory=True) + @pytest.mark.parametrize('decorr_v2', [False, True]) + def test_fix_good_and_decorr_good_bad(self, decorr_v2): + dataset = self._get_dataset(keep_in_memory=True) (topic_model, good_topic_names, bad_topic_names, - not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) - - fix_regularizer = self._get_fix_regularizer( - name='fix', - target_topic_names=good_topic_names, - parent_topic_model=topic_model._model, # TODO: test breaks if pass just `topic_model` - # aah, I guess, there are some score saving issues - ) - decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( - name='ext_decorr_bad', - tau=1e5, - target_topic_names=not_good_topic_names, - other_topic_model=topic_model, - other_topic_names=bad_topic_names, - ) - decorr_good_regularizer, good_phi = self._get_decorr_regularizer( - name='ext_decorr_good', - tau=1e5, - target_topic_names=not_good_topic_names, - other_topic_model=topic_model, - other_topic_names=good_topic_names, - ) - - topic_model._fit( - dataset.get_batch_vectorizer(), - num_iterations=self.ONE_FIT_NUM_ITERS, - custom_regularizers={ - fix_regularizer.name: fix_regularizer, - decorr_bad_regularizer.name: decorr_bad_regularizer, - decorr_good_regularizer.name: decorr_good_regularizer, - } - ) - - new_phi = topic_model._model.get_phi() - - assert np.allclose( - new_phi[good_topic_names], good_phi - ) - - assert not np.allclose( - new_phi[not_good_topic_names], good_phi, rtol=0.5 - ) - assert not np.allclose( - new_phi[not_good_topic_names], bad_phi, rtol=0.5 - ) - - def test_fix_good_and_decorr_good_bad2(self): - dataset = self._dataset(keep_in_memory=True) - (topic_model, - good_topic_names, - bad_topic_names, - not_good_topic_names) = self._topic_model_and_topics(dataset=dataset) + not_good_topic_names) = self._get_topic_model_and_topics(dataset=dataset) fix_regularizer = self._get_fix_regularizer( name='fix', target_topic_names=good_topic_names, parent_topic_model=topic_model._model, ) - decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( - name='ext_decorr_bad2', - tau=1e8, - target_topic_names=not_good_topic_names, - other_topic_model=topic_model, - other_topic_names=bad_topic_names, - ) - decorr_good_regularizer, good_phi = self._get_decorr_regularizer2( - name='ext_decorr_good2', - tau=1e8, + # TODO: test breaks if pass just `topic_model` for `parent_topic_model` + # aah, I guess, there are some score saving issues (_score_caches=None) + + base_topic_decorr_kwargs = dict( target_topic_names=not_good_topic_names, other_topic_model=topic_model, - other_topic_names=good_topic_names, ) + if not decorr_v2: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer( + name='ext_decorr_bad', tau=1e5, + other_topic_names=bad_topic_names, + **base_topic_decorr_kwargs, + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer( + name='ext_decorr_good', tau=1e5, + other_topic_names=good_topic_names, + **base_topic_decorr_kwargs + ) + else: + decorr_bad_regularizer, bad_phi = self._get_decorr_regularizer2( + name='ext_decorr_bad2', tau=1e8, + other_topic_names=bad_topic_names, + **base_topic_decorr_kwargs + ) + decorr_good_regularizer, good_phi = self._get_decorr_regularizer2( + name='ext_decorr_good2', tau=1e8, + other_topic_names=good_topic_names, + **base_topic_decorr_kwargs + ) + topic_model._fit( dataset.get_batch_vectorizer(), num_iterations=self.ONE_FIT_NUM_ITERS, From 2afc4aeb1575145688a24ee79542c0aca09ef17c Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 22:41:20 +0300 Subject: [PATCH 45/49] refine has_bcg usage (as much as possible) --- topnum/scores/diversity_score.py | 43 +++++++++------ .../topic_bank/topic_bank_method.py | 52 +++++++++---------- 2 files changed, 54 insertions(+), 41 deletions(-) diff --git a/topnum/scores/diversity_score.py b/topnum/scores/diversity_score.py index 683730c..2e05ee7 100644 --- a/topnum/scores/diversity_score.py +++ b/topnum/scores/diversity_score.py @@ -1,15 +1,20 @@ -from scipy.spatial.distance import pdist +import warnings + +from typing import ( + List, + Union +) + import numpy as np -from scipy.spatial.distance import squareform import pandas as pd + +from scipy.spatial.distance import pdist +from scipy.spatial.distance import squareform + from topicnet.cooking_machine.models import ( BaseScore as BaseTopicNetScore, TopicModel ) -from typing import ( - List, - Union -) from .base_custom_score import BaseCustomScore @@ -80,9 +85,9 @@ def __init__( name: str, metric: str = L2, class_ids: Union[List[str], str] = None, - topic_names = None, + topic_names: List[str] = None, closest: bool = False): - ''' + """ Parameters ---------- metric @@ -92,11 +97,12 @@ def __init__( (Actually, supports anything implemented in scipy.spatial.distance, but not everything is sanity-checked) class_ids + topic_names closest if False, the score will calculate average pairwise distance (default) if True, will calculate the average distance to the closest topic - ''' + """ super().__init__(name) metric = metric.lower() @@ -108,12 +114,24 @@ def __init__( self._score = self._initialize() + if self._topic_names is None: + warnings.warn( + 'Make sure you do not compute diversity with background topics!' + 'Specify the `topic_names` parameter if needed.' + ) + def _initialize(self) -> BaseTopicNetScore: return _DiversityScore(self._metric, self._class_ids, self._topic_names, self._closest) class _DiversityScore(BaseTopicNetScore): - def __init__(self, metric: str, class_ids: Union[List[str], str] = None, topic_names = None, closest: bool = False): + def __init__( + self, + metric: str, + class_ids: Union[List[str], str] = None, + topic_names: List[str] = None, + closest: bool = False + ): super().__init__() metric = metric.lower() @@ -137,11 +155,6 @@ def call(self, model: TopicModel): phi = model.get_phi(class_ids=self._class_ids) all_topic_names = list(phi.columns) - if hasattr(model, 'has_bcg'): - print(f'Detected bcg topics! Skipping for diversity computation (and now {len(all_topic_names) - 1} topics).') - - all_topic_names = all_topic_names[:-1] - if self._topic_names is not None: phi = phi.loc[:, self._topic_names] else: diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 52f749a..15baff6 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -385,10 +385,16 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._last_bank_phi = self._get_phi(self._topic_bank.topics, word2index) self._last_model_phi = phi - if hasattr(topic_model, 'has_bcg'): - print(f'Eliminating bcg topic before Hier. Cur |T| is {phi.shape[1]}, topics are: {phi.columns}.') + # TODO: TopicNet's model should be able to tell + # what topics are subject topics, + # and what topics are background ones + if hasattr(topic_model, 'num_bcg') and topic_model.num_bcg > 0: + print( + f'Eliminating {topic_model.num_bcg} bcg topic before Hierarchy.' + f' Current |T| is {phi.shape[1]}, topics are: {phi.columns}.' + ) - phi = phi.iloc[:, :-1] + phi = phi.iloc[:, :-topic_model.num_bcg] print(f'Now |T| is {phi.shape[1]}, topics are: {phi.columns}.') @@ -420,8 +426,12 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) _logger.info('Calculating model topic scores...') for topic_index, topic_name in enumerate(topic_model.get_phi().columns): - if hasattr(topic_model, 'has_bcg') and topic_index == num_model_topics - 1: - print('Skipping saving scores for bcg topic') + if hasattr(topic_model, 'num_bcg') and topic_index >= num_model_topics - topic_model.num_bcg: + print( + f'Skipping saving scores for bcg topic number {topic_index}' + f' of {num_model_topics} model topics.' + ) + continue topic_scores = dict() @@ -502,16 +512,14 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topic_names=bank_phi.columns, ) - bank_model = _get_topic_model( self._dataset, main_modality=self._main_modality, num_topics=bank_phi.shape[1], scores=self._all_model_scores, - num_safe_fit_iterations=1 + num_safe_fit_iterations=1, ) - - # Safe fit to make topics so-so + # Safe fit to make topics so-so adequate (just in case) bank_model._fit( self._dataset.get_batch_vectorizer(), num_iterations=1, @@ -530,9 +538,6 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) - print(f'!!! Bank Phi: {bank_phi.to_numpy()}.') - print(f'!!! Bank model Phi: {bank_model.get_phi().to_numpy()}.') - assert np.allclose( bank_phi.to_numpy(), bank_model.get_phi().to_numpy(), @@ -544,23 +549,22 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) scores.update(self._get_default_scores(bank_model)) scores['ppl_fair'] = bank_model.scores['ppl_fair'][-1] - # TODO: Second bank model is needed for experiments with regularizers + + # Model with one bcg topic bank_model = init_model_from_family( family='sparse', - dataset=self._dataset, main_modality=self._main_modality, - num_topics=len(bank_phi.columns), seed=0, + dataset=self._dataset, + main_modality=self._main_modality, + num_topics=len(bank_phi.columns), + seed=0, ) - - # Bcg sparse model - # assert hasattr(bank_model, 'has_bcg') - # assert bank_model.has_bcg - - # Safe fit to make topics so-so + # Safe fit to make topics so-so adequate (just in case) bank_model._fit( self._dataset.get_batch_vectorizer(), num_iterations=1, ) + bank_model._model.scores.add( artm.scores.PerplexityScore( name=f'ppl_cheatty', @@ -574,9 +578,6 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) - print(f'!!! Bank Phi: {bank_phi.to_numpy()}.') - print(f'!!! Bank model Phi: {bank_model.get_phi().to_numpy()}.') - assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 assert np.allclose( bank_phi.to_numpy(), @@ -586,8 +587,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] - print(f'Bank scores: {scores}') - + print(f'Bank scores: {scores}.') # Topic scores already calculated From fb556538c32df9d4380f90088a51baa3cca75749 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 23:34:02 +0300 Subject: [PATCH 46/49] fix topic bank --- .../topic_bank/topic_bank_method.py | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 15baff6..f2c3a79 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -355,7 +355,7 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) # self.save() if self._topic_score_threshold_percentile % 1 != 0: - print(f'Using absoulte threshold: {self._topic_score_threshold_percentile}.') + print(f'Using absolute threshold: {self._topic_score_threshold_percentile}.') threshold = self._topic_score_threshold_percentile else: @@ -442,6 +442,14 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) topic_word_prob_values[topic_word_prob_values > 1.0 / num_words] ) + if topic_scores[_KEY_TOPIC_SCORE_KERNEL_SIZE] == 0: + warnings.warn( + f'Not going to add topic "{topic_name}" to the bank' + f' because it has zero kernel!' + ) + + continue + for score_name in raw_topic_scores: topic_scores[score_name] = raw_topic_scores[score_name][topic_name] @@ -538,11 +546,19 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) - assert np.allclose( - bank_phi.to_numpy(), - bank_model.get_phi().to_numpy(), - atol=1e-3, - ) + if not np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy(), + atol=1e-3): + warnings.warn( + 'Seems that bank topics are not perfectly fixed in the bank topic model!' + ' Check your bank topics!' + ) + + print(f'Bank Phi:\n{bank_phi.to_numpy()}') + print(f'Total topic probs: {bank_phi.to_numpy().sum(axis=0)}.') + print(f'Bank model Phi:\n{bank_model.get_phi().to_numpy()}') + print(f'Total topic probs: {bank_model.get_phi().to_numpy().sum(axis=0)}.') _logger.info('Computing default scores for bank model...') @@ -578,12 +594,23 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) } ) + # One background topic assert bank_model.get_phi().shape[1] == bank_phi.shape[1] + 1 - assert np.allclose( - bank_phi.to_numpy(), - bank_model.get_phi().to_numpy()[:, :-1], - atol=1e-3, - ) + + if not np.allclose( + bank_phi.to_numpy(), + bank_model.get_phi().to_numpy()[:, :-1], + atol=1e-3): + warnings.warn( + 'Seems that bank topics are not perfectly fixed in the bank topic model!' + ' (The last model topic — background — is not considered.)' + ' Check your bank topics!' + ) + + print(f'Bank Phi:\n{bank_phi.to_numpy()}') + print(f'Total topic probs: {bank_phi.to_numpy().sum(axis=0)}.') + print(f'Bank model Phi (including bcg topic):\n{bank_model.get_phi().to_numpy()}') + print(f'Total topic probs (including bcg topic): {bank_model.get_phi().to_numpy().sum(axis=0)}.') scores['ppl_cheatty'] = bank_model.scores['ppl_cheatty'][-1] From 1bc1813b0a84f9840837227d9c31d48b61241dcf Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 23:41:49 +0300 Subject: [PATCH 47/49] add some comments for older comments in tb --- topnum/search_methods/topic_bank/topic_bank_method.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index f2c3a79..b2cf5e6 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -352,6 +352,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._result[_KEY_MODEL_SCORES].append(scores) self._result[_KEY_NUM_MODEL_TOPICS].append(topic_model.get_phi().shape[1]) + # Better one time at the end of the iteration + # (otherwise, incomplete information will be saved) # self.save() if self._topic_score_threshold_percentile % 1 != 0: @@ -495,6 +497,8 @@ def search_for_optimum(self, text_collection: VowpalWabbitTextCollection = None) self._topic_bank.topic_scores # TODO: append ) + # Better one time at the end of the iteration + # (otherwise, incomplete information will be saved) # self.save() if self._save_model_topics: From cab445194a59307efc25eb805e08b96d1787f6ac Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Wed, 19 Mar 2025 23:44:15 +0300 Subject: [PATCH 48/49] return input mode for tb --- topnum/search_methods/topic_bank/topic_bank_method.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index b2cf5e6..923884e 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -221,12 +221,12 @@ def __init__( f' Are you sure you want to proceed (yes/no)?' ) - #answer = input() + answer = input() - #if strtobool(answer) is False: - # warnings.warn('Exiting') + if strtobool(answer) is False: + warnings.warn('Exiting') - # exit(0) + exit(0) self._topic_score_threshold_percentile = topic_score_threshold_percentile From d27e44bb4dea4a09021848afca3665f9655e6052 Mon Sep 17 00:00:00 2001 From: Vasily Alexeev Date: Thu, 20 Mar 2025 00:16:26 +0300 Subject: [PATCH 49/49] refine code, add pytest rerun (for a couple of intratext coherence tests) --- requirements.txt | 1 + topnum/model_constructor.py | 9 ++++++--- topnum/scores/diversity_score.py | 2 +- topnum/search_methods/topic_bank/topic_bank_method.py | 8 ++++---- topnum/tests/test_topic_bank.py | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2a0ab7f..93a8a83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ matplotlib==3.7.5 numpy==1.24.4 pandas==2.0.3 pytest==8.1.1 +pytest-rerunfailures==14.0 scikit-learn==1.3.2 scipy==1.10.1 topicnet>=0.9.0 diff --git a/topnum/model_constructor.py b/topnum/model_constructor.py index f422107..1e01a59 100644 --- a/topnum/model_constructor.py +++ b/topnum/model_constructor.py @@ -99,9 +99,12 @@ def init_model_from_family( dataset, modalities_to_use, main_modality, num_topics, 1, model_params ) elif family == "decorrelation": - model = init_decorrelated_artm( - dataset, modalities_to_use, main_modality, num_topics, 1, model_params + model = init_decorrelated_plsa( + dataset, modalities_to_use, main_modality, num_topics, model_params ) + # model = init_decorrelated_artm( + # dataset, modalities_to_use, main_modality, num_topics, 1, model_params + # ) elif family == "ARTM": model = init_baseline_artm( dataset, modalities_to_use, main_modality, num_topics, 1, model_params @@ -213,6 +216,7 @@ def init_decorrelated_plsa( return model +# TODO: is it the same as init_baseline_artm? def init_decorrelated_artm( dataset, modalities_to_use, @@ -255,7 +259,6 @@ def init_decorrelated_artm( ) ) - dictionary = dataset.get_dictionary() baseline_class_ids = {class_id: 1 for class_id in modalities_to_use} data_stats = count_vocab_size(dictionary, baseline_class_ids) diff --git a/topnum/scores/diversity_score.py b/topnum/scores/diversity_score.py index 2e05ee7..0b0435b 100644 --- a/topnum/scores/diversity_score.py +++ b/topnum/scores/diversity_score.py @@ -117,7 +117,7 @@ def __init__( if self._topic_names is None: warnings.warn( 'Make sure you do not compute diversity with background topics!' - 'Specify the `topic_names` parameter if needed.' + ' Specify the `topic_names` parameter if needed.' ) def _initialize(self) -> BaseTopicNetScore: diff --git a/topnum/search_methods/topic_bank/topic_bank_method.py b/topnum/search_methods/topic_bank/topic_bank_method.py index 923884e..42961fa 100644 --- a/topnum/search_methods/topic_bank/topic_bank_method.py +++ b/topnum/search_methods/topic_bank/topic_bank_method.py @@ -687,14 +687,14 @@ def _extract_hierarchical_relationship( hierarchy = artm.hARTM(num_processors=1) - print(f'Creating first level with {bank_phi.shape[1]} topics. Dictionary: {self._dictionary}.') + _logger.debug(f'Creating first level with {bank_phi.shape[1]} topics. Dictionary: {self._dictionary}.') level0 = hierarchy.add_level( num_topics=bank_phi.shape[1] ) level0.initialize(dictionary=self._dictionary) - print( + _logger.debug( f'Copying phi for the first level.' f' Phi shape: {bank_phi.shape}.' f' First words: {bank_phi.index[:10]}' @@ -708,7 +708,7 @@ def _extract_hierarchical_relationship( small_num_fit_iterations=1 ) - print(f'Creating second level with {new_model_phi.shape[1]} topics') + _logger.debug(f'Creating second level with {new_model_phi.shape[1]} topics') level1 = hierarchy.add_level( num_topics=new_model_phi.shape[1], @@ -731,7 +731,7 @@ def _extract_hierarchical_relationship( ) ) - print( + _logger.debug( f'Copying phi for the second level.' f' Phi shape: {new_model_phi.shape}.' f' First words: {new_model_phi.index[:10]}' diff --git a/topnum/tests/test_topic_bank.py b/topnum/tests/test_topic_bank.py index b3440af..f5994ae 100644 --- a/topnum/tests/test_topic_bank.py +++ b/topnum/tests/test_topic_bank.py @@ -262,7 +262,7 @@ def test_topic_bank_specific_phi_arora(self, keep_in_memory, bank_update): document_occurrences_threshold_percentage=0.001 ) - print(f'Arora Phi: {phi}') + print(f'Arora Phi: {phi}.') assert not phi.isnull().any(axis=None)