machine-intelligence-laboratory · Alvant · Mar 23, 2024 · Jul 20, 2024 · Jul 20, 2024 · Jul 20, 2024
@@ -1,12 +1,13 @@
 anchor-topic==0.1.2
-bigartm==0.9.2
-dill==0.3.1.1
-lapsolver==1.0.2
-matplotlib
-numpy==1.22.0
-pandas==1.0.1
-pytest==5.3.5
-scikit-learn==1.5.0
-scipy==1.10.0
-topicnet>=0.8.0
-tqdm==4.66.3
+bigartm>=0.9.2
+dill==0.3.8
+lapsolver==1.1.0
+matplotlib==3.7.5
+numpy==1.24.4
+pandas==2.0.3
+pytest==8.1.1
+pytest-rerunfailures==14.0
+scikit-learn==1.3.2
+scipy==1.10.1
+topicnet>=0.9.0
+tqdm==4.66.2
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
@@ -0,0 +1,45 @@
+from distutils.core import setup
+
+
+setup(
+    name='topnum',
+    packages=[
+        'topnum',
+        'topnum.data',
+        'topnum.scores',
+        'topnum.search_methods',
+        'topnum.search_methods.topic_bank',
+        'topnum.search_methods.topic_bank.phi_initialization',
+        'topnum.tests'
+    ],
+    version='0.3.0',
+    license='MIT',
+    description='A set of methods for finding an appropriate number of topics in a text collection',
+    author='Machine Intelligence Laboratory',
+    author_email='vasiliy.alekseyev@phystech.edu',
+    url='https://github.com/machine-intelligence-laboratory/OptimalNumberOfTopics',
+    keywords=[
+        'topic modeling',
+        'document clustering',
+        'number of clusters',
+        'ARTM',
+        'regularization',
+    ],
+    install_requires=[
+        'anchor-topic==0.1.2',
+        'bigartm>=0.9.2',
+        'dill==0.3.8',
+        'lapsolver==1.1.0',
+        'matplotlib==3.7.5',
+        'numpy==1.24.4',
+        'pandas==2.0.3',
+        'pytest==8.1.1',
+        'scikit-learn==1.3.2',
+        'scipy==1.10.1',
+        'topicnet>=0.9.0',
+        'tqdm==4.66.2',
+    ],
+    classifiers=[
+        'Programming Language :: Python :: 3.8',
+    ],
+)
@@ -102,6 +102,9 @@ def init_model_from_family(
         model = init_decorrelated_plsa(
             dataset, modalities_to_use, main_modality, num_topics, model_params
         )
+        # model = init_decorrelated_artm(
+        #     dataset, modalities_to_use, main_modality, num_topics, 1, model_params
+        # )
     elif family == "ARTM":
         model = init_baseline_artm(
             dataset, modalities_to_use, main_modality, num_topics, 1, model_params
@@ -213,6 +216,82 @@ def init_decorrelated_plsa(
     return model
 
 
+# TODO: is it the same as init_baseline_artm?
+def init_decorrelated_artm(
+        dataset,
+        modalities_to_use,
+        main_modality,
+        num_topics,
+        bcg_topics,
+        model_params: dict = None
+):
+    """
+    Creates simple artm model with standard scores.
+
+    Parameters
+    ----------
+    dataset : Dataset
+    modalities_to_use : list of str
+    main_modality : str
+    num_topics : int
+    model_params : dict
+
+    Returns
+    -------
+    model: artm.ARTM() instance
+    """
+    if model_params is None:
+        model_params = dict()
+
+    model = init_plsa(
+        dataset, modalities_to_use, main_modality, num_topics
+    )
+    tau = model_params.get('decorrelation_tau', 0.01)
+
+    specific_topic_names = model.topic_names  # let's decorrelate everything
+    model.regularizers.add(
+        artm.DecorrelatorPhiRegularizer(
+            gamma=0,
+            tau=tau,
+            name='decorrelation',
+            topic_names=specific_topic_names,
+            class_ids=modalities_to_use,
+        )
+    )
+
+    dictionary = dataset.get_dictionary()
+    baseline_class_ids = {class_id: 1 for class_id in modalities_to_use}
+    data_stats = count_vocab_size(dictionary, baseline_class_ids)
+
+    background_topic_names = model.topic_names[-bcg_topics:]
+    specific_topic_names = model.topic_names[:-bcg_topics]
+
+    # all coefficients are relative
+    regularizers = [
+        artm.SmoothSparsePhiRegularizer(
+             name='smooth_phi_bcg',
+             topic_names=background_topic_names,
+             tau=model_params.get("smooth_bcg_tau", 0.1),
+             class_ids=[main_modality],
+        ),
+        artm.SmoothSparseThetaRegularizer(
+             name='smooth_theta_bcg',
+             topic_names=background_topic_names,
+             tau=model_params.get("smooth_bcg_tau", 0.1),
+        ),
+    ]
+
+    for reg in regularizers:
+        model.regularizers.add(transform_regularizer(
+            data_stats,
+            reg,
+            model.class_ids,
+            n_topics=len(reg.topic_names)
+        ))
+
+    return model
+
+
 def _init_dirichlet_prior(name, num_topics, num_terms):
     """
     Adapted from github.com/RaRe-Technologies/gensim/blob/master/gensim/models/ldamodel.py#L521

@@ -0,0 +1,5 @@
+from .fix_phi import FastFixPhiRegularizer
+from .decorrelate_with_other_phi import (
+    DecorrelateWithOtherPhiRegularizer,
+    DecorrelateWithOtherPhiRegularizer2,
+)
@@ -0,0 +1,122 @@
+from typing import List, Optional
+
+import numpy as np
+from numpy import ndarray
+from pandas import DataFrame
+from scipy.spatial.distance import cdist
+
+from artm import ARTM
+from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer
+
+
+# TODO: find (and make possible to use) relative taus for these regularizers
+
+class DecorrelateWithOtherPhiRegularizer(BaseRegularizer):
+    def __init__(
+            self,
+            name: str,
+            tau: float,
+            topic_names: List[str],
+            other_phi: DataFrame,
+            ):
+        """
+
+        Parameters
+        ----------
+        name
+        tau
+            To select a value, try a few test runs to find the tau
+            that affects the perplexity (worsens, but not very much).
+            Recommendation based on experimentation: try 1e5 or 1e6.
+        topic_names
+        other_phi
+
+        """
+        super().__init__(name, tau=tau)
+
+        self._topic_names = topic_names
+        self._other_phi = other_phi
+        self._other_topic_sum = self._other_phi.values.sum(
+            axis=1, keepdims=True
+        )
+
+        self._topic_indices = None
+
+    def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray:
+        rwt = np.zeros_like(pwt)
+        rwt[:, self._topic_indices] += (
+            pwt.values[:, self._topic_indices] * self._other_topic_sum
+        )
+
+        return -1 * self.tau * rwt
+
+    def attach(self, model: ARTM) -> None:
+        super().attach(model)
+
+        phi = model.get_phi()
+        self._topic_indices = [
+            phi.columns.get_loc(topic_name)
+            for topic_name in self._topic_names
+        ]
+
+
+class DecorrelateWithOtherPhiRegularizer2(BaseRegularizer):
+    def __init__(
+            self,
+            name: str,
+            tau: float,
+            topic_names: List[str],
+            other_phi: DataFrame,
+            num_iters: Optional[int] = None,
+            ):
+        """
+
+        Parameters
+        ----------
+        name
+        tau
+            To select a value, try a few test runs to find the tau
+            that affects the perplexity (worsens, but not very much).
+            Recommendation based on experimentation: try 1e8, 1e9, or 1e10.
+        topic_names
+        other_phi
+        num_iters
+
+        """
+        super().__init__(name, tau=tau)
+
+        self._topic_names = topic_names
+        self._other_phi = other_phi
+        self._num_iters = num_iters
+        self._cur_iter = 0
+
+        self._topic_indices = None
+
+    def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray:
+        rwt = np.zeros_like(pwt)
+
+        if self._num_iters is not None and self._cur_iter >= self._num_iters:
+            return rwt
+
+        correlations = cdist(
+            self._other_phi.values.T,
+            pwt.values[:, self._topic_indices].T,
+            lambda u, v: (u * v).sum()
+        )
+        weighted_other_topics = self._other_phi.values.dot(correlations)
+
+        rwt[:, self._topic_indices] += (
+                pwt.values[:, self._topic_indices] * weighted_other_topics
+        )
+        self._cur_iter += 1
+
+        return -1 * self.tau * rwt
+
+    def attach(self, model: ARTM) -> None:
+        super().attach(model)
+
+        phi = model.get_phi()
+        self._topic_indices = [
+            phi.columns.get_loc(topic_name)
+            for topic_name in self._topic_names
+        ]
@@ -0,0 +1,57 @@
+from typing import List, Optional
+
+import numpy as np
+from numpy import ndarray
+from pandas import DataFrame
+
+from artm import ARTM
+from topicnet.cooking_machine.models.topic_model import TopicModel
+from topicnet.cooking_machine.models.base_regularizer import BaseRegularizer
+
+
+class FastFixPhiRegularizer(BaseRegularizer):
+    _VERY_BIG_TAU = 10 ** 9
+
+    def __init__(
+            self,
+            name: str,
+            topic_names: List[str],
+            parent_model: Optional[TopicModel] = None,  # TODO: TopicModel or ARTM?
+            parent_phi: DataFrame = None,
+            tau: float = _VERY_BIG_TAU,
+            ):
+        super().__init__(name, tau=tau)
+
+        if parent_phi is None and parent_model is None:
+            raise ValueError('Both parent Phi and parent model not specified.')
+
+        self._topic_names = topic_names
+        self._topic_indices = None
+        self._parent_model = parent_model
+        self._parent_phi = parent_phi
+
+    def grad(self, pwt: DataFrame, nwt: DataFrame) -> ndarray:
+        rwt = np.zeros_like(pwt)
+
+        if self._parent_phi is not None:
+            parent_phi = self._parent_phi
+            vals = parent_phi.values
+        else:
+            parent_phi = self._parent_model.get_phi()
+            vals = parent_phi.values[:, self._topic_indices]
+
+        assert vals.shape[0] == rwt.shape[0]
+        assert vals.shape[1] == len(self._topic_indices), (vals.shape[1], len(self._topic_indices))
+
+        rwt[:, self._topic_indices] += vals
+
+        return self.tau * rwt
+
+    def attach(self, model: ARTM) -> None:
+        super().attach(model)
+
+        phi = self._model.get_phi()
+        self._topic_indices = [
+            phi.columns.get_loc(topic_name)
+            for topic_name in self._topic_names
+        ]