s-nlp · lmeribal · Sep 29, 2025 · Nov 2, 2025 · Nov 2, 2025 · Nov 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 __pycache__/
 *.py[codz]
 *$py.class
+*log
 
 # C extensions
 *.so
@@ -215,4 +216,4 @@ __marimo__/
 # Streamlit
 .streamlit/secrets.toml
 
-**/*.jsonl
+**/*.jsonl
diff --git a/README.md b/README.md
@@ -84,6 +84,12 @@ The annotation process includes two filtering stages. Heuristic-based filters en
 uv run psilo dataset filter
 ```
 
+### Uncertainty
+Use the following command to run uncertainty quantification methods:
+```bash
+uv run psilo methods uncertainty
+```
+
 ## Citation
 ```
 @misc{rykov2025modelslielearnmultilingual,

diff --git a/psilo/cli.py b/psilo/cli.py
@@ -1,9 +1,10 @@
 import typer
 from dataset.cli import app as dataset_app
+from methods.cli import app as methods_app
 
 app = typer.Typer(no_args_is_help=True, add_completion=False)
 app.add_typer(dataset_app, name="dataset")
-
+app.add_typer(methods_app, name="methods")
 
 def main():
     app()
diff --git a/psilo/methods/__init__.py b/psilo/methods/__init__.py
diff --git a/psilo/methods/cli.py b/psilo/methods/cli.py
@@ -0,0 +1,39 @@
+import typer
+from loguru import logger
+from .uncertainty.recompute_logits_with_uq import run_uncertainty_evaluation
+from .uncertainty.evaluation import evaluate_uncertainty
+from datasets import load_dataset
+
+app = typer.Typer(help="PsiloQA Methods Pipeline")
+
+@app.command("uncertainty")
+def uncertainty():
+    logger.info("uncertainty evaluation cli")
+
+    dataset_val = load_dataset("s-nlp/PsiloQA", split="validation").to_pandas()
+    dataset_test = load_dataset("s-nlp/PsiloQA", split="test").to_pandas()
+
+    logger.info("loaded datasets")
+
+    logger.info("starting validation evaluation")
+    result_val = run_uncertainty_evaluation(dataset=dataset_val)
+    logger.info("finished validation evaluation")
+
+    logger.info("starting test evaluation")
+    result_test = run_uncertainty_evaluation(dataset=dataset_test)
+    logger.info("finished test evaluation")
+
+    logger.info("starting results aggregation")
+    evaluate_uncertainty(result_val, result_test)
+    logger.info("finished evaluation")
+
+@app.command("encoder_train")
+def encoder_train():
+    logger.info("encoder train cli")
+
+@app.command("encoder_eval")
+def encoder_eval():
+    logger.info("encoder evaluation cli")
+
+if __name__ == "__main__":
+    app()
diff --git a/psilo/methods/encoder/__init__.py b/psilo/methods/encoder/__init__.py
diff --git a/psilo/methods/uncertainty/README.md b/psilo/methods/uncertainty/README.md
@@ -0,0 +1,6 @@
+# Token-Level Uncertainty Quantification with LM-Polygraph
+
+This code is adapted from the [LM-Polygraph](https://github.com/IINemo/lm-polygraph) project,  
+developed by Fadeeva et al. (2023) and released under the MIT License.  
+The original project provides a comprehensive framework for uncertainty quantification of large language models.  
+Modifications have been made to suit token-level uncertainty quantification cases in this script.
diff --git a/psilo/methods/uncertainty/__init__.py b/psilo/methods/uncertainty/__init__.py
diff --git a/psilo/methods/uncertainty/estimators/claim_conditioned_probability.py b/psilo/methods/uncertainty/estimators/claim_conditioned_probability.py
@@ -0,0 +1,65 @@
+import numpy as np
+
+from typing import Dict
+
+
+class ClaimConditionedProbability():
+    def __init__(self):
+        super().__init__()
+
+    def __str__(self):
+        return "CCP"
+
+    def _reduce(self, logprobs: list[float]):
+        return logprobs
+
+    def _combine_nli(self, forward: str, backward: str):
+        """
+        Combines two NLI predictions NLI(x, y) and NLI(y, x) into a single prediction.
+
+        Prioritizes "entail" or "contra" if present, otherwise returns "neutral".
+        """
+        if forward == backward:
+            return forward
+        if all(x in [forward, backward] for x in ["entail", "contra"]):
+            return "neutral"
+        for x in ["entail", "contra"]:
+            if x in [forward, backward]:
+                return x
+        return "neutral"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        words = stats["greedy_tokens"]
+        alternatives = stats["greedy_tokens_alternatives"]
+        alternatives_nli = stats["greedy_tokens_alternatives_nli"]
+        prob_nli = []
+        for sample_words, sample_alternatives, sample_alternatives_nli in zip(
+            words,
+            alternatives,
+            alternatives_nli,
+        ):
+            sample_mnlis = []
+            for word, word_alternatives, word_alternatives_nli in zip(
+                sample_words,
+                sample_alternatives,
+                sample_alternatives_nli,
+            ):
+                entail_logprobs, entail_words = [], []
+                contra_logprobs, contra_words = [], []
+                for i in range(len(word_alternatives)):
+                    word_alt, logprob = word_alternatives[i]
+                    nli_outcome = self._combine_nli(
+                        word_alternatives_nli[0][i],
+                        word_alternatives_nli[i][0],
+                    )
+                    if i == 0 or nli_outcome == "entail":
+                        entail_logprobs.append(logprob)
+                        entail_words.append(word_alt)
+                    elif nli_outcome == "contra":
+                        contra_logprobs.append(logprob)
+                        contra_words.append(word_alt)
+                entail_logprob = np.logaddexp.reduce(entail_logprobs)
+                total_logprob = np.logaddexp.reduce(entail_logprobs + contra_logprobs)
+                sample_mnlis.append(entail_logprob - total_logprob)
+            prob_nli.append(self._reduce(sample_mnlis))
+        return -np.array(prob_nli)