Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[codz]
*$py.class
*log

# C extensions
*.so
Expand Down Expand Up @@ -215,4 +216,4 @@ __marimo__/
# Streamlit
.streamlit/secrets.toml

**/*.jsonl
**/*.jsonl
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ The annotation process includes two filtering stages. Heuristic-based filters en
uv run psilo dataset filter
```

### Uncertainty
Use the following command to run uncertainty quantification methods:
```bash
uv run psilo methods uncertainty
```

## Citation
```
@misc{rykov2025modelslielearnmultilingual,
Expand Down
3 changes: 2 additions & 1 deletion psilo/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import typer
from dataset.cli import app as dataset_app
from methods.cli import app as methods_app

app = typer.Typer(no_args_is_help=True, add_completion=False)
app.add_typer(dataset_app, name="dataset")

app.add_typer(methods_app, name="methods")

def main():
app()
Empty file added psilo/methods/__init__.py
Empty file.
39 changes: 39 additions & 0 deletions psilo/methods/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import typer
from loguru import logger
from .uncertainty.recompute_logits_with_uq import run_uncertainty_evaluation
from .uncertainty.evaluation import evaluate_uncertainty
from datasets import load_dataset

app = typer.Typer(help="PsiloQA Methods Pipeline")

@app.command("uncertainty")
def uncertainty():
logger.info("uncertainty evaluation cli")

dataset_val = load_dataset("s-nlp/PsiloQA", split="validation").to_pandas()
dataset_test = load_dataset("s-nlp/PsiloQA", split="test").to_pandas()

logger.info("loaded datasets")

logger.info("starting validation evaluation")
result_val = run_uncertainty_evaluation(dataset=dataset_val)
logger.info("finished validation evaluation")

logger.info("starting test evaluation")
result_test = run_uncertainty_evaluation(dataset=dataset_test)
logger.info("finished test evaluation")

logger.info("starting results aggregation")
evaluate_uncertainty(result_val, result_test)
logger.info("finished evaluation")

@app.command("encoder_train")
def encoder_train():
logger.info("encoder train cli")

@app.command("encoder_eval")
def encoder_eval():
logger.info("encoder evaluation cli")

if __name__ == "__main__":
app()
Empty file.
6 changes: 6 additions & 0 deletions psilo/methods/uncertainty/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Token-Level Uncertainty Quantification with LM-Polygraph

This code is adapted from the [LM-Polygraph](https://github.com/IINemo/lm-polygraph) project,
developed by Fadeeva et al. (2023) and released under the MIT License.
The original project provides a comprehensive framework for uncertainty quantification of large language models.
Modifications have been made to suit token-level uncertainty quantification cases in this script.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np

from typing import Dict


class ClaimConditionedProbability():
def __init__(self):
super().__init__()

def __str__(self):
return "CCP"

def _reduce(self, logprobs: list[float]):
return logprobs

def _combine_nli(self, forward: str, backward: str):
"""
Combines two NLI predictions NLI(x, y) and NLI(y, x) into a single prediction.

Prioritizes "entail" or "contra" if present, otherwise returns "neutral".
"""
if forward == backward:
return forward
if all(x in [forward, backward] for x in ["entail", "contra"]):
return "neutral"
for x in ["entail", "contra"]:
if x in [forward, backward]:
return x
return "neutral"

def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
words = stats["greedy_tokens"]
alternatives = stats["greedy_tokens_alternatives"]
alternatives_nli = stats["greedy_tokens_alternatives_nli"]
prob_nli = []
for sample_words, sample_alternatives, sample_alternatives_nli in zip(
words,
alternatives,
alternatives_nli,
):
sample_mnlis = []
for word, word_alternatives, word_alternatives_nli in zip(
sample_words,
sample_alternatives,
sample_alternatives_nli,
):
entail_logprobs, entail_words = [], []
contra_logprobs, contra_words = [], []
for i in range(len(word_alternatives)):
word_alt, logprob = word_alternatives[i]
nli_outcome = self._combine_nli(
word_alternatives_nli[0][i],
word_alternatives_nli[i][0],
)
if i == 0 or nli_outcome == "entail":
entail_logprobs.append(logprob)
entail_words.append(word_alt)
elif nli_outcome == "contra":
contra_logprobs.append(logprob)
contra_words.append(word_alt)
entail_logprob = np.logaddexp.reduce(entail_logprobs)
total_logprob = np.logaddexp.reduce(entail_logprobs + contra_logprobs)
sample_mnlis.append(entail_logprob - total_logprob)
prob_nli.append(self._reduce(sample_mnlis))
return -np.array(prob_nli)
Loading