Skip to content

feat(transcript-cleaner): add transcript cleaning feature with custom vocabulary #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ OLLAMA_DEFAULT_MODEL=tinyllama

# Path configuration
VOICE_MEMOS_DIR=VoiceMemos
VOCABULARY_FILE=VOCABULARY.txt

# Whisper model configuration
WHISPER_MODEL=base.en
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The main idea is to let your computer do computer work, while you're out and abo
## Features

- 🎙️ Automatic voice memo transcription
- 🧹 Transcript cleaning with customizable vocabulary
- 🔌 Flexible plugin system for content extraction
- 🤖 AI-powered content generation using Ollama
- 📝 Built-in plugins for:
Expand Down Expand Up @@ -47,11 +48,30 @@ This will:

## How to use

### Basic Usage

- Use whatever you want to record voice notes (I use [Fossify](https://github.com/FossifyOrg/Voice-Recorder))
- Use whatever you want to sync your files (I use [Syncthing](https://syncthing.net/))
- Use whatever you want to look at the markdown/output files (I use [Zettel Notes](https://www.zettelnotes.com/))
- Run the `./watch.sh` script on an idle machine to get the most out of it

### Transcript Cleaning

VibeLine includes a transcript cleaning feature that corrects common transcription errors based on a customizable vocabulary file. This is especially useful for technical terms, names, or domain-specific jargon that speech recognition models often misinterpret.

To use this feature:

1. Edit the `VOCABULARY.txt` file in the root directory to add your custom corrections:
```
# Format: incorrect_word -> correct_word
Noster -> Nostr
```

2. The transcript cleaner will automatically run as part of the extraction process.

3. You can customize the behavior with these options:
- `--no-clean`: Skip the transcript cleaning step entirely

## Contributors

[![Contributors](https://contrib.rocks/image?repo=dergigi/vibeline)](https://github.com/dergigi/vibeline/graphs/contributors)
Expand Down
21 changes: 21 additions & 0 deletions VOCABULARY.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Vocabulary file for transcript corrections
# Format: incorrect_word -> correct_word

# Nostr-related terms
Noster -> Nostr
Nostril -> Nostr
Nostrum -> Nostr
Nostre -> Nostr
Nostra -> Nostr

Pipeline -> vibeline
vive line -> vibeline

# Technical terms
API's -> APIs
SDK's -> SDKs
Java script -> JavaScript
Type script -> TypeScript
Pie thon -> Python
Rust Lang -> Rust
Go Lang -> Go
42 changes: 40 additions & 2 deletions src/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import List, Dict
from dotenv import load_dotenv
from plugin_manager import PluginManager, Plugin
from transcript_cleaner import TranscriptCleaner

# Load environment variables
load_dotenv()
Expand All @@ -21,6 +22,7 @@
# Configuration from environment variables
OLLAMA_MODEL = os.getenv("OLLAMA_EXTRACT_MODEL", "llama2")
VOICE_MEMOS_DIR = os.getenv("VOICE_MEMOS_DIR", "VoiceMemos")
VOCABULARY_FILE = os.getenv("VOCABULARY_FILE", "VOCABULARY.txt")

# Set a different host (default is http://localhost:11434)
ollama.host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
Expand Down Expand Up @@ -84,11 +86,12 @@ def main():
parser = argparse.ArgumentParser(description='Extract content from transcripts using plugins.')
parser.add_argument('transcript_file', help='The transcript file to process')
parser.add_argument('-f', '--force', action='store_true', help='Force overwrite existing output files')
parser.add_argument('--no-clean', action='store_true', help='Skip transcript cleaning step')
args = parser.parse_args()

# Ensure the default model exists
ensure_model_exists(OLLAMA_MODEL)

input_file = Path(args.transcript_file)
if not input_file.exists():
print(f"Error: File {input_file} does not exist")
Expand Down Expand Up @@ -118,7 +121,42 @@ def main():

# Read transcript
with open(input_file, 'r', encoding='utf-8') as f:
transcript_text = f.read()
original_transcript_text = f.read()

# Clean transcript if not disabled
if not args.no_clean:
print("Cleaning transcript...")

# Check if vocabulary file exists
vocabulary_path = Path(VOCABULARY_FILE)
if not vocabulary_path.exists():
print(f"Warning: Vocabulary file {VOCABULARY_FILE} not found. Skipping transcript cleaning.")
transcript_text = original_transcript_text
else:
# Initialize transcript cleaner and clean the transcript
cleaner = TranscriptCleaner(vocabulary_file=vocabulary_path)
transcript_text, corrections = cleaner.clean_transcript(original_transcript_text)

# Log corrections
if corrections:
print(f"Made {len(corrections)} corrections to the transcript.")

# List the corrections made
for i, correction in enumerate(corrections, 1):
print(f"{i}. Line {correction['line']}:")
print(f" Original: {correction['original']}")
print(f" Corrected: {correction['corrected']}")

# Save cleaned transcript
cleaned_file = input_file.parent / f"{input_file.stem}_cleaned.txt"
with open(cleaned_file, 'w', encoding='utf-8') as f:
f.write(transcript_text)
print(f"Cleaned transcript saved to: {cleaned_file}")
else:
print("No corrections needed for this transcript.")
transcript_text = original_transcript_text
else:
transcript_text = original_transcript_text

# Read summary if it exists
summary_file = input_file.parent / f"{input_file.stem}_summary.txt"
Expand Down
129 changes: 129 additions & 0 deletions src/transcript_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3

import os
import re
from pathlib import Path
from typing import Dict, List, Tuple
# ollama import removed as we no longer use LLM for cleaning

class TranscriptCleaner:
def __init__(self, vocabulary_file: Path = None):
"""
Initialize the transcript cleaner with a vocabulary file.

Args:
vocabulary_file: Path to the vocabulary file containing corrections
"""
self.vocabulary_file = vocabulary_file
self.corrections = {}

# Load vocabulary file if provided
if vocabulary_file and vocabulary_file.exists():
self._load_vocabulary()

def _load_vocabulary(self) -> None:
"""Load word corrections from the vocabulary file."""
with open(self.vocabulary_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue

# Format: incorrect_word -> correct_word
if '->' in line:
incorrect, correct = [part.strip() for part in line.split('->')]
self.corrections[incorrect.lower()] = correct

def _apply_direct_corrections(self, text: str) -> str:
"""Apply direct word corrections from the vocabulary file."""
if not self.corrections:
return text

# Split the text into words while preserving whitespace and punctuation
# This regex captures words, whitespace, and punctuation separately
tokens = re.findall(r'(\b\w+\b|\s+|[^\w\s])', text)
corrected_tokens = []
corrections_made = []

for token in tokens:
# Only process word tokens (skip whitespace and punctuation)
if re.match(r'\b\w+\b', token):
token_lower = token.lower()

# Check if this token needs correction
if token_lower in self.corrections:
replacement = self.corrections[token_lower]

# Preserve original capitalization
if token.isupper():
replacement = replacement.upper()
elif token[0].isupper():
replacement = replacement[0].upper() + replacement[1:]

# Record this correction
corrections_made.append((token, replacement))

# Use the replacement
corrected_tokens.append(replacement)
else:
# No correction needed
corrected_tokens.append(token)
else:
# Preserve whitespace and punctuation
corrected_tokens.append(token)

# Join the tokens back into text
corrected_text = ''.join(corrected_tokens)

# Also handle multi-word phrases
for incorrect, correct in self.corrections.items():
if ' ' in incorrect: # This is a multi-word phrase
# Create a regex pattern that handles case variations
pattern = re.compile(re.escape(incorrect), re.IGNORECASE)

def replace_phrase(match):
original = match.group(0)
# Preserve case pattern if possible
if original.isupper():
return correct.upper()
elif original[0].isupper():
return correct[0].upper() + correct[1:]
return correct

# Apply the replacement
corrected_text = pattern.sub(replace_phrase, corrected_text)

return corrected_text

# No LLM-based methods needed anymore

def clean_transcript(self, text: str) -> Tuple[str, List[Dict]]:
"""
Clean the transcript by applying vocabulary corrections.

Args:
text: The transcript text to clean

Returns:
Tuple containing:
- The cleaned transcript text
- A list of corrections made (for logging/debugging)
"""
original_text = text

# Apply direct word-for-word corrections
cleaned_text = self._apply_direct_corrections(text)

# Log corrections for debugging
corrections_made = []
if cleaned_text != original_text:
# Find differences between original and corrected text
for i, (orig_line, new_line) in enumerate(zip(original_text.splitlines(), cleaned_text.splitlines())):
if orig_line != new_line:
corrections_made.append({
'line': i + 1,
'original': orig_line,
'corrected': new_line
})

return cleaned_text, corrections_made
78 changes: 78 additions & 0 deletions tests/transcript_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3

import sys
import os
from pathlib import Path
from dotenv import load_dotenv

# Add the src directory to the Python path
sys.path.append(str(Path(__file__).parent))
from src.transcript_cleaner import TranscriptCleaner

# Load environment variables
load_dotenv()

def main():
vocabulary_file = Path(os.getenv("VOCABULARY_FILE", "VOCABULARY.txt"))

# Check if vocabulary file exists
if not vocabulary_file.exists():
print(f"Error: Vocabulary file {vocabulary_file} not found.")
print("Please create it first or specify a different file with VOCABULARY_FILE environment variable.")
sys.exit(1)

# Check command line arguments
if len(sys.argv) < 2:
print("Usage: python test_transcript_cleaner.py <text_to_clean>")
print(" or: python test_transcript_cleaner.py --file <transcript_file>")
sys.exit(1)

# Get the text to clean
if sys.argv[1] == "--file":
if len(sys.argv) < 3:
print("Error: No file specified after --file")
sys.exit(1)

file_path = Path(sys.argv[2])
if not file_path.exists():
print(f"Error: File {file_path} not found")
sys.exit(1)

with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()

print(f"Cleaning transcript from file: {file_path}")
else:
# Use the command line argument as the text
text = " ".join(sys.argv[1:])
print(f"Cleaning text: {text}")

# Initialize the transcript cleaner
cleaner = TranscriptCleaner(vocabulary_file=vocabulary_file)

# Clean the transcript
cleaned_text, corrections = cleaner.clean_transcript(text)

# Print the results
print("\nOriginal text:")
print("-" * 80)
print(text)
print("-" * 80)

print("\nCleaned text:")
print("-" * 80)
print(cleaned_text)
print("-" * 80)

# Print corrections
if corrections:
print(f"\nMade {len(corrections)} corrections:")
for i, correction in enumerate(corrections, 1):
print(f"{i}. Line {correction['line']}:")
print(f" Original: {correction['original']}")
print(f" Corrected: {correction['corrected']}")
else:
print("\nNo corrections were made.")

if __name__ == "__main__":
main()