dergigi · pablof7z · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/.env.example b/.env.example
@@ -11,6 +11,7 @@ OLLAMA_DEFAULT_MODEL=tinyllama
 
 # Path configuration
 VOICE_MEMOS_DIR=VoiceMemos
+VOCABULARY_FILE=VOCABULARY.txt
 
 # Whisper model configuration
 WHISPER_MODEL=base.en
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ The main idea is to let your computer do computer work, while you're out and abo
 ## Features
 
 - 🎙️ Automatic voice memo transcription
+- 🧹 Transcript cleaning with customizable vocabulary
 - 🔌 Flexible plugin system for content extraction
 - 🤖 AI-powered content generation using Ollama
 - 📝 Built-in plugins for:
@@ -47,11 +48,30 @@ This will:
 
 ## How to use
 
+### Basic Usage
+
 - Use whatever you want to record voice notes (I use [Fossify](https://github.com/FossifyOrg/Voice-Recorder))
 - Use whatever you want to sync your files (I use [Syncthing](https://syncthing.net/))
 - Use whatever you want to look at the markdown/output files (I use [Zettel Notes](https://www.zettelnotes.com/))
 - Run the `./watch.sh` script on an idle machine to get the most out of it
 
+### Transcript Cleaning
+
+VibeLine includes a transcript cleaning feature that corrects common transcription errors based on a customizable vocabulary file. This is especially useful for technical terms, names, or domain-specific jargon that speech recognition models often misinterpret.
+
+To use this feature:
+
+1. Edit the `VOCABULARY.txt` file in the root directory to add your custom corrections:
+```
+# Format: incorrect_word -> correct_word
+Noster -> Nostr
+```
+
+2. The transcript cleaner will automatically run as part of the extraction process.
+
+3. You can customize the behavior with these options:
+   - `--no-clean`: Skip the transcript cleaning step entirely
+
 ## Contributors
 
 [![Contributors](https://contrib.rocks/image?repo=dergigi/vibeline)](https://github.com/dergigi/vibeline/graphs/contributors)

diff --git a/VOCABULARY.txt b/VOCABULARY.txt
@@ -0,0 +1,21 @@
+# Vocabulary file for transcript corrections
+# Format: incorrect_word -> correct_word
+
+# Nostr-related terms
+Noster -> Nostr
+Nostril -> Nostr
+Nostrum -> Nostr
+Nostre -> Nostr
+Nostra -> Nostr
+
+Pipeline -> vibeline
+vive line -> vibeline
+
+# Technical terms
+API's -> APIs
+SDK's -> SDKs
+Java script -> JavaScript
+Type script -> TypeScript
+Pie thon -> Python
+Rust Lang -> Rust
+Go Lang -> Go
diff --git a/src/extract.py b/src/extract.py
@@ -11,6 +11,7 @@
 from typing import List, Dict
 from dotenv import load_dotenv
 from plugin_manager import PluginManager, Plugin
+from transcript_cleaner import TranscriptCleaner
 
 # Load environment variables
 load_dotenv()
@@ -21,6 +22,7 @@
 # Configuration from environment variables
 OLLAMA_MODEL = os.getenv("OLLAMA_EXTRACT_MODEL", "llama2")
 VOICE_MEMOS_DIR = os.getenv("VOICE_MEMOS_DIR", "VoiceMemos")
+VOCABULARY_FILE = os.getenv("VOCABULARY_FILE", "VOCABULARY.txt")
 
 # Set a different host (default is http://localhost:11434)
 ollama.host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
@@ -84,11 +86,12 @@ def main():
     parser = argparse.ArgumentParser(description='Extract content from transcripts using plugins.')
     parser.add_argument('transcript_file', help='The transcript file to process')
     parser.add_argument('-f', '--force', action='store_true', help='Force overwrite existing output files')
+    parser.add_argument('--no-clean', action='store_true', help='Skip transcript cleaning step')
     args = parser.parse_args()
 
     # Ensure the default model exists
     ensure_model_exists(OLLAMA_MODEL)
-
+    
     input_file = Path(args.transcript_file)
     if not input_file.exists():
         print(f"Error: File {input_file} does not exist")
@@ -118,7 +121,42 @@ def main():
 
     # Read transcript
     with open(input_file, 'r', encoding='utf-8') as f:
-        transcript_text = f.read()
+        original_transcript_text = f.read()
+
+    # Clean transcript if not disabled
+    if not args.no_clean:
+        print("Cleaning transcript...")
+
+        # Check if vocabulary file exists
+        vocabulary_path = Path(VOCABULARY_FILE)
+        if not vocabulary_path.exists():
+            print(f"Warning: Vocabulary file {VOCABULARY_FILE} not found. Skipping transcript cleaning.")
+            transcript_text = original_transcript_text
+        else:
+            # Initialize transcript cleaner and clean the transcript
+            cleaner = TranscriptCleaner(vocabulary_file=vocabulary_path)
+            transcript_text, corrections = cleaner.clean_transcript(original_transcript_text)
+
+            # Log corrections
+            if corrections:
+                print(f"Made {len(corrections)} corrections to the transcript.")
+
+                # List the corrections made
+                for i, correction in enumerate(corrections, 1):
+                    print(f"{i}. Line {correction['line']}:")
+                    print(f"   Original: {correction['original']}")
+                    print(f"   Corrected: {correction['corrected']}")
+
+                # Save cleaned transcript
+                cleaned_file = input_file.parent / f"{input_file.stem}_cleaned.txt"
+                with open(cleaned_file, 'w', encoding='utf-8') as f:
+                    f.write(transcript_text)
+                print(f"Cleaned transcript saved to: {cleaned_file}")
+            else:
+                print("No corrections needed for this transcript.")
+                transcript_text = original_transcript_text
+    else:
+        transcript_text = original_transcript_text
 
     # Read summary if it exists
     summary_file = input_file.parent / f"{input_file.stem}_summary.txt"

diff --git a/src/transcript_cleaner.py b/src/transcript_cleaner.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+import os
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+# ollama import removed as we no longer use LLM for cleaning
+
+class TranscriptCleaner:
+    def __init__(self, vocabulary_file: Path = None):
+        """
+        Initialize the transcript cleaner with a vocabulary file.
+
+        Args:
+            vocabulary_file: Path to the vocabulary file containing corrections
+        """
+        self.vocabulary_file = vocabulary_file
+        self.corrections = {}
+
+        # Load vocabulary file if provided
+        if vocabulary_file and vocabulary_file.exists():
+            self._load_vocabulary()
+
+    def _load_vocabulary(self) -> None:
+        """Load word corrections from the vocabulary file."""
+        with open(self.vocabulary_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+
+                # Format: incorrect_word -> correct_word
+                if '->' in line:
+                    incorrect, correct = [part.strip() for part in line.split('->')]
+                    self.corrections[incorrect.lower()] = correct
+
+    def _apply_direct_corrections(self, text: str) -> str:
+        """Apply direct word corrections from the vocabulary file."""
+        if not self.corrections:
+            return text
+
+        # Split the text into words while preserving whitespace and punctuation
+        # This regex captures words, whitespace, and punctuation separately
+        tokens = re.findall(r'(\b\w+\b|\s+|[^\w\s])', text)
+        corrected_tokens = []
+        corrections_made = []
+
+        for token in tokens:
+            # Only process word tokens (skip whitespace and punctuation)
+            if re.match(r'\b\w+\b', token):
+                token_lower = token.lower()
+
+                # Check if this token needs correction
+                if token_lower in self.corrections:
+                    replacement = self.corrections[token_lower]
+
+                    # Preserve original capitalization
+                    if token.isupper():
+                        replacement = replacement.upper()
+                    elif token[0].isupper():
+                        replacement = replacement[0].upper() + replacement[1:]
+
+                    # Record this correction
+                    corrections_made.append((token, replacement))
+
+                    # Use the replacement
+                    corrected_tokens.append(replacement)
+                else:
+                    # No correction needed
+                    corrected_tokens.append(token)
+            else:
+                # Preserve whitespace and punctuation
+                corrected_tokens.append(token)
+
+        # Join the tokens back into text
+        corrected_text = ''.join(corrected_tokens)
+
+        # Also handle multi-word phrases
+        for incorrect, correct in self.corrections.items():
+            if ' ' in incorrect:  # This is a multi-word phrase
+                # Create a regex pattern that handles case variations
+                pattern = re.compile(re.escape(incorrect), re.IGNORECASE)
+
+                def replace_phrase(match):
+                    original = match.group(0)
+                    # Preserve case pattern if possible
+                    if original.isupper():
+                        return correct.upper()
+                    elif original[0].isupper():
+                        return correct[0].upper() + correct[1:]
+                    return correct
+
+                # Apply the replacement
+                corrected_text = pattern.sub(replace_phrase, corrected_text)
+
+        return corrected_text
+
+    # No LLM-based methods needed anymore
+
+    def clean_transcript(self, text: str) -> Tuple[str, List[Dict]]:
+        """
+        Clean the transcript by applying vocabulary corrections.
+
+        Args:
+            text: The transcript text to clean
+
+        Returns:
+            Tuple containing:
+                - The cleaned transcript text
+                - A list of corrections made (for logging/debugging)
+        """
+        original_text = text
+
+        # Apply direct word-for-word corrections
+        cleaned_text = self._apply_direct_corrections(text)
+
+        # Log corrections for debugging
+        corrections_made = []
+        if cleaned_text != original_text:
+            # Find differences between original and corrected text
+            for i, (orig_line, new_line) in enumerate(zip(original_text.splitlines(), cleaned_text.splitlines())):
+                if orig_line != new_line:
+                    corrections_made.append({
+                        'line': i + 1,
+                        'original': orig_line,
+                        'corrected': new_line
+                    })
+
+        return cleaned_text, corrections_made
diff --git a/tests/transcript_cleaner.py b/tests/transcript_cleaner.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+
+# Add the src directory to the Python path
+sys.path.append(str(Path(__file__).parent))
+from src.transcript_cleaner import TranscriptCleaner
+
+# Load environment variables
+load_dotenv()
+
+def main():
+    vocabulary_file = Path(os.getenv("VOCABULARY_FILE", "VOCABULARY.txt"))
+
+    # Check if vocabulary file exists
+    if not vocabulary_file.exists():
+        print(f"Error: Vocabulary file {vocabulary_file} not found.")
+        print("Please create it first or specify a different file with VOCABULARY_FILE environment variable.")
+        sys.exit(1)
+
+    # Check command line arguments
+    if len(sys.argv) < 2:
+        print("Usage: python test_transcript_cleaner.py <text_to_clean>")
+        print("   or: python test_transcript_cleaner.py --file <transcript_file>")
+        sys.exit(1)
+
+    # Get the text to clean
+    if sys.argv[1] == "--file":
+        if len(sys.argv) < 3:
+            print("Error: No file specified after --file")
+            sys.exit(1)
+
+        file_path = Path(sys.argv[2])
+        if not file_path.exists():
+            print(f"Error: File {file_path} not found")
+            sys.exit(1)
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+
+        print(f"Cleaning transcript from file: {file_path}")
+    else:
+        # Use the command line argument as the text
+        text = " ".join(sys.argv[1:])
+        print(f"Cleaning text: {text}")
+
+    # Initialize the transcript cleaner
+    cleaner = TranscriptCleaner(vocabulary_file=vocabulary_file)
+
+    # Clean the transcript
+    cleaned_text, corrections = cleaner.clean_transcript(text)
+
+    # Print the results
+    print("\nOriginal text:")
+    print("-" * 80)
+    print(text)
+    print("-" * 80)
+
+    print("\nCleaned text:")
+    print("-" * 80)
+    print(cleaned_text)
+    print("-" * 80)
+
+    # Print corrections
+    if corrections:
+        print(f"\nMade {len(corrections)} corrections:")
+        for i, correction in enumerate(corrections, 1):
+            print(f"{i}. Line {correction['line']}:")
+            print(f"   Original: {correction['original']}")
+            print(f"   Corrected: {correction['corrected']}")
+    else:
+        print("\nNo corrections were made.")
+
+if __name__ == "__main__":
+    main()