feat(libs): integrate Ollama and Whisper clients with config models

- Add `AppConfig` and `PresetConfig` models using Pydantic for config validation - Refactor `read_configurations` to return an `AppConfig` instance - Implement `OllamaClient` for chat-based server interaction - Implement `WhisperClient` for transcription via Whisper CLI - Migrate notification utilities to `libs` directory - Update tray application to use new clients and config structure - Simplify Whisper and Ollama integration logic in `WhisperWorker` Signed-off-by: Max P. <Mail@MPassarello.de>
2025-05-05 12:00:33 +02:00
parent 5688769437
commit 58b9cb586c
6 changed files with 182 additions and 88 deletions
--- a/src/pyvtt/configuration.py
+++ b/src/pyvtt/configuration.py
@@ -2,9 +2,11 @@ import json
 import os
 from pathlib import Path
 from pyvtt.models.config import AppConfig
 DEFAULT_CONFIG_PATH = Path.home() / ".pyvtt.json"
-def read_configurations():
+def read_configurations() -> AppConfig:
    """
    Reads the configuration settings from a JSON file named 'pyvtt.settings.json' 
    located in the same directory as the script.
@@ -18,7 +20,8 @@ def read_configurations():
    """
    try:
        with open(DEFAULT_CONFIG_PATH) as f:
-            return json.load(f)
+            raw_config =  json.load(f)
            return AppConfig(**raw_config)
    except Exception as e:
        print(f"Error reading configurations: {e}")
        raise Exception(f"Error reading configurations: {e}")
--- a/src/pyvtt/libs/notify.py
+++ b/src/pyvtt/libs/notify.py
--- a/src/pyvtt/libs/ollama.py
+++ b/src/pyvtt/libs/ollama.py
@@ -0,0 +1,67 @@
 import requests
 from typing import Union, List, Optional
 from pyvtt.libs.notify import notify
 from pyvtt.models.config import AppConfig, PresetConfig
 class OllamaClient:
    def __init__(self, config: AppConfig):
        """
        Initialisiert den Ollama-Client mit der Basis-Konfiguration aus der globalen App-Konfiguration.
        :param config: AppConfig-Instanz mit Host und Port für den Ollama-Server.
        """
        self.base_url = config.ollama_url
        self.port = config.ollama_port
    def send_chat(
        self,
        user_message: str,
        config: PresetConfig,
    ) -> str:
        """
        Sendet eine Chat-Anfrage an den Ollama-Server basierend auf der spezifischen Preset-Konfiguration.
        :param user_message: Der vom Nutzer erzeugte Eingabetext (z. B. Transkript).
        :param config: PresetConfig-Instanz mit modell-, prompt- und kontextbezogenen Parametern.
        :return: Der von Ollama zurückgegebene, formatierte Antworttext, die user_message
                 unverändert zurückgibt, wenn Ollama deaktiviert ist oder none bei einem Fehler.
        """
        if config.ollama and config.ollama.lower() == "disable":
            print("[OllamaClient] Ollama ist im Preset deaktiviert.")
            print("[OllamaClient] Gebe die Eingabe unverändert zurück.")
            return user_message
        # Prompt als String aufbereiten – Liste wird zu Zeilen verbunden
        if isinstance(config.ollama_prompt, list):
            prompt_str = "\n".join(config.ollama_prompt)
        else:
            prompt_str = config.ollama_prompt
        # Payload für die API-Anfrage vorbereiten
        payload = {
            "model": config.ollama_model,
            "messages": [
                {"role": "system", "content": prompt_str},
                {"role": "user", "content": user_message}
            ],
            "options": {
                "num_ctx": config.ollama_context,
            } if config.ollama_context else {},
            "stream": False
        }
        endpoint = f"{self.base_url}:{self.port}/api/chat"
        # Anfrage an Ollama senden und Antwort extrahieren
        try:
            response = requests.post(endpoint, json=payload)
            response.raise_for_status()
            json_response = response.json()
            content = json_response.get("message", {}).get("content", "").strip()
            return "\n".join(line.strip() for line in content.splitlines())
        except requests.exceptions.RequestException as e:
            print(f"[OllamaClient] HTTP-Fehler: {e}")
            notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!")
            return ""
--- a/src/pyvtt/libs/whisper.py
+++ b/src/pyvtt/libs/whisper.py
@@ -0,0 +1,50 @@
 import subprocess
 from typing import Optional
 from pathlib import Path
 from pyvtt.libs.notify import notify
 from pyvtt.models.config import AppConfig, PresetConfig
 class WhisperClient:
    def __init__(self, config: AppConfig):
        """
        Initialisiert den Whisper-Client mit der globalen Anwendungskonfiguration.
        :param config: AppConfig-Instanz mit Pfaden zur Whisper-Binary, Audio- und Ausgabedatei.
        """
        self.whisper_path = config.whisper_path
        self.audio_file = config.audio_file
        self.output_file = config.output_file
    def transcribe(self, config: PresetConfig) -> str:
        """
        Führt Whisper (CLI) zur Transkription der Audiodatei aus und gibt das Transkript zurück.
        :param config: PresetConfig-Instanz mit Whisper-Modell und Spracheinstellungen.
        :return: Das rohe Transkript als String – oder None bei Fehlern.
        """
        output_base = self.output_file.replace(".txt", "")
        whisper_cmd = [
            self.whisper_path,
            "-m", config.whisper_model,
            "-f", self.audio_file,
            "-l", config.language,
            "-otxt",
            "-of", output_base
        ]
        try:
            subprocess.run(whisper_cmd, check=True)
        except subprocess.CalledProcessError as e:
            print(f"[WhisperClient] Whisper-Ausführungsfehler: {e}")
            notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!")
            return ""
        try:
            with open(self.output_file, "r", encoding="utf-8") as f:
                return "\n".join(line.strip() for line in f.readlines())
        except Exception as e:
            print(f"[WhisperClient] Fehler beim Einlesen der Ausgabedatei: {e}")
            notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!")
            return ""
--- a/src/pyvtt/models/config.py
+++ b/src/pyvtt/models/config.py
@@ -0,0 +1,25 @@
 from typing import List, Optional, Union
 from pydantic import BaseModel, HttpUrl, Field
 class PresetConfig(BaseModel):
    name: str
    language: str
    whisper_model: str
    ollama: Optional[str] = None
    ollama_model: Optional[str] = None
    ollama_context: Optional[int] = None
    ollama_prompt: Optional[Union[str, List[str]]] = None
    mode: Optional[str] = None
    journal_name: Optional[str] = None
 class AppConfig(BaseModel):
    audio_file: str
    output_file: str
    whisper_path: str
    socket_path: str
    ollama_url: str
    ollama_port: int
    journal_path: str
    presets: List[PresetConfig]
--- a/src/pyvtt/voice_to_text_tray.py
+++ b/src/pyvtt/voice_to_text_tray.py
@@ -10,10 +10,12 @@ from PyQt5.QtWidgets import QApplication, QSystemTrayIcon, QMenu, QAction
 from PyQt5.QtGui import QIcon
 from PyQt5.QtCore import QThread, pyqtSignal
 from pyvtt.configuration import read_configurations
-from pyvtt.notify import notify, play_sound
+from pyvtt.libs.notify import notify, play_sound
 from pyvtt.libs.ollama import OllamaClient
 from pyvtt.libs.whisper import WhisperClient
-CONFIGURATION = read_configurations()
+CONFIG = read_configurations()
-CURRENT_PRESET = CONFIGURATION["presets"][0]  # Default to first preset
+CURRENT_PRESET = CONFIG.presets[0]  # Default to first preset
 class WhisperWorker(QThread):
    """
@@ -31,82 +33,28 @@ class WhisperWorker(QThread):
    """
    finished = pyqtSignal(str)
    def __init__(self):
        super().__init__()
        self.whisper = WhisperClient(CONFIG)
        self.ollama = OllamaClient(CONFIG)
    def run(self):
-        CURENT_CONFIGURATION_LOCALE = CONFIGURATION
+        CURENT_CONFIG_LOCALE = CONFIG
        CURRENT_PRESET_LOCALE = CURRENT_PRESET
        try:
-            # Whisper ausführen
+            raw_result = self.whisper.transcribe(CURRENT_PRESET_LOCALE)
-            whisper_cmd = [
+            formatted_result = self.ollama.send_chat(raw_result, CURRENT_PRESET_LOCALE)
                CURENT_CONFIGURATION_LOCALE["whisper_path"],
                "-m", CURRENT_PRESET_LOCALE["whisper_model"],
                "-f", CURENT_CONFIGURATION_LOCALE["audio_file"],
                "-l", CURRENT_PRESET_LOCALE["language"],
                "-otxt",
                "-of", CURENT_CONFIGURATION_LOCALE["output_file"].replace(".txt", "")
            ]
            try:
                subprocess.run(whisper_cmd, check=True)
            except subprocess.CalledProcessError as e:
                print(f"Whisper Fehler: {e}")
                notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!")
                return
            try:
                with open(CURENT_CONFIGURATION_LOCALE["output_file"], "r") as f:
                    raw_result = "\n".join(line.strip() for line in f.readlines())
            except Exception as e:
                print(f"Datei Fehler: {e}")
                notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!")
                return
            print("Whisper Transkript erhalten.")
            # --- An Ollama schicken ---
            if CURRENT_PRESET_LOCALE["ollama"] != "disable":
                if isinstance(CURRENT_PRESET_LOCALE["ollama_prompt"], list):
                    prompt = "\n".join(CURRENT_PRESET_LOCALE["ollama_prompt"])
                else:
                    prompt = CURRENT_PRESET_LOCALE["ollama_prompt"]
                payload = {
                    "model": CURRENT_PRESET_LOCALE["ollama_model"],
                    "messages": [
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": raw_result}
                    ],
                    "options": {
                        "num_ctx": CURRENT_PRESET_LOCALE["ollama_context"]
                    },
                    "stream": False
                }
                ollama_endpoint = f"{CURENT_CONFIGURATION_LOCALE['ollama_url']}:{CURENT_CONFIGURATION_LOCALE['ollama_port']}/api/chat"
                response = requests.post(ollama_endpoint, json=payload)
                try:
                    response.raise_for_status()
                except requests.exceptions.HTTPError as e:
                    print(f"HTTP Fehler: {e}")
                    notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!")
                    return
                json_response = response.json()
                formatted_result = json_response.get("message", {}).get("content", "").strip()
                formatted_result = "\n".join(line.strip() for line in formatted_result.splitlines())
                print("Ollama Antwort erhalten.")
            else:
                formatted_result = raw_result
                print("Kein Ollama Prompt angegeben, nur Whisper Ergebnis verwendet.")
            # Ergebnis ins Clipboard kopieren
-            if CURRENT_PRESET_LOCALE.get("mode") == "journal":
+            if CURRENT_PRESET_LOCALE.mode == "journal":
                today = datetime.date.today().strftime("%Y.%m.%d")
-                journal_path = os.path.join(CURENT_CONFIGURATION_LOCALE["journal_path"], f"{today} - {CURRENT_PRESET_LOCALE['journal_name']}.md")
+                journal_path = os.path.join(CURENT_CONFIG_LOCALE.journal_path, f"{today} - {CURRENT_PRESET_LOCALE.journal_name}.md")
                now = datetime.datetime.now().strftime("%H:%M:%S")
                if not os.path.exists(journal_path):
                    try:
                        with open(journal_path, "w") as f:
-                            f.write(f"# {CURRENT_PRESET_LOCALE['journal_name']} - {today}\n\n")
+                            f.write(f"# {CURRENT_PRESET_LOCALE.journal_name} - {today}\n\n")
                    except Exception as e:
                        print(f"Journal Erstellungsfehler: {e}")
                        notify("Fehler", "Ein Fehler beim Erstellen des Journals ist aufgetreten!")
@@ -157,11 +105,11 @@ class SocketListener(threading.Thread):
    def __init__(self, tray_app):
        super().__init__(daemon=True)
        self.tray_app = tray_app
-        if os.path.exists(CONFIGURATION["socket_path"]):
+        if os.path.exists(CONFIG.socket_path):
-            os.remove(CONFIGURATION["socket_path"])
+            os.remove(CONFIG.socket_path)
        self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-        self.sock.bind(CONFIGURATION["socket_path"])
+        self.sock.bind(CONFIG.socket_path)
-        os.chmod(CONFIGURATION["socket_path"], 0o666)
+        os.chmod(CONFIG.socket_path, 0o666)
        self.sock.listen(1)
    def run(self):
@@ -174,8 +122,8 @@ class SocketListener(threading.Thread):
                    if len(cmd) > 1:
                        data = cmd[0]
                        preset = cmd[1]
-                        if preset in [p["name"] for p in CONFIGURATION["presets"]]:
+                        if preset in [p.name for p in CONFIG.presets]:
-                            self.tray_app.set_preset([p["name"] for p in CONFIGURATION["presets"]].index(preset))
+                            self.tray_app.set_preset([p.name for p in CONFIG.presets].index(preset))
                    else:
                        data = cmd[0]
                if data == "toggle":
@@ -224,8 +172,8 @@ class TrayApp:
        # Preset Menü
        self.preset_actions = []
        self.preset_group = QMenu("Presets")
-        for i, preset in enumerate(CONFIGURATION["presets"]):
+        for i, preset in enumerate(CONFIG.presets):
-            action = QAction(preset["name"], self.menu)
+            action = QAction(preset.name, self.menu)
            action.setCheckable(True)
            if i == 0:
                action.setChecked(True)
@@ -255,8 +203,9 @@ class TrayApp:
    def set_preset(self, index):
        global CURRENT_PRESET
-        print(f"Preset gewechselt: {CONFIGURATION['presets'][index]['name']}")
+        selected_preset = CONFIG.presets[index]
-        CURRENT_PRESET = CONFIGURATION["presets"][index]
+        print(f"Preset gewechselt: {selected_preset.name}")
        CURRENT_PRESET = selected_preset
        # Nur einer darf gecheckt sein
        for i, action in enumerate(self.preset_actions):
            action.setChecked(i == index)
@@ -266,7 +215,7 @@ class TrayApp:
            print("Starte Aufnahme...")
            self.recording_process = subprocess.Popen([
                "ffmpeg", "-f", "pulse", "-i", "default", "-ar", "16000",
-                "-ac", "1", CONFIGURATION["audio_file"], "-y", "-loglevel", "quiet"
+                "-ac", "1", CONFIG.audio_file, "-y", "-loglevel", "quiet"
            ])
            notify("Aufnahme", "Aufnahme gestartet!")
@@ -294,15 +243,15 @@ class TrayApp:
        print(f"Fertig:\n{text}")
    def reload_configurations(self):
-        global CONFIGURATION, CURRENT_PRESET
+        global CONFIG, CURRENT_PRESET
        print("Lade Einstellungen neu...")
-        CONFIGURATION = read_configurations()
+        CONFIG = read_configurations()
-        CURRENT_PRESET = CONFIGURATION["presets"][0]  # Default to first preset
+        CURRENT_PRESET = CONFIG.presets[0]  # Default to first preset
        # Update preset menu
        self.preset_group.clear()
        self.preset_actions = []
-        for i, preset in enumerate(CONFIGURATION["presets"]):
+        for i, preset in enumerate(CONFIG.presets):
-            action = QAction(preset["name"], self.menu)
+            action = QAction(preset.name, self.menu)
            action.setCheckable(True)
            if i == 0:
                action.setChecked(True)
@@ -312,8 +261,8 @@ class TrayApp:
        print("Einstellungen erfolgreich neu geladen.")
    def cleanup(self):
-        if os.path.exists(CONFIGURATION["socket_path"]):
+        if os.path.exists(CONFIG.socket_path):
-            os.remove(CONFIGURATION["socket_path"])
+            os.remove(CONFIG.socket_path)
        print("Socket sauber entfernt.")
    def run(self):