From 58b9cb586ce2e66a50eb28045654775dda67445e Mon Sep 17 00:00:00 2001
From: "Max P." <Mail@MPassarello.de>
Date: Mon, 5 May 2025 12:00:33 +0200
Subject: [PATCH] feat(libs): integrate Ollama and Whisper clients with config
 models

- Add `AppConfig` and `PresetConfig` models using Pydantic for config validation
- Refactor `read_configurations` to return an `AppConfig` instance
- Implement `OllamaClient` for chat-based server interaction
- Implement `WhisperClient` for transcription via Whisper CLI
- Migrate notification utilities to `libs` directory
- Update tray application to use new clients and config structure
- Simplify Whisper and Ollama integration logic in `WhisperWorker`

Signed-off-by: Max P. <Mail@MPassarello.de>
---
 src/pyvtt/configuration.py      |   7 +-
 src/pyvtt/{ => libs}/notify.py  |   0
 src/pyvtt/libs/ollama.py        |  67 ++++++++++++++++++
 src/pyvtt/libs/whisper.py       |  50 +++++++++++++
 src/pyvtt/models/config.py      |  25 +++++++
 src/pyvtt/voice_to_text_tray.py | 121 +++++++++-----------------------
 6 files changed, 182 insertions(+), 88 deletions(-)
 rename src/pyvtt/{ => libs}/notify.py (100%)
 create mode 100644 src/pyvtt/libs/ollama.py
 create mode 100644 src/pyvtt/libs/whisper.py
 create mode 100644 src/pyvtt/models/config.py

diff --git a/src/pyvtt/configuration.py b/src/pyvtt/configuration.py
index cdcac36..803defb 100644
--- a/src/pyvtt/configuration.py
+++ b/src/pyvtt/configuration.py
@@ -2,9 +2,11 @@ import json
 import os
 from pathlib import Path
 
+from pyvtt.models.config import AppConfig
+
 DEFAULT_CONFIG_PATH = Path.home() / ".pyvtt.json"
 
-def read_configurations():
+def read_configurations() -> AppConfig:
     """
     Reads the configuration settings from a JSON file named 'pyvtt.settings.json' 
     located in the same directory as the script.
@@ -18,7 +20,8 @@ def read_configurations():
     """
     try:
         with open(DEFAULT_CONFIG_PATH) as f:
-            return json.load(f)
+            raw_config =  json.load(f)
+            return AppConfig(**raw_config)
     except Exception as e:
         print(f"Error reading configurations: {e}")
         raise Exception(f"Error reading configurations: {e}")
\ No newline at end of file
diff --git a/src/pyvtt/notify.py b/src/pyvtt/libs/notify.py
similarity index 100%
rename from src/pyvtt/notify.py
rename to src/pyvtt/libs/notify.py
diff --git a/src/pyvtt/libs/ollama.py b/src/pyvtt/libs/ollama.py
new file mode 100644
index 0000000..e175c7d
--- /dev/null
+++ b/src/pyvtt/libs/ollama.py
@@ -0,0 +1,67 @@
+import requests
+from typing import Union, List, Optional
+
+from pyvtt.libs.notify import notify
+from pyvtt.models.config import AppConfig, PresetConfig
+
+
+class OllamaClient:
+    def __init__(self, config: AppConfig):
+        """
+        Initialisiert den Ollama-Client mit der Basis-Konfiguration aus der globalen App-Konfiguration.
+
+        :param config: AppConfig-Instanz mit Host und Port für den Ollama-Server.
+        """
+        self.base_url = config.ollama_url
+        self.port = config.ollama_port
+
+    def send_chat(
+        self,
+        user_message: str,
+        config: PresetConfig,
+    ) -> str:
+        """
+        Sendet eine Chat-Anfrage an den Ollama-Server basierend auf der spezifischen Preset-Konfiguration.
+
+        :param user_message: Der vom Nutzer erzeugte Eingabetext (z. B. Transkript).
+        :param config: PresetConfig-Instanz mit modell-, prompt- und kontextbezogenen Parametern.
+        :return: Der von Ollama zurückgegebene, formatierte Antworttext, die user_message
+                 unverändert zurückgibt, wenn Ollama deaktiviert ist oder none bei einem Fehler.
+        """
+        if config.ollama and config.ollama.lower() == "disable":
+            print("[OllamaClient] Ollama ist im Preset deaktiviert.")
+            print("[OllamaClient] Gebe die Eingabe unverändert zurück.")
+            return user_message
+
+        # Prompt als String aufbereiten – Liste wird zu Zeilen verbunden
+        if isinstance(config.ollama_prompt, list):
+            prompt_str = "\n".join(config.ollama_prompt)
+        else:
+            prompt_str = config.ollama_prompt
+
+        # Payload für die API-Anfrage vorbereiten
+        payload = {
+            "model": config.ollama_model,
+            "messages": [
+                {"role": "system", "content": prompt_str},
+                {"role": "user", "content": user_message}
+            ],
+            "options": {
+                "num_ctx": config.ollama_context,
+            } if config.ollama_context else {},
+            "stream": False
+        }
+
+        endpoint = f"{self.base_url}:{self.port}/api/chat"
+
+        # Anfrage an Ollama senden und Antwort extrahieren
+        try:
+            response = requests.post(endpoint, json=payload)
+            response.raise_for_status()
+            json_response = response.json()
+            content = json_response.get("message", {}).get("content", "").strip()
+            return "\n".join(line.strip() for line in content.splitlines())
+        except requests.exceptions.RequestException as e:
+            print(f"[OllamaClient] HTTP-Fehler: {e}")
+            notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!")
+            return ""
diff --git a/src/pyvtt/libs/whisper.py b/src/pyvtt/libs/whisper.py
new file mode 100644
index 0000000..c4bab4c
--- /dev/null
+++ b/src/pyvtt/libs/whisper.py
@@ -0,0 +1,50 @@
+import subprocess
+from typing import Optional
+from pathlib import Path
+
+from pyvtt.libs.notify import notify
+from pyvtt.models.config import AppConfig, PresetConfig
+
+
+class WhisperClient:
+    def __init__(self, config: AppConfig):
+        """
+        Initialisiert den Whisper-Client mit der globalen Anwendungskonfiguration.
+
+        :param config: AppConfig-Instanz mit Pfaden zur Whisper-Binary, Audio- und Ausgabedatei.
+        """
+        self.whisper_path = config.whisper_path
+        self.audio_file = config.audio_file
+        self.output_file = config.output_file
+
+    def transcribe(self, config: PresetConfig) -> str:
+        """
+        Führt Whisper (CLI) zur Transkription der Audiodatei aus und gibt das Transkript zurück.
+
+        :param config: PresetConfig-Instanz mit Whisper-Modell und Spracheinstellungen.
+        :return: Das rohe Transkript als String – oder None bei Fehlern.
+        """
+        output_base = self.output_file.replace(".txt", "")
+        whisper_cmd = [
+            self.whisper_path,
+            "-m", config.whisper_model,
+            "-f", self.audio_file,
+            "-l", config.language,
+            "-otxt",
+            "-of", output_base
+        ]
+
+        try:
+            subprocess.run(whisper_cmd, check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"[WhisperClient] Whisper-Ausführungsfehler: {e}")
+            notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!")
+            return ""
+
+        try:
+            with open(self.output_file, "r", encoding="utf-8") as f:
+                return "\n".join(line.strip() for line in f.readlines())
+        except Exception as e:
+            print(f"[WhisperClient] Fehler beim Einlesen der Ausgabedatei: {e}")
+            notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!")
+            return ""
diff --git a/src/pyvtt/models/config.py b/src/pyvtt/models/config.py
new file mode 100644
index 0000000..e10c829
--- /dev/null
+++ b/src/pyvtt/models/config.py
@@ -0,0 +1,25 @@
+from typing import List, Optional, Union
+from pydantic import BaseModel, HttpUrl, Field
+
+
+class PresetConfig(BaseModel):
+    name: str
+    language: str
+    whisper_model: str
+    ollama: Optional[str] = None
+    ollama_model: Optional[str] = None
+    ollama_context: Optional[int] = None
+    ollama_prompt: Optional[Union[str, List[str]]] = None
+    mode: Optional[str] = None
+    journal_name: Optional[str] = None
+
+
+class AppConfig(BaseModel):
+    audio_file: str
+    output_file: str
+    whisper_path: str
+    socket_path: str
+    ollama_url: str
+    ollama_port: int
+    journal_path: str
+    presets: List[PresetConfig]
diff --git a/src/pyvtt/voice_to_text_tray.py b/src/pyvtt/voice_to_text_tray.py
index 75cca43..ccc6bb1 100755
--- a/src/pyvtt/voice_to_text_tray.py
+++ b/src/pyvtt/voice_to_text_tray.py
@@ -10,10 +10,12 @@ from PyQt5.QtWidgets import QApplication, QSystemTrayIcon, QMenu, QAction
 from PyQt5.QtGui import QIcon
 from PyQt5.QtCore import QThread, pyqtSignal
 from pyvtt.configuration import read_configurations
-from pyvtt.notify import notify, play_sound
+from pyvtt.libs.notify import notify, play_sound
+from pyvtt.libs.ollama import OllamaClient
+from pyvtt.libs.whisper import WhisperClient
 
-CONFIGURATION = read_configurations()
-CURRENT_PRESET = CONFIGURATION["presets"][0]  # Default to first preset
+CONFIG = read_configurations()
+CURRENT_PRESET = CONFIG.presets[0]  # Default to first preset
 
 class WhisperWorker(QThread):
     """
@@ -31,82 +33,28 @@ class WhisperWorker(QThread):
     """
     finished = pyqtSignal(str)
 
+    def __init__(self):
+        super().__init__()
+        self.whisper = WhisperClient(CONFIG)
+        self.ollama = OllamaClient(CONFIG)
+
     def run(self):
-        CURENT_CONFIGURATION_LOCALE = CONFIGURATION
+        CURENT_CONFIG_LOCALE = CONFIG
         CURRENT_PRESET_LOCALE = CURRENT_PRESET
 
         try:
-            # Whisper ausführen
-            whisper_cmd = [
-                CURENT_CONFIGURATION_LOCALE["whisper_path"],
-                "-m", CURRENT_PRESET_LOCALE["whisper_model"],
-                "-f", CURENT_CONFIGURATION_LOCALE["audio_file"],
-                "-l", CURRENT_PRESET_LOCALE["language"],
-                "-otxt",
-                "-of", CURENT_CONFIGURATION_LOCALE["output_file"].replace(".txt", "")
-            ]
-            try:
-                subprocess.run(whisper_cmd, check=True)
-            except subprocess.CalledProcessError as e:
-                print(f"Whisper Fehler: {e}")
-                notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!")
-                return
-            
-            try:
-                with open(CURENT_CONFIGURATION_LOCALE["output_file"], "r") as f:
-                    raw_result = "\n".join(line.strip() for line in f.readlines())
-            except Exception as e:
-                print(f"Datei Fehler: {e}")
-                notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!")
-                return
-            
-            print("Whisper Transkript erhalten.")
-
-            # --- An Ollama schicken ---
-            if CURRENT_PRESET_LOCALE["ollama"] != "disable":
-                if isinstance(CURRENT_PRESET_LOCALE["ollama_prompt"], list):
-                    prompt = "\n".join(CURRENT_PRESET_LOCALE["ollama_prompt"])
-                else:
-                    prompt = CURRENT_PRESET_LOCALE["ollama_prompt"]
-                
-                payload = {
-                    "model": CURRENT_PRESET_LOCALE["ollama_model"],
-                    "messages": [
-                        {"role": "system", "content": prompt},
-                        {"role": "user", "content": raw_result}
-                    ],
-                    "options": {
-                        "num_ctx": CURRENT_PRESET_LOCALE["ollama_context"]
-                    },
-                    "stream": False
-                }
-                ollama_endpoint = f"{CURENT_CONFIGURATION_LOCALE['ollama_url']}:{CURENT_CONFIGURATION_LOCALE['ollama_port']}/api/chat"
-                response = requests.post(ollama_endpoint, json=payload)
-
-                try:
-                    response.raise_for_status()
-                except requests.exceptions.HTTPError as e:
-                    print(f"HTTP Fehler: {e}")
-                    notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!")
-                    return
-
-                json_response = response.json()
-                formatted_result = json_response.get("message", {}).get("content", "").strip()
-                formatted_result = "\n".join(line.strip() for line in formatted_result.splitlines())
-                print("Ollama Antwort erhalten.")
-            else:
-                formatted_result = raw_result
-                print("Kein Ollama Prompt angegeben, nur Whisper Ergebnis verwendet.")
+            raw_result = self.whisper.transcribe(CURRENT_PRESET_LOCALE)
+            formatted_result = self.ollama.send_chat(raw_result, CURRENT_PRESET_LOCALE)
 
             # Ergebnis ins Clipboard kopieren
-            if CURRENT_PRESET_LOCALE.get("mode") == "journal":
+            if CURRENT_PRESET_LOCALE.mode == "journal":
                 today = datetime.date.today().strftime("%Y.%m.%d")
-                journal_path = os.path.join(CURENT_CONFIGURATION_LOCALE["journal_path"], f"{today} - {CURRENT_PRESET_LOCALE['journal_name']}.md")
+                journal_path = os.path.join(CURENT_CONFIG_LOCALE.journal_path, f"{today} - {CURRENT_PRESET_LOCALE.journal_name}.md")
                 now = datetime.datetime.now().strftime("%H:%M:%S")
                 if not os.path.exists(journal_path):
                     try:
                         with open(journal_path, "w") as f:
-                            f.write(f"# {CURRENT_PRESET_LOCALE['journal_name']} - {today}\n\n")
+                            f.write(f"# {CURRENT_PRESET_LOCALE.journal_name} - {today}\n\n")
                     except Exception as e:
                         print(f"Journal Erstellungsfehler: {e}")
                         notify("Fehler", "Ein Fehler beim Erstellen des Journals ist aufgetreten!")
@@ -157,11 +105,11 @@ class SocketListener(threading.Thread):
     def __init__(self, tray_app):
         super().__init__(daemon=True)
         self.tray_app = tray_app
-        if os.path.exists(CONFIGURATION["socket_path"]):
-            os.remove(CONFIGURATION["socket_path"])
+        if os.path.exists(CONFIG.socket_path):
+            os.remove(CONFIG.socket_path)
         self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-        self.sock.bind(CONFIGURATION["socket_path"])
-        os.chmod(CONFIGURATION["socket_path"], 0o666)
+        self.sock.bind(CONFIG.socket_path)
+        os.chmod(CONFIG.socket_path, 0o666)
         self.sock.listen(1)
 
     def run(self):
@@ -174,8 +122,8 @@ class SocketListener(threading.Thread):
                     if len(cmd) > 1:
                         data = cmd[0]
                         preset = cmd[1]
-                        if preset in [p["name"] for p in CONFIGURATION["presets"]]:
-                            self.tray_app.set_preset([p["name"] for p in CONFIGURATION["presets"]].index(preset))
+                        if preset in [p.name for p in CONFIG.presets]:
+                            self.tray_app.set_preset([p.name for p in CONFIG.presets].index(preset))
                     else:
                         data = cmd[0]
                 if data == "toggle":
@@ -224,8 +172,8 @@ class TrayApp:
         # Preset Menü
         self.preset_actions = []
         self.preset_group = QMenu("Presets")
-        for i, preset in enumerate(CONFIGURATION["presets"]):
-            action = QAction(preset["name"], self.menu)
+        for i, preset in enumerate(CONFIG.presets):
+            action = QAction(preset.name, self.menu)
             action.setCheckable(True)
             if i == 0:
                 action.setChecked(True)
@@ -255,8 +203,9 @@ class TrayApp:
 
     def set_preset(self, index):
         global CURRENT_PRESET
-        print(f"Preset gewechselt: {CONFIGURATION['presets'][index]['name']}")
-        CURRENT_PRESET = CONFIGURATION["presets"][index]
+        selected_preset = CONFIG.presets[index]
+        print(f"Preset gewechselt: {selected_preset.name}")
+        CURRENT_PRESET = selected_preset
         # Nur einer darf gecheckt sein
         for i, action in enumerate(self.preset_actions):
             action.setChecked(i == index)
@@ -266,7 +215,7 @@ class TrayApp:
             print("Starte Aufnahme...")
             self.recording_process = subprocess.Popen([
                 "ffmpeg", "-f", "pulse", "-i", "default", "-ar", "16000",
-                "-ac", "1", CONFIGURATION["audio_file"], "-y", "-loglevel", "quiet"
+                "-ac", "1", CONFIG.audio_file, "-y", "-loglevel", "quiet"
             ])
             notify("Aufnahme", "Aufnahme gestartet!")
 
@@ -294,15 +243,15 @@ class TrayApp:
         print(f"Fertig:\n{text}")
 
     def reload_configurations(self):
-        global CONFIGURATION, CURRENT_PRESET
+        global CONFIG, CURRENT_PRESET
         print("Lade Einstellungen neu...")
-        CONFIGURATION = read_configurations()
-        CURRENT_PRESET = CONFIGURATION["presets"][0]  # Default to first preset
+        CONFIG = read_configurations()
+        CURRENT_PRESET = CONFIG.presets[0]  # Default to first preset
         # Update preset menu
         self.preset_group.clear()
         self.preset_actions = []
-        for i, preset in enumerate(CONFIGURATION["presets"]):
-            action = QAction(preset["name"], self.menu)
+        for i, preset in enumerate(CONFIG.presets):
+            action = QAction(preset.name, self.menu)
             action.setCheckable(True)
             if i == 0:
                 action.setChecked(True)
@@ -312,8 +261,8 @@ class TrayApp:
         print("Einstellungen erfolgreich neu geladen.")
 
     def cleanup(self):
-        if os.path.exists(CONFIGURATION["socket_path"]):
-            os.remove(CONFIGURATION["socket_path"])
+        if os.path.exists(CONFIG.socket_path):
+            os.remove(CONFIG.socket_path)
         print("Socket sauber entfernt.")
 
     def run(self):