From 58b9cb586ce2e66a50eb28045654775dda67445e Mon Sep 17 00:00:00 2001 From: "Max P." Date: Mon, 5 May 2025 12:00:33 +0200 Subject: [PATCH] feat(libs): integrate Ollama and Whisper clients with config models - Add `AppConfig` and `PresetConfig` models using Pydantic for config validation - Refactor `read_configurations` to return an `AppConfig` instance - Implement `OllamaClient` for chat-based server interaction - Implement `WhisperClient` for transcription via Whisper CLI - Migrate notification utilities to `libs` directory - Update tray application to use new clients and config structure - Simplify Whisper and Ollama integration logic in `WhisperWorker` Signed-off-by: Max P. --- src/pyvtt/configuration.py | 7 +- src/pyvtt/{ => libs}/notify.py | 0 src/pyvtt/libs/ollama.py | 67 ++++++++++++++++++ src/pyvtt/libs/whisper.py | 50 +++++++++++++ src/pyvtt/models/config.py | 25 +++++++ src/pyvtt/voice_to_text_tray.py | 121 +++++++++----------------------- 6 files changed, 182 insertions(+), 88 deletions(-) rename src/pyvtt/{ => libs}/notify.py (100%) create mode 100644 src/pyvtt/libs/ollama.py create mode 100644 src/pyvtt/libs/whisper.py create mode 100644 src/pyvtt/models/config.py diff --git a/src/pyvtt/configuration.py b/src/pyvtt/configuration.py index cdcac36..803defb 100644 --- a/src/pyvtt/configuration.py +++ b/src/pyvtt/configuration.py @@ -2,9 +2,11 @@ import json import os from pathlib import Path +from pyvtt.models.config import AppConfig + DEFAULT_CONFIG_PATH = Path.home() / ".pyvtt.json" -def read_configurations(): +def read_configurations() -> AppConfig: """ Reads the configuration settings from a JSON file named 'pyvtt.settings.json' located in the same directory as the script. @@ -18,7 +20,8 @@ def read_configurations(): """ try: with open(DEFAULT_CONFIG_PATH) as f: - return json.load(f) + raw_config = json.load(f) + return AppConfig(**raw_config) except Exception as e: print(f"Error reading configurations: {e}") raise Exception(f"Error reading configurations: {e}") \ No newline at end of file diff --git a/src/pyvtt/notify.py b/src/pyvtt/libs/notify.py similarity index 100% rename from src/pyvtt/notify.py rename to src/pyvtt/libs/notify.py diff --git a/src/pyvtt/libs/ollama.py b/src/pyvtt/libs/ollama.py new file mode 100644 index 0000000..e175c7d --- /dev/null +++ b/src/pyvtt/libs/ollama.py @@ -0,0 +1,67 @@ +import requests +from typing import Union, List, Optional + +from pyvtt.libs.notify import notify +from pyvtt.models.config import AppConfig, PresetConfig + + +class OllamaClient: + def __init__(self, config: AppConfig): + """ + Initialisiert den Ollama-Client mit der Basis-Konfiguration aus der globalen App-Konfiguration. + + :param config: AppConfig-Instanz mit Host und Port für den Ollama-Server. + """ + self.base_url = config.ollama_url + self.port = config.ollama_port + + def send_chat( + self, + user_message: str, + config: PresetConfig, + ) -> str: + """ + Sendet eine Chat-Anfrage an den Ollama-Server basierend auf der spezifischen Preset-Konfiguration. + + :param user_message: Der vom Nutzer erzeugte Eingabetext (z. B. Transkript). + :param config: PresetConfig-Instanz mit modell-, prompt- und kontextbezogenen Parametern. + :return: Der von Ollama zurückgegebene, formatierte Antworttext, die user_message + unverändert zurückgibt, wenn Ollama deaktiviert ist oder none bei einem Fehler. + """ + if config.ollama and config.ollama.lower() == "disable": + print("[OllamaClient] Ollama ist im Preset deaktiviert.") + print("[OllamaClient] Gebe die Eingabe unverändert zurück.") + return user_message + + # Prompt als String aufbereiten – Liste wird zu Zeilen verbunden + if isinstance(config.ollama_prompt, list): + prompt_str = "\n".join(config.ollama_prompt) + else: + prompt_str = config.ollama_prompt + + # Payload für die API-Anfrage vorbereiten + payload = { + "model": config.ollama_model, + "messages": [ + {"role": "system", "content": prompt_str}, + {"role": "user", "content": user_message} + ], + "options": { + "num_ctx": config.ollama_context, + } if config.ollama_context else {}, + "stream": False + } + + endpoint = f"{self.base_url}:{self.port}/api/chat" + + # Anfrage an Ollama senden und Antwort extrahieren + try: + response = requests.post(endpoint, json=payload) + response.raise_for_status() + json_response = response.json() + content = json_response.get("message", {}).get("content", "").strip() + return "\n".join(line.strip() for line in content.splitlines()) + except requests.exceptions.RequestException as e: + print(f"[OllamaClient] HTTP-Fehler: {e}") + notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!") + return "" diff --git a/src/pyvtt/libs/whisper.py b/src/pyvtt/libs/whisper.py new file mode 100644 index 0000000..c4bab4c --- /dev/null +++ b/src/pyvtt/libs/whisper.py @@ -0,0 +1,50 @@ +import subprocess +from typing import Optional +from pathlib import Path + +from pyvtt.libs.notify import notify +from pyvtt.models.config import AppConfig, PresetConfig + + +class WhisperClient: + def __init__(self, config: AppConfig): + """ + Initialisiert den Whisper-Client mit der globalen Anwendungskonfiguration. + + :param config: AppConfig-Instanz mit Pfaden zur Whisper-Binary, Audio- und Ausgabedatei. + """ + self.whisper_path = config.whisper_path + self.audio_file = config.audio_file + self.output_file = config.output_file + + def transcribe(self, config: PresetConfig) -> str: + """ + Führt Whisper (CLI) zur Transkription der Audiodatei aus und gibt das Transkript zurück. + + :param config: PresetConfig-Instanz mit Whisper-Modell und Spracheinstellungen. + :return: Das rohe Transkript als String – oder None bei Fehlern. + """ + output_base = self.output_file.replace(".txt", "") + whisper_cmd = [ + self.whisper_path, + "-m", config.whisper_model, + "-f", self.audio_file, + "-l", config.language, + "-otxt", + "-of", output_base + ] + + try: + subprocess.run(whisper_cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"[WhisperClient] Whisper-Ausführungsfehler: {e}") + notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!") + return "" + + try: + with open(self.output_file, "r", encoding="utf-8") as f: + return "\n".join(line.strip() for line in f.readlines()) + except Exception as e: + print(f"[WhisperClient] Fehler beim Einlesen der Ausgabedatei: {e}") + notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!") + return "" diff --git a/src/pyvtt/models/config.py b/src/pyvtt/models/config.py new file mode 100644 index 0000000..e10c829 --- /dev/null +++ b/src/pyvtt/models/config.py @@ -0,0 +1,25 @@ +from typing import List, Optional, Union +from pydantic import BaseModel, HttpUrl, Field + + +class PresetConfig(BaseModel): + name: str + language: str + whisper_model: str + ollama: Optional[str] = None + ollama_model: Optional[str] = None + ollama_context: Optional[int] = None + ollama_prompt: Optional[Union[str, List[str]]] = None + mode: Optional[str] = None + journal_name: Optional[str] = None + + +class AppConfig(BaseModel): + audio_file: str + output_file: str + whisper_path: str + socket_path: str + ollama_url: str + ollama_port: int + journal_path: str + presets: List[PresetConfig] diff --git a/src/pyvtt/voice_to_text_tray.py b/src/pyvtt/voice_to_text_tray.py index 75cca43..ccc6bb1 100755 --- a/src/pyvtt/voice_to_text_tray.py +++ b/src/pyvtt/voice_to_text_tray.py @@ -10,10 +10,12 @@ from PyQt5.QtWidgets import QApplication, QSystemTrayIcon, QMenu, QAction from PyQt5.QtGui import QIcon from PyQt5.QtCore import QThread, pyqtSignal from pyvtt.configuration import read_configurations -from pyvtt.notify import notify, play_sound +from pyvtt.libs.notify import notify, play_sound +from pyvtt.libs.ollama import OllamaClient +from pyvtt.libs.whisper import WhisperClient -CONFIGURATION = read_configurations() -CURRENT_PRESET = CONFIGURATION["presets"][0] # Default to first preset +CONFIG = read_configurations() +CURRENT_PRESET = CONFIG.presets[0] # Default to first preset class WhisperWorker(QThread): """ @@ -31,82 +33,28 @@ class WhisperWorker(QThread): """ finished = pyqtSignal(str) + def __init__(self): + super().__init__() + self.whisper = WhisperClient(CONFIG) + self.ollama = OllamaClient(CONFIG) + def run(self): - CURENT_CONFIGURATION_LOCALE = CONFIGURATION + CURENT_CONFIG_LOCALE = CONFIG CURRENT_PRESET_LOCALE = CURRENT_PRESET try: - # Whisper ausführen - whisper_cmd = [ - CURENT_CONFIGURATION_LOCALE["whisper_path"], - "-m", CURRENT_PRESET_LOCALE["whisper_model"], - "-f", CURENT_CONFIGURATION_LOCALE["audio_file"], - "-l", CURRENT_PRESET_LOCALE["language"], - "-otxt", - "-of", CURENT_CONFIGURATION_LOCALE["output_file"].replace(".txt", "") - ] - try: - subprocess.run(whisper_cmd, check=True) - except subprocess.CalledProcessError as e: - print(f"Whisper Fehler: {e}") - notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!") - return - - try: - with open(CURENT_CONFIGURATION_LOCALE["output_file"], "r") as f: - raw_result = "\n".join(line.strip() for line in f.readlines()) - except Exception as e: - print(f"Datei Fehler: {e}") - notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!") - return - - print("Whisper Transkript erhalten.") - - # --- An Ollama schicken --- - if CURRENT_PRESET_LOCALE["ollama"] != "disable": - if isinstance(CURRENT_PRESET_LOCALE["ollama_prompt"], list): - prompt = "\n".join(CURRENT_PRESET_LOCALE["ollama_prompt"]) - else: - prompt = CURRENT_PRESET_LOCALE["ollama_prompt"] - - payload = { - "model": CURRENT_PRESET_LOCALE["ollama_model"], - "messages": [ - {"role": "system", "content": prompt}, - {"role": "user", "content": raw_result} - ], - "options": { - "num_ctx": CURRENT_PRESET_LOCALE["ollama_context"] - }, - "stream": False - } - ollama_endpoint = f"{CURENT_CONFIGURATION_LOCALE['ollama_url']}:{CURENT_CONFIGURATION_LOCALE['ollama_port']}/api/chat" - response = requests.post(ollama_endpoint, json=payload) - - try: - response.raise_for_status() - except requests.exceptions.HTTPError as e: - print(f"HTTP Fehler: {e}") - notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!") - return - - json_response = response.json() - formatted_result = json_response.get("message", {}).get("content", "").strip() - formatted_result = "\n".join(line.strip() for line in formatted_result.splitlines()) - print("Ollama Antwort erhalten.") - else: - formatted_result = raw_result - print("Kein Ollama Prompt angegeben, nur Whisper Ergebnis verwendet.") + raw_result = self.whisper.transcribe(CURRENT_PRESET_LOCALE) + formatted_result = self.ollama.send_chat(raw_result, CURRENT_PRESET_LOCALE) # Ergebnis ins Clipboard kopieren - if CURRENT_PRESET_LOCALE.get("mode") == "journal": + if CURRENT_PRESET_LOCALE.mode == "journal": today = datetime.date.today().strftime("%Y.%m.%d") - journal_path = os.path.join(CURENT_CONFIGURATION_LOCALE["journal_path"], f"{today} - {CURRENT_PRESET_LOCALE['journal_name']}.md") + journal_path = os.path.join(CURENT_CONFIG_LOCALE.journal_path, f"{today} - {CURRENT_PRESET_LOCALE.journal_name}.md") now = datetime.datetime.now().strftime("%H:%M:%S") if not os.path.exists(journal_path): try: with open(journal_path, "w") as f: - f.write(f"# {CURRENT_PRESET_LOCALE['journal_name']} - {today}\n\n") + f.write(f"# {CURRENT_PRESET_LOCALE.journal_name} - {today}\n\n") except Exception as e: print(f"Journal Erstellungsfehler: {e}") notify("Fehler", "Ein Fehler beim Erstellen des Journals ist aufgetreten!") @@ -157,11 +105,11 @@ class SocketListener(threading.Thread): def __init__(self, tray_app): super().__init__(daemon=True) self.tray_app = tray_app - if os.path.exists(CONFIGURATION["socket_path"]): - os.remove(CONFIGURATION["socket_path"]) + if os.path.exists(CONFIG.socket_path): + os.remove(CONFIG.socket_path) self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - self.sock.bind(CONFIGURATION["socket_path"]) - os.chmod(CONFIGURATION["socket_path"], 0o666) + self.sock.bind(CONFIG.socket_path) + os.chmod(CONFIG.socket_path, 0o666) self.sock.listen(1) def run(self): @@ -174,8 +122,8 @@ class SocketListener(threading.Thread): if len(cmd) > 1: data = cmd[0] preset = cmd[1] - if preset in [p["name"] for p in CONFIGURATION["presets"]]: - self.tray_app.set_preset([p["name"] for p in CONFIGURATION["presets"]].index(preset)) + if preset in [p.name for p in CONFIG.presets]: + self.tray_app.set_preset([p.name for p in CONFIG.presets].index(preset)) else: data = cmd[0] if data == "toggle": @@ -224,8 +172,8 @@ class TrayApp: # Preset Menü self.preset_actions = [] self.preset_group = QMenu("Presets") - for i, preset in enumerate(CONFIGURATION["presets"]): - action = QAction(preset["name"], self.menu) + for i, preset in enumerate(CONFIG.presets): + action = QAction(preset.name, self.menu) action.setCheckable(True) if i == 0: action.setChecked(True) @@ -255,8 +203,9 @@ class TrayApp: def set_preset(self, index): global CURRENT_PRESET - print(f"Preset gewechselt: {CONFIGURATION['presets'][index]['name']}") - CURRENT_PRESET = CONFIGURATION["presets"][index] + selected_preset = CONFIG.presets[index] + print(f"Preset gewechselt: {selected_preset.name}") + CURRENT_PRESET = selected_preset # Nur einer darf gecheckt sein for i, action in enumerate(self.preset_actions): action.setChecked(i == index) @@ -266,7 +215,7 @@ class TrayApp: print("Starte Aufnahme...") self.recording_process = subprocess.Popen([ "ffmpeg", "-f", "pulse", "-i", "default", "-ar", "16000", - "-ac", "1", CONFIGURATION["audio_file"], "-y", "-loglevel", "quiet" + "-ac", "1", CONFIG.audio_file, "-y", "-loglevel", "quiet" ]) notify("Aufnahme", "Aufnahme gestartet!") @@ -294,15 +243,15 @@ class TrayApp: print(f"Fertig:\n{text}") def reload_configurations(self): - global CONFIGURATION, CURRENT_PRESET + global CONFIG, CURRENT_PRESET print("Lade Einstellungen neu...") - CONFIGURATION = read_configurations() - CURRENT_PRESET = CONFIGURATION["presets"][0] # Default to first preset + CONFIG = read_configurations() + CURRENT_PRESET = CONFIG.presets[0] # Default to first preset # Update preset menu self.preset_group.clear() self.preset_actions = [] - for i, preset in enumerate(CONFIGURATION["presets"]): - action = QAction(preset["name"], self.menu) + for i, preset in enumerate(CONFIG.presets): + action = QAction(preset.name, self.menu) action.setCheckable(True) if i == 0: action.setChecked(True) @@ -312,8 +261,8 @@ class TrayApp: print("Einstellungen erfolgreich neu geladen.") def cleanup(self): - if os.path.exists(CONFIGURATION["socket_path"]): - os.remove(CONFIGURATION["socket_path"]) + if os.path.exists(CONFIG.socket_path): + os.remove(CONFIG.socket_path) print("Socket sauber entfernt.") def run(self):