feat(pyvtt): add CI pipeline and restructure project
All checks were successful
Build and Publish / build-and-publish (push) Successful in 21s

- Add GitHub Actions workflow for building and publishing packages.
- Introduce `pyproject.toml` for project metadata and dependency management.
- Remove `requirements.txt` in favor of Poetry for dependency handling.
- Restructure source files under `src/pyvtt` for better organization.
- Enhance `notify.py` with sound playback and improve error handling.
- Update `voice_to_text_tray.py` to support dynamic configuration reload.
- Add `.vscode/settings.json` for improved IDE configuration.
- Update `.gitignore` to exclude build artifacts.

Signed-off-by: Max P. <Mail@MPassarello.de>
This commit is contained in:
2025-04-30 15:01:58 +02:00
parent 58c8bf5c8f
commit 5b343b68cf
12 changed files with 963 additions and 35 deletions

Binary file not shown.

View File

@@ -0,0 +1,23 @@
import json
import os
def read_configurations():
"""
Reads the configuration settings from a JSON file named 'pyvtt.settings.json'
located in the same directory as the script.
Returns:
dict: The configuration settings loaded from the JSON file.
Raises:
Exception: If there is an error reading or parsing the JSON file,
an exception is raised with the error details.
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
settings_path = os.path.join(script_dir, "pyvtt.settings.json")
try:
with open(settings_path) as f:
return json.load(f)
except Exception as e:
print(f"Error reading configurations: {e}")
raise Exception(f"Error reading configurations: {e}")

44
src/pyvtt/notify.py Normal file
View File

@@ -0,0 +1,44 @@
import subprocess
from importlib.resources import files
def notify(title: str, message: str) -> None:
"""
Sends a desktop notification using the `notify-send` command.
Args:
title (str): The title of the notification.
message (str): The message content of the notification.
Raises:
subprocess.CalledProcessError: If the `notify-send` command fails.
Note:
This function requires the `notify-send` command to be available on the system.
It is typically available on Linux systems with a notification daemon running.
"""
try:
subprocess.run(["notify-send", "-a", "Voice to Text", "-i", "audio-input-microphone", title, message], check=True)
except subprocess.CalledProcessError as e:
print("Fehler beim Benachrichtigen mit 'notify-send'.")
print(e)
def play_sound() -> None:
"""
Plays a sound file using the `paplay` command.
Args:
sound_file (str): The path to the sound file to be played.
Raises:
subprocess.CalledProcessError: If the `paplay` command fails.
Note:
This function requires the `paplay` command to be available on the system.
It is typically available on Linux systems with PulseAudio installed.
"""
sound_file = files("pyvtt.assets").joinpath("notification.wav")
try:
subprocess.run(["paplay", str(sound_file)], check=True)
except subprocess.CalledProcessError as e:
print("Fehler beim Abspielen des Sounds mit 'paplay'.")
print(e)

52
src/pyvtt/send_cmd.py Executable file
View File

@@ -0,0 +1,52 @@
import socket
import sys
import argparse
from src.pyvtt.configuration import read_configurations
CONFIGURATION = read_configurations()
def send_cmd(cmd: str, socket_path: str):
"""
Sends a command to a Unix domain socket server.
This function creates a Unix domain socket, connects to the server
specified by the socket_path, and sends the provided command as a
UTF-8 encoded string.
Args:
cmd (str): The command to send to the server.
socket_path (str): The path to the Unix domain socket.
Raises:
FileNotFoundError: If the socket file specified by socket_path does not exist.
ConnectionRefusedError: If the connection to the server is refused.
OSError: For other socket-related errors.
"""
try:
with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as client:
client.settimeout(3)
client.connect(socket_path)
client.sendall(cmd.encode())
except FileNotFoundError:
print(f"Error: The socket file '{socket_path}' does not exist.", file=sys.stderr, flush=True)
except ConnectionRefusedError:
print(f"Error: Connection to the server at '{socket_path}' was refused.", file=sys.stderr, flush=True)
except socket.timeout:
print("Error: Socket operation timed out.", file=sys.stderr, flush=True)
except OSError as e:
print(f"Socket error: {e}", file=sys.stderr, flush=True)
def main():
parser = argparse.ArgumentParser(
description="Send a command to a Unix domain socket server."
)
parser.add_argument(
"command",
choices=["start", "stop", "toggle"],
nargs="?",
default="toggle",
help="The command to send to the server (default: toggle).",
)
args = parser.parse_args()
send_cmd(args.command, CONFIGURATION["socket_path"])

289
src/pyvtt/voice_to_text_tray.py Executable file
View File

@@ -0,0 +1,289 @@
import sys
import subprocess
import os
import threading
import socket
import json
import requests
from PyQt5.QtWidgets import QApplication, QSystemTrayIcon, QMenu, QAction
from PyQt5.QtGui import QIcon
from PyQt5.QtCore import QThread, pyqtSignal
from src.pyvtt.configuration import read_configurations
from pyvtt.notify import notify, play_sound
CONFIGURATION = read_configurations()
CURRENT_PRESET = CONFIGURATION["presets"][0] # Default to first preset
class WhisperWorker(QThread):
"""
A PyQt QThread subclass that handles the transcription of audio files using Whisper
and processes the result with Ollama. The final output is copied to the clipboard
and a signal is emitted upon completion.
Signals:
finished (pyqtSignal): Emitted with the formatted transcription result as a string
when the process is successfully completed.
Methods:
run():
Executes the transcription process using Whisper, sends the result to Ollama
for further processing, and copies the final output to the clipboard. Handles
errors at various stages and provides notifications for failures.
"""
finished = pyqtSignal(str)
def run(self):
try:
# Whisper ausführen
whisper_cmd = [
CONFIGURATION["whisper_path"],
"-m", CURRENT_PRESET["whisper_model"],
"-f", CONFIGURATION["audio_file"],
"-l", CURRENT_PRESET["language"],
"-otxt",
"-of", CONFIGURATION["output_file"].replace(".txt", "")
]
try:
subprocess.run(whisper_cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"Whisper Fehler: {e}")
notify("Fehler", "Ein Fehler mit 'Whisper' ist aufgetreten!")
return
try:
with open(CONFIGURATION["output_file"], "r") as f:
raw_result = "\n".join(line.strip() for line in f.readlines())
except Exception as e:
print(f"Datei Fehler: {e}")
notify("Fehler", "Ein Fehler beim Lesen der Whisper-Ausgabe ist aufgetreten!")
return
print("Whisper Transkript erhalten.")
# --- An Ollama schicken ---
if CURRENT_PRESET["ollama"] != "disable":
if isinstance(CURRENT_PRESET["ollama_prompt"], list):
prompt = "\n".join(CURRENT_PRESET["ollama_prompt"])
else:
prompt = CURRENT_PRESET["ollama_prompt"]
payload = {
"model": CURRENT_PRESET["ollama_model"],
"messages": [
{"role": "system", "content": prompt},
{"role": "user", "content": raw_result}
],
"options": {
"num_ctx": CURRENT_PRESET["ollama_context"]
},
"stream": False
}
ollama_endpoint = f"{CONFIGURATION['ollama_url']}:{CONFIGURATION['ollama_port']}/api/chat"
response = requests.post(ollama_endpoint, json=payload)
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print(f"HTTP Fehler: {e}")
notify("Fehler", "Ein Fehler bei der Kommunikation mit 'Ollama' ist aufgetreten!")
return
json_response = response.json()
formatted_result = json_response.get("message", {}).get("content", "").strip()
formatted_result = "\n".join(line.strip() for line in formatted_result.splitlines())
print("Ollama Antwort erhalten.")
else:
formatted_result = raw_result
print("Kein Ollama Prompt angegeben, nur Whisper Ergebnis verwendet.")
# Ergebnis ins Clipboard kopieren
try:
subprocess.run(["wl-copy"], input=formatted_result.encode(), check=True)
except subprocess.CalledProcessError as e:
print(f"Clipboard Fehler: {e}")
notify("Fehler", "Ein Fehler beim Kopieren des Ergebnisses ist aufgetreten!")
return
notify("Spracherkennung", "Transkription abgeschlossen!")
play_sound()
self.finished.emit(formatted_result)
except Exception as e:
print(f"Fehler: {e}")
notify("Fehler", "Ein Fehler ist aufgetreten!")
return
class SocketListener(threading.Thread):
"""
A thread-based socket listener for handling inter-process communication
via a UNIX domain socket. This class listens for specific commands
("toggle", "start", "stop") sent to the socket and triggers corresponding
methods in the provided tray application instance.
Attributes:
tray_app (object): The tray application instance that provides methods
for handling recording actions.
sock (socket.socket): The UNIX domain socket used for communication.
Methods:
run():
Continuously listens for incoming connections on the socket.
Processes received commands and invokes the appropriate methods
on the tray application instance.
"""
def __init__(self, tray_app):
super().__init__(daemon=True)
self.tray_app = tray_app
if os.path.exists(CONFIGURATION["socket_path"]):
os.remove(CONFIGURATION["socket_path"])
self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
self.sock.bind(CONFIGURATION["socket_path"])
os.chmod(CONFIGURATION["socket_path"], 0o666)
self.sock.listen(1)
def run(self):
while True:
conn, _ = self.sock.accept()
with conn:
data = conn.recv(1024).decode().strip()
if data == "toggle":
self.tray_app.toggle_recording()
elif data == "start":
self.tray_app.start_recording()
elif data == "stop":
self.tray_app.stop_recording_if_possible()
class TrayApp:
"""
TrayApp is a system tray application that provides voice-to-text functionality. It allows users to manage presets,
start and stop audio recording, and process the recorded audio using a WhisperWorker.
Attributes:
app (QApplication): The main application instance.
tray (QSystemTrayIcon): The system tray icon for the application.
menu (QMenu): The context menu for the system tray icon.
preset_actions (list): A list of QAction objects representing the preset options.
preset_group (QMenu): A submenu for managing presets.
quit_action (QAction): An action to quit the application.
reload_action (QAction): An action to reload configurations.
recording_process (subprocess.Popen or None): The process handling audio recording.
socket_listener (SocketListener): A listener for socket communication.
worker (WhisperWorker or None): A worker thread for processing audio with Whisper.
Methods:
__init__(): Initializes the TrayApp instance, setting up the system tray, menu, and socket listener.
set_preset(index): Sets the active preset based on the given index and updates the UI.
start_recording(): Starts audio recording using ffmpeg.
stop_recording_if_possible(): Stops the audio recording process if it is running.
toggle_recording(): Toggles between starting and stopping the audio recording.
start_whisper_worker(): Starts a WhisperWorker thread to process the recorded audio.
show_result(text): Displays the processed text result from the WhisperWorker.
reload_configurations(): Reloads configurations from the settings file and updates the UI.
cleanup(): Cleans up resources, such as removing the socket file, before the application exits.
run(): Starts the application's event loop.
"""
def __init__(self):
self.app = QApplication(sys.argv)
self.tray = QSystemTrayIcon(QIcon.fromTheme("audio-input-microphone"))
self.menu = QMenu()
self.app.aboutToQuit.connect(self.cleanup)
# Preset Menü
self.preset_actions = []
self.preset_group = QMenu("Presets")
for i, preset in enumerate(CONFIGURATION["presets"]):
action = QAction(preset["name"], self.menu)
action.setCheckable(True)
if i == 0:
action.setChecked(True)
action.triggered.connect(lambda checked, index=i: self.set_preset(index))
self.preset_group.addAction(action)
self.preset_actions.append(action)
self.menu.addMenu(self.preset_group)
# Reload Configurations
self.reload_action = QAction("Einstellungen neu laden")
self.reload_action.triggered.connect(self.reload_configurations)
self.menu.addAction(self.reload_action)
# Quit
self.quit_action = QAction("Beenden")
self.quit_action.triggered.connect(self.app.quit)
self.menu.addAction(self.quit_action)
self.tray.setContextMenu(self.menu)
self.tray.setToolTip("Voice to Text")
self.tray.show()
self.recording_process = None
self.socket_listener = SocketListener(self)
self.socket_listener.start()
def set_preset(self, index):
global CURRENT_PRESET
print(f"Preset gewechselt: {CONFIGURATION['presets'][index]['name']}")
CURRENT_PRESET = CONFIGURATION["presets"][index]
# Nur einer darf gecheckt sein
for i, action in enumerate(self.preset_actions):
action.setChecked(i == index)
def start_recording(self):
if self.recording_process is None:
print("Starte Aufnahme...")
self.recording_process = subprocess.Popen([
"ffmpeg", "-f", "pulse", "-i", "default", "-ar", "16000",
"-ac", "1", CONFIGURATION["audio_file"], "-y", "-loglevel", "quiet"
])
notify("Aufnahme", "Aufnahme gestartet!")
def stop_recording_if_possible(self):
if self.recording_process:
print("Stoppe Aufnahme...")
self.recording_process.terminate()
self.recording_process.wait()
self.recording_process = None
notify("Aufnahme", "Aufnahme beendet, verarbeite...")
self.start_whisper_worker()
def toggle_recording(self):
if self.recording_process:
self.stop_recording_if_possible()
else:
self.start_recording()
def start_whisper_worker(self):
self.worker = WhisperWorker()
self.worker.finished.connect(self.show_result)
self.worker.start()
def show_result(self, text):
print(f"Fertig:\n{text}")
def reload_configurations(self):
global CONFIGURATION, CURRENT_PRESET
print("Lade Einstellungen neu...")
CONFIGURATION = read_configurations()
CURRENT_PRESET = CONFIGURATION["presets"][0] # Default to first preset
# Update preset menu
self.preset_group.clear()
self.preset_actions = []
for i, preset in enumerate(CONFIGURATION["presets"]):
action = QAction(preset["name"], self.menu)
action.setCheckable(True)
if i == 0:
action.setChecked(True)
action.triggered.connect(lambda checked, index=i: self.set_preset(index))
self.preset_group.addAction(action)
self.preset_actions.append(action)
print("Einstellungen erfolgreich neu geladen.")
def cleanup(self):
if os.path.exists(CONFIGURATION["socket_path"]):
os.remove(CONFIGURATION["socket_path"])
print("Socket sauber entfernt.")
def run(self):
sys.exit(self.app.exec_())
def main():
TrayApp().run()