OllamaとVOICEVOXで音声出力

前回の音声出力のVOICEVOX版プログラムです。Docker Desktopを使った記事ばかりしか見つからなかったのでDocker Desktopを使わないバージョンを作りました。pcパワーが結構いりますね。あとグラフィクスボードによっては入力中にエラーでます。なにが悪いのかわかる人がいたら教えてください。モデルと音声は自分の環境に合わせて書き換えてください。やっぱりgui版が一番安定しますね。

import requests
import subprocess
import speech_recognition as sr
import os
import time
import uuid
from datetime import datetime
from playsound import playsound

# ===== 設定 =====
VOSK_MODEL_PATH = r"C:\models\vosk-model-small-ja-0.22"
VOICEVOX_API = "http://127.0.0.1:50021"
SPEAKER_ID = 1
OLLAMA_COMMAND = (
'ollama run hf.co/lightblue/suzume-llama-3-8B-japanese-gguf:Q4_K_M "{prompt}"'
)
AUDIO_SAVE_DIR = os.path.join(os.getcwd(), "audio_logs")

# ===== 音声保存フォルダの作成 =====
os.makedirs(AUDIO_SAVE_DIR, exist_ok=True)

# ===== VOICEVOXで読み上げ =====
def speak_voicevox(text: str):
try:
res1 = requests.post(
f"{VOICEVOX_API}/audio_query",
params={"text": text, "speaker": SPEAKER_ID}
)
res1.raise_for_status()
query = res1.json()

res2 = requests.post(
f"{VOICEVOX_API}/synthesis",
params={"speaker": SPEAKER_ID},
json=query
)
res2.raise_for_status()

# タイムスタンプ付きファイル名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_id = str(uuid.uuid4())[:8]
filename = f"voicevox_{timestamp}_{unique_id}.wav"
audio_path = os.path.join(AUDIO_SAVE_DIR, filename)

with open(audio_path, "wb") as f:
f.write(res2.content)

playsound(audio_path)
print(f"✅ 音声ファイル保存: {audio_path}")

except Exception as e:
print(f"[VOICEVOX error] {e}")

# ===== LLM応答生成 =====
def ask_llm(prompt: str) -> str:
try:
cmd = OLLAMA_COMMAND.format(prompt=prompt.replace('"', '\\"'))
result = subprocess.run(
cmd,
shell=True,
encoding="utf-8",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
return (result.stdout or "").strip()
except Exception as e:
print(f"[LLM error] {e}")
return "（エラーが発生しました）"

# ===== 音声認識（Vosk） =====
def transcribe():
recognizer = sr.Recognizer()
mic = sr.Microphone()
with mic as source:
print("🎤 話しかけてください...")
recognizer.adjust_for_ambient_noise(source)
audio = recognizer.listen(source, phrase_time_limit=10)
try:
return recognizer.recognize_vosk(audio, model=VOSK_MODEL_PATH)
except Exception as e:
print(f"[音声認識エラー] {e}")
return ""

# ===== メインループ =====
def main():
print("🟢 VOICEVOX対話開始")
print("モードを選択してください：")
print("1. 🎙 音声入力")
print("2. ⌨️ キーボード入力")
mode = input("番号を入力（1または2）: ").strip()

if mode == "1":
while True:
key = input("\n[Enter]で録音 / [q]で終了: ").strip().lower()
if key == "q":
break
user_text = transcribe()
if not user_text:
print("（聞き取れませんでした）")
continue
print(f"🧑 あなた: {user_text}")
reply = ask_llm(user_text)
print(f"🤖 すずめ: {reply}")
speak_voicevox(reply)

elif mode == "2":
while True:
user_text = input("\n🧑 あなた（qで終了）: ").strip()
if user_text.lower() == "q":
break
reply = ask_llm(user_text)
print(f"🤖 すずめ: {reply}")
speak_voicevox(reply)

else:
print("⚠️ 無効な選択です。1または2を入力してください。")

if __name__ == "__main__":
main()

お好きに改良してください。不具合起きても一切の責任を負いかねます。