リブログ記事”AIサポートでのマニュアル作成がいいらしい”
faster-whisperが早いらしいので修正モデルをlarge-v3-turboにして精度が上がりました時間はかかるんですが、自動化できましたノートパソコンでがGPUが付いていないので遅いですね外付けGPUが欲しくなりました#pip install faster-whisper#pip install ffmpeg-python pydub opencv-python openai-whisper#pip install python-docx pillowimport osimport subprocessimport cv2#import whisperimport refrom docx import Documentfrom docx.shared import Inchesfrom PIL import Imageimport globfrom faster_whisper import WhisperModelmodel_size = "large-v3" #完璧model_size = "large-v3-turbo" #完璧#model_size = "small" #ちょっと精度が悪い# Run on GPU with FP16model = WhisperModel(model_size, device="cpu", compute_type="float32")# 設定BASE_DIR = "output" # 画像と voice.txt が入っているフォルダーVOICE_FILE = os.path.join(BASE_DIR, "voice.txt")IMAGE_EXT = ".jpg"OUTPUT_FILE = "output.docx"# 画像サイズ(Word 上の幅)IMAGE_WIDTH_INCH = 2.5 # 約6.3cmVIDEO_PATH=""OUTPUT_DIR = "output"os.makedirs(OUTPUT_DIR, exist_ok=True)def parse_voice_file(path): data = [] with open(path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line.startswith("[") and "]" in line: num = line[1:line.index("]")] text = line[line.index("]") + 1:].strip() data.append((num, text)) return datadef add_text_and_image(table, row, text, image_path): cell_left = table.cell(row, 0) cell_right = table.cell(row, 1) # 左側にテキスト cell_left.text = text # 右側に画像 if os.path.exists(image_path): paragraph = cell_right.paragraphs[0] run = paragraph.add_run() run.add_picture(image_path, width=Inches(IMAGE_WIDTH_INCH)) else: cell_right.text = "(画像なし)"def docx(): if not os.path.exists(VOICE_FILE): print("output/voice.txt が見つかりません") return voice_data = parse_voice_file(VOICE_FILE) document = Document() # 1ページに4つのセット → 4行 × 2列のテーブル ROWS_PER_PAGE = 4 row_index = 0 table = None for num, text in voice_data: if row_index == 0: # 新しいページにテーブルを作成 table = document.add_table(rows=ROWS_PER_PAGE, cols=2) table.autofit = False image_path = os.path.join(BASE_DIR, f"{num}{IMAGE_EXT}") add_text_and_image(table, row_index, text, image_path) row_index += 1 if row_index >= ROWS_PER_PAGE: document.add_page_break() row_index = 0 document.save(OUTPUT_FILE) print(f"{OUTPUT_FILE} を作成しました")for name in glob.glob('*.mp4'): VIDEO_PATH=name# ===== Whisper =====#model = whisper.load_model("medium")# ===== ffmpeg で音声抽出 =====audio_path = os.path.join(OUTPUT_DIR, "temp.wav")subprocess.run([ "ffmpeg", "-y", "-i", VIDEO_PATH, "-ac", "1", "-ar", "16000", audio_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)# ===== ffmpeg の silencedetect で無音区間を検出 =====cmd = [ "ffmpeg", "-i", audio_path, "-af", "silencedetect=noise=-40dB:d=1.5", "-f", "null", "-"]result = subprocess.run(cmd, stderr=subprocess.PIPE, text=True)log = result.stderr# ===== 無音区間から音声区間を抽出 =====pattern_start = r"silence_end: ([0-9\.]+)"pattern_end = r"silence_start: ([0-9\.]+)"starts = [float(x) for x in re.findall(pattern_start, log)]ends = [float(x) for x in re.findall(pattern_end, log)]# 最初の音声開始が無音で始まらない場合の補正if not starts or (starts and starts[0] != 0.0): starts.insert(0, 0.0)# 最後の音声終了補正duration_cmd = subprocess.run( ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", VIDEO_PATH], stdout=subprocess.PIPE, text=True)duration = float(duration_cmd.stdout.strip())if len(ends) < len(starts): ends.append(duration)segments = list(zip(starts, ends))print("検出された音声区間:", segments)# ===== OpenCV で動画読み込み =====cap = cv2.VideoCapture(VIDEO_PATH)fps = cap.get(cv2.CAP_PROP_FPS)# ===== まとめ用テキストファイル =====voice_txt_path = os.path.join(OUTPUT_DIR, "voice.txt")with open(voice_txt_path, "w", encoding="utf-8") as f: f.write("") # 初期化counter = 1for start_sec, end_sec in segments: # ===== 無効な区間をスキップ ===== if end_sec - start_sec < 0.1: # 0.1秒未満は無視 print(f"区間が短すぎるためスキップ: {start_sec} - {end_sec}") continue num = f"{counter:03d}" # ===== 音声切り出し ===== wav_path = os.path.join(OUTPUT_DIR, f"{num}.wav") subprocess.run([ "ffmpeg", "-y", "-i", audio_path, "-ss", str(start_sec), "-to", str(end_sec), wav_path ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # ===== WAV が作られたか確認 ===== if not os.path.exists(wav_path) or os.path.getsize(wav_path) == 0: print(f"{num}.wav が作成されなかったためスキップ") continue # ===== フレーム抽出 ===== frame_index = int(start_sec * fps) cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) ret, frame = cap.read() if ret: img_path = os.path.join(OUTPUT_DIR, f"{num}.jpg") cv2.imwrite(img_path, frame) # ===== Whisper 文字起こし ===== segments ,info= model.transcribe(wav_path, beam_size=5) for segment in segments: text= segment.text# result = model.transcribe(# wav_path,# fp16=False,# language="ja",# temperature=0.0# )# text = result["text"].strip() # ===== voice.txt に追記 ===== with open(voice_txt_path, "a", encoding="utf-8") as f: f.write(f"[{num}] {text}\n") print(f"{num}: 保存完了") counter += 1cap.release()docx()print("完了しました!")