Author : zbzhen, Modified : Sun Apr 28 21:11:14 2024
开源地址: https://github.com/rany2/edge-tts
安装
pip install edge-tts
函数调用
def tts_ms(input): txt, k = input # https://github.com/rany2/edge-tts # edge-tts --list-voices 查看语音角色 # zh-CN-YunxiNeural zh-CN-YunxiaNeural # mds = ['zh-CN-XiaoxiaoNeural', 'zh-CN-YunxiNeural'] # edge-tts --voice zh-CN-XiaoxiaoNeural --text 你好召唤师 --write-media 1.mp3 --rate=-0% --volume=+10% model = 'zh-CN-XiaoxiaoNeural'; cmd= f"""edge-tts --voice {model} --text "{txt}" --write-media ./mp3/{k}.wav --write-subtitles ./mp3/{k}.vvt """ subprocess.run(cmd) return
最终我们希望把xx.srt
文件输出为语音文件, 例如xx.srt
的格式为:
1
00:00:00,866 --> 00:00:03,799
亲爱的召唤师
2
00:00:05,466 --> 00:00:06,999
你好呀
那么我们希望, 最后输出的音频文件总共时长为00:00:06,999
, 并且尽可能保证
时段 00:00:00,866 --> 00:00:03,799
播放语音亲爱的召唤师
时段 00:00:05,466 --> 00:00:06,999
播放语音你好呀
问题时段和语音播放不可能完全匹配, 但是我们希望尽可能匹配, 为此, 我想到的策略是:
读取srt内容, 得到2个列表, 一个是列表存放文本, 另外一个存放列每段的放播放时长
通过并行把文字生成语音
核对, 实际输出的声频文件时长 必须要比 srt时长短, 超过的话得重新生成一个快速的视频
合并语音. 需要确保语音时间位于srt时间区间
完整代码如下:
代码只能用于学习, 不得做其它用途
import re import numpy as np import os from pydub import AudioSegment import subprocess import concurrent.futures ## 只能用于学习, 不得做其它用途 ## pip install pydub, edge-tts -i https://pypi.tuna.tsinghua.edu.cn/simple ## edge-tts --list-voices all_model=[ 'zh-CN-XiaoxiaoNeural', # 0 Female 'zh-CN-XiaoyiNeural', # 1 Female 'zh-CN-YunjianNeural', # 2 'zh-CN-YunxiNeural', # 3 'zh-CN-YunxiaNeural', # 4 'zh-CN-YunyangNeural', # 5 'zh-CN-liaoning-XiaobeiNeural', # 6 Female 'zh-CN-shaanxi-XiaoniNeural', # 7 Female 'zh-HK-HiuGaaiNeural', # 8 Female 'zh-HK-HiuMaanNeural', # 9 Female 'zh-HK-WanLungNeural', #10 'zh-TW-HsiaoChenNeural', #11 Female 'zh-TW-HsiaoYuNeural', #12 Female 'zh-TW-YunJheNeural', #13 ] def read_srt(file_name): with open(file_name, 'r', encoding='utf-8') as file: lines = file.readlines() # content = file.read() # # 使用正则表达式匹配时间和文本 time_pattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})') allnn = len(lines) nn = int((allnn+1)/4) times = ['']*nn texts = ['']*nn for i in range(nn): times[i] = time_pattern.findall(lines[i*4+1])[0] texts[i] = lines[i*4+2].strip() # 计算停顿时间 tt = [] for i in range(len(times)): t2 = int(times[i][4]) * 3600000 + int(times[i][5]) * 60000 + int(times[i][6]) * 1000 + int(times[i][7]) t1 = int(times[i][0]) * 3600000 + int(times[i][1]) * 60000 + int(times[i][2]) * 1000 + int(times[i][3]) tt.append(t1) tt.append(t2) tbb = np.array(tt) - np.array([0]+tt[:-1]) return texts, tbb, nn, tt[-1] def tts_ms(input): txt, k, dir, lent, mdn = input # https://github.com/rany2/edge-tts # edge-tts --voice zh-CN-XiaoxiaoNeural --text '你好召唤师' --write-media 1.mp3 --rate=-0% --volume=+10% --write-subtitles ./{dir}/{k}.vvt if txt == '': return model = all_model[mdn] cmd= f"""edge-tts --voice {model} --text "{txt}" --write-media ./{dir}/{k}.wav --write-subtitles ./{dir}/{k}.vvt """ subprocess.run(cmd) ################################# get wav time time_pattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}).(\d{3}) --> (\d{2}):(\d{2}):(\d{2}).(\d{3})') with open(f'./{dir}/{k}.vvt ', 'r', encoding='utf-8') as file: lines = file.readlines() vvt = time_pattern.findall(lines[4])[0] t1 = int(vvt[0]) * 3600000 + int(vvt[1]) * 60000 + int(vvt[2]) * 1000 + int(vvt[3]) t2 = int(vvt[4]) * 3600000 + int(vvt[5]) * 60000 + int(vvt[6]) * 1000 + int(vvt[7]) t2 += 1 tt12 = t2-t1 audio = AudioSegment.from_mp3(f'./{dir}/{k}.wav') if tt12 <= lent: audio[t1:t2].export(f'./{dir}/{k}.wav', format='wav') return ################################# remake pp = int((1.0*t2/lent - 1)*100) + 2 cmd= f"""edge-tts --voice {model} --text "{txt}" --write-media ./{dir}/{k}.wav --rate=+{pp}% --write-subtitles ./{dir}/{k}.vvt """ subprocess.run(cmd) with open(f'./{dir}/{k}.vvt ', 'r', encoding='utf-8') as file: lines = file.readlines() vvt = time_pattern.findall(lines[4])[0] t1 = int(vvt[0]) * 3600000 + int(vvt[1]) * 60000 + int(vvt[2]) * 1000 + int(vvt[3]) t2 = int(vvt[4]) * 3600000 + int(vvt[5]) * 60000 + int(vvt[6]) * 1000 + int(vvt[7]) t2 += 1 tt12 = t2-t1 audio = AudioSegment.from_mp3(f'./{dir}/{k}.wav') audio[t1:t2].export(f'./{dir}/{k}.wav', format='wav') return def dir_check(dir): current_dir = os.getcwd() mp3_dir = os.path.join(current_dir, dir) if not os.path.exists(mp3_dir): os.makedirs(mp3_dir) return def make(tt, nn, sumtt, dir): dt = 0 combined = AudioSegment.silent(duration=0) # print(tt) for i in range(nn): audio = AudioSegment.from_mp3(f'./{dir}/{i}.wav') # print(len(audio)) silence = AudioSegment.silent(duration=dt+tt[i*2]) dt = tt[i*2+1] - len(audio) combined += silence + audio if sumtt > len(combined): dtt = sumtt - len(combined) silence = AudioSegment.silent(duration=dtt) combined = combined + silence print('srt audio time :', sumtt) print('wav audio time :', len(combined)) combined.export(f'./{dir}/00{dir}.wav', format='wav') print(f'please find ./{dir}/00{dir}.wav') ####################################################################### srtfile = 'xx.srt' texts, tt, nn, sumtbb = read_srt(srtfile) spk = 3 dir = srtfile[:-4]+'_'+all_model[spk][6:13] tts = tts_ms inputs = [[texts[i], i, dir, tt[i*2+1], spk] for i in range(nn)] dir_check(dir) with concurrent.futures.ThreadPoolExecutor(max_workers=max(10, nn)) as executor: executor.map(tts, inputs) # tts(inputs[-1]) make(tt, nn, sumtbb, dir)
思路:
假设目标视频为input.mp4
, 需要输出为output.mp4
把 input.mp4 分离视频和音频, 1.mp3
只有声音, 1.mp4
只有图像
ffmpeg -i input.mp4 -q:a 0 -map a 1.mp3 ffmpeg -i input.mp4 -c:v copy -an 1.mp4
通过1.mp3
识别得到srt字幕, 有两个软件可以轻松实现, 剪映
或者WhisperDesktop
. 如果有gpu, 推荐用WhisperDesktop
; 没有gpu, 那就用剪映. WhisperDesktop
是完全跑在本地的服务.
1.srt
转变为音频 2.mp3
. 只需要运行本页面提供的py
脚本即可
合并1.mp4
与2.wav
ffmpeg -i 1.mp4 -i 2.wav -c:v copy -c:a aac -strict experimental output.mp4