-
Notifications
You must be signed in to change notification settings - Fork 0
/
srt-resegment.py
110 lines (87 loc) · 3.54 KB
/
srt-resegment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# srt-resegment.py
# https://github.com/VimWei/WhisperTranscriber
# Synchronize SRT with Whisper's Word-Level Timestamps JSON
# 实现断句的完全自由
# 提升文本匹配的兼容性
# 无视标点符号
# 适应简单的文本增删改情形
# lua edition: https://github.com/VimWei/mpv-config
import json
import re
def generate_srt(json_data, text):
lines = text.strip().split('\n')
srt_content = ""
line_id = 1
# 提取JSON中的单词
json_all_words = []
for segment in json_data['segments']:
json_all_words.extend(segment['words'])
json_word_index = 0
matched_words_index = 0
previous_end_time = 0 # 存储上一行的结束时间
for line in lines:
# 提取TXT中的单词:只要单词,不要标点符号
txt_words = re.findall(r'\b\w+\b', line.strip())
# print(f"txt_words: {txt_words}")
if not txt_words:
continue
start_time = None
end_time = None
matched_words = []
# 遍历TXT中的每个单词以查找匹配
for txt_word in txt_words:
# print(f"开始匹配: {txt_word} ...")
matched = False # 标记是否找到匹配
while json_word_index < len(json_all_words):
json_word_info = json_all_words[json_word_index]
clean_json_word = re.sub(r'[^\w\s]', '', json_word_info['word']).strip()
if clean_json_word.lower() == txt_word.lower():
if start_time is None:
start_time = json_word_info['start']
end_time = json_word_info['end']
matched_words.append(txt_word)
# print(f"matched_words_index: {matched_words_index}")
matched = True
matched_words_index = json_word_index + 1
break # 找到匹配后退出循环
else:
json_word_index += 1
if matched:
json_word_index = matched_words_index
else:
json_word_index = matched_words_index-1
print(f"Warning: Could not match word '{txt_word}' in line {line_id}")
# 设置时间戳
if start_time is None:
start_time = previous_end_time
if end_time is None:
end_time = previous_end_time
srt_content += f"{line_id}\n{format_time(start_time)} --> {format_time(end_time)}\n{line}\n\n"
previous_end_time = end_time # 更新上一行结束时间
line_id += 1
return srt_content
def format_time(time_in_seconds):
hours = int(time_in_seconds // 3600)
minutes = int((time_in_seconds % 3600) // 60)
seconds = int(time_in_seconds % 60)
milliseconds = int((time_in_seconds - int(time_in_seconds)) * 1000)
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def main():
# 设置路径
json_file_path = 'path/to/whisper_output.json'
text_file_path = 'path/to/text_file.txt'
output_srt_path = 'path/to/output.srt'
# 读取 JSON 文件
with open(json_file_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# 读取文本文件
with open(text_file_path, 'r', encoding='utf-8') as f:
text = f.read()
# 生成 SRT 内容
srt_content = generate_srt(json_data, text)
# 写入 SRT 文件
with open(output_srt_path, 'w', encoding='utf-8') as f:
f.write(srt_content)
print(f"SRT file has been generated: {output_srt_path}")
if __name__ == "__main__":
main()