建議檔名:批次空白列合併.py
緣起:
本意是對由【成為小說家】提供之直列式PDF,利用全文複製將PDF 全文貼到TXT 檔中,對轉出的 TXT 檔執行格式清理作業,形成清爽的版面。
主作用:
利用數字存在與否判別章節標題所在,並插入想要的分隔標記。
消除原 PDF 文件各頁夾帶的頁碼(會形成單獨一行)。
副作用:
將 TXT 檔內段落間重複的空白列合併,使各段落間只間隔一個空白列
程式碼:
(複製以下文字,貼入純文字檔中,存檔後將副檔名設定為 .py)
import os
import re
def to_fullwidth_number(number):
"""
將阿拉伯數字轉換為全形格式。
"""
return ''.join(chr(ord('0') + int(char)) for char in str(number).zfill(4))
def normalize_to_fullwidth(text):
"""
將文字中的數字和英文字母轉換為全形。
"""
def to_fullwidth_char(char):
if '0' <= char <= '9':
return chr(ord('0') + ord(char) - ord('0'))
elif 'A' <= char <= 'Z':
return chr(ord('A') + ord(char) - ord('A'))
elif 'a' <= char <= 'z':
return chr(ord('a') + ord(char) - ord('a'))
return char
return ''.join(to_fullwidth_char(c) for c in text)
def detect_chapter_titles(lines, toc_titles=None):
"""
根據頁碼行後的內容檢測章節標題。
"""
chapter_positions = []
for i, line in enumerate(lines):
if re.match(r"^\d+$", line.strip()): # 偵測頁碼行
next_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
normalized_next_line = normalize_to_fullwidth(next_line)
is_title = (
re.match(r"^(第[一二三四五六七八九十百千]+章\s+.*?)$|^(プロローグ)$|^(第[一二三四五六七八九十百千]+話\s+.*?)$|^(EX[::]?.*?)$|^【.*?】$", normalized_next_line)
or (toc_titles and any(normalize_to_fullwidth(normalized_next_line) in normalize_to_fullwidth(title) for title in toc_titles))
)
if is_title:
chapter_positions.append((i + 1, next_line))
return chapter_positions
def insert_dividers(lines, chapter_positions):
"""
根據檢測到的章節標題插入分隔標記。
"""
output_lines = []
chapter_counter = 1
inserted_positions = set()
for i, line in enumerate(lines):
for pos, title in chapter_positions:
if i == pos and i not in inserted_positions:
divider = f"******{to_fullwidth_number(chapter_counter)}******"
output_lines.append(divider)
output_lines.append(title)
output_lines.append(divider)
chapter_counter += 1
inserted_positions.add(i)
if not re.match(r"^\d+$", line.strip()): # 排除頁碼行
output_lines.append(line.rstrip())
return output_lines
def process_file(input_file, output_file, toc_titles=None):
"""
處理單一文件,插入分隔標記,並消除頁碼行。
"""
with open(input_file, "r", encoding="utf-8") as file:
lines = file.readlines()
# 檢測章節標題位置
chapter_positions = detect_chapter_titles(lines, toc_titles)
# 插入分隔標記並處理頁碼行
processed_lines = insert_dividers(lines, chapter_positions)
# 寫入處理後的檔案
with open(output_file, "w", encoding="utf-8") as file:
file.write("\n".join(processed_lines) + "\n")
def main():
print("章節分隔標記插入工具(基於頁碼行後內容檢測,並消除頁碼行)")
input_folder = input("請輸入包含 TXT 文件的資料夾路徑:").strip()
output_folder = input("請輸入輸出結果的資料夾路徑:").strip()
toc_file = input("若有目錄文件,請輸入其路徑(可按 Enter 跳過):").strip()
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 載入目錄標題
toc_titles = []
if toc_file and os.path.isfile(toc_file):
with open(toc_file, "r", encoding="utf-8") as file:
toc_titles = [line.strip() for line in file.readlines() if line.strip()]
print(f"已載入目錄標題:{len(toc_titles)} 項")
# 處理每個 TXT 文件
for file_name in os.listdir(input_folder):
if file_name.endswith(".txt"):
input_file = os.path.join(input_folder, file_name)
output_file = os.path.join(output_folder, file_name)
process_file(input_file, output_file, toc_titles)
print(f"處理完成!結果已存放於:{output_folder}")
if __name__ == "__main__":
main()
# Page Analysis Divider Tool
# Version: 2025/01/27 22:45 (UTC+8)