## 流式提取 EPUB 文件中的文本内容,统计字数,并写入文件(如果convert_to_txt==True)。 defextract_epub_text(epub_path,convert_to_txt=False,text_to_find=None): try: text_find_result_dt = {} # 用于存储text finding结果的字典 if(convert_to_txt): # 如果需要写入文件,则在此处创建文件,后续进行流式写入 output_file_path = epub_path+".txt" output = open(output_file_path,'w',encoding="utf-8") with zipfile.ZipFile(epub_path, 'r') as epub: # 打开epub文件。后续对这个epub文件对象进行操作。 # 找到并解析内容清单 (container.xml) container_file = "META-INF/container.xml" with epub.open(container_file) as container: tree = ET.parse(container) root = tree.getroot() # 获取根文件路径 rootfile_path = root.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile").attrib['full-path'] # 打开根文件 (通常是 .opf 文件) print(f"rootfile:\t{rootfile_path}") with epub.open(rootfile_path) as rootfile: tree = ET.parse(rootfile) root = tree.getroot() # 查找所有内容文件的引用 items = root.findall(".//{http://www.idpf.org/2007/opf}item") text_files = [item.attrib['href'] for item in items if item.attrib['media-type'] == 'application/xhtml+xml'] total_word_count = total_char_count = total_zh_word_count = total_en_word_count = 0# 字数统计的变量,此处创建并赋值为0 for text_file in text_files: # 根据所有内容文件的引用,依次读取这些文件,并提取文本内容 print(f"File:\t{text_file}") try: dirname = os.path.dirname(rootfile_path) if(dirname==""): text_fpath = text_file else: # 如果文件路径中有斜杠,需要分情况做一些处理 if("\\"in text_file): text_fpath = dirname+"\\"+text_file else: text_fpath = dirname+"/"+text_file with epub.open(text_fpath) as tf: # 打开对应的文件,并进行解析 txt1 = tf.read() bs4obj = BeautifulSoup(txt1,'lxml') # 使用beautifulsoup解析,这个库比xml更稳定 text = bs4obj.get_text().strip() # 使用beautifulsoup获取页面上的所有文本内容 if(convert_to_txt): # 如果指定了要导出到text文件,则此处开启写入。 output.write(text) output.write("\n") if(text_to_find isnotNone): # 查询字符串是否在这个章节中 if(text_to_find in text): text_find_result_dt[text_file] = text word_count, char_count, chinese_word_count, english_word_count = count_words_and_chars(text) print(f"\tword_count:\t{word_count}") print(f"\tchar_count:\t{char_count}") print(f"\tchinese_word_count:\t{chinese_word_count}") print(f"\tenglish_word_count:\t{english_word_count}") total_word_count += word_count total_char_count += char_count total_zh_word_count += chinese_word_count total_en_word_count += english_word_count except Exception as e: print(f"Warning: Failed to parse `{text_file}` - message:{e}", file=sys.stderr) if(convert_to_txt): output.close() print(f"Text file is saved as `{output_file_path}`.") if((text_to_find isNone)==False): print("\n\n===> Text finding result: <===") print(text_find_result_dt.keys()) json_text = json.dumps(text_find_result_dt,ensure_ascii=False,indent="\t") withopen("text_finding_result.json",'w',encoding="utf-8") as jsonfile: jsonfile.write(json_text) print("See more details in `text_finding_result.json`") return [total_word_count,total_char_count,total_zh_word_count,total_en_word_count] except Exception as e: print(f"Error: Unable to process EPUB file - {e}", file=sys.stderr) return [0,0,0,0]
defmain(): iflen(sys.argv) < 2: print("Usage: python epub_convert_tool.py <epub_file> [convert_to_txt] [text_to_find]") print("\t<epub_file> Input file path") print("\t[convert_to_txt] 0: not convert, 1: convert to a txt file in the same name") print("\t[text_to_find] Text you want to find") sys.exit(1)
ifnot os.path.isfile(epub_file) ornot epub_file.lower().endswith('.epub'): print("Error: Please provide a valid EPUB file.") sys.exit(1)
wc,cc,zh_wc,en_wc = extract_epub_text(epub_file,convert_to_txt,text_to_find) print(f"\n\n>>> total count of `{epub_file}` <<<\n") print(f"word_count:\t{wc}") print(f"char_count:\t{cc}") print(f"chinese_word_count:\t{zh_wc}") print(f"english_word_count:\t{en_wc}") if __name__ == "__main__": main()
三、运行效果
这是一个命令行应用,因此直接运行时,会得到下面这样的帮助文档。
1 2 3 4
Usage: python epub_convert_tool.py <epub_file> [convert_to_txt] [text_to_find] <epub_file> Input file path [convert_to_txt] 0: not convert, 1: convert to a txt file in the same name [text_to_find] Text you want to find
File: Text/part0142.xhtml word_count: 2976 char_count: 3458 chinese_word_count: 2976 english_word_count: 0 File: Text/part0143.xhtml word_count: 1912 char_count: 2234 chinese_word_count: 1912 english_word_count: 0 Text file is saved as `李娟阿勒泰系列典藏合集.epub.txt`.
===> Text finding result: <=== dict_keys(['Text/part0018.xhtml', 'Text/part0038.xhtml', 'Text/part0059.xhtml', 'Text/part0064.xhtml', 'Text/part0079.xhtml', 'Text/part0096.xhtml', 'Text/part0097.xhtml', 'Text/part0119.xhtml']) See more details in `text_finding_result.json`