Python办公库4——PyPDF2 – PDF文件处理专家
PyPDF2 – PDF文件处理专家
📖 简介
PyPDF2是Python中处理PDF文件的经典库,支持PDF文件的读取、合并、拆分、旋转、加密等操作。虽然功能相对基础,但对于日常的PDF处理任务来说已经足够强大且稳定。

🚀 安装
pip install PyPDF2
# 推荐同时安装用于文本提取的库
pip install PyPDF2 pdfplumber
📋 基础操作
1. 读取PDF文件
import PyPDF2
import os
defread_pdf_info(file_path):
"""读取PDF文件基本信息"""
withopen(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# 获取基本信息
num_pages = len(pdf_reader.pages)
metadata = pdf_reader.metadata
print(f"文件路径: {file_path}")
print(f"页数: {num_pages}")
print(f"标题: {metadata.get('/Title', '未知')}")
print(f"作者: {metadata.get('/Author', '未知')}")
print(f"创建者: {metadata.get('/Creator', '未知')}")
print(f"制作者: {metadata.get('/Producer', '未知')}")
return pdf_reader
# 使用示例
# pdf_reader = read_pdf_info('example.pdf')
2. 提取文本内容
defextract_text_from_pdf(file_path, page_num=None):
"""从PDF中提取文本"""
withopen(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
if page_num isnotNone:
# 提取指定页面的文本
if0 <= page_num < len(pdf_reader.pages):
page = pdf_reader.pages[page_num]
text = page.extract_text()
return text
else:
print(f"页面 {page_num} 不存在")
returnNone
else:
# 提取所有页面的文本
all_text = ""
for page_num inrange(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
all_text += f"--- 第 {page_num + 1} 页 ---\n"
all_text += page.extract_text() + "\n\n"
return all_text
# 使用示例
# text = extract_text_from_pdf('example.pdf', 0) # 提取第一页
# all_text = extract_text_from_pdf('example.pdf') # 提取所有页面
📄 PDF文件操作
1. 合并PDF文件
defmerge_pdfs(pdf_list, output_path):
"""合并多个PDF文件"""
pdf_merger = PyPDF2.PdfMerger()
try:
for pdf_file in pdf_list:
if os.path.exists(pdf_file):
pdf_merger.append(pdf_file)
print(f"已添加: {pdf_file}")
else:
print(f"文件不存在: {pdf_file}")
# 保存合并后的文件
withopen(output_path, 'wb') as output_file:
pdf_merger.write(output_file)
print(f"合并完成,保存为: {output_path}")
except Exception as e:
print(f"合并过程中出错: {e}")
finally:
pdf_merger.close()
# 使用示例
pdf_files = ['file1.pdf', 'file2.pdf', 'file3.pdf']
merge_pdfs(pdf_files, 'merged_document.pdf')
2. 拆分PDF文件
defsplit_pdf(input_path, output_dir):
"""将PDF文件拆分为单独的页面"""
ifnot os.path.exists(output_dir):
os.makedirs(output_dir)
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num inrange(len(pdf_reader.pages)):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page_num])
output_filename = f"{output_dir}/page_{page_num + 1}.pdf"
withopen(output_filename, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已保存: {output_filename}")
# 使用示例
# split_pdf('document.pdf', 'split_pages')
3. 提取指定页面范围
defextract_pages(input_path, output_path, start_page, end_page):
"""提取指定页面范围"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
# 检查页面范围
total_pages = len(pdf_reader.pages)
if start_page < 0or end_page >= total_pages or start_page > end_page:
print(f"页面范围错误。总页数: {total_pages}")
return
# 添加指定范围的页面
for page_num inrange(start_page, end_page + 1):
pdf_writer.add_page(pdf_reader.pages[page_num])
# 保存提取的页面
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已提取第 {start_page + 1} 到第 {end_page + 1} 页,保存为: {output_path}")
# 使用示例
# extract_pages('document.pdf', 'extracted_pages.pdf', 0, 4) # 提取前5页
🔄 页面操作
1. 旋转页面
defrotate_pages(input_path, output_path, rotation_angle):
"""旋转PDF页面"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
for page in pdf_reader.pages:
# 旋转页面(90, 180, 270度)
rotated_page = page.rotate(rotation_angle)
pdf_writer.add_page(rotated_page)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已旋转 {rotation_angle} 度,保存为: {output_path}")
# 使用示例
# rotate_pages('document.pdf', 'rotated_document.pdf', 90)
2. 缩放页面
defscale_pages(input_path, output_path, scale_factor):
"""缩放PDF页面"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
for page in pdf_reader.pages:
# 缩放页面
scaled_page = page.scale(scale_factor, scale_factor)
pdf_writer.add_page(scaled_page)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已缩放 {scale_factor} 倍,保存为: {output_path}")
# 使用示例
# scale_pages('document.pdf', 'scaled_document.pdf', 0.8) # 缩小到80%
🔐 PDF加密与解密
1. 加密PDF文件
defencrypt_pdf(input_path, output_path, password):
"""为PDF文件添加密码保护"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
# 复制所有页面
for page in pdf_reader.pages:
pdf_writer.add_page(page)
# 添加密码保护
pdf_writer.encrypt(password)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已加密保存为: {output_path}")
# 使用示例
# encrypt_pdf('document.pdf', 'encrypted_document.pdf', 'mypassword123')
2. 解密PDF文件
defdecrypt_pdf(input_path, output_path, password):
"""解密PDF文件"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# 检查是否加密
if pdf_reader.is_encrypted:
# 尝试解密
if pdf_reader.decrypt(password):
pdf_writer = PyPDF2.PdfWriter()
# 复制所有页面
for page in pdf_reader.pages:
pdf_writer.add_page(page)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"解密成功,保存为: {output_path}")
else:
print("密码错误,解密失败")
else:
print("PDF文件未加密")
# 使用示例
# decrypt_pdf('encrypted_document.pdf', 'decrypted_document.pdf', 'mypassword123')
🎯 实战案例:批量处理PDF报告
import PyPDF2
import os
from datetime import datetime
import glob
classPDFProcessor:
"""PDF批量处理器"""
def__init__(self, input_dir, output_dir):
self.input_dir = input_dir
self.output_dir = output_dir
# 创建输出目录
ifnot os.path.exists(output_dir):
os.makedirs(output_dir)
defget_pdf_files(self):
"""获取目录下所有PDF文件"""
pattern = os.path.join(self.input_dir, "*.pdf")
return glob.glob(pattern)
defextract_all_text(self):
"""提取所有PDF文件的文本"""
pdf_files = self.get_pdf_files()
for pdf_file in pdf_files:
try:
filename = os.path.basename(pdf_file)
text_file = os.path.join(self.output_dir, f"{filename[:-4]}_text.txt")
withopen(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
all_text = ""
for page_num, page inenumerate(pdf_reader.pages):
all_text += f"=== 第 {page_num + 1} 页 ===\n"
all_text += page.extract_text() + "\n\n"
# 保存文本
withopen(text_file, 'w', encoding='utf-8') as f:
f.write(all_text)
print(f"已提取文本: {text_file}")
except Exception as e:
print(f"处理 {pdf_file} 时出错: {e}")
defmerge_all_pdfs(self, output_filename="merged_all.pdf"):
"""合并所有PDF文件"""
pdf_files = self.get_pdf_files()
ifnot pdf_files:
print("没有找到PDF文件")
return
pdf_merger = PyPDF2.PdfMerger()
try:
for pdf_file in pdf_files:
pdf_merger.append(pdf_file)
print(f"已添加: {os.path.basename(pdf_file)}")
output_path = os.path.join(self.output_dir, output_filename)
withopen(output_path, 'wb') as output_file:
pdf_merger.write(output_file)
print(f"合并完成: {output_path}")
except Exception as e:
print(f"合并过程中出错: {e}")
finally:
pdf_merger.close()
defsplit_all_pdfs(self):
"""拆分所有PDF文件"""
pdf_files = self.get_pdf_files()
for pdf_file in pdf_files:
try:
filename = os.path.basename(pdf_file)[:-4]
split_dir = os.path.join(self.output_dir, f"{filename}_split")
ifnot os.path.exists(split_dir):
os.makedirs(split_dir)
withopen(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num inrange(len(pdf_reader.pages)):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page_num])
output_filename = os.path.join(split_dir, f"page_{page_num + 1}.pdf")
withopen(output_filename, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已拆分: {pdf_file} -> {split_dir}")
except Exception as e:
print(f"拆分 {pdf_file} 时出错: {e}")
defcreate_summary_report(self):
"""创建PDF文件汇总报告"""
pdf_files = self.get_pdf_files()
report_content = f"PDF文件处理报告\n"
report_content += f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
report_content += f"处理目录: {self.input_dir}\n"
report_content += f"文件总数: {len(pdf_files)}\n\n"
total_pages = 0
for i, pdf_file inenumerate(pdf_files, 1):
try:
withopen(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
num_pages = len(pdf_reader.pages)
total_pages += num_pages
metadata = pdf_reader.metadata
file_size = os.path.getsize(pdf_file) / 1024# KB
report_content += f"{i}. {os.path.basename(pdf_file)}\n"
report_content += f" 页数: {num_pages}\n"
report_content += f" 大小: {file_size:.2f} KB\n"
report_content += f" 标题: {metadata.get('/Title', '未知')}\n"
report_content += f" 作者: {metadata.get('/Author', '未知')}\n\n"
except Exception as e:
report_content += f"{i}. {os.path.basename(pdf_file)} - 读取失败: {e}\n\n"
report_content += f"总页数: {total_pages}\n"
# 保存报告
report_file = os.path.join(self.output_dir, "pdf_summary_report.txt")
withopen(report_file, 'w', encoding='utf-8') as f:
f.write(report_content)
print(f"汇总报告已生成: {report_file}")
return report_content
defmain():
"""主函数 - 演示PDF批量处理"""
# 创建处理器
processor = PDFProcessor("input_pdfs", "output_results")
print("=== PDF批量处理工具 ===")
print("1. 提取所有文本")
print("2. 合并所有PDF")
print("3. 拆分所有PDF")
print("4. 生成汇总报告")
print("5. 执行所有操作")
choice = input("请选择操作 (1-5): ")
if choice == "1":
processor.extract_all_text()
elif choice == "2":
processor.merge_all_pdfs()
elif choice == "3":
processor.split_all_pdfs()
elif choice == "4":
processor.create_summary_report()
elif choice == "5":
print("执行所有操作...")
processor.extract_all_text()
processor.merge_all_pdfs()
processor.split_all_pdfs()
processor.create_summary_report()
print("所有操作完成!")
else:
print("无效选择")
if __name__ == "__main__":
main()
🔧 高级功能
1. 添加水印
defadd_watermark(input_path, watermark_path, output_path):
"""为PDF添加水印"""
withopen(input_path, 'rb') as input_file, open(watermark_path, 'rb') as watermark_file:
input_pdf = PyPDF2.PdfReader(input_file)
watermark_pdf = PyPDF2.PdfReader(watermark_file)
watermark_page = watermark_pdf.pages[0]
pdf_writer = PyPDF2.PdfWriter()
for page in input_pdf.pages:
page.merge_page(watermark_page)
pdf_writer.add_page(page)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已添加水印,保存为: {output_path}")
2. 书签操作
defadd_bookmarks(input_path, output_path, bookmarks):
"""添加书签到PDF"""
withopen(input_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
# 复制所有页面
for page in pdf_reader.pages:
pdf_writer.add_page(page)
# 添加书签
for title, page_num in bookmarks:
pdf_writer.add_outline_item(title, page_num)
withopen(output_path, 'wb') as output_file:
pdf_writer.write(output_file)
print(f"已添加书签,保存为: {output_path}")
# 使用示例
bookmarks = [
("第一章", 0),
("第二章", 5),
("第三章", 10)
]
# add_bookmarks('document.pdf', 'bookmarked_document.pdf', bookmarks)
⚡ 性能优化建议
-
内存管理: 处理大文件时使用流式处理
-
批量操作: 一次性处理多个文件,减少IO操作
-
异常处理: 添加适当的错误处理机制
-
文件检查: 处理前检查文件是否存在和可读
defsafe_pdf_operation(func):
"""PDF操作装饰器,添加安全检查"""
defwrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except FileNotFoundError:
print("文件未找到")
except PyPDF2.errors.PdfReadError:
print("PDF文件损坏或格式不正确")
except Exception as e:
print(f"操作失败: {e}")
return wrapper
⚠️ 常见问题
-
文本提取不完整: PyPDF2的文本提取功能有限,复杂布局可能提取不准确
-
加密文件: 某些加密方式可能不支持
-
文件损坏: 损坏的PDF文件可能无法正常处理
-
中文支持: 某些中文PDF可能存在编码问题
🎉 总结
PyPDF2是处理PDF文件的实用工具,具有以下特点:
-
✅ 功能稳定:基础PDF操作功能完善 -
✅ 易于使用:API简单直观 -
✅ 轻量级:依赖少,安装简单 -
✅ 开源免费:完全免费使用
虽然在文本提取和复杂操作方面有一定局限性,但对于日常的PDF处理任务来说,PyPDF2仍然是一个可靠的选择!
下一章预告: 我们将学习schedule库,探索Python中的任务调度和定时执行功能。
夜雨聆风
