方法一:使用PyPDF2(推荐)
import PyPDF2
def extract_text_from_pages(pdf_path, page_numbers):
"""
从PDF的指定页面提取文本
参数:
pdf_path: PDF文件路径
page_numbers: 页面编号列表(从0开始或从1开始)
返回:
字典,键为页面编号,值为文本内容
"""
result = {}
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
total_pages = len(pdf_reader.pages)
print(f"PDF总页数: {total_pages}")
for page_num in page_numbers:
# 调整页面索引(如果输入的是从1开始的页码)
if page_num > 0: # 假设是从1开始的页码
adjusted_index = page_num - 1
else: # 从0开始的页码
adjusted_index = page_num
# 检查页码是否有效
if 0 <= adjusted_index < total_pages:
page = pdf_reader.pages[adjusted_index]
text = page.extract_text()
result[page_num] = text
else:
print(f"警告: 页码 {page_num} 超出范围 (1-{total_pages})")
result[page_num] = None
return result
except Exception as e:
print(f"读取PDF文件时出错: {e}")
return None
# 使用示例
if __name__ == "__main__":
# 提取指定页面(页码从1开始)
pages_to_extract = [1, 3, 5] # 提取第1, 3, 5页
extracted_text = extract_text_from_pages("sample.pdf", pages_to_extract)
# 打印结果
if extracted_text:
for page_num, text in extracted_text.items():
if text:
print(f"=== 第 {page_num} 页内容 ===")
print(text[:500] + "..." if len(text) > 500 else text) # 只显示前500字符
print("-" * 50)
方法二:使用pdfplumber(更好的文本提取)
import pdfplumber
def extract_text_with_pdfplumber(pdf_path, page_numbers, output_txt=False):
"""
使用pdfplumber从PDF指定页面提取文本(更精确)
参数:
pdf_path: PDF文件路径
page_numbers: 页面编号列表(从1开始)
output_txt: 是否保存到文本文件
"""
results = {}
try:
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
print(f"PDF总页数: {total_pages}")
for page_num in page_numbers:
# 检查页码有效性
if 1 <= page_num <= total_pages:
page = pdf.pages[page_num - 1]
# 提取文本
text = page.extract_text()
# 提取表格(如果有)
tables = page.extract_tables()
results[page_num] = {
'text': text,
'tables': tables,
'has_tables': len(tables) > 0
}
print(f"第 {page_num} 页提取完成 - 表格数量: {len(tables)}")
else:
print(f"警告: 页码 {page_num} 超出范围 (1-{total_pages})")
results[page_num] = None
# 可选:保存到文本文件
if output_txt and results:
with open("extracted_pages.txt", "w", encoding="utf-8") as f:
for page_num, content in results.items():
if content and content['text']:
f.write(f"=== 第 {page_num} 页 ===\n")
f.write(content['text'])
f.write("\n\n")
return results
except Exception as e:
print(f"处理PDF时出错: {e}")
return None
# 使用示例
if __name__ == "__main__":
# 提取指定页面
pages = [1, 2, 3]
result = extract_text_with_pdfplumber("sample.pdf", pages, output_txt=True)
# 处理结果
if result:
for page_num, content in result.items():
if content:
print(f"\n第 {page_num} 页:")
print(f"文本长度: {len(content['text'])} 字符")
if content['has_tables']:
print(f"发现 {len(content['tables'])} 个表格")
方法三:批量提取并保存的高级版本
import os
import json
from PyPDF2 import PdfReader
import pdfplumber
from typing import List, Dict, Union
class PDFTextExtractor:
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
self.total_pages = 0
self._initialize()
def _initialize(self):
"""初始化PDF信息"""
try:
with open(self.pdf_path, 'rb') as f:
reader = PdfReader(f)
self.total_pages = len(reader.pages)
print(f"已加载PDF: {os.path.basename(self.pdf_path)}")
print(f"总页数: {self.total_pages}")
except Exception as e:
print(f"初始化失败: {e}")
def extract_pages(self, page_numbers: List[int],
method: str = "pdfplumber",
include_tables: bool = False) -> Dict:
"""
提取指定页面文本
参数:
page_numbers: 页面编号列表(从1开始)
method: 提取方法 - "pdfplumber" 或 "pypdf2"
include_tables: 是否提取表格(仅pdfplumber有效)
"""
if not page_numbers:
return {}
results = {}
if method == "pdfplumber":
results = self._extract_with_pdfplumber(page_numbers, include_tables)
elif method == "pypdf2":
results = self._extract_with_pypdf2(page_numbers)
else:
print(f"未知方法: {method}")
return results
def _extract_with_pdfplumber(self, page_numbers: List[int],
include_tables: bool) -> Dict:
"""使用pdfplumber提取"""
results = {}
try:
with pdfplumber.open(self.pdf_path) as pdf:
for page_num in page_numbers:
if 1 <= page_num <= self.total_pages:
page_info = pdf.pages[page_num - 1]
text = page_info.extract_text()
result = {"text": text}
if include_tables:
tables = page_info.extract_tables()
result["tables"] = tables
result["table_count"] = len(tables)
results[page_num] = result
else:
print(f"跳过无效页码: {page_num}")
except Exception as e:
print(f"pdfplumber提取失败: {e}")
return results
def _extract_with_pypdf2(self, page_numbers: List[int]) -> Dict:
"""使用PyPDF2提取"""
results = {}
try:
with open(self.pdf_path, 'rb') as f:
reader = PdfReader(f)
for page_num in page_numbers:
if 1 <= page_num <= self.total_pages:
page = reader.pages[page_num - 1]
text = page.extract_text()
results[page_num] = {"text": text}
else:
print(f"跳过无效页码: {page_num}")
except Exception as e:
print(f"PyPDF2提取失败: {e}")
return results
def save_results(self, results: Dict, output_format: str = "txt"):
"""
保存提取结果
参数:
results: 提取结果
output_format: 输出格式 - "txt", "json", 或 "csv"
"""
if not results:
print("没有结果可保存")
return
base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]
if output_format == "txt":
filename = f"{base_name}_extracted.txt"
with open(filename, "w", encoding="utf-8") as f:
for page_num, content in results.items():
f.write(f"=== Page {page_num} ===\n")
f.write(content.get("text", ""))
f.write("\n\n")
print(f"结果已保存到: {filename}")
elif output_format == "json":
filename = f"{base_name}_extracted.json"
# 转换对象为可序列化格式
serializable_results = {}
for page_num, content in results.items():
serializable_results[str(page_num)] = {
"text": content.get("text", ""),
"table_count": content.get("table_count", 0)
}
with open(filename, "w", encoding="utf-8") as f:
json.dump(serializable_results, f, ensure_ascii=False, indent=2)
print(f"结果已保存到: {filename}")
elif output_format == "csv":
import csv
filename = f"{base_name}_extracted.csv"
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Page", "Text", "Table_Count"])
for page_num, content in results.items():
# 清理文本中的换行符
text_clean = content.get("text", "").replace("\n", " ").strip()
table_count = content.get("table_count", 0)
writer.writerow([page_num, text_clean, table_count])
print(f"结果已保存到: {filename}")
# 使用示例
if __name__ == "__main__":
# 1. 创建提取器
extractor = PDFTextExtractor("your_document.pdf")
# 2. 定义要提取的页面(从1开始)
pages_to_extract = [1, 3, 5, 7]
# 3. 提取文本
print("正在提取文本...")
results = extractor.extract_pages(
page_numbers=pages_to_extract,
method="pdfplumber", # 使用pdfplumber(更准确)
include_tables=True # 包含表格
)
# 4. 显示统计信息
print(f"\n提取完成!共处理 {len(results)} 页")
for page_num, content in results.items():
text_len = len(content.get("text", ""))
tables = content.get("table_count", 0)
print(f"第 {page_num} 页: {text_len} 字符, {tables} 个表格")
# 5. 保存结果
extractor.save_results(results, output_format="json")
安装依赖
# 安装PyPDF2
pip install PyPDF2
# 安装pdfplumber(推荐,提取效果更好)
pip install pdfplumber pillow
# 可选:安装pandas用于表格处理
pip install pandas
选择建议
PyPDF2: 简单快速,适合基本文本提取
pdfplumber: 更精确的文本提取,支持表格识别,推荐使用
批量处理: 使用第三种的类结构,适合生产环境
这个解决方案提供了灵活的选项,您可以根据具体需求选择合适的方法。