海门区上往网

Python实现快速从指定页面PDF中提取文本

2026-04-05 13:41:02 浏览次数:1
详细信息

方法一:使用PyPDF2(推荐)

import PyPDF2

def extract_text_from_pages(pdf_path, page_numbers):
    """
    从PDF的指定页面提取文本

    参数:
        pdf_path: PDF文件路径
        page_numbers: 页面编号列表(从0开始或从1开始)

    返回:
        字典,键为页面编号,值为文本内容
    """
    result = {}

    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)

            print(f"PDF总页数: {total_pages}")

            for page_num in page_numbers:
                # 调整页面索引(如果输入的是从1开始的页码)
                if page_num > 0:  # 假设是从1开始的页码
                    adjusted_index = page_num - 1
                else:  # 从0开始的页码
                    adjusted_index = page_num

                # 检查页码是否有效
                if 0 <= adjusted_index < total_pages:
                    page = pdf_reader.pages[adjusted_index]
                    text = page.extract_text()
                    result[page_num] = text
                else:
                    print(f"警告: 页码 {page_num} 超出范围 (1-{total_pages})")
                    result[page_num] = None

        return result

    except Exception as e:
        print(f"读取PDF文件时出错: {e}")
        return None

# 使用示例
if __name__ == "__main__":
    # 提取指定页面(页码从1开始)
    pages_to_extract = [1, 3, 5]  # 提取第1, 3, 5页
    extracted_text = extract_text_from_pages("sample.pdf", pages_to_extract)

    # 打印结果
    if extracted_text:
        for page_num, text in extracted_text.items():
            if text:
                print(f"=== 第 {page_num} 页内容 ===")
                print(text[:500] + "..." if len(text) > 500 else text)  # 只显示前500字符
                print("-" * 50)

方法二:使用pdfplumber(更好的文本提取)

import pdfplumber

def extract_text_with_pdfplumber(pdf_path, page_numbers, output_txt=False):
    """
    使用pdfplumber从PDF指定页面提取文本(更精确)

    参数:
        pdf_path: PDF文件路径
        page_numbers: 页面编号列表(从1开始)
        output_txt: 是否保存到文本文件
    """
    results = {}

    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            print(f"PDF总页数: {total_pages}")

            for page_num in page_numbers:
                # 检查页码有效性
                if 1 <= page_num <= total_pages:
                    page = pdf.pages[page_num - 1]

                    # 提取文本
                    text = page.extract_text()

                    # 提取表格(如果有)
                    tables = page.extract_tables()

                    results[page_num] = {
                        'text': text,
                        'tables': tables,
                        'has_tables': len(tables) > 0
                    }

                    print(f"第 {page_num} 页提取完成 - 表格数量: {len(tables)}")
                else:
                    print(f"警告: 页码 {page_num} 超出范围 (1-{total_pages})")
                    results[page_num] = None

        # 可选:保存到文本文件
        if output_txt and results:
            with open("extracted_pages.txt", "w", encoding="utf-8") as f:
                for page_num, content in results.items():
                    if content and content['text']:
                        f.write(f"=== 第 {page_num} 页 ===\n")
                        f.write(content['text'])
                        f.write("\n\n")

        return results

    except Exception as e:
        print(f"处理PDF时出错: {e}")
        return None

# 使用示例
if __name__ == "__main__":
    # 提取指定页面
    pages = [1, 2, 3]
    result = extract_text_with_pdfplumber("sample.pdf", pages, output_txt=True)

    # 处理结果
    if result:
        for page_num, content in result.items():
            if content:
                print(f"\n第 {page_num} 页:")
                print(f"文本长度: {len(content['text'])} 字符")
                if content['has_tables']:
                    print(f"发现 {len(content['tables'])} 个表格")

方法三:批量提取并保存的高级版本

import os
import json
from PyPDF2 import PdfReader
import pdfplumber
from typing import List, Dict, Union

class PDFTextExtractor:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.total_pages = 0
        self._initialize()

    def _initialize(self):
        """初始化PDF信息"""
        try:
            with open(self.pdf_path, 'rb') as f:
                reader = PdfReader(f)
                self.total_pages = len(reader.pages)
                print(f"已加载PDF: {os.path.basename(self.pdf_path)}")
                print(f"总页数: {self.total_pages}")
        except Exception as e:
            print(f"初始化失败: {e}")

    def extract_pages(self, page_numbers: List[int], 
                     method: str = "pdfplumber",
                     include_tables: bool = False) -> Dict:
        """
        提取指定页面文本

        参数:
            page_numbers: 页面编号列表(从1开始)
            method: 提取方法 - "pdfplumber" 或 "pypdf2"
            include_tables: 是否提取表格(仅pdfplumber有效)
        """
        if not page_numbers:
            return {}

        results = {}

        if method == "pdfplumber":
            results = self._extract_with_pdfplumber(page_numbers, include_tables)
        elif method == "pypdf2":
            results = self._extract_with_pypdf2(page_numbers)
        else:
            print(f"未知方法: {method}")

        return results

    def _extract_with_pdfplumber(self, page_numbers: List[int], 
                                include_tables: bool) -> Dict:
        """使用pdfplumber提取"""
        results = {}
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                for page_num in page_numbers:
                    if 1 <= page_num <= self.total_pages:
                        page_info = pdf.pages[page_num - 1]
                        text = page_info.extract_text()

                        result = {"text": text}

                        if include_tables:
                            tables = page_info.extract_tables()
                            result["tables"] = tables
                            result["table_count"] = len(tables)

                        results[page_num] = result
                    else:
                        print(f"跳过无效页码: {page_num}")
        except Exception as e:
            print(f"pdfplumber提取失败: {e}")

        return results

    def _extract_with_pypdf2(self, page_numbers: List[int]) -> Dict:
        """使用PyPDF2提取"""
        results = {}
        try:
            with open(self.pdf_path, 'rb') as f:
                reader = PdfReader(f)
                for page_num in page_numbers:
                    if 1 <= page_num <= self.total_pages:
                        page = reader.pages[page_num - 1]
                        text = page.extract_text()
                        results[page_num] = {"text": text}
                    else:
                        print(f"跳过无效页码: {page_num}")
        except Exception as e:
            print(f"PyPDF2提取失败: {e}")

        return results

    def save_results(self, results: Dict, output_format: str = "txt"):
        """
        保存提取结果

        参数:
            results: 提取结果
            output_format: 输出格式 - "txt", "json", 或 "csv"
        """
        if not results:
            print("没有结果可保存")
            return

        base_name = os.path.splitext(os.path.basename(self.pdf_path))[0]

        if output_format == "txt":
            filename = f"{base_name}_extracted.txt"
            with open(filename, "w", encoding="utf-8") as f:
                for page_num, content in results.items():
                    f.write(f"=== Page {page_num} ===\n")
                    f.write(content.get("text", ""))
                    f.write("\n\n")
            print(f"结果已保存到: {filename}")

        elif output_format == "json":
            filename = f"{base_name}_extracted.json"
            # 转换对象为可序列化格式
            serializable_results = {}
            for page_num, content in results.items():
                serializable_results[str(page_num)] = {
                    "text": content.get("text", ""),
                    "table_count": content.get("table_count", 0)
                }

            with open(filename, "w", encoding="utf-8") as f:
                json.dump(serializable_results, f, ensure_ascii=False, indent=2)
            print(f"结果已保存到: {filename}")

        elif output_format == "csv":
            import csv
            filename = f"{base_name}_extracted.csv"
            with open(filename, "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["Page", "Text", "Table_Count"])
                for page_num, content in results.items():
                    # 清理文本中的换行符
                    text_clean = content.get("text", "").replace("\n", " ").strip()
                    table_count = content.get("table_count", 0)
                    writer.writerow([page_num, text_clean, table_count])
            print(f"结果已保存到: {filename}")

# 使用示例
if __name__ == "__main__":
    # 1. 创建提取器
    extractor = PDFTextExtractor("your_document.pdf")

    # 2. 定义要提取的页面(从1开始)
    pages_to_extract = [1, 3, 5, 7]

    # 3. 提取文本
    print("正在提取文本...")
    results = extractor.extract_pages(
        page_numbers=pages_to_extract,
        method="pdfplumber",  # 使用pdfplumber(更准确)
        include_tables=True   # 包含表格
    )

    # 4. 显示统计信息
    print(f"\n提取完成!共处理 {len(results)} 页")
    for page_num, content in results.items():
        text_len = len(content.get("text", ""))
        tables = content.get("table_count", 0)
        print(f"第 {page_num} 页: {text_len} 字符, {tables} 个表格")

    # 5. 保存结果
    extractor.save_results(results, output_format="json")

安装依赖

# 安装PyPDF2
pip install PyPDF2

# 安装pdfplumber(推荐,提取效果更好)
pip install pdfplumber pillow

# 可选:安装pandas用于表格处理
pip install pandas

选择建议

PyPDF2: 简单快速,适合基本文本提取 pdfplumber: 更精确的文本提取,支持表格识别,推荐使用 批量处理: 使用第三种的类结构,适合生产环境

这个解决方案提供了灵活的选项,您可以根据具体需求选择合适的方法。

相关推荐