从基础到进阶详解Python处理Word文档的完全指南

Python处理Word文档完全指南：从基础到进阶

1. 基础篇：Word文档处理基础概念

Word文档结构

# Word文档的层次结构
文档(Document)
├── 段落(Paragraph)
│   ├── 运行(Run) - 相同格式的文本块
│   └── 格式(Formatting)
├── 表格(Table)
│   ├── 行(Row)
│   └── 单元格(Cell)
├── 图片(Image)
├── 页眉页脚(Header/Footer)
└── 样式(Styles)

2. 入门篇：使用python-docx库基础操作

安装与导入

pip install python-docx

from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

创建和保存文档

# 创建新文档
doc = Document()

# 添加标题
doc.add_heading('文档标题', level=0)

# 添加段落
paragraph = doc.add_paragraph('这是一个段落。')

# 保存文档
doc.save('示例文档.docx')

文本操作

# 添加带格式的文本
p = doc.add_paragraph()
run = p.add_run('加粗文本')
run.bold = True

run = p.add_run(' 红色文本')
run.font.color.rgb = RGBColor(255, 0, 0)

# 设置字体
run.font.name = '微软雅黑'
run.font.size = Pt(12)

# 对齐方式
p.alignment = WD_ALIGN_PARAGRAPH.CENTER

列表处理

# 添加项目符号列表
doc.add_paragraph('第一项', style='List Bullet')
doc.add_paragraph('第二项', style='List Bullet')

# 添加编号列表
doc.add_paragraph('第一步', style='List Number')
doc.add_paragraph('第二步', style='List Number')

表格操作

# 创建表格
table = doc.add_table(rows=3, cols=3)
table.style = 'Table Grid'

# 填充表格
for i in range(3):
    for j in range(3):
        cell = table.cell(i, j)
        cell.text = f'行{i}列{j}'

# 合并单元格
table.cell(0, 0).merge(table.cell(0, 2))

图片插入

from docx.shared import Cm

# 添加图片
doc.add_picture('image.jpg', width=Cm(10))

# 添加带标题的图片
doc.add_paragraph('图1: 示例图片')
doc.add_picture('image.jpg', width=Inches(4))

3. 进阶篇：高级文档处理技巧

样式管理

# 获取所有样式
styles = doc.styles

# 创建自定义样式
style = styles.add_style('CustomStyle', 1)
style.font.name = '宋体'
style.font.size = Pt(14)
style.font.bold = True
style.paragraph_format.space_after = Pt(12)

# 应用样式
p = doc.add_paragraph('自定义样式文本', style='CustomStyle')

文档读取与解析

def read_document_info(doc_path):
    """读取文档信息"""
    doc = Document(doc_path)

    info = {
        '段落数': len(doc.paragraphs),
        '表格数': len(doc.tables),
        '核心属性': doc.core_properties
    }

    return info

# 遍历文档内容
doc = Document('existing.docx')
for i, paragraph in enumerate(doc.paragraphs):
    print(f'段落 {i}: {paragraph.text}')

    # 获取段落格式
    if paragraph.style:
        print(f'样式: {paragraph.style.name}')

批量替换文本

def replace_text_in_doc(doc_path, replacements):
    """
    批量替换文档中的文本

    Args:
        doc_path: 文档路径
        replacements: 替换字典 {旧文本: 新文本}
    """
    doc = Document(doc_path)

    for paragraph in doc.paragraphs:
        for old_text, new_text in replacements.items():
            if old_text in paragraph.text:
                # 清除原段落内容
                paragraph.clear()

                # 分割文本并保留部分原始格式
                text_parts = paragraph.text.split(old_text)
                for i, part in enumerate(text_parts):
                    if part:
                        paragraph.add_run(part)
                    if i < len(text_parts) - 1:
                        run = paragraph.add_run(new_text)
                        run.bold = True  # 可以自定义新文本格式

    return doc

表格高级操作

class TableProcessor:
    """表格处理器"""

    def __init__(self, document):
        self.doc = document

    def extract_table_data(self, table_index=0):
        """提取表格数据"""
        if table_index >= len(self.doc.tables):
            return []

        table = self.doc.tables[table_index]
        data = []

        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            data.append(row_data)

        return data

    def create_table_from_data(self, data, style='Table Grid'):
        """从数据创建表格"""
        if not data:
            return None

        rows = len(data)
        cols = len(data[0])

        table = self.doc.add_table(rows=rows, cols=cols)
        table.style = style

        for i in range(rows):
            for j in range(cols):
                table.cell(i, j).text = str(data[i][j])

        return table

页眉页脚处理

def add_header_footer(doc, header_text, footer_text):
    """添加页眉页脚"""

    # 添加页眉
    section = doc.sections[0]
    header = section.header
    header_para = header.paragraphs[0]
    header_para.text = header_text
    header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

    # 添加页脚
    footer = section.footer
    footer_para = footer.paragraphs[0]
    footer_para.text = footer_text
    footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

    # 添加页码
    run = footer_para.add_run()
    run.add_field('PAGE')

4. 实战篇：综合应用案例

案例1：自动生成报告

class ReportGenerator:
    """报告生成器"""

    def __init__(self, template_path=None):
        self.doc = Document(template_path) if template_path else Document()
        self.sections = []

    def add_section(self, title, content, level=1):
        """添加章节"""
        # 添加标题
        self.doc.add_heading(title, level=level)

        # 添加内容
        if isinstance(content, list):
            for item in content:
                self.doc.add_paragraph(item, style='List Bullet')
        else:
            self.doc.add_paragraph(content)

        self.doc.add_paragraph()  # 空行

    def add_data_table(self, data, title=None):
        """添加数据表格"""
        if title:
            self.doc.add_heading(title, level=2)

        table = self.doc.add_table(rows=1, cols=len(data[0]))
        table.style = 'Light Shading'

        # 添加表头
        header_cells = table.rows[0].cells
        for i, header in enumerate(data[0]):
            header_cells[i].text = str(header)

        # 添加数据行
        for row_data in data[1:]:
            row_cells = table.add_row().cells
            for i, cell_data in enumerate(row_data):
                row_cells[i].text = str(cell_data)

    def save(self, filename):
        """保存报告"""
        self.doc.save(filename)
        print(f"报告已保存至: {filename}")

# 使用示例
generator = ReportGenerator()
generator.add_section("执行摘要", "本报告总结了项目的主要发现...")
generator.add_data_table([
    ['指标', 'Q1', 'Q2', 'Q3', 'Q4'],
    ['销售额', '100万', '120万', '150万', '180万'],
    ['增长率', '10%', '20%', '25%', '20%']
], title="年度销售数据")
generator.save("年度报告.docx")

案例2：文档合并工具

def merge_documents(doc_paths, output_path):
    """合并多个Word文档"""

    master_doc = Document()

    for i, doc_path in enumerate(doc_paths):
        sub_doc = Document(doc_path)

        # 添加分页符（除第一个文档外）
        if i > 0:
            master_doc.add_page_break()

        # 复制所有元素
        for element in sub_doc.element.body:
            master_doc.element.body.append(element)

    master_doc.save(output_path)
    return output_path

案例3：模板填充系统

class TemplateFiller:
    """模板填充系统"""

    def __init__(self, template_path):
        self.template = Document(template_path)
        self.placeholders = self._find_placeholders()

    def _find_placeholders(self):
        """查找模板中的占位符"""
        placeholders = []

        for paragraph in self.template.paragraphs:
            text = paragraph.text
            # 查找 {{placeholder}} 格式的占位符
            import re
            matches = re.findall(r'\{\{(\w+)\}\}', text)
            placeholders.extend(matches)

        return set(placeholders)

    def fill_template(self, data_dict, output_path):
        """填充模板"""

        for paragraph in self.template.paragraphs:
            for placeholder in self.placeholders:
                if placeholder in data_dict and f'{{{{{placeholder}}}}}' in paragraph.text:
                    # 替换占位符
                    new_text = paragraph.text.replace(
                        f'{{{{{placeholder}}}}}', 
                        str(data_dict[placeholder])
                    )

                    # 清除原内容并添加新内容
                    paragraph.clear()
                    # 尝试保留格式
                    runs = paragraph.add_run(new_text)

        self.template.save(output_path)
        return output_path

# 使用示例
filler = TemplateFiller('合同模板.docx')
data = {
    'client_name': '张三',
    'project_name': '网站开发项目',
    'amount': '50000',
    'date': '2024-01-15'
}
filler.fill_template(data, 'filled_contract.docx')

5. 性能优化与最佳实践

性能优化技巧

# 1. 批量操作减少IO
def batch_process_documents(doc_paths):
    """批量处理文档"""
    results = []

    for path in doc_paths:
        # 避免重复打开关闭文件
        doc = Document(path)
        # 执行处理...
        results.append(processed_data)

    return results

# 2. 使用生成器处理大文档
def process_large_document(doc_path, chunk_size=100):
    """分块处理大文档"""
    doc = Document(doc_path)

    for i in range(0, len(doc.paragraphs), chunk_size):
        chunk = doc.paragraphs[i:i+chunk_size]
        yield chunk  # 返回文档块

错误处理

from docx.exceptions import PackageNotFoundError

def safe_document_processing(doc_path):
    """安全的文档处理"""
    try:
        doc = Document(doc_path)

        if len(doc.paragraphs) == 0:
            print("警告：文档为空")
            return None

        # 处理文档...
        return processed_result

    except PackageNotFoundError:
        print(f"错误：文件 {doc_path} 不是有效的Word文档")
        return None
    except Exception as e:
        print(f"处理文档时发生错误: {str(e)}")
        return None

扩展建议

使用其他库增强功能

# 1. 处理doc格式（旧版Word）
# pip install antiword
import subprocess

def read_doc_file(doc_path):
    """读取.doc格式文件"""
    result = subprocess.run(['antiword', doc_path], 
                          capture_output=True, text=True)
    return result.stdout

# 2. 转换为PDF
# pip install docx2pdf
from docx2pdf import convert

def convert_to_pdf(docx_path, pdf_path):
    """将Word转换为PDF"""
    convert(docx_path, pdf_path)

# 3. 提取图片
import zipfile
import os

def extract_images_from_docx(docx_path, output_dir):
    """从docx中提取图片"""
    with zipfile.ZipFile(docx_path, 'r') as docx_zip:
        for file in docx_zip.namelist():
            if file.startswith('word/media/'):
                docx_zip.extract(file, output_dir)