Python 中 fuzzywuzzy 进行字符串模糊匹配的全过程

Python中fuzzywuzzy进行字符串模糊匹配的全过程

一、简介与安装

1.1 fuzzywuzzy简介

fuzzywuzzy是一个Python库，使用Levenshtein距离算法来计算两个序列的差异，专门用于字符串模糊匹配。

1.2 安装方法

# 基础安装
pip install fuzzywuzzy

# 为了提高速度，可以安装python-Levenshtein
pip install python-Levenshtein

# 也可以安装fuzzywuzzy的扩展库
pip install fuzzywuzzy[speedup]

二、核心方法与使用流程

2.1 导入库

from fuzzywuzzy import fuzz, process

2.2 基本相似度计算方法

(1) 简单比例匹配（Simple Ratio）

from fuzzywuzzy import fuzz

str1 = "Apple Inc."
str2 = "apple inc"

# 简单比例匹配
similarity = fuzz.ratio(str1, str2)
print(f"简单比例匹配相似度: {similarity}%")
# 输出: 86%

(2) 部分字符串匹配（Partial Ratio）

str1 = "apple pie"
str2 = "I love apple pie with ice cream"

similarity = fuzz.partial_ratio(str1, str2)
print(f"部分字符串匹配相似度: {similarity}%")
# 输出: 100% (因为"apple pie"完全包含在长字符串中)

(3) 分词排序匹配（Token Sort Ratio）

str1 = "apple orange banana"
str2 = "banana apple orange"

similarity = fuzz.token_sort_ratio(str1, str2)
print(f"分词排序匹配相似度: {similarity}%")
# 输出: 100% (因为单词相同，只是顺序不同)

(4) 分词集合匹配（Token Set Ratio）

str1 = "apple banana"
str2 = "apple banana cherry"

similarity = fuzz.token_set_ratio(str1, str2)
print(f"分词集合匹配相似度: {similarity}%")
# 输出: 100% (因为str1的所有单词都包含在str2中)

2.3 高级匹配方法

(1) 提取最佳匹配（extractOne）

from fuzzywuzzy import process

choices = ["Apple Inc.", "Google LLC", "Microsoft Corporation", "Amazon.com"]
query = "apple company"

# 提取单个最佳匹配
best_match = process.extractOne(query, choices)
print(f"最佳匹配: {best_match}")
# 输出: ('Apple Inc.', 90)

# 指定使用的匹配算法
best_match = process.extractOne(query, choices, scorer=fuzz.token_sort_ratio)
print(f"使用token_sort_ratio的最佳匹配: {best_match}")

(2) 提取多个匹配（extract）

query = "apple"
choices = ["Apple Inc.", "Pineapple Co.", "Snapple", "Appliance Co."]

# 提取前3个匹配
matches = process.extract(query, choices, limit=3)
print("前3个匹配:")
for match in matches:
    print(f"  {match[0]}: {match[1]}%")

# 设置相似度阈值
matches = process.extract(query, choices, scorer=fuzz.partial_ratio)
print("\n使用partial_ratio的匹配:")
for match in matches:
    print(f"  {match[0]}: {match[1]}%")

(3) 提取多个最佳匹配（extractBests）

query = "apple"
choices = ["Apple Inc.", "Pineapple Co.", "Snapple", "Appliance Co."]

# 提取所有超过特定阈值的匹配
matches = process.extractBests(query, choices, score_cutoff=70)
print(f"相似度超过70%的匹配: {matches}")

2.4 去除重复项（dedupe）

from fuzzywuzzy import process

duplicate_list = ["Apple Inc.", "apple inc", "Apple Inc", "Google", "google llc"]

# 去除重复项
deduped = process.deduplicate(duplicate_list, threshold=85)
print(f"去重后列表: {deduped}")
# 输出: ['Apple Inc.', 'Google', 'google llc']

三、实际应用示例

3.1 公司名称匹配

import pandas as pd
from fuzzywuzzy import process

# 数据准备
company_list = [
    "Microsoft Corporation",
    "Apple Inc.",
    "Amazon.com Inc.",
    "Alphabet Inc. (Google)",
    "Facebook, Inc.",
    "Tesla, Inc."
]

# 待匹配的公司名称
queries = [
    "microsoft corp",
    "apple company",
    "amazon",
    "google llc",
    "meta platforms",  # Facebook的新名称
    "tesla motors"
]

print("公司名称匹配结果:")
print("-" * 50)

for query in queries:
    result = process.extractOne(query, company_list, scorer=fuzz.token_set_ratio)
    print(f"查询: '{query}' -> 匹配: '{result[0]}' (相似度: {result[1]}%)")

3.2 地址模糊匹配

def match_addresses(address_list, query_address, threshold=80):
    """
    地址模糊匹配函数
    """
    matches = process.extractBests(
        query_address, 
        address_list, 
        scorer=fuzz.token_sort_ratio,
        score_cutoff=threshold
    )

    return matches

# 地址数据库
address_database = [
    "123 Main St, New York, NY 10001",
    "456 Oak Ave, Los Angeles, CA 90001",
    "789 Pine Rd, Chicago, IL 60601",
    "321 Elm St, Boston, MA 02101"
]

# 查询地址
query = "123 Main Street, New York"

matches = match_addresses(address_database, query, threshold=75)
print("地址匹配结果:")
for match in matches:
    print(f"  匹配地址: {match[0]}, 相似度: {match[1]}%")

3.3 产品名称规范化

def normalize_product_names(product_names, standard_names):
    """
    将产品名称规范化为标准名称
    """
    normalized = []

    for name in product_names:
        best_match = process.extractOne(name, standard_names, scorer=fuzz.token_sort_ratio)

        # 如果相似度超过85%，使用标准名称，否则保留原名称
        if best_match[1] >= 85:
            normalized.append(best_match[0])
        else:
            normalized.append(name)

    return normalized

# 标准产品名称
standard_products = [
    "iPhone 13 Pro",
    "Samsung Galaxy S21",
    "MacBook Pro",
    "iPad Air",
    "Dell XPS 13"
]

# 需要规范化的产品名称
raw_products = [
    "iphone 13 pro max",
    "Samsung S21",
    "macbook pro 2021",
    "ipad",
    "XPS 13 laptop"
]

normalized = normalize_product_names(raw_products, standard_products)
print("规范化结果:")
for original, normalized_name in zip(raw_products, normalized):
    print(f"  '{original}' -> '{normalized_name}'")

四、性能优化技巧

4.1 使用python-Levenshtein加速

# 安装python-Levenshtein可以显著提高速度
# pip install python-Levenshtein

# 不需要特殊代码，安装后fuzzywuzzy会自动使用

4.2 预处理器优化

from fuzzywuzzy import process
from fuzzywuzzy.utils import full_process

# 自定义预处理器
def custom_preprocessor(text):
    """自定义文本预处理函数"""
    import re
    # 转换为小写
    text = text.lower()
    # 移除特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 移除多余空格
    text = ' '.join(text.split())
    return text

# 使用自定义预处理器
choices = ["Apple Inc.", "Google LLC", "Microsoft Corp"]
query = "apple company"

result = process.extractOne(
    query, 
    choices, 
    processor=custom_preprocessor
)

4.3 批量处理优化

import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from fuzzywuzzy import process

def batch_match(queries, choices, scorer=fuzz.ratio, workers=4):
    """批量匹配优化函数"""
    with ThreadPoolExecutor(max_workers=workers) as executor:
        results = list(executor.map(
            lambda q: process.extractOne(q, choices, scorer=scorer),
            queries
        ))
    return results

# 大数据集示例
large_choices = [f"Company {i}" for i in range(1000)]
queries = ["Company 123", "Company 456", "Company 789"]

results = batch_match(queries, large_choices, workers=4)

五、注意事项与限制

5.1 使用注意事项

大小写敏感性：默认情况下大小写敏感，建议先转换为统一大小写 性能问题：大量数据匹配时可能较慢，考虑使用索引或其他优化方法 阈值选择：根据应用场景选择合适的相似度阈值 特殊字符处理：特殊字符可能影响匹配结果

5.2 常见问题解决

# 1. 处理None或空字符串
def safe_fuzzy_match(str1, str2):
    if not str1 or not str2:
        return 0
    return fuzz.ratio(str(str1), str(str2))

# 2. 组合多种匹配策略
def combined_similarity(str1, str2):
    weights = {
        'ratio': 0.25,
        'partial_ratio': 0.25,
        'token_sort_ratio': 0.25,
        'token_set_ratio': 0.25
    }

    scores = [
        fuzz.ratio(str1, str2) * weights['ratio'],
        fuzz.partial_ratio(str1, str2) * weights['partial_ratio'],
        fuzz.token_sort_ratio(str1, str2) * weights['token_sort_ratio'],
        fuzz.token_set_ratio(str1, str2) * weights['token_set_ratio']
    ]

    return sum(scores)

# 3. 处理超长字符串
def long_text_similarity(text1, text2, max_length=100):
    # 截断超长文本
    if len(text1) > max_length:
        text1 = text1[:max_length]
    if len(text2) > max_length:
        text2 = text2[:max_length]

    return fuzz.ratio(text1, text2)

六、替代方案

6.1 rapidfuzz（更快更高效的替代品）

# 安装: pip install rapidfuzz
from rapidfuzz import fuzz, process

# 使用方法与fuzzywuzzy类似，但速度更快

6.2 其他字符串匹配库

difflib：Python标准库，功能相对简单
textdistance：提供多种字符串距离算法
jellyfish：实现多种拼音和字符串匹配算法

通过以上全过程的介绍，你应该能够掌握fuzzywuzzy在Python中进行字符串模糊匹配的完整使用方法。根据具体应用场景选择合适的匹配策略和参数，可以获得更好的匹配效果。