Python中fuzzywuzzy进行字符串模糊匹配的全过程
一、简介与安装
1.1 fuzzywuzzy简介
fuzzywuzzy是一个Python库,使用Levenshtein距离算法来计算两个序列的差异,专门用于字符串模糊匹配。
1.2 安装方法
# 基础安装
pip install fuzzywuzzy
# 为了提高速度,可以安装python-Levenshtein
pip install python-Levenshtein
# 也可以安装fuzzywuzzy的扩展库
pip install fuzzywuzzy[speedup]
二、核心方法与使用流程
2.1 导入库
from fuzzywuzzy import fuzz, process
2.2 基本相似度计算方法
(1) 简单比例匹配(Simple Ratio)
from fuzzywuzzy import fuzz
str1 = "Apple Inc."
str2 = "apple inc"
# 简单比例匹配
similarity = fuzz.ratio(str1, str2)
print(f"简单比例匹配相似度: {similarity}%")
# 输出: 86%
(2) 部分字符串匹配(Partial Ratio)
str1 = "apple pie"
str2 = "I love apple pie with ice cream"
similarity = fuzz.partial_ratio(str1, str2)
print(f"部分字符串匹配相似度: {similarity}%")
# 输出: 100% (因为"apple pie"完全包含在长字符串中)
(3) 分词排序匹配(Token Sort Ratio)
str1 = "apple orange banana"
str2 = "banana apple orange"
similarity = fuzz.token_sort_ratio(str1, str2)
print(f"分词排序匹配相似度: {similarity}%")
# 输出: 100% (因为单词相同,只是顺序不同)
(4) 分词集合匹配(Token Set Ratio)
str1 = "apple banana"
str2 = "apple banana cherry"
similarity = fuzz.token_set_ratio(str1, str2)
print(f"分词集合匹配相似度: {similarity}%")
# 输出: 100% (因为str1的所有单词都包含在str2中)
2.3 高级匹配方法
(1) 提取最佳匹配(extractOne)
from fuzzywuzzy import process
choices = ["Apple Inc.", "Google LLC", "Microsoft Corporation", "Amazon.com"]
query = "apple company"
# 提取单个最佳匹配
best_match = process.extractOne(query, choices)
print(f"最佳匹配: {best_match}")
# 输出: ('Apple Inc.', 90)
# 指定使用的匹配算法
best_match = process.extractOne(query, choices, scorer=fuzz.token_sort_ratio)
print(f"使用token_sort_ratio的最佳匹配: {best_match}")
(2) 提取多个匹配(extract)
query = "apple"
choices = ["Apple Inc.", "Pineapple Co.", "Snapple", "Appliance Co."]
# 提取前3个匹配
matches = process.extract(query, choices, limit=3)
print("前3个匹配:")
for match in matches:
print(f" {match[0]}: {match[1]}%")
# 设置相似度阈值
matches = process.extract(query, choices, scorer=fuzz.partial_ratio)
print("\n使用partial_ratio的匹配:")
for match in matches:
print(f" {match[0]}: {match[1]}%")
(3) 提取多个最佳匹配(extractBests)
query = "apple"
choices = ["Apple Inc.", "Pineapple Co.", "Snapple", "Appliance Co."]
# 提取所有超过特定阈值的匹配
matches = process.extractBests(query, choices, score_cutoff=70)
print(f"相似度超过70%的匹配: {matches}")
2.4 去除重复项(dedupe)
from fuzzywuzzy import process
duplicate_list = ["Apple Inc.", "apple inc", "Apple Inc", "Google", "google llc"]
# 去除重复项
deduped = process.deduplicate(duplicate_list, threshold=85)
print(f"去重后列表: {deduped}")
# 输出: ['Apple Inc.', 'Google', 'google llc']
三、实际应用示例
3.1 公司名称匹配
import pandas as pd
from fuzzywuzzy import process
# 数据准备
company_list = [
"Microsoft Corporation",
"Apple Inc.",
"Amazon.com Inc.",
"Alphabet Inc. (Google)",
"Facebook, Inc.",
"Tesla, Inc."
]
# 待匹配的公司名称
queries = [
"microsoft corp",
"apple company",
"amazon",
"google llc",
"meta platforms", # Facebook的新名称
"tesla motors"
]
print("公司名称匹配结果:")
print("-" * 50)
for query in queries:
result = process.extractOne(query, company_list, scorer=fuzz.token_set_ratio)
print(f"查询: '{query}' -> 匹配: '{result[0]}' (相似度: {result[1]}%)")
3.2 地址模糊匹配
def match_addresses(address_list, query_address, threshold=80):
"""
地址模糊匹配函数
"""
matches = process.extractBests(
query_address,
address_list,
scorer=fuzz.token_sort_ratio,
score_cutoff=threshold
)
return matches
# 地址数据库
address_database = [
"123 Main St, New York, NY 10001",
"456 Oak Ave, Los Angeles, CA 90001",
"789 Pine Rd, Chicago, IL 60601",
"321 Elm St, Boston, MA 02101"
]
# 查询地址
query = "123 Main Street, New York"
matches = match_addresses(address_database, query, threshold=75)
print("地址匹配结果:")
for match in matches:
print(f" 匹配地址: {match[0]}, 相似度: {match[1]}%")
3.3 产品名称规范化
def normalize_product_names(product_names, standard_names):
"""
将产品名称规范化为标准名称
"""
normalized = []
for name in product_names:
best_match = process.extractOne(name, standard_names, scorer=fuzz.token_sort_ratio)
# 如果相似度超过85%,使用标准名称,否则保留原名称
if best_match[1] >= 85:
normalized.append(best_match[0])
else:
normalized.append(name)
return normalized
# 标准产品名称
standard_products = [
"iPhone 13 Pro",
"Samsung Galaxy S21",
"MacBook Pro",
"iPad Air",
"Dell XPS 13"
]
# 需要规范化的产品名称
raw_products = [
"iphone 13 pro max",
"Samsung S21",
"macbook pro 2021",
"ipad",
"XPS 13 laptop"
]
normalized = normalize_product_names(raw_products, standard_products)
print("规范化结果:")
for original, normalized_name in zip(raw_products, normalized):
print(f" '{original}' -> '{normalized_name}'")
四、性能优化技巧
4.1 使用python-Levenshtein加速
# 安装python-Levenshtein可以显著提高速度
# pip install python-Levenshtein
# 不需要特殊代码,安装后fuzzywuzzy会自动使用
4.2 预处理器优化
from fuzzywuzzy import process
from fuzzywuzzy.utils import full_process
# 自定义预处理器
def custom_preprocessor(text):
"""自定义文本预处理函数"""
import re
# 转换为小写
text = text.lower()
# 移除特殊字符
text = re.sub(r'[^\w\s]', '', text)
# 移除多余空格
text = ' '.join(text.split())
return text
# 使用自定义预处理器
choices = ["Apple Inc.", "Google LLC", "Microsoft Corp"]
query = "apple company"
result = process.extractOne(
query,
choices,
processor=custom_preprocessor
)
4.3 批量处理优化
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from fuzzywuzzy import process
def batch_match(queries, choices, scorer=fuzz.ratio, workers=4):
"""批量匹配优化函数"""
with ThreadPoolExecutor(max_workers=workers) as executor:
results = list(executor.map(
lambda q: process.extractOne(q, choices, scorer=scorer),
queries
))
return results
# 大数据集示例
large_choices = [f"Company {i}" for i in range(1000)]
queries = ["Company 123", "Company 456", "Company 789"]
results = batch_match(queries, large_choices, workers=4)
五、注意事项与限制
5.1 使用注意事项
大小写敏感性:默认情况下大小写敏感,建议先转换为统一大小写
性能问题:大量数据匹配时可能较慢,考虑使用索引或其他优化方法
阈值选择:根据应用场景选择合适的相似度阈值
特殊字符处理:特殊字符可能影响匹配结果
5.2 常见问题解决
# 1. 处理None或空字符串
def safe_fuzzy_match(str1, str2):
if not str1 or not str2:
return 0
return fuzz.ratio(str(str1), str(str2))
# 2. 组合多种匹配策略
def combined_similarity(str1, str2):
weights = {
'ratio': 0.25,
'partial_ratio': 0.25,
'token_sort_ratio': 0.25,
'token_set_ratio': 0.25
}
scores = [
fuzz.ratio(str1, str2) * weights['ratio'],
fuzz.partial_ratio(str1, str2) * weights['partial_ratio'],
fuzz.token_sort_ratio(str1, str2) * weights['token_sort_ratio'],
fuzz.token_set_ratio(str1, str2) * weights['token_set_ratio']
]
return sum(scores)
# 3. 处理超长字符串
def long_text_similarity(text1, text2, max_length=100):
# 截断超长文本
if len(text1) > max_length:
text1 = text1[:max_length]
if len(text2) > max_length:
text2 = text2[:max_length]
return fuzz.ratio(text1, text2)
六、替代方案
6.1 rapidfuzz(更快更高效的替代品)
# 安装: pip install rapidfuzz
from rapidfuzz import fuzz, process
# 使用方法与fuzzywuzzy类似,但速度更快
6.2 其他字符串匹配库
- difflib:Python标准库,功能相对简单
- textdistance:提供多种字符串距离算法
- jellyfish:实现多种拼音和字符串匹配算法
通过以上全过程的介绍,你应该能够掌握fuzzywuzzy在Python中进行字符串模糊匹配的完整使用方法。根据具体应用场景选择合适的匹配策略和参数,可以获得更好的匹配效果。