1. 安装依赖
pip install pypinyin
2. 核心实现代码
from pypinyin import lazy_pinyin, Style
import re
from typing import List, Dict, Tuple, Optional
import unicodedata
class ChineseNamePinyin:
"""中文姓名拼音转换与搜索类"""
def __init__(self):
# 常见姓氏的多音字映射
self.surname_mapping = {
'单': ['shan', 'dan'],
'解': ['xie', 'jie'],
'仇': ['qiu', 'chou'],
'曾': ['zeng', 'ceng'],
'乐': ['yue', 'le'],
'查': ['zha', 'cha'],
'朴': ['piao', 'po', 'pu'],
'翟': ['zhai', 'di'],
'沈': ['shen', 'chen'],
'尉迟': ['yu chi'],
'长孙': ['zhang sun'],
'令狐': ['ling hu'],
}
# 预编译正则表达式
self.name_pattern = re.compile(r'^[\u4e00-\u9fa5]{2,4}$')
def normalize_name(self, name: str) -> str:
"""标准化姓名,去除空格和特殊字符"""
# 移除空格和特殊空白字符
name = ''.join(name.split())
# 标准化Unicode字符
name = unicodedata.normalize('NFKC', name)
return name
def get_name_pinyin_variations(self, name: str) -> Dict[str, List[str]]:
"""
获取姓名的所有拼音变体
返回格式:{'full_name': '张三', 'pinyin_variations': ['zhangsan', 'zhang san', 'zs']}
"""
name = self.normalize_name(name)
if not self.name_pattern.match(name):
raise ValueError("姓名应为2-4个汉字")
variations = set()
# 1. 全拼(连写)
full_pinyin = ''.join(lazy_pinyin(name))
variations.add(full_pinyin)
# 2. 全拼(带空格)
spaced_pinyin = ' '.join(lazy_pinyin(name))
variations.add(spaced_pinyin)
# 3. 首字母缩写
initials = ''.join([p[0] for p in lazy_pinyin(name) if p])
variations.add(initials)
# 4. 处理可能的多音字姓氏
for surname, pinyin_list in self.surname_mapping.items():
if name.startswith(surname):
for surname_pinyin in pinyin_list:
# 姓氏用多音字,名字用默认拼音
rest_name = name[len(surname):]
rest_pinyin = ''.join(lazy_pinyin(rest_name))
# 连写变体
variations.add(surname_pinyin.replace(' ', '') + rest_pinyin)
# 空格分隔变体
variations.add(f'{surname_pinyin} {rest_pinyin}')
# 5. 大小写变体
case_variations = set()
for var in variations:
case_variations.add(var.lower())
case_variations.add(var.upper())
case_variations.add(var.title())
variations.update(case_variations)
return {
'full_name': name,
'pinyin_variations': sorted(list(variations))
}
def create_pinyin_index(self, names: List[str]) -> Dict[str, Dict]:
"""为姓名列表创建拼音索引"""
index = {}
for name in names:
try:
pinyin_info = self.get_name_pinyin_variations(name)
for variation in pinyin_info['pinyin_variations']:
# 去除variation中的空格用于索引键
key = variation.replace(' ', '').lower()
if key not in index:
index[key] = []
index[key].append(pinyin_info['full_name'])
except ValueError as e:
print(f"跳过无效姓名 '{name}': {e}")
return index
def search_names(self,
query: str,
name_index: Dict[str, List[str]],
exact_match: bool = False) -> List[str]:
"""
搜索姓名
Args:
query: 搜索词(可以是拼音、首字母、汉字)
name_index: 姓名拼音索引
exact_match: 是否精确匹配
"""
query = self.normalize_query(query)
results = set()
# 1. 如果是中文直接匹配
if re.search(r'[\u4e00-\u9fa5]', query):
for names in name_index.values():
for name in names:
if query in name:
results.add(name)
# 2. 拼音搜索
query_key = query.replace(' ', '').lower()
if exact_match:
# 精确匹配
if query_key in name_index:
results.update(name_index[query_key])
else:
# 模糊匹配
for key, names in name_index.items():
if query_key in key or key in query_key:
results.update(names)
return sorted(list(results))
def normalize_query(self, query: str) -> str:
"""标准化搜索查询"""
query = query.strip().lower()
# 移除多余的空格
query = re.sub(r'\s+', ' ', query)
return query
def get_pinyin_breakdown(self, name: str) -> List[Dict]:
"""获取姓名的拼音分解"""
name = self.normalize_name(name)
breakdown = []
for i, char in enumerate(name):
# 获取单个字符的所有拼音(带声调)
pinyins = lazy_pinyin(char, style=Style.NORMAL)
# 获取不带声调的拼音
plain_pinyin = lazy_pinyin(char)[0] if lazy_pinyin(char) else ''
# 获取首字母
initial = plain_pinyin[0] if plain_pinyin else ''
breakdown.append({
'character': char,
'position': i,
'is_surname': i == 0, # 假设第一个字是姓
'pinyin_with_tone': pinyins[0] if pinyins else '',
'pinyin_plain': plain_pinyin,
'initial': initial,
'possible_variants': self._get_char_pinyin_variants(char)
})
return breakdown
def _get_char_pinyin_variants(self, char: str) -> List[str]:
"""获取单个字符的可能拼音变体"""
variants = set()
# 获取带声调的不同读音
pinyins_with_tone = lazy_pinyin(char, style=Style.NORMAL, heteronym=True)[0]
for pinyin in pinyins_with_tone:
# 去除声调
plain = re.sub(r'[1-5]', '', pinyin)
variants.add(plain)
return sorted(list(variants))
class NameSearchEngine:
"""姓名搜索引擎"""
def __init__(self):
self.pinyin_converter = ChineseNamePinyin()
self.name_index = {}
self.original_names = []
def load_names(self, names: List[str]):
"""加载姓名列表"""
self.original_names = [self.pinyin_converter.normalize_name(name)
for name in names
if self.pinyin_converter.name_pattern.match(name)]
self.name_index = self.pinyin_converter.create_pinyin_index(self.original_names)
def add_name(self, name: str):
"""添加单个姓名"""
try:
normalized = self.pinyin_converter.normalize_name(name)
if not self.pinyin_converter.name_pattern.match(normalized):
return False
pinyin_info = self.pinyin_converter.get_name_pinyin_variations(normalized)
# 更新索引
for variation in pinyin_info['pinyin_variations']:
key = variation.replace(' ', '').lower()
if key not in self.name_index:
self.name_index[key] = []
if normalized not in self.name_index[key]:
self.name_index[key].append(normalized)
# 更新原始列表
if normalized not in self.original_names:
self.original_names.append(normalized)
return True
except ValueError:
return False
def search(self, query: str, fuzzy: bool = True) -> List[str]:
"""搜索姓名"""
return self.pinyin_converter.search_names(
query,
self.name_index,
exact_match=not fuzzy
)
def get_pinyin_info(self, name: str) -> Optional[Dict]:
"""获取姓名的拼音信息"""
try:
return self.pinyin_converter.get_name_pinyin_variations(name)
except ValueError:
return None
def get_all_names(self) -> List[str]:
"""获取所有姓名"""
return sorted(self.original_names)
# 3. 使用示例
def main():
# 创建搜索引擎
engine = NameSearchEngine()
# 加载示例姓名
sample_names = [
"张三", "李四", "王五", "赵六",
"欧阳修", "诸葛亮", "司马光",
"令狐冲", "单雄信", "解珍",
"尉迟恭", "长孙无忌"
]
engine.load_names(sample_names)
print("=" * 50)
print("中文姓名拼音转换与搜索系统")
print("=" * 50)
# 示例1:获取姓名的拼音变体
print("\n1. 姓名的拼音变体示例:")
test_names = ["张三", "令狐冲", "单雄信"]
for name in test_names:
info = engine.get_pinyin_info(name)
if info:
print(f"\n姓名: {info['full_name']}")
print(f"拼音变体: {info['pinyin_variations'][:5]}...") # 只显示前5个
# 示例2:拼音分解
print("\n\n2. 姓名拼音分解示例:")
breakdown = engine.pinyin_converter.get_pinyin_breakdown("欧阳修")
for char_info in breakdown:
print(f" 字符: {char_info['character']} "
f"| 拼音: {char_info['pinyin_plain']} "
f"| 声调: {char_info['pinyin_with_tone']} "
f"| 首字母: {char_info['initial']}")
# 示例3:搜索示例
print("\n\n3. 搜索示例:")
search_cases = [
("zhangsan", "精确搜索 'zhangsan'"),
("zs", "首字母搜索 'zs'"),
("欧阳", "中文搜索 '欧阳'"),
("zhang", "模糊搜索 'zhang'"),
("令狐", "复姓搜索 '令狐'"),
("shan", "多音字搜索 'shan'(单雄信)"),
]
for query, description in search_cases:
results = engine.search(query, fuzzy=True)
print(f"\n{description}:")
print(f" 结果: {results if results else '无匹配结果'}")
# 示例4:添加新姓名并搜索
print("\n\n4. 添加新姓名并搜索:")
engine.add_name("孙悟空")
engine.add_name("唐僧")
results = engine.search("sunwukong")
print(f"搜索 'sunwukong': {results}")
results = engine.search("swk")
print(f"搜索首字母 'swk': {results}")
# 示例5:获取所有姓名
print("\n\n5. 当前所有姓名:")
all_names = engine.get_all_names()
for i, name in enumerate(all_names, 1):
print(f" {i:2d}. {name}")
# 4. 高级功能扩展
class AdvancedNameSearch(NameSearchEngine):
"""扩展的姓名搜索功能"""
def search_by_initial(self, initials: str) -> List[str]:
"""按首字母缩写搜索"""
results = []
initials = initials.lower().replace(' ', '')
for name in self.original_names:
name_initials = ''.join([
lazy_pinyin(char)[0][0] if lazy_pinyin(char) else ''
for char in name
])
if name_initials == initials:
results.append(name)
return results
def search_by_partial(self, partial: str) -> List[str]:
"""部分匹配搜索(姓或名)"""
results = set()
partial = partial.lower()
for name in self.original_names:
# 转换为拼音
name_pinyin = ''.join(lazy_pinyin(name)).lower()
spaced_pinyin = ' '.join(lazy_pinyin(name)).lower()
if (partial in name or
partial in name_pinyin or
partial in spaced_pinyin):
results.add(name)
return sorted(list(results))
def export_to_csv(self, filename: str):
"""导出姓名拼音数据到CSV"""
import csv
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['姓名', '全拼', '首字母', '拼音变体数'])
for name in self.original_names:
info = self.get_pinyin_info(name)
if info:
full_pinyin = ' '.join(lazy_pinyin(name))
initials = ''.join([p[0] for p in lazy_pinyin(name) if p])
writer.writerow([
name,
full_pinyin,
initials,
len(info['pinyin_variations'])
])
print(f"数据已导出到 {filename}")
if __name__ == "__main__":
# 运行示例
main()
# 高级功能演示
print("\n" + "=" * 50)
print("高级功能演示")
print("=" * 50)
advanced_engine = AdvancedNameSearch()
advanced_engine.load_names(["刘备", "关羽", "张飞", "曹操", "孙权"])
print("\n按首字母搜索 'lb':")
print(advanced_engine.search_by_initial("lb"))
print("\n部分匹配搜索 'fei':")
print(advanced_engine.search_by_partial("fei"))
# 导出数据
# advanced_engine.export_to_csv("names_pinyin.csv")
5. 主要功能说明
核心功能:
拼音转换:将中文姓名转换为多种拼音格式
多音字处理:自动处理常见姓氏的多音字
拼音变体生成:生成全拼、带空格、首字母等多种格式
智能搜索:支持拼音、首字母、汉字混合搜索
模糊匹配:支持模糊搜索和精确搜索
搜索方式:
- 中文直接搜索:
"张三"
- 全拼搜索:
"zhangsan" 或 "zhang san"
- 首字母搜索:
"zs"
- 混合搜索:
"zhang三"
扩展功能:
- 姓名拼音分解
- 批量导入导出
- 添加/删除姓名
- 数据导出为CSV
6. 简单使用示例
# 快速使用
from pypinyin import lazy_pinyin
# 简单拼音转换
name = "张三"
pinyin_list = lazy_pinyin(name) # ['zhang', 'san']
full_pinyin = ''.join(pinyin_list) # 'zhangsan'
initials = ''.join([p[0] for p in pinyin_list]) # 'zs'
print(f"姓名: {name}")
print(f"拼音: {' '.join(pinyin_list)}")
print(f"全拼: {full_pinyin}")
print(f"首字母: {initials}")
这个实现提供了完整的姓名拼音转换和搜索功能,支持模糊匹配、多音字处理等实际需求。你可以根据具体需求进行调整和扩展。