沭阳县护送服务网

Python+pypinyin实现将中文姓名拼音转换与搜索

2026-04-01 17:26:02 浏览次数:0
详细信息

1. 安装依赖

pip install pypinyin

2. 核心实现代码

from pypinyin import lazy_pinyin, Style
import re
from typing import List, Dict, Tuple, Optional
import unicodedata


class ChineseNamePinyin:
    """中文姓名拼音转换与搜索类"""

    def __init__(self):
        # 常见姓氏的多音字映射
        self.surname_mapping = {
            '单': ['shan', 'dan'],
            '解': ['xie', 'jie'],
            '仇': ['qiu', 'chou'],
            '曾': ['zeng', 'ceng'],
            '乐': ['yue', 'le'],
            '查': ['zha', 'cha'],
            '朴': ['piao', 'po', 'pu'],
            '翟': ['zhai', 'di'],
            '沈': ['shen', 'chen'],
            '尉迟': ['yu chi'],
            '长孙': ['zhang sun'],
            '令狐': ['ling hu'],
        }

        # 预编译正则表达式
        self.name_pattern = re.compile(r'^[\u4e00-\u9fa5]{2,4}$')

    def normalize_name(self, name: str) -> str:
        """标准化姓名,去除空格和特殊字符"""
        # 移除空格和特殊空白字符
        name = ''.join(name.split())
        # 标准化Unicode字符
        name = unicodedata.normalize('NFKC', name)
        return name

    def get_name_pinyin_variations(self, name: str) -> Dict[str, List[str]]:
        """
        获取姓名的所有拼音变体
        返回格式:{'full_name': '张三', 'pinyin_variations': ['zhangsan', 'zhang san', 'zs']}
        """
        name = self.normalize_name(name)

        if not self.name_pattern.match(name):
            raise ValueError("姓名应为2-4个汉字")

        variations = set()

        # 1. 全拼(连写)
        full_pinyin = ''.join(lazy_pinyin(name))
        variations.add(full_pinyin)

        # 2. 全拼(带空格)
        spaced_pinyin = ' '.join(lazy_pinyin(name))
        variations.add(spaced_pinyin)

        # 3. 首字母缩写
        initials = ''.join([p[0] for p in lazy_pinyin(name) if p])
        variations.add(initials)

        # 4. 处理可能的多音字姓氏
        for surname, pinyin_list in self.surname_mapping.items():
            if name.startswith(surname):
                for surname_pinyin in pinyin_list:
                    # 姓氏用多音字,名字用默认拼音
                    rest_name = name[len(surname):]
                    rest_pinyin = ''.join(lazy_pinyin(rest_name))

                    # 连写变体
                    variations.add(surname_pinyin.replace(' ', '') + rest_pinyin)
                    # 空格分隔变体
                    variations.add(f'{surname_pinyin} {rest_pinyin}')

        # 5. 大小写变体
        case_variations = set()
        for var in variations:
            case_variations.add(var.lower())
            case_variations.add(var.upper())
            case_variations.add(var.title())

        variations.update(case_variations)

        return {
            'full_name': name,
            'pinyin_variations': sorted(list(variations))
        }

    def create_pinyin_index(self, names: List[str]) -> Dict[str, Dict]:
        """为姓名列表创建拼音索引"""
        index = {}
        for name in names:
            try:
                pinyin_info = self.get_name_pinyin_variations(name)
                for variation in pinyin_info['pinyin_variations']:
                    # 去除variation中的空格用于索引键
                    key = variation.replace(' ', '').lower()
                    if key not in index:
                        index[key] = []
                    index[key].append(pinyin_info['full_name'])
            except ValueError as e:
                print(f"跳过无效姓名 '{name}': {e}")

        return index

    def search_names(self, 
                    query: str, 
                    name_index: Dict[str, List[str]],
                    exact_match: bool = False) -> List[str]:
        """
        搜索姓名

        Args:
            query: 搜索词(可以是拼音、首字母、汉字)
            name_index: 姓名拼音索引
            exact_match: 是否精确匹配
        """
        query = self.normalize_query(query)
        results = set()

        # 1. 如果是中文直接匹配
        if re.search(r'[\u4e00-\u9fa5]', query):
            for names in name_index.values():
                for name in names:
                    if query in name:
                        results.add(name)

        # 2. 拼音搜索
        query_key = query.replace(' ', '').lower()

        if exact_match:
            # 精确匹配
            if query_key in name_index:
                results.update(name_index[query_key])
        else:
            # 模糊匹配
            for key, names in name_index.items():
                if query_key in key or key in query_key:
                    results.update(names)

        return sorted(list(results))

    def normalize_query(self, query: str) -> str:
        """标准化搜索查询"""
        query = query.strip().lower()
        # 移除多余的空格
        query = re.sub(r'\s+', ' ', query)
        return query

    def get_pinyin_breakdown(self, name: str) -> List[Dict]:
        """获取姓名的拼音分解"""
        name = self.normalize_name(name)
        breakdown = []

        for i, char in enumerate(name):
            # 获取单个字符的所有拼音(带声调)
            pinyins = lazy_pinyin(char, style=Style.NORMAL)
            # 获取不带声调的拼音
            plain_pinyin = lazy_pinyin(char)[0] if lazy_pinyin(char) else ''
            # 获取首字母
            initial = plain_pinyin[0] if plain_pinyin else ''

            breakdown.append({
                'character': char,
                'position': i,
                'is_surname': i == 0,  # 假设第一个字是姓
                'pinyin_with_tone': pinyins[0] if pinyins else '',
                'pinyin_plain': plain_pinyin,
                'initial': initial,
                'possible_variants': self._get_char_pinyin_variants(char)
            })

        return breakdown

    def _get_char_pinyin_variants(self, char: str) -> List[str]:
        """获取单个字符的可能拼音变体"""
        variants = set()

        # 获取带声调的不同读音
        pinyins_with_tone = lazy_pinyin(char, style=Style.NORMAL, heteronym=True)[0]

        for pinyin in pinyins_with_tone:
            # 去除声调
            plain = re.sub(r'[1-5]', '', pinyin)
            variants.add(plain)

        return sorted(list(variants))


class NameSearchEngine:
    """姓名搜索引擎"""

    def __init__(self):
        self.pinyin_converter = ChineseNamePinyin()
        self.name_index = {}
        self.original_names = []

    def load_names(self, names: List[str]):
        """加载姓名列表"""
        self.original_names = [self.pinyin_converter.normalize_name(name) 
                              for name in names 
                              if self.pinyin_converter.name_pattern.match(name)]
        self.name_index = self.pinyin_converter.create_pinyin_index(self.original_names)

    def add_name(self, name: str):
        """添加单个姓名"""
        try:
            normalized = self.pinyin_converter.normalize_name(name)
            if not self.pinyin_converter.name_pattern.match(normalized):
                return False

            pinyin_info = self.pinyin_converter.get_name_pinyin_variations(normalized)

            # 更新索引
            for variation in pinyin_info['pinyin_variations']:
                key = variation.replace(' ', '').lower()
                if key not in self.name_index:
                    self.name_index[key] = []
                if normalized not in self.name_index[key]:
                    self.name_index[key].append(normalized)

            # 更新原始列表
            if normalized not in self.original_names:
                self.original_names.append(normalized)

            return True
        except ValueError:
            return False

    def search(self, query: str, fuzzy: bool = True) -> List[str]:
        """搜索姓名"""
        return self.pinyin_converter.search_names(
            query, 
            self.name_index, 
            exact_match=not fuzzy
        )

    def get_pinyin_info(self, name: str) -> Optional[Dict]:
        """获取姓名的拼音信息"""
        try:
            return self.pinyin_converter.get_name_pinyin_variations(name)
        except ValueError:
            return None

    def get_all_names(self) -> List[str]:
        """获取所有姓名"""
        return sorted(self.original_names)


# 3. 使用示例
def main():
    # 创建搜索引擎
    engine = NameSearchEngine()

    # 加载示例姓名
    sample_names = [
        "张三", "李四", "王五", "赵六",
        "欧阳修", "诸葛亮", "司马光",
        "令狐冲", "单雄信", "解珍",
        "尉迟恭", "长孙无忌"
    ]

    engine.load_names(sample_names)

    print("=" * 50)
    print("中文姓名拼音转换与搜索系统")
    print("=" * 50)

    # 示例1:获取姓名的拼音变体
    print("\n1. 姓名的拼音变体示例:")
    test_names = ["张三", "令狐冲", "单雄信"]
    for name in test_names:
        info = engine.get_pinyin_info(name)
        if info:
            print(f"\n姓名: {info['full_name']}")
            print(f"拼音变体: {info['pinyin_variations'][:5]}...")  # 只显示前5个

    # 示例2:拼音分解
    print("\n\n2. 姓名拼音分解示例:")
    breakdown = engine.pinyin_converter.get_pinyin_breakdown("欧阳修")
    for char_info in breakdown:
        print(f"  字符: {char_info['character']} "
              f"| 拼音: {char_info['pinyin_plain']} "
              f"| 声调: {char_info['pinyin_with_tone']} "
              f"| 首字母: {char_info['initial']}")

    # 示例3:搜索示例
    print("\n\n3. 搜索示例:")

    search_cases = [
        ("zhangsan", "精确搜索 'zhangsan'"),
        ("zs", "首字母搜索 'zs'"),
        ("欧阳", "中文搜索 '欧阳'"),
        ("zhang", "模糊搜索 'zhang'"),
        ("令狐", "复姓搜索 '令狐'"),
        ("shan", "多音字搜索 'shan'(单雄信)"),
    ]

    for query, description in search_cases:
        results = engine.search(query, fuzzy=True)
        print(f"\n{description}:")
        print(f"  结果: {results if results else '无匹配结果'}")

    # 示例4:添加新姓名并搜索
    print("\n\n4. 添加新姓名并搜索:")
    engine.add_name("孙悟空")
    engine.add_name("唐僧")

    results = engine.search("sunwukong")
    print(f"搜索 'sunwukong': {results}")

    results = engine.search("swk")
    print(f"搜索首字母 'swk': {results}")

    # 示例5:获取所有姓名
    print("\n\n5. 当前所有姓名:")
    all_names = engine.get_all_names()
    for i, name in enumerate(all_names, 1):
        print(f"  {i:2d}. {name}")


# 4. 高级功能扩展
class AdvancedNameSearch(NameSearchEngine):
    """扩展的姓名搜索功能"""

    def search_by_initial(self, initials: str) -> List[str]:
        """按首字母缩写搜索"""
        results = []
        initials = initials.lower().replace(' ', '')

        for name in self.original_names:
            name_initials = ''.join([
                lazy_pinyin(char)[0][0] if lazy_pinyin(char) else ''
                for char in name
            ])
            if name_initials == initials:
                results.append(name)

        return results

    def search_by_partial(self, partial: str) -> List[str]:
        """部分匹配搜索(姓或名)"""
        results = set()
        partial = partial.lower()

        for name in self.original_names:
            # 转换为拼音
            name_pinyin = ''.join(lazy_pinyin(name)).lower()
            spaced_pinyin = ' '.join(lazy_pinyin(name)).lower()

            if (partial in name or 
                partial in name_pinyin or 
                partial in spaced_pinyin):
                results.add(name)

        return sorted(list(results))

    def export_to_csv(self, filename: str):
        """导出姓名拼音数据到CSV"""
        import csv

        with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            writer.writerow(['姓名', '全拼', '首字母', '拼音变体数'])

            for name in self.original_names:
                info = self.get_pinyin_info(name)
                if info:
                    full_pinyin = ' '.join(lazy_pinyin(name))
                    initials = ''.join([p[0] for p in lazy_pinyin(name) if p])
                    writer.writerow([
                        name,
                        full_pinyin,
                        initials,
                        len(info['pinyin_variations'])
                    ])

        print(f"数据已导出到 {filename}")


if __name__ == "__main__":
    # 运行示例
    main()

    # 高级功能演示
    print("\n" + "=" * 50)
    print("高级功能演示")
    print("=" * 50)

    advanced_engine = AdvancedNameSearch()
    advanced_engine.load_names(["刘备", "关羽", "张飞", "曹操", "孙权"])

    print("\n按首字母搜索 'lb':")
    print(advanced_engine.search_by_initial("lb"))

    print("\n部分匹配搜索 'fei':")
    print(advanced_engine.search_by_partial("fei"))

    # 导出数据
    # advanced_engine.export_to_csv("names_pinyin.csv")

5. 主要功能说明

核心功能:

拼音转换:将中文姓名转换为多种拼音格式 多音字处理:自动处理常见姓氏的多音字 拼音变体生成:生成全拼、带空格、首字母等多种格式 智能搜索:支持拼音、首字母、汉字混合搜索 模糊匹配:支持模糊搜索和精确搜索

搜索方式:

扩展功能:

6. 简单使用示例

# 快速使用
from pypinyin import lazy_pinyin

# 简单拼音转换
name = "张三"
pinyin_list = lazy_pinyin(name)  # ['zhang', 'san']
full_pinyin = ''.join(pinyin_list)  # 'zhangsan'
initials = ''.join([p[0] for p in pinyin_list])  # 'zs'

print(f"姓名: {name}")
print(f"拼音: {' '.join(pinyin_list)}")
print(f"全拼: {full_pinyin}")
print(f"首字母: {initials}")

这个实现提供了完整的姓名拼音转换和搜索功能,支持模糊匹配、多音字处理等实际需求。你可以根据具体需求进行调整和扩展。

相关推荐