Python集合与字典高级技巧一、字典的底层实现Python字典基于哈希表实现Python 3.7保证插入顺序。理解其工作原理有助于写出更高效的代码。# 哈希冲突演示class BadHash:所有实例哈希值相同导致性能退化def __init__(self, value):self.value valuedef __hash__(self):return 1 # 所有对象哈希相同def __eq__(self, other):return self.value other.value# 正确的哈希实现class Point:def __init__(self, x, y):self.x xself.y ydef __hash__(self):return hash((self.x, self.y))def __eq__(self, other):return isinstance(other, Point) and self.x other.x and self.y other.y二、字典推导与高级操作# 字典推导squares {x: x**2 for x in range(10)}filtered {k: v for k, v in squares.items() if v 20}# 字典合并Python 3.9defaults {host: localhost, port: 8080, debug: False}overrides {port: 9090, debug: True}config defaults | overrides # {host: localhost, port: 9090, debug: True}# 就地合并defaults | overrides# 字典解包合并Python 3.5merged {**defaults, **overrides}# 条件字典构建def build_query(**kwargs):return {k: v for k, v in kwargs.items() if v is not None}query build_query(nameAlice, ageNone, cityBeijing)# {name: Alice, city: Beijing}三、collections模块3.1 defaultdictfrom collections import defaultdict# 分组students [(math, Alice), (math, Bob),(physics, Charlie), (physics, Alice),(cs, Bob), (cs, Diana),]by_subject defaultdict(list)for subject, student in students:by_subject[subject].append(student)# 嵌套defaultdicttree lambda: defaultdict(tree)taxonomy tree()taxonomy[动物][哺乳类][猫科][猫] Truetaxonomy[动物][哺乳类][犬科][狗] True# 计数word_count defaultdict(int)for word in text.split():word_count[word] 13.2 Counterfrom collections import Counter# 基本计数words the quick brown fox jumps over the lazy dog the fox.split()counter Counter(words)print(counter.most_common(3)) # [(the, 3), (fox, 2), (quick, 1)]# 算术运算c1 Counter(a3, b1)c2 Counter(a1, b2)print(c1 c2) # Counter({a: 4, b: 3})print(c1 - c2) # Counter({a: 2}) 负数被忽略print(c1 c2) # Counter({a: 1, b: 1}) 取最小值print(c1 | c2) # Counter({a: 3, b: 2}) 取最大值# 实际应用文本相似度def text_similarity(text1, text2):c1 Counter(text1.lower().split())c2 Counter(text2.lower().split())common sum((c1 c2).values())total sum((c1 | c2).values())return common / total if total else 03.3 OrderedDictfrom collections import OrderedDictclass LRUCache:基于OrderedDict的LRU缓存def __init__(self, capacity):self.cache OrderedDict()self.capacity capacitydef get(self, key):if key in self.cache:self.cache.move_to_end(key)return self.cache[key]return Nonedef put(self, key, value):if key in self.cache:self.cache.move_to_end(key)self.cache[key] valueif len(self.cache) self.capacity:self.cache.popitem(lastFalse) # 移除最旧的cache LRUCache(3)cache.put(a, 1)cache.put(b, 2)cache.put(c, 3)cache.get(a) # 访问a使其变为最新cache.put(d, 4) # 容量满移除最旧的b3.4 ChainMapfrom collections import ChainMap# 多层配置查找defaults {color: red, size: medium, debug: False}env_config {debug: True}cli_args {color: blue}# 按优先级查找cli_args env_config defaultsconfig ChainMap(cli_args, env_config, defaults)print(config[color]) # blue来自cli_argsprint(config[debug]) # True来自env_configprint(config[size]) # medium来自defaults# 作用域模拟class Scope:def __init__(self):self.maps ChainMap()def push(self):self.maps self.maps.new_child()def pop(self):self.maps self.maps.parentsdef set(self, key, value):self.maps[key] valuedef get(self, key):return self.maps.get(key)四、集合操作# 集合运算a {1, 2, 3, 4, 5}b {4, 5, 6, 7, 8}print(a b) # 交集: {4, 5}print(a | b) # 并集: {1, 2, 3, 4, 5, 6, 7, 8}print(a - b) # 差集: {1, 2, 3}print(a ^ b) # 对称差集: {1, 2, 3, 6, 7, 8}# 子集和超集print({1, 2} {1, 2, 3}) # True子集print({1, 2, 3} {1, 2}) # True超集# frozenset不可变集合可作为字典键permissions frozenset([read, write])role_permissions {frozenset([read]): viewer,frozenset([read, write]): editor,frozenset([read, write, admin]): admin,}# 实际应用去重保持顺序def unique_ordered(items):seen set()result []for item in items:if item not in seen:seen.add(item)result.append(item)return result# Python 3.7 可以用dict.fromkeysdef unique_ordered_v2(items):return list(dict.fromkeys(items))五、字典视图对象d {a: 1, b: 2, c: 3}# 视图是动态的反映字典的当前状态keys d.keys()values d.values()items d.items()d[d] 4print(len(keys)) # 4视图自动更新# 视图支持集合运算d1 {a: 1, b: 2, c: 3}d2 {b: 2, c: 4, d: 5}# 共同的键common_keys d1.keys() d2.keys() # {b, c}# 只在d1中的键only_d1 d1.keys() - d2.keys() # {a}# 键值都相同的项same_items d1.items() d2.items() # {(b, 2)}六、字典的setdefault和get# setdefault键不存在时设置默认值并返回index {}words [apple, banana, avocado, blueberry, cherry]for word in words:first_letter word[0]index.setdefault(first_letter, []).append(word)# {a: [apple, avocado], b: [banana, blueberry], c: [cherry]}# 对比defaultdictfrom collections import defaultdictindex defaultdict(list)for word in words:index[word[0]].append(word)# get的妙用安全的嵌套访问data {user: {profile: {name: Alice}}}# 不安全# name data[user][profile][name] # 任何层缺失都会KeyError# 安全方式def deep_get(d, *keys, defaultNone):for key in keys:if isinstance(d, dict):d d.get(key, default)else:return defaultreturn dname deep_get(data, user, profile, name) # Alicemissing deep_get(data, user, settings, theme, defaultdark) # dark七、自定义字典类class CaseInsensitiveDict(dict):大小写不敏感的字典def __setitem__(self, key, value):super().__setitem__(key.lower(), value)def __getitem__(self, key):return super().__getitem__(key.lower())def __contains__(self, key):return super().__contains__(key.lower())def get(self, key, defaultNone):return super().get(key.lower(), default)headers CaseInsensitiveDict()headers[Content-Type] application/jsonprint(headers[content-type]) # application/jsonclass AttrDict(dict):支持属性访问的字典def __getattr__(self, key):try:return self[key]except KeyError:raise AttributeError(key)def __setattr__(self, key, value):self[key] valuedef __delattr__(self, key):try:del self[key]except KeyError:raise AttributeError(key)config AttrDict(hostlocalhost, port8080)print(config.host) # localhostconfig.debug True八、性能对比import timeit# 查找性能# list: O(n)# dict: O(1)# set: O(1)n 100000test_list list(range(n))test_set set(range(n))test_dict {i: True for i in range(n)}# 查找最后一个元素# list: ~3ms# set: ~0.05ms# dict: ~0.05ms# 内存占用对比存储100万个整数# list: ~8MB# set: ~32MB额外存储哈希值# dict: ~40MB存储键值对# 选择建议# 只需要存在性检查 - set# 需要键值映射 - dict# 需要有序且支持索引 - list# 需要频繁增删两端 - deque九、实际应用模式# 1. 字典作为switch/casedef handle_event(event_type, data):handlers {click: handle_click,submit: handle_submit,scroll: handle_scroll,}handler handlers.get(event_type, handle_unknown)return handler(data)# 2. 缓存/记忆化def memoize(func):cache {}def wrapper(*args):if args not in cache:cache[args] func(*args)return cache[args]return wrapper# 3. 倒排索引def build_inverted_index(documents):index defaultdict(set)for doc_id, text in enumerate(documents):for word in text.lower().split():index[word].add(doc_id)return indexdef search(index, query):words query.lower().split()if not words:return set()result index[words[0]]for word in words[1:]:result index[word] # 交集所有词都出现的文档return result# 4. 双向映射class BiDict:def __init__(self):self._forward {}self._backward {}def put(self, key, value):self._forward[key] valueself._backward[value] keydef get_by_key(self, key):return self._forward.get(key)def get_by_value(self, value):return self._backward.get(value)十、总结集合与字典使用要点1. 字典和集合的查找是O(1)大量查找时远优于列表2. 使用defaultdict和Counter简化计数和分组操作3. ChainMap适合多层配置的场景4. 集合运算交并差比手动循环更高效且更清晰5. 字典视图支持集合运算可以高效比较两个字典6. 自定义__hash__和__eq__使对象可以作为字典键或集合元素
Python集合与字典高级技巧
Python集合与字典高级技巧一、字典的底层实现Python字典基于哈希表实现Python 3.7保证插入顺序。理解其工作原理有助于写出更高效的代码。# 哈希冲突演示class BadHash:所有实例哈希值相同导致性能退化def __init__(self, value):self.value valuedef __hash__(self):return 1 # 所有对象哈希相同def __eq__(self, other):return self.value other.value# 正确的哈希实现class Point:def __init__(self, x, y):self.x xself.y ydef __hash__(self):return hash((self.x, self.y))def __eq__(self, other):return isinstance(other, Point) and self.x other.x and self.y other.y二、字典推导与高级操作# 字典推导squares {x: x**2 for x in range(10)}filtered {k: v for k, v in squares.items() if v 20}# 字典合并Python 3.9defaults {host: localhost, port: 8080, debug: False}overrides {port: 9090, debug: True}config defaults | overrides # {host: localhost, port: 9090, debug: True}# 就地合并defaults | overrides# 字典解包合并Python 3.5merged {**defaults, **overrides}# 条件字典构建def build_query(**kwargs):return {k: v for k, v in kwargs.items() if v is not None}query build_query(nameAlice, ageNone, cityBeijing)# {name: Alice, city: Beijing}三、collections模块3.1 defaultdictfrom collections import defaultdict# 分组students [(math, Alice), (math, Bob),(physics, Charlie), (physics, Alice),(cs, Bob), (cs, Diana),]by_subject defaultdict(list)for subject, student in students:by_subject[subject].append(student)# 嵌套defaultdicttree lambda: defaultdict(tree)taxonomy tree()taxonomy[动物][哺乳类][猫科][猫] Truetaxonomy[动物][哺乳类][犬科][狗] True# 计数word_count defaultdict(int)for word in text.split():word_count[word] 13.2 Counterfrom collections import Counter# 基本计数words the quick brown fox jumps over the lazy dog the fox.split()counter Counter(words)print(counter.most_common(3)) # [(the, 3), (fox, 2), (quick, 1)]# 算术运算c1 Counter(a3, b1)c2 Counter(a1, b2)print(c1 c2) # Counter({a: 4, b: 3})print(c1 - c2) # Counter({a: 2}) 负数被忽略print(c1 c2) # Counter({a: 1, b: 1}) 取最小值print(c1 | c2) # Counter({a: 3, b: 2}) 取最大值# 实际应用文本相似度def text_similarity(text1, text2):c1 Counter(text1.lower().split())c2 Counter(text2.lower().split())common sum((c1 c2).values())total sum((c1 | c2).values())return common / total if total else 03.3 OrderedDictfrom collections import OrderedDictclass LRUCache:基于OrderedDict的LRU缓存def __init__(self, capacity):self.cache OrderedDict()self.capacity capacitydef get(self, key):if key in self.cache:self.cache.move_to_end(key)return self.cache[key]return Nonedef put(self, key, value):if key in self.cache:self.cache.move_to_end(key)self.cache[key] valueif len(self.cache) self.capacity:self.cache.popitem(lastFalse) # 移除最旧的cache LRUCache(3)cache.put(a, 1)cache.put(b, 2)cache.put(c, 3)cache.get(a) # 访问a使其变为最新cache.put(d, 4) # 容量满移除最旧的b3.4 ChainMapfrom collections import ChainMap# 多层配置查找defaults {color: red, size: medium, debug: False}env_config {debug: True}cli_args {color: blue}# 按优先级查找cli_args env_config defaultsconfig ChainMap(cli_args, env_config, defaults)print(config[color]) # blue来自cli_argsprint(config[debug]) # True来自env_configprint(config[size]) # medium来自defaults# 作用域模拟class Scope:def __init__(self):self.maps ChainMap()def push(self):self.maps self.maps.new_child()def pop(self):self.maps self.maps.parentsdef set(self, key, value):self.maps[key] valuedef get(self, key):return self.maps.get(key)四、集合操作# 集合运算a {1, 2, 3, 4, 5}b {4, 5, 6, 7, 8}print(a b) # 交集: {4, 5}print(a | b) # 并集: {1, 2, 3, 4, 5, 6, 7, 8}print(a - b) # 差集: {1, 2, 3}print(a ^ b) # 对称差集: {1, 2, 3, 6, 7, 8}# 子集和超集print({1, 2} {1, 2, 3}) # True子集print({1, 2, 3} {1, 2}) # True超集# frozenset不可变集合可作为字典键permissions frozenset([read, write])role_permissions {frozenset([read]): viewer,frozenset([read, write]): editor,frozenset([read, write, admin]): admin,}# 实际应用去重保持顺序def unique_ordered(items):seen set()result []for item in items:if item not in seen:seen.add(item)result.append(item)return result# Python 3.7 可以用dict.fromkeysdef unique_ordered_v2(items):return list(dict.fromkeys(items))五、字典视图对象d {a: 1, b: 2, c: 3}# 视图是动态的反映字典的当前状态keys d.keys()values d.values()items d.items()d[d] 4print(len(keys)) # 4视图自动更新# 视图支持集合运算d1 {a: 1, b: 2, c: 3}d2 {b: 2, c: 4, d: 5}# 共同的键common_keys d1.keys() d2.keys() # {b, c}# 只在d1中的键only_d1 d1.keys() - d2.keys() # {a}# 键值都相同的项same_items d1.items() d2.items() # {(b, 2)}六、字典的setdefault和get# setdefault键不存在时设置默认值并返回index {}words [apple, banana, avocado, blueberry, cherry]for word in words:first_letter word[0]index.setdefault(first_letter, []).append(word)# {a: [apple, avocado], b: [banana, blueberry], c: [cherry]}# 对比defaultdictfrom collections import defaultdictindex defaultdict(list)for word in words:index[word[0]].append(word)# get的妙用安全的嵌套访问data {user: {profile: {name: Alice}}}# 不安全# name data[user][profile][name] # 任何层缺失都会KeyError# 安全方式def deep_get(d, *keys, defaultNone):for key in keys:if isinstance(d, dict):d d.get(key, default)else:return defaultreturn dname deep_get(data, user, profile, name) # Alicemissing deep_get(data, user, settings, theme, defaultdark) # dark七、自定义字典类class CaseInsensitiveDict(dict):大小写不敏感的字典def __setitem__(self, key, value):super().__setitem__(key.lower(), value)def __getitem__(self, key):return super().__getitem__(key.lower())def __contains__(self, key):return super().__contains__(key.lower())def get(self, key, defaultNone):return super().get(key.lower(), default)headers CaseInsensitiveDict()headers[Content-Type] application/jsonprint(headers[content-type]) # application/jsonclass AttrDict(dict):支持属性访问的字典def __getattr__(self, key):try:return self[key]except KeyError:raise AttributeError(key)def __setattr__(self, key, value):self[key] valuedef __delattr__(self, key):try:del self[key]except KeyError:raise AttributeError(key)config AttrDict(hostlocalhost, port8080)print(config.host) # localhostconfig.debug True八、性能对比import timeit# 查找性能# list: O(n)# dict: O(1)# set: O(1)n 100000test_list list(range(n))test_set set(range(n))test_dict {i: True for i in range(n)}# 查找最后一个元素# list: ~3ms# set: ~0.05ms# dict: ~0.05ms# 内存占用对比存储100万个整数# list: ~8MB# set: ~32MB额外存储哈希值# dict: ~40MB存储键值对# 选择建议# 只需要存在性检查 - set# 需要键值映射 - dict# 需要有序且支持索引 - list# 需要频繁增删两端 - deque九、实际应用模式# 1. 字典作为switch/casedef handle_event(event_type, data):handlers {click: handle_click,submit: handle_submit,scroll: handle_scroll,}handler handlers.get(event_type, handle_unknown)return handler(data)# 2. 缓存/记忆化def memoize(func):cache {}def wrapper(*args):if args not in cache:cache[args] func(*args)return cache[args]return wrapper# 3. 倒排索引def build_inverted_index(documents):index defaultdict(set)for doc_id, text in enumerate(documents):for word in text.lower().split():index[word].add(doc_id)return indexdef search(index, query):words query.lower().split()if not words:return set()result index[words[0]]for word in words[1:]:result index[word] # 交集所有词都出现的文档return result# 4. 双向映射class BiDict:def __init__(self):self._forward {}self._backward {}def put(self, key, value):self._forward[key] valueself._backward[value] keydef get_by_key(self, key):return self._forward.get(key)def get_by_value(self, value):return self._backward.get(value)十、总结集合与字典使用要点1. 字典和集合的查找是O(1)大量查找时远优于列表2. 使用defaultdict和Counter简化计数和分组操作3. ChainMap适合多层配置的场景4. 集合运算交并差比手动循环更高效且更清晰5. 字典视图支持集合运算可以高效比较两个字典6. 自定义__hash__和__eq__使对象可以作为字典键或集合元素