大模型长期记忆的工程化之痛我是怎么被上下文窗口搞疯的前言做智能客服系统用户聊到第 10 轮大模型就开始失忆了。翻聊天记录那是真痛苦。长期记忆模块的问题比想象中复杂得多。今天聊聊我们遇到的工程化挑战和应对方案。一、底层原理1.1 长期记忆管理的核心难点大模型的记忆管理有三个层次graph TD A[对话上下文] -- B[短期记忆] B -- C[工作记忆] C -- D[长期记忆] D -- E[向量数据库] D -- F[总结压缩] D -- G[分层检索] B -- H[Token 限制] H -- I[滑动窗口] I -- J[信息丢失]核心挑战Token 窗口有限不能无限塞记忆压缩时丢失细节检索时找不到相关信息记忆的时效性管理1.2 记忆方案对比方案容量检索速度信息损失全部历史很小快无滑动窗口中快大总结压缩大中中向量检索很大中小二、快速上手先看最简单的记忆方案滑动窗口from typing import List, Dict, Any from collections import deque class SlidingWindowMemory: def __init__(self, window_size10): self.window deque(maxlenwindow_size) def add(self, message: Dict[str, str]): self.window.append(message) def get_context(self) - List[Dict[str, str]]: return list(self.window) def clear(self): self.window.clear() memory SlidingWindowMemory(5) memory.add({role: user, content: 你好}) memory.add({role: assistant, content: 你好有什么可以帮助的}) print(memory.get_context())这够简单但窗口大小就是记忆瓶颈。再看更好的实现class SummaryMemory: def __init__(self, llm, max_tokens2000): self.llm llm self.max_tokens max_tokens self.history [] self.summary def add(self, message: Dict): self.history.append(message) if self._estimate_tokens() self.max_tokens: self._compress() def _estimate_tokens(self): total len(self.summary.split()) for msg in self.history: total len(msg.get(content, ).split()) return total def _compress(self): text \n.join(f{m[role]}: {m[content]} for m in self.history) prompt f压缩以下对话为摘要保留关键信息\n{text} self.summary self.llm(prompt) self.history [] def get_context(self): context [{role: system, content: f历史摘要{self.summary}}] context.extend(self.history) return context三、核心 API / 深水区3.1 记忆管理策略速查策略适用场景优点缺点滑动窗口短期对话实现简单丢失历史总结压缩长对话压缩效果好总结开销向量检索知识问答容量大检索可能不准分层记忆复杂场景兼顾长短实现复杂3.2 向量检索记忆import json from typing import List, Dict, Any class VectorStoreMemory: def __init__(self, embedding_func, store): self.embedding embedding_func self.store store self.session_memory [] def add(self, message: Dict): self.session_memory.append(message) def save_to_long_term(self): if len(self.session_memory) 5: return text \n.join(m[content] for m in self.session_memory) embedding self.embedding(text) self.store.add( idfmem_{len(self.store)}, vectorembedding, metadata{ text: text, timestamp: time.time() } ) self.session_memory [] def retrieve(self, query: str, k3) - List[str]: query_embedding self.embedding(query) results self.store.similarity_search(query_embedding, kk) return [r.metadata[text] for r in results]3.3 记忆优先级管理class PriorityMemory: def __init__(self, max_items100): self.max_items max_items self.items [] def add(self, item: Dict, priority: int 1): self.items.append({item: item, priority: priority}) self.items.sort(keylambda x: x[priority], reverseTrue) self.items self.items[:self.max_items] def get_top(self, n10): return [i[item] for i in self.items[:n]] def increase_priority(self, key: str, amount: int 1): for item in self.items: if item[item].get(key) key: item[priority] amount break self.items.sort(keylambda x: x[priority], reverseTrue)四、实战演练完整的多层记忆系统import time from typing import List, Dict, Any, Optional from dataclasses import dataclass, field dataclass class MemoryEntry: content: str timestamp: float type: str importance: int 1 class LayeredMemory: def __init__(self, llm, vector_store): self.llm llm self.working_memory [] self.short_term [] self.long_term vector_store self.max_working 20 self.max_short 100 def add(self, entry: MemoryEntry): # 1. 加到工作记忆 self.working_memory.append(entry) if len(self.working_memory) self.max_working: self._consolidate() def _consolidate(self): # 压缩到短期记忆 text \n.join(e.content for e in self.working_memory) if len(text) 500: prompt f压缩以下内容{text[:1000]} summary self.llm(prompt) self.short_term.append(MemoryEntry( contentsummary, timestamptime.time(), typesummary, importance5 )) self.working_memory [] if len(self.short_term) self.max_short: self._archive() def _archive(self): # 归档到长期记忆 for entry in self.short_term: if entry.importance 3: self.long_term.add(entry.content) self.short_term [] def retrieve(self, query: str, k5) - List[str]: results [] # 1. 工作记忆 for entry in self.working_memory[-5:]: results.append(entry.content) # 2. 短期记忆 for entry in self.short_term[-3:]: results.append(entry.content) # 3. 长期记忆 long_results self.long_term.similarity_search(query, k2) results.extend(long_results) return results memory LayeredMemory(llm, vector_store) memory.add(MemoryEntry(用户说想退款, time.time(), user, 5)) memory.add(MemoryEntry(系统查询到订单, time.time(), system, 3)) context memory.retrieve(退款政策) print(context)五、避坑指南与最佳实践 **技巧记忆要有重要性评分不是所有信息都重要给记忆打分优先保留高分的。⚠️ **警告压缩太多会丢关键信息压缩比不要超过 5:1不然细节全没了。✅ **推荐分层记忆架构工作记忆 短期 长期兼顾速度和容量。六、综合实战演示生产级长期记忆系统import json import time from typing import Dict, List, Any, Optional from collections import OrderedDict class LongTermMemory: def __init__(self, llm, persist_pathmemory.json): self.llm llm self.persist_path persist_path self.episodes [] self.index OrderedDict() def store_episode(self, episode: Dict): self.episodes.append({ content: episode, timestamp: time.time(), id: len(self.episodes) }) # 生成摘要和关键词 text json.dumps(episode, ensure_asciiFalse) keywords self._extract_keywords(text) for kw in keywords: if kw not in self.index: self.index[kw] [] self.index[kw].append(len(self.episodes) - 1) def _extract_keywords(self, text): prompt f提取关键词3-5个{text[:200]} result self.llm(prompt) return [kw.strip() for kw in result.split(,) if kw.strip()] def recall(self, query: str, k5) - List[Dict]: # 关键词匹配 keywords self._extract_keywords(query) episode_ids set() for kw in keywords: for eid in self.index.get(kw, []): episode_ids.add(eid) # 关联意图 matched [] for eid in list(episode_ids)[:k]: if eid len(self.episodes): matched.append(self.episodes[eid]) return matched def forget_old(self, max_age_hours48): now time.time() self.episodes [ e for e in self.episodes if (now - e[timestamp]) max_age_hours * 3600 ] def persist(self): data { episodes: self.episodes, index: dict(self.index) } with open(self.persist_path, w) as f: json.dump(data, f, ensure_asciiFalse) def load(self): try: with open(self.persist_path) as f: data json.load(f) self.episodes data.get(episodes, []) self.index OrderedDict(data.get(index, {})) except: pass memory LongTermMemory(llm) memory.store_episode({action: 查询订单, result: 已发货}) memory.store_episode({action: 退款申请, result: 已退款}) # 回忆 recalled memory.recall(我的订单怎么了) print(recalled) # 持久化 memory.persist()七、总结大模型长期记忆管理的工程化挑战窗口限制用滑动窗口解决信息丢失用总结压缩检索用向量关键词效率用分层架构没有银弹但组合起来能有效缓解失忆问题。
大模型长期记忆的工程化之痛:我是怎么被上下文窗口搞疯的
大模型长期记忆的工程化之痛我是怎么被上下文窗口搞疯的前言做智能客服系统用户聊到第 10 轮大模型就开始失忆了。翻聊天记录那是真痛苦。长期记忆模块的问题比想象中复杂得多。今天聊聊我们遇到的工程化挑战和应对方案。一、底层原理1.1 长期记忆管理的核心难点大模型的记忆管理有三个层次graph TD A[对话上下文] -- B[短期记忆] B -- C[工作记忆] C -- D[长期记忆] D -- E[向量数据库] D -- F[总结压缩] D -- G[分层检索] B -- H[Token 限制] H -- I[滑动窗口] I -- J[信息丢失]核心挑战Token 窗口有限不能无限塞记忆压缩时丢失细节检索时找不到相关信息记忆的时效性管理1.2 记忆方案对比方案容量检索速度信息损失全部历史很小快无滑动窗口中快大总结压缩大中中向量检索很大中小二、快速上手先看最简单的记忆方案滑动窗口from typing import List, Dict, Any from collections import deque class SlidingWindowMemory: def __init__(self, window_size10): self.window deque(maxlenwindow_size) def add(self, message: Dict[str, str]): self.window.append(message) def get_context(self) - List[Dict[str, str]]: return list(self.window) def clear(self): self.window.clear() memory SlidingWindowMemory(5) memory.add({role: user, content: 你好}) memory.add({role: assistant, content: 你好有什么可以帮助的}) print(memory.get_context())这够简单但窗口大小就是记忆瓶颈。再看更好的实现class SummaryMemory: def __init__(self, llm, max_tokens2000): self.llm llm self.max_tokens max_tokens self.history [] self.summary def add(self, message: Dict): self.history.append(message) if self._estimate_tokens() self.max_tokens: self._compress() def _estimate_tokens(self): total len(self.summary.split()) for msg in self.history: total len(msg.get(content, ).split()) return total def _compress(self): text \n.join(f{m[role]}: {m[content]} for m in self.history) prompt f压缩以下对话为摘要保留关键信息\n{text} self.summary self.llm(prompt) self.history [] def get_context(self): context [{role: system, content: f历史摘要{self.summary}}] context.extend(self.history) return context三、核心 API / 深水区3.1 记忆管理策略速查策略适用场景优点缺点滑动窗口短期对话实现简单丢失历史总结压缩长对话压缩效果好总结开销向量检索知识问答容量大检索可能不准分层记忆复杂场景兼顾长短实现复杂3.2 向量检索记忆import json from typing import List, Dict, Any class VectorStoreMemory: def __init__(self, embedding_func, store): self.embedding embedding_func self.store store self.session_memory [] def add(self, message: Dict): self.session_memory.append(message) def save_to_long_term(self): if len(self.session_memory) 5: return text \n.join(m[content] for m in self.session_memory) embedding self.embedding(text) self.store.add( idfmem_{len(self.store)}, vectorembedding, metadata{ text: text, timestamp: time.time() } ) self.session_memory [] def retrieve(self, query: str, k3) - List[str]: query_embedding self.embedding(query) results self.store.similarity_search(query_embedding, kk) return [r.metadata[text] for r in results]3.3 记忆优先级管理class PriorityMemory: def __init__(self, max_items100): self.max_items max_items self.items [] def add(self, item: Dict, priority: int 1): self.items.append({item: item, priority: priority}) self.items.sort(keylambda x: x[priority], reverseTrue) self.items self.items[:self.max_items] def get_top(self, n10): return [i[item] for i in self.items[:n]] def increase_priority(self, key: str, amount: int 1): for item in self.items: if item[item].get(key) key: item[priority] amount break self.items.sort(keylambda x: x[priority], reverseTrue)四、实战演练完整的多层记忆系统import time from typing import List, Dict, Any, Optional from dataclasses import dataclass, field dataclass class MemoryEntry: content: str timestamp: float type: str importance: int 1 class LayeredMemory: def __init__(self, llm, vector_store): self.llm llm self.working_memory [] self.short_term [] self.long_term vector_store self.max_working 20 self.max_short 100 def add(self, entry: MemoryEntry): # 1. 加到工作记忆 self.working_memory.append(entry) if len(self.working_memory) self.max_working: self._consolidate() def _consolidate(self): # 压缩到短期记忆 text \n.join(e.content for e in self.working_memory) if len(text) 500: prompt f压缩以下内容{text[:1000]} summary self.llm(prompt) self.short_term.append(MemoryEntry( contentsummary, timestamptime.time(), typesummary, importance5 )) self.working_memory [] if len(self.short_term) self.max_short: self._archive() def _archive(self): # 归档到长期记忆 for entry in self.short_term: if entry.importance 3: self.long_term.add(entry.content) self.short_term [] def retrieve(self, query: str, k5) - List[str]: results [] # 1. 工作记忆 for entry in self.working_memory[-5:]: results.append(entry.content) # 2. 短期记忆 for entry in self.short_term[-3:]: results.append(entry.content) # 3. 长期记忆 long_results self.long_term.similarity_search(query, k2) results.extend(long_results) return results memory LayeredMemory(llm, vector_store) memory.add(MemoryEntry(用户说想退款, time.time(), user, 5)) memory.add(MemoryEntry(系统查询到订单, time.time(), system, 3)) context memory.retrieve(退款政策) print(context)五、避坑指南与最佳实践 **技巧记忆要有重要性评分不是所有信息都重要给记忆打分优先保留高分的。⚠️ **警告压缩太多会丢关键信息压缩比不要超过 5:1不然细节全没了。✅ **推荐分层记忆架构工作记忆 短期 长期兼顾速度和容量。六、综合实战演示生产级长期记忆系统import json import time from typing import Dict, List, Any, Optional from collections import OrderedDict class LongTermMemory: def __init__(self, llm, persist_pathmemory.json): self.llm llm self.persist_path persist_path self.episodes [] self.index OrderedDict() def store_episode(self, episode: Dict): self.episodes.append({ content: episode, timestamp: time.time(), id: len(self.episodes) }) # 生成摘要和关键词 text json.dumps(episode, ensure_asciiFalse) keywords self._extract_keywords(text) for kw in keywords: if kw not in self.index: self.index[kw] [] self.index[kw].append(len(self.episodes) - 1) def _extract_keywords(self, text): prompt f提取关键词3-5个{text[:200]} result self.llm(prompt) return [kw.strip() for kw in result.split(,) if kw.strip()] def recall(self, query: str, k5) - List[Dict]: # 关键词匹配 keywords self._extract_keywords(query) episode_ids set() for kw in keywords: for eid in self.index.get(kw, []): episode_ids.add(eid) # 关联意图 matched [] for eid in list(episode_ids)[:k]: if eid len(self.episodes): matched.append(self.episodes[eid]) return matched def forget_old(self, max_age_hours48): now time.time() self.episodes [ e for e in self.episodes if (now - e[timestamp]) max_age_hours * 3600 ] def persist(self): data { episodes: self.episodes, index: dict(self.index) } with open(self.persist_path, w) as f: json.dump(data, f, ensure_asciiFalse) def load(self): try: with open(self.persist_path) as f: data json.load(f) self.episodes data.get(episodes, []) self.index OrderedDict(data.get(index, {})) except: pass memory LongTermMemory(llm) memory.store_episode({action: 查询订单, result: 已发货}) memory.store_episode({action: 退款申请, result: 已退款}) # 回忆 recalled memory.recall(我的订单怎么了) print(recalled) # 持久化 memory.persist()七、总结大模型长期记忆管理的工程化挑战窗口限制用滑动窗口解决信息丢失用总结压缩检索用向量关键词效率用分层架构没有银弹但组合起来能有效缓解失忆问题。