Embedding 优化技术:提升检索效率与质量

Embedding 优化技术:提升检索效率与质量 Embedding 优化技术提升检索效率与质量前言Embedding 是现代 AI 系统的核心组件其质量和计算效率直接影响整个系统的性能。通过优化 Embedding 技术我们可以显著提升检索质量、降低计算成本。我在多个项目中实践过 Embedding 优化技术今天分享一些实用的经验。向量量化量化原理import numpy as np from sklearn.cluster import KMeans class VectorQuantizer: 向量量化器 def __init__(self, n_clusters256, dim1024): self.n_clusters n_clusters self.dim dim self.kmeans KMeans(n_clustersn_clusters, n_init10) self.centroids None def fit(self, vectors): 训练量化器 self.kmeans.fit(vectors) self.centroids self.kmeans.cluster_centers_ def quantize(self, vector): 量化单个向量 if self.centroids is None: raise ValueError(请先调用 fit() 训练量化器) distances np.linalg.norm(self.centroids - vector, axis1) return np.argmin(distances) def reconstruct(self, code): 从量化码重建向量 return self.centroids[code]PQ 量化class ProductQuantizer: 乘积量化器 def __init__(self, n_subvectors8, n_clusters256): self.n_subvectors n_subvectors self.n_clusters n_clusters self.sub_quantizers [] def fit(self, vectors): 训练乘积量化器 dim vectors.shape[1] subvector_dim dim // self.n_subvectors for i in range(self.n_subvectors): start i * subvector_dim end (i 1) * subvector_dim subvectors vectors[:, start:end] quantizer VectorQuantizer(n_clustersself.n_clusters, dimsubvector_dim) quantizer.fit(subvectors) self.sub_quantizers.append(quantizer) def quantize(self, vector): 乘积量化 dim vector.shape[0] subvector_dim dim // self.n_subvectors codes [] for i, quantizer in enumerate(self.sub_quantizers): start i * subvector_dim end (i 1) * subvector_dim subvector vector[start:end] code quantizer.quantize(subvector) codes.append(code) return np.array(codes, dtypenp.uint8)Embedding 裁剪与压缩维度裁剪class DimensionalityReducer: 维度降压器 def __init__(self, methodpca, target_dim256): self.method method self.target_dim target_dim self.transformer None def fit(self, vectors): 训练降压器 if self.method pca: from sklearn.decomposition import PCA self.transformer PCA(n_componentsself.target_dim) self.transformer.fit(vectors) elif self.method tsne: from sklearn.manifold import TSNE self.transformer TSNE(n_componentsself.target_dim) def transform(self, vectors): 降维转换 return self.transformer.transform(vectors) def fit_transform(self, vectors): 训练并转换 self.fit(vectors) return self.transform(vectors)二值化 Embeddingclass BinaryEmbedding: 二值化 Embedding def __init__(self): self.threshold 0.0 def binarize(self, vectors): 将向量二值化 return (vectors self.threshold).astype(np.int8) def hamming_distance(self, a, b): 计算汉明距离 return np.count_nonzero(a ! b) def similarity(self, a, b): 计算相似度汉明距离的倒数 distance self.hamming_distance(a, b) return 1.0 / (1.0 distance)混合精度与缓存混合精度计算import torch class MixedPrecisionEmbedding: 混合精度 Embedding def __init__(self, model): self.model model def encode(self, texts): 使用混合精度编码 with torch.cuda.amp.autocast(): embeddings self.model.encode(texts) return embeddings.float() def batch_encode(self, texts, batch_size32): 批量编码 results [] for i in range(0, len(texts), batch_size): batch texts[i:ibatch_size] embeddings self.encode(batch) results.extend(embeddings) return np.array(results)Embedding 缓存import hashlib from functools import lru_cache import pickle class EmbeddingCache: Embedding 缓存 def __init__(self, cache_fileembedding_cache.pkl): self.cache_file cache_file self.cache self._load_cache() def _load_cache(self): 加载缓存 try: with open(self.cache_file, rb) as f: return pickle.load(f) except FileNotFoundError: return {} def _save_cache(self): 保存缓存 with open(self.cache_file, wb) as f: pickle.dump(self.cache, f) def _get_key(self, text): 获取缓存键 return hashlib.md5(text.encode()).hexdigest() def get(self, text): 获取缓存 key self._get_key(text) return self.cache.get(key) def set(self, text, embedding): 设置缓存 key self._get_key(text) self.cache[key] embedding self._save_cache()实战优化class OptimizedEmbeddingSystem: 优化的 Embedding 系统 def __init__(self, base_model, cacheTrue, quantizeTrue): self.model base_model self.cache EmbeddingCache() if cache else None self.quantizer ProductQuantizer() if quantize else None self.reducer DimensionalityReducer() def encode(self, text): 优化的编码流程 # 检查缓存 if self.cache: cached self.cache.get(text) if cached is not None: return cached # 原始编码 embedding self.model.encode(text) # 降维 if self.reducer: embedding self.reducer.transform(embedding.reshape(1, -1))[0] # 缓存 if self.cache: self.cache.set(text, embedding) return embedding def batch_process(self, texts, use_cacheTrue): 批量处理 results [] for text in texts: results.append(self.encode(text)) return np.array(results)总结Embedding 优化技术包括向量量化减少存储空间加速检索维度裁剪PCA/TSNE 降维二值化汉明距离快速计算混合精度使用 FP16 降低内存缓存机制避免重复计算关键要点量化和降维可以大幅降低存储缓存能显著提升热点数据的编码效率需要在精度和效率之间找到平衡根据场景选择合适的优化策略