Go语言机器学习实战:聚类算法与无监督学习

Go语言机器学习实战:聚类算法与无监督学习 Go语言机器学习实战聚类算法与无监督学习无监督学习是机器学习的重要分支它从无标签数据中发现模式和结构。聚类算法是无监督学习的核心本文将深入探讨如何使用Go语言实现常见的聚类算法。一、聚类算法概述聚类是将数据点分组的过程使得同一组内的数据点相似度较高而不同组之间的数据点相似度较低。常见的聚类算法包括K-Means基于距离的划分方法简单高效层次聚类构建层次化的聚类树DBSCAN基于密度的聚类能发现任意形状的簇高斯混合模型概率模型考虑数据的分布二、K-Means聚类实现2.1 算法原理K-Means算法的核心思想是随机选择K个初始质心将每个数据点分配到最近的质心重新计算每个簇的质心重复步骤2-3直到收敛2.2 Go语言实现package main import ( fmt math math/rand time ) type Point struct { Features []float64 } type KMeans struct { K int Centroids []Point MaxIter int } func NewKMeans(k, maxIter int) *KMeans { return KMeans{ K: k, MaxIter: maxIter, } } func (km *KMeans) distance(p1, p2 Point) float64 { var sum float64 for i : 0; i len(p1.Features); i { sum math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) } func (km *KMeans) fit(points []Point) { rand.Seed(time.Now().UnixNano()) // 随机初始化质心 km.Centroids make([]Point, km.K) for i : 0; i km.K; i { idx : rand.Intn(len(points)) km.Centroids[i] points[idx] } for iter : 0; iter km.MaxIter; iter { // 分配数据点到簇 clusters : make([][]Point, km.K) for _, point : range points { minDist : math.MaxFloat64 clusterIdx : 0 for i, centroid : range km.Centroids { dist : km.distance(point, centroid) if dist minDist { minDist dist clusterIdx i } } clusters[clusterIdx] append(clusters[clusterIdx], point) } // 更新质心 prevCentroids : make([]Point, km.K) copy(prevCentroids, km.Centroids) for i, cluster : range clusters { if len(cluster) 0 { continue } newCentroid : Point{Features: make([]float64, len(cluster[0].Features))} for _, point : range cluster { for j : 0; j len(point.Features); j { newCentroid.Features[j] point.Features[j] } } for j : 0; j len(newCentroid.Features); j { newCentroid.Features[j] / float64(len(cluster)) } km.Centroids[i] newCentroid } // 检查收敛 converged : true for i : 0; i km.K; i { if km.distance(km.Centroids[i], prevCentroids[i]) 1e-6 { converged false break } } if converged { fmt.Printf(算法在第%d次迭代收敛\n, iter1) break } } } func (km *KMeans) predict(point Point) int { minDist : math.MaxFloat64 clusterIdx : 0 for i, centroid : range km.Centroids { dist : km.distance(point, centroid) if dist minDist { minDist dist clusterIdx i } } return clusterIdx }2.3 使用示例func main() { // 生成模拟数据 points : []Point{ {Features: []float64{1, 2}}, {Features: []float64{2, 1}}, {Features: []float64{2, 3}}, {Features: []float64{8, 7}}, {Features: []float64{9, 8}}, {Features: []float64{7, 9}}, {Features: []float64{15, 16}}, {Features: []float64{16, 15}}, {Features: []float64{17, 17}}, } kmeans : NewKMeans(3, 100) kmeans.fit(points) fmt.Println(质心坐标:) for i, centroid : range kmeans.Centroids { fmt.Printf(簇%d: %v\n, i, centroid.Features) } // 预测新数据点 testPoint : Point{Features: []float64{10, 10}} cluster : kmeans.predict(testPoint) fmt.Printf(数据点(10, 10)属于簇%d\n, cluster) }三、DBSCAN聚类实现3.1 算法原理DBSCANDensity-Based Spatial Clustering of Applications with Noise是一种基于密度的聚类算法核心点在指定半径ε内有至少MinPts个邻居边界点在ε内邻居数少于MinPts但属于某个核心点的邻域噪声点既不是核心点也不是边界点3.2 Go语言实现type DBSCAN struct { Epsilon float64 MinPts int labels []int } func NewDBSCAN(epsilon float64, minPts int) *DBSCAN { return DBSCAN{ Epsilon: epsilon, MinPts: minPts, } } func (d *DBSCAN) fit(points []Point) { n : len(points) d.labels make([]int, n) for i : range d.labels { d.labels[i] -1 // -1表示未访问 } clusterID : 0 for i : 0; i n; i { if d.labels[i] ! -1 { continue } neighbors : d.rangeQuery(points, i) if len(neighbors) d.MinPts { d.labels[i] 0 // 标记为噪声 continue } // 扩展簇 d.expandCluster(points, i, neighbors, clusterID) clusterID } } func (d *DBSCAN) rangeQuery(points []Point, idx int) []int { neighbors : []int{} for i : 0; i len(points); i { if i idx { continue } dist : d.distance(points[idx], points[i]) if dist d.Epsilon { neighbors append(neighbors, i) } } return neighbors } func (d *DBSCAN) expandCluster(points []Point, idx int, neighbors []int, clusterID int) { d.labels[idx] clusterID queue : neighbors for len(queue) 0 { current : queue[0] queue queue[1:] if d.labels[current] 0 { d.labels[current] clusterID } if d.labels[current] ! -1 { continue } d.labels[current] clusterID currentNeighbors : d.rangeQuery(points, current) if len(currentNeighbors) d.MinPts { queue append(queue, currentNeighbors...) } } } func (d *DBSCAN) distance(p1, p2 Point) float64 { var sum float64 for i : 0; i len(p1.Features); i { sum math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }四、层次聚类4.1 算法原理层次聚类有两种策略凝聚式从单个点开始逐步合并相似的簇分裂式从整个数据集开始逐步分裂4.2 Go语言实现type HierarchicalClustering struct { linkage string // single, complete, average } func NewHierarchicalClustering(linkage string) *HierarchicalClustering { return HierarchicalClustering{linkage: linkage} } func (hc *HierarchicalClustering) fit(points []Point) [][]Point { // 初始化每个点为一个簇 clusters : make([][]Point, len(points)) for i, point : range points { clusters[i] []Point{point} } for len(clusters) 1 { minDist : math.MaxFloat64 mergeI, mergeJ : 0, 1 // 找到距离最近的两个簇 for i : 0; i len(clusters); i { for j : i 1; j len(clusters); j { dist : hc.clusterDistance(clusters[i], clusters[j]) if dist minDist { minDist dist mergeI, mergeJ i, j } } } // 合并两个簇 merged : append(clusters[mergeI], clusters[mergeJ]...) clusters append(clusters[:mergeJ], clusters[mergeJ1:]...) clusters[mergeI] merged } return clusters } func (hc *HierarchicalClustering) clusterDistance(c1, c2 []Point) float64 { switch hc.linkage { case single: return hc.singleLinkage(c1, c2) case complete: return hc.completeLinkage(c1, c2) case average: return hc.averageLinkage(c1, c2) default: return hc.singleLinkage(c1, c2) } } func (hc *HierarchicalClustering) singleLinkage(c1, c2 []Point) float64 { minDist : math.MaxFloat64 for _, p1 : range c1 { for _, p2 : range c2 { dist : hc.pointDistance(p1, p2) if dist minDist { minDist dist } } } return minDist } func (hc *HierarchicalClustering) completeLinkage(c1, c2 []Point) float64 { maxDist : 0.0 for _, p1 : range c1 { for _, p2 : range c2 { dist : hc.pointDistance(p1, p2) if dist maxDist { maxDist dist } } } return maxDist } func (hc *HierarchicalClustering) averageLinkage(c1, c2 []Point) float64 { var sumDist float64 count : 0 for _, p1 : range c1 { for _, p2 : range c2 { sumDist hc.pointDistance(p1, p2) count } } return sumDist / float64(count) } func (hc *HierarchicalClustering) pointDistance(p1, p2 Point) float64 { var sum float64 for i : 0; i len(p1.Features); i { sum math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }五、高斯混合模型GMM5.1 算法原理GMM假设数据来自多个高斯分布的混合使用EM算法进行参数估计。5.2 Go语言实现type Gaussian struct { Mean float64 StdDev float64 } func (g *Gaussian) pdf(x float64) float64 { return math.Exp(-math.Pow(x-g.Mean, 2)/(2*math.Pow(g.StdDev, 2))) / (g.StdDev * math.Sqrt(2*math.Pi)) } type GMM struct { Gaussians []Gaussian Weights []float64 } func NewGMM(k int) *GMM { return GMM{ Gaussians: make([]Gaussian, k), Weights: make([]float64, k), } } func (gmm *GMM) fit(data []float64, maxIter int) { n : len(data) k : len(gmm.Gaussians) // 初始化 for i : 0; i k; i { gmm.Gaussians[i] Gaussian{ Mean: data[i*n/k], StdDev: 1.0, } gmm.Weights[i] 1.0 / float64(k) } for iter : 0; iter maxIter; iter { // E步计算后验概率 responsibilities : make([][]float64, k) for i : range responsibilities { responsibilities[i] make([]float64, n) } for j : 0; j n; j { var sum float64 for i : 0; i k; i { responsibilities[i][j] gmm.Weights[i] * gmm.Gaussians[i].pdf(data[j]) sum responsibilities[i][j] } for i : 0; i k; i { responsibilities[i][j] / sum } } // M步更新参数 for i : 0; i k; i { var weightSum, meanSum, varSum float64 for j : 0; j n; j { weightSum responsibilities[i][j] meanSum responsibilities[i][j] * data[j] } gmm.Weights[i] weightSum / float64(n) gmm.Gaussians[i].Mean meanSum / weightSum for j : 0; j n; j { varSum responsibilities[i][j] * math.Pow(data[j]-gmm.Gaussians[i].Mean, 2) } gmm.Gaussians[i].StdDev math.Sqrt(varSum / weightSum) } } }六、聚类评估指标func SilhouetteScore(points []Point, labels []int) float64 { n : len(points) silhouette : make([]float64, n) for i : 0; i n; i { // 计算a(i)同一簇内其他点的平均距离 var a float64 sameCluster : []Point{} for j : 0; j n; j { if i ! j labels[j] labels[i] { sameCluster append(sameCluster, points[j]) } } if len(sameCluster) 0 { for _, p : range sameCluster { a distance(points[i], p) } a / float64(len(sameCluster)) } // 计算b(i)最近簇的平均距离 b : math.MaxFloat64 clusters : make(map[int][]Point) for j : 0; j n; j { if j ! i { clusters[labels[j]] append(clusters[labels[j]], points[j]) } } for _, cluster : range clusters { if len(cluster) 0 { var distSum float64 for _, p : range cluster { distSum distance(points[i], p) } avgDist : distSum / float64(len(cluster)) if avgDist b { b avgDist } } } silhouette[i] (b - a) / math.Max(a, b) } var score float64 for _, s : range silhouette { score s } return score / float64(n) } func distance(p1, p2 Point) float64 { var sum float64 for i : 0; i len(p1.Features); i { sum math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }七、总结本文介绍了四种经典的聚类算法及其Go语言实现K-Means简单高效适合大规模数据DBSCAN基于密度能发现任意形状的簇层次聚类构建层次化结构无需指定K值高斯混合模型概率模型考虑数据分布每种算法都有其适用场景选择合适的聚类算法需要考虑数据特性和业务需求。Go语言的高性能特性使其成为处理大规模数据聚类的理想选择。