Python朴素贝叶斯分类

Python朴素贝叶斯分类 # Python朴素贝叶斯分类# 朴素贝叶斯基于贝叶斯定理和特征独立假设# 虽然朴素但文本分类等领域效果出色# 1. 导入库import numpy as npfrom sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNBfrom sklearn.datasets import load_iris, make_classificationfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_scorefrom sklearn.feature_extraction.text import CountVectorizer# 2. GaussianNB — 适用于连续特征假设正态分布iris load_iris()X, y iris.data, iris.targetX_train, X_test, y_train, y_test train_test_split(X, y, test_size0.3, random_state42)gnb GaussianNB()gnb.fit(X_train, y_train)acc_gnb accuracy_score(y_test, gnb.predict(X_test))print(f GaussianNB )print(f准确率: {acc_gnb:.4f})print(f先验概率: {gnb.class_prior_})# theta 是均值矩阵var 是方差矩阵print(f均值矩阵形状: {gnb.theta_.shape}, 方差矩阵形状: {gnb.var_.shape})# 3. MultinomialNB — 适用于文本分类词频特征texts [good movie great film, bad terrible awful,wonderful amazing excellent, boring dull waste,fantastic love it, hate it worst ever,pretty good watch, not good avoid]labels [1, 0, 1, 0, 1, 0, 1, 0] # 1正面vectorizer CountVectorizer()X_text vectorizer.fit_transform(texts)X_train_text, X_test_text, y_train_text, y_test_text train_test_split(X_text, labels, test_size0.25, random_state42)mnb MultinomialNB(alpha1.0) # alpha: 拉普拉斯平滑mnb.fit(X_train_text, y_train_text)acc_mnb accuracy_score(y_test_text, mnb.predict(X_test_text))print(f\n MultinomialNB文本)print(f词汇表: {list(vectorizer.vocabulary_.keys())})print(f准确率: {acc_mnb:.4f})# 预测新文本new_texts [really great amazing wonderful]proba_new mnb.predict_proba(vectorizer.transform(new_texts))print(f新文本概率 (负面, 正面): {proba_new[0]})# 4. BernoulliNB — 适用于二元特征X_bin, y_bin make_classification(n_samples500, n_features20, n_informative10, random_state42)X_bin (X_bin 0).astype(int) # 二值化Xb_train, Xb_test, yb_train, yb_test train_test_split(X_bin, y_bin, test_size0.3, random_state42)bnb BernoulliNB(alpha1.0, binarizeNone)bnb.fit(Xb_train, yb_train)print(f\n BernoulliNB二元特征)print(f准确率: {bnb.score(Xb_test, yb_test):.4f})# 5. 三种方法对比print(f\n 三种朴素贝叶斯对比 )print(fGaussianNB: {acc_gnb:.4f})print(fMultinomialNB: {acc_mnb:.4f})print(fBernoulliNB: {bnb.score(Xb_test, yb_test):.4f})# 6. 概率校准检查probas gnb.predict_proba(X_test[:5])for i, prob in enumerate(probas):print(f样本 {i}: 预测{np.argmax(prob)}, 概率{prob})# 7. 总结# - GaussianNB: 连续特征假设正态分布# - MultinomialNB: 计数特征如词频适合文本分类# - BernoulliNB: 二元特征如词是否出现# 三者都支持 partial_fit 实现增量学习