Reference
《Python数据分析与挖掘实战(第2版)》
第9章,页码:P234,代码略有改进
images数据下载
分析过程
import numpy as np
import os, re
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
#图像切割及特征提取
path = './images/'
#自定义获取图片名称函数
def getImageNames(path=path):
#获取指定路径中所有图片的名称
filenames = os.listdir(path)
imageNames = []
for i in filenames:
if re.findall('^\d_\d+\.jpg$', i) != []:
imageNames.append(i)
return imageNames
#自定义获取三阶颜色矩函数
def Var(data=None):
#获取给定像素矩阵的三阶颜色矩
x = np.mean((data - data.mean())**3)
return np.sign(x)*abs(x)**(1/3)
#批量处理图片数据
imageNames = getImageNames(path=path) #获取所有图片名称
n = len(imageNames) #图片张数
data = np.zeros([n, 9]) #用来装样本自变量
labels = np.zeros([n]) #用来放样本标签
# print(data)
# print(labels)
for i in range(n):
img = Image.open(path + imageNames[i]) #读取图片
M, N = img.size #图片像素尺寸
img = img.crop((M/2-50, N/2-50, M/2+50, N/2+50)) #图片切割
r, g, b = img.split() #将图片分割成三通道
rd = np.asarray(r)/255 #转化成数组数据
gd = np.asarray(g)/255
bd = np.asarray(b)/255
data[i, 0] = rd.mean() #一阶颜色矩
data[i, 1] = gd.mean()
data[i, 2] = bd.mean()
data[i, 3] = rd.std() #二阶颜色矩
data[i, 4] = gd.std()
data[i, 5] = bd.std()
data[i, 6] = Var(rd) #三阶颜色矩
data[i, 7] = Var(gd)
data[i, 8] = Var(bd)
labels[i] = imageNames[i][0]
print(data)
print(labels)
#模型构建
#使用决策树法构建水质评价模型
#数据拆分,训练集。测试集
data_tr, data_te, label_tr, label_te = train_test_split(data, labels, test_size=0.4, random_state=10)
#模型训练
model = DecisionTreeClassifier(random_state=5).fit(data_tr, label_tr)
#水质评价
pre_te = model.predict(data_te)
#混淆矩阵
cm_te = confusion_matrix(label_te, pre_te)
print(cm_te)
#准确率
print(accuracy_score(label_te, pre_te))
网友评论