Data Analysis / ML / Multinomial Classification

< Multinomial Classification 예제 >

▷ BMI 지수에 대한 데이터로 학습하고 예측해보자

※ BMI 지수 : 키와 몸무게를 가지고 저체중, 정상, 과체중, 비만을 판단하는 지수

▷ Multinomial Classification의 경우 python 구현은 너무 복잡하므로 생략하겠다.

1. sklearn

	import numpy as np
	import pandas as pd
	from scipy import stats
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import cross_val_score
	from sklearn.metrics import precision_score, accuracy_score

	################ 공통구현 끝 ################
	# 1. Raw Data Loading
	df = pd.read_csv('./data/bmi.csv', skiprows=3)

	# 2. Preprocessing
	# 2-1. 결측치 확인
	# isnull() 함수로 column별로 결측치가 있으면 true를 반환
	df.isnull().sum() # 결측치는 없음
	# label 0
	# height 0
	# weight 0
	# dtype: int64

	# 2-2. 이상치 확인 : z-score 방식
	zscore = 1.8
	# stats.zscore() 함수로 각 값을 zscore 값으로 변환해줌
	# 결과는 ndarray 이므로, 각 행에 대해서 수행해주는 것이 좋음
	display(df.loc[np.abs(stats.zscore(df['height'])) >= zscore,:]) # height의 이상치는 없음
	display(df.loc[np.abs(stats.zscore(df['weight'])) >= zscore,:]) # weight의 이상치는 없음
	display(df.loc[np.abs(stats.zscore(df['label'])) >= zscore,:]) # label의 이상치는 없음

	# 3. Data Split : Train(7) / Test(3)
	# random_state는 seed와 비슷한 역할 수행
	# train_test_split(x_data, t_data, test_size= , random_state= ) 형태로 호출
	x_data_train, x_data_test, t_data_train, t_data_test = \
	train_test_split(df[['height', 'weight']], df['label'], test_size=0.3, random_state=0)
	display(x_data_train)

	# 4. Normalization
	scaler = MinMaxScaler()
	scaler.fit(x_data_train)

	x_data_train_norm = scaler.transform(x_data_train)
	x_data_test_norm = scaler.transform(x_data_test)

	# 혼동을 방지하기 위한 변수 삭제
	del x_data_train
	del x_data_test
	################ 공통구현 끝 ################

	# 5. Model 생성
	model = LogisticRegression()
	model.fit(x_data_train_norm, t_data_train)

	# 6. Cross Validation
	kfold = 10
	# cv에 나눌 fold 수를 적어줌
	kfold_score = cross_val_score(model, x_data_train_norm, t_data_train, cv=kfold)
	print(' ### cross validation ###')
	print(' ### cross validation score ###')
	print('score : {}'.format(kfold_score))
	print('평균 : {}'.format(kfold_score.mean()))
	# ### cross validation ###
	# ### cross validation score ###
	# score : [0.98 0.98642857 0.985 0.97642857 0.98642857 0.98428571
	# 0.98714286 0.97714286 0.97714286 0.98642857]
	# 평균 : 0.9826428571428572

	# 7. 최종모델평가
	predict_val = model.predict(x_data_test_norm) # test 데이터로 예측값을 구함
	acc = accuracy_score(predict_val, t_data_test)
	print('우리 Model의 최종 Accuracy : {}'.format(acc)) # 우리 Model의 최종 Accuracy : 0.9845

	# 8. predict
	height = 188
	weight = 78
	my_state = [[height, weight]]
	my_state_val = model.predict(scaler.transform(my_state))

	print(my_state_val) # [1] : 정상

view raw multinomial_classification_sklearn.py hosted with ❤ by GitHub

2. tensorflow

	import numpy as np
	import pandas as pd
	import tensorflow as tf
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import KFold

	################ 공통구현 끝 ################
	# 1. Raw Data Loading
	df = pd.read_csv('./data/bmi.csv', skiprows=3)

	# 2. Data Preprocessing
	# 결측치와 이상치가 없으므로 생략

	# 3. Data Split
	x_data_train, x_data_test, t_data_train, t_data_test = \
	train_test_split(df[['height','weight']], df['label'], test_size=0.3, random_state=0)

	# 4. Normalization (Min-Max Scaling)
	scaler = MinMaxScaler()
	scaler.fit(x_data_train)
	x_data_train_norm = scaler.transform(x_data_train)
	x_data_test_norm = scaler.transform(x_data_test)

	# 에러방지를 위해 사용하지 않는 변수 삭제
	del x_data_train
	del x_data_test
	################ 공통구현 끝 ################

	# 정답에 해당하는 t_data_train이 Multinomial 이므로
	# One-Hot Encoding을 이용해서 데이터를 변환해야 한다.
	# 5. Tensorflow의 one_hot() 함수를 이용해서 변환해보자
	sess = tf.Session()
	# label에 unique한 값이 몇개 인지를 알려주어야 한다. (치역의 수)
	t_data_train_onehot = sess.run(tf.one_hot(t_data_train, depth=3))
	t_data_test_onehot = sess.run(tf.one_hot(t_data_test, depth=3))

	# 에러 방지를 위해서 안쓰는 데이터 삭제
	del t_data_train
	del t_data_test

	# 이제 training data set 준비 완료

	# 6. Placeholder : 독립변수가 2개 이상이므로 shape을 잡아주어야 한다.
	X = tf.placeholder(shape=[None,2], dtype=tf.float32)
	T = tf.placeholder(shape=[None,3], dtype=tf.float32)

	# 7. Weight & bias
	# 독립변수가 2개면서, logistic 결과값이 3개이므로 2*3 = 6개
	W = tf.Variable(tf.random.normal([2,3]), name='weight')
	# b는 logistic 하나당 1개이므로 총 3개
	b = tf.Variable(tf.random.normal([3]), name='bias')

	# 8. Hypothesis(Model)
	logit = tf.matmul(X,W) + b
	H = tf.nn.softmax(logit)

	# 9. loss function
	# logit과 정답(label)을 주어 예측값과 실제값을 비교
	loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit,
	labels=T))

	# 10. train
	train = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)
	# 이 과정이 한 단계 학습

	# 11. 반복학습
	num_of_epoch = 1000
	batch_size = 100 # 한번에 학습을 x_data와 t_data의 행의 수

	# 메모리에 한번에 많은 데이터가 들어가면 문제가 생길 수 있다.
	# 이를 방지하기 위해서 한번에 학습할 데이터 수를 제한
	def run_train(sess, train_x, train_t):
	print('### 학습 시작 ###')
	# 학습이 진행되기 바로 이전에 실행해주어야 하는 코드
	sess.run(tf.global_variables_initializer())
	# 전체개수 / batch_size
	total_batch = int(train_x.shape[0] / batch_size) # shape : (14000, 2)

	for step in range(num_of_epoch):
	for i in range(total_batch):
	batch_x = train_x[ibatch_size : (i+1)batch_size]
	batch_t = train_t[ibatch_size : (i+1)batch_size]

	_, loss_val = sess.run([train, loss], feed_dict={X:batch_x, T:batch_t})

	if step % 100 == 0:
	print('Loss : {}'.format(loss_val))

	print('### 학습 종료 ###')

	# 12. Accuracy(정확도 측정)
	# 행방향으로 최대값을 찾을지, 열방향으로 최대값을 찾을지 지정
	predict = tf.argmax(H,1) # [[0.5 0.4 0.1]] => 내가 입력한 값에 대한 예측
	# 두 개의 값이 다르면 잘못 예측했다는 의미
	correct = tf.equal(predict, tf.argmax(T,1))
	# accuracy는 correct 값들을 합산해서 평균내면 된다.
	accuracy = tf.reduce_mean(tf.cast(correct, dtype=tf.float32))

	# 13. Cross Validation (K-fold)
	cv = 5 # [훈련, 검증] => 5 set가 만들어짐
	results = [] # 5 set에 대한 accuracy를 구해서 list안에 넣자
	kf = KFold(n_splits=cv, shuffle=True)

	for training_idx, validation_idx in kf.split(x_data_train_norm):
	# training_idx : 결국은 index값을 알아옴
	train_x = x_data_train_norm[training_idx] # Fancy indexing
	train_t = t_data_train_onehot[training_idx]

	valid_x = x_data_train_norm[validation_idx]
	valid_t = t_data_train_onehot[validation_idx]

	# 학습 진행
	run_train(sess, train_x, train_t)
	results.append(sess.run(accuracy, feed_dict={X:valid_x, T:valid_t}))

	### 결과 - 1개의 fold만 출력 ###
	# ### 학습이 시작됨 ###
	# Loss : 0.9591677188873291
	# Loss : 0.21171066164970398
	# Loss : 0.16438382863998413
	# Loss : 0.14280198514461517
	# Loss : 0.12987026572227478
	# Loss : 0.12104474753141403
	# Loss : 0.11453663557767868
	# Loss : 0.10948289930820465
	# Loss : 0.10541136562824249
	# Loss : 0.10203935950994492
	# ### 학습 종료 ###

	print('Cross Validation 결과 : {}'.format(results))
	# Cross Validation 결과 : [0.98321426, 0.98285717, 0.9825, 0.9817857, 0.9810714]
	print('Cross Validation 최종 결과 : {}'.format(np.mean(results)))
	# Cross Valication 최종 결과 : 0.9822856783866882

	# 14.학습진행
	run_train(sess, x_data_train_norm, t_data_train_onehot)
	# ### 학습이 시작됨 ###
	# Loss : 0.8822436332702637
	# Loss : 0.20563861727714539
	# Loss : 0.16245439648628235
	# Loss : 0.14280696213245392
	# Loss : 0.13102111220359802
	# Loss : 0.12295647710561752
	# Loss : 0.11698989570140839
	# Loss : 0.11234113574028015
	# Loss : 0.10858290642499924
	# Loss : 0.1054590493440628
	# ### 학습 종료 ###

	# 15. Accuracy 측정
	result = sess.run(accuracy, feed_dict={X:x_data_test_norm, T:t_data_test_onehot})

	print('최종 정확도 : {}'.format(result))
	# 최종 정확도 : 0.9829999804496765

	# Prediction
	height = 187
	weight = 78

	my_state = [[height, weight]]
	my_state_scaled = scaler.transform(my_state)
	print(my_state_scaled) # [[0.8375 0.95555556]]

	result = sess.run(H, feed_dict={X:my_state_scaled})
	print(np.argmax(result)) # 1

view raw multinomial_classification_tensorflow.py hosted with ❤ by GitHub

< MNIST (Digit Recognizer) 예제 >

	import numpy as np
	import pandas as pd
	import tensorflow as tf
	import matplotlib.pyplot as plt
	# confusion matrix를 시각적으로 출력하는 library
	import seaborn as sns # heatmap을 통해서 confusion matrix 출력
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.model_selection import train_test_split, KFold
	from sklearn.metrics import classification_report, confusion_matrix

	# 1. Raw Data Loading
	df = pd.read_csv('./data/train.csv')

	# 2. 결측치와 이상치 처리
	# MNIST 데이터에는 결측치와 이상치가 없으므로 패스

	# 3. 사용하는 데이터가 이미지 데이터
	# 어떤 이미지인지 한 번 확인해보고 진행하자
	# df에서 label column은 제외하고 pixel 데이터만 들고오자
	img_data = df.drop('label', axis=1, inplace=False).values
	# 이미지들의 pixel 데이터만 ndarray 형식으로 추출(2차원)
	# 이미지 데이터를 화면에 출력해보자
	fig = plt.figure() # 출력할 전체 화면을 지칭하는 객체
	# fig안에 subplot을 만들 것인데, 이 subplot을 저장할 list 생성
	fig_arr = list()

	# 큰 도화지를 2행 5열로 나누기 위해 10번 순회
	for n in range(10):
	# 몇행 몇열의 몇번째 subplot 인지 명시
	fig_arr.append(fig.add_subplot(2,5,n+1))
	# img_data는 2차원인데, 인자를 하나만 주면 행 전체를 지칭
	fig_arr[n].imshow(img_data[n].reshape(28,28), cmap='Greys',
	interpolation='nearest')

	plt.tight_layout()
	plt.show()

	# 4. Data Split
	# 데이터는 train, validation, test 3부분으로 나누자
	x_data_train, x_data_test, t_data_train, t_data_test = \
	train_test_split(df.drop('label', axis=1), df['label'], test_size=0.3,
	random_state=0)

	# 5. Normalization (x_data, 독립변수의 처리)
	scaler = MinMaxScaler()
	scaler.fit(x_data_train)
	x_data_train_norm = scaler.transform(x_data_train)
	x_data_test_norm = scaler.transform(x_data_test)

	# 에러방지를 위해 안쓰는 데이터 삭제
	del x_data_train
	del x_data_test

	# t_data(label, 정답)를 one hot 형태로 변환
	sess = tf.Session()
	# depth는 label의 종류 개수
	t_data_train_onehot = sess.run(tf.one_hot(t_data_train, depth=10))
	t_data_test_onehot = sess.run(tf.one_hot(t_data_test, depth=10))

	##########################################################
	# training용, test용 데이터가 준비되었음
	##########################################################

	# Tensorflow 구현
	# 1. placeholder
	# x_data(독립변수)를 받아들이기 위한 placeholder
	X = tf.placeholder(shape=[None, 784], dtype=tf.float32)
	T = tf.placeholder(shape=[None, 10], dtype=tf.float32)

	# 2. Weight & bias
	W = tf.Variable(tf.random.normal([784, 10]), name='weight')
	b = tf.Variable(tf.random.normal([10]), name='bias')

	# 3. Model(Hypothesis) => Multinomial
	logit = tf.matmul(X,W) + b # Linear Regression
	H = tf.nn.softmax(logit) # Multinomial Hypothesis

	# 4. loss function
	loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit, labels=T))

	# 5. Optimizer를 이용한 train(Optimizer는 loss값을 줄이는 알고리즘)
	train = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)

	# 6. 학습용 parameter setting (기본적으로 2개는 설정)
	num_of_epoch = 100
	batch_size = 100

	# 7. 학습진행
	def run_train(sess, train_x, train_t):
	print('### 학습 시작 ###')
	sess.run(tf.global_variables_initializer()) # tf.Variable 초기화(W,b)
	total_batch = int(train_x.shape[0] / batch_size)
	for step in range(num_of_epoch):
	for i in range(total_batch):
	batch_x = train_x[ibatch_size : (i+1)batch_size]
	batch_t = train_t[ibatch_size : (i+1)batch_size]

	_, loss_val = sess.run([train, loss], feed_dict={X:batch_x, T:batch_t})

	if step % 10 == 0:
	print('Loss : {}'.format(loss_val))
	print('### 학습 끝 ###')
	# 학습결과
	# ### 학습 시작 ###
	# Loss : 1.5263224840164185
	# Loss : 0.3862583041191101
	# Loss : 0.3168090283870697
	# Loss : 0.2877289652824402
	# Loss : 0.2699568271636963
	# Loss : 0.25711315870285034
	# Loss : 0.24663415551185608
	# Loss : 0.237602099776268
	# Loss : 0.22966302931308746
	# Loss : 0.22262884676456451
	# ### 학습 끝 ###

	# Accuracy
	predict = tf.argmax(H,1) # [[0.1 0.3 0.2 0.2 ... 0.1]]

	# sklearn을 이용해서 classification_report를 출력해보자
	# 출력할 때, label을 0,1,2,3이 아닌 우리가 보기 좋게 출력되도록 설정
	target_name = ['num 0', 'num 1', 'num 2', 'num 3', 'num 4', 'num 5',
	'num 6', 'num 7', 'num 8', 'num 9']
	# 크게 의미는 없지만 train 데이터로 성능평가를 해보자
	run_train(sess, x_data_train_norm, t_data_train_onehot)
	print(classification_report(t_data_train,
	sess.run(predict, feed_dict={X:x_data_train_norm}),
	target_names=target_name))

	# precision recall f1-score support

	# num 0 0.96 0.97 0.96 2890
	# num 1 0.96 0.97 0.96 3255
	# num 2 0.92 0.90 0.91 2901
	# num 3 0.90 0.89 0.90 3053
	# num 4 0.93 0.93 0.93 2836
	# num 5 0.87 0.89 0.88 2676 # 5는 상대적으로 못찾는 결과
	# num 6 0.96 0.95 0.95 2894
	# num 7 0.94 0.92 0.93 3067
	# num 8 0.89 0.89 0.89 2859
	# num 9 0.90 0.90 0.90 2969

	# accuracy 0.92 29400
	# macro avg 0.92 0.92 0.92 29400
	# weighted avg 0.92 0.92 0.92 29400

	# seaborn을 이용한 confusion matrix의 그래프 출력
	fig, ax = plt.subplots(figsize=(10,10)) # inch 단위로 그림의 크기 (10 x 10)

	sns.heatmap(
	confusion_matrix(t_data_train,
	sess.run(predict, feed_dict={X:x_data_train_norm})),
	annot = True, # 숫자 표현
	cbar = True, # color bar
	fmt = '3d', # 정수 표현
	cmap = 'Blues', # color 색상
	ax = ax # 그래프로 사용할 subplot
	)
	ax.set_xlabel('Predict')
	ax.set_ylabel('Actual')

	plt.show()

view raw MINST.py hosted with ❤ by GitHub

'Python > Data Analysis' 카테고리의 다른 글

Data Analysis / ML / Tensorflow 2.x(2) (0)	2020.10.15
Data Analysis / ML / Tensorflow 2.x(1) (0)	2020.10.15
Data Analysis / ML / Basic Concept(3) (0)	2020.10.09
Data Analysis / ML / Logistic Regression(2) (0)	2020.10.08
Data Analysis / ML / Logistic Regression(1) (0)	2020.10.07

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

My Life

Data Analysis / ML / Multinomial Classification

< Multinomial Classification 예제 >

1. sklearn

2. tensorflow

< MNIST (Digit Recognizer) 예제 >

'Python > Data Analysis' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

Data Analysis / ML / Multinomial Classification

< Multinomial Classification 예제 >

1. sklearn

2. tensorflow

< MNIST (Digit Recognizer) 예제 >

'Python > Data Analysis' 카테고리의 다른 글

'Python/Data Analysis' Related Articles

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역