Data Analysis / ML / Logistic Regression(2)

< Single Variable Logistic Regression >

1. python

	import numpy as np

	# 다변수 수치미분코드
	def numerical_derivative(f,x):
	# f : 미분하려고 하는 다변수 함수
	# x : 모든 변수를 포함하고 있는 numpy array

	delta_x = 1e-4
	derivative_x = np.zeros_like(x) # 계산된 수치미분 값을 저장하기 위한 변수

	# iterator를 이용하여 입력변수 x에 대해 편미분 수행
	it = np.nditer(x, flags=['multi_index'])

	while not it.finished:
	idx = it.multi_index # 현재 index를 tuple 형태로 return

	tmp = x[idx]

	x[idx] = tmp + delta_x
	fx_plus_delta = f(x) # f(x + delta_x)

	x[idx] = tmp - delta_x
	fx_minus_delta = f(x) # f(x - delta_x)

	derivative_x[idx] = (fx_plus_delta - fx_minus_delta) / (2 * delta_x)

	x[idx] = tmp

	it.iternext()

	return derivative_x

	# Raw Data Loading + Data Preprocessing
	# 이번 예제는 이 과정이 필요없음

	# Training Data Set
	x_data = np.arange(2,21,2).reshape(-1,1)
	t_data = np.array([0,0,0,0,0,0,1,1,1,1])

	# Weight & bias
	W = np.random.rand(1,1)
	b = np.random.rand(1)

	# loss function
	# loss 함수는 인자가 하나이므로 W와 b를 주려면 ndarray 형태로 전달해야 한다.
	def loss_func(input_obj):
	# input_obj : W와 b를 같이 포함하고 있는 ndarray => [W1 W2 W3 ... b]
	# b.shape가 (1, )로 나타나므로 b.shape[0]으로 접근해야 한다.
	num_of_bias = b.shape[0] # num_of_bias = 1

	# bias 수만큼 b도 생성되므로...
	input_W = input_obj[:-1*num_of_bias].reshape(-1,num_of_bias)
	input_b = input_obj[-1*num_of_bias:] # bias

	# 우리 모델의 예측값 : Wx + b => sigmoid를 적용
	z = np.dot(x_data, input_W) + input_b
	y = 1 / (1 + np.exp(-1 * z)) # sigmoid

	# 굉장히 작은 값을 이용해서 프로그램으로 로그 연산시 무한대로 발산하는 것을 방지
	delta = 1e-7

	# cross entropy
	return -np.sum(t_datanp.log(y+delta) + ((1-t_data)np.log(1-y+delta)))

	# learning rate
	learning_rate = 1e-4

	# 학습 진행
	for step in range(30000):
	input_param = np.concatenate((W.ravel(), b.ravel()), axis=0) # [W b]
	derivative_result = learning_rate * numerical_derivative(loss_func, input_param)

	num_of_bias = b.shape[0]
	W = W - derivative_result[:-1*num_of_bias].reshape(-1, num_of_bias) # [[W1] [W2] [W3]]
	b = b - derivative_result[-1*num_of_bias:]

	# predict => W, b를 다 구해서 우리의 Logistic Regression Model 완성
	def logistic_predict(x): # 공부한 시간이 입력으로 들어옴
	z = np.dot(x,W) + b
	y = 1 / (1 + np.exp(-1*z))

	if y < 0.5:
	result = 0
	else:
	result = 1

	return result, y

	# broadcasting은 사칙연산에서만 수행되고 행렬곱에서는 수행되지 않음
	# 그러므로 행렬형태로 만들어주어야 한다.
	study_hour = np.array([[13]])
	result = logistic_predict(study_hour)
	print('####### python 결과값 #######')
	print('공부시간 : {}, 결과 : {}'.format(study_hour, result))
	# ####### python 결과값 #######
	# 공부시간 : [[13]], 결과 : (0, array([[0.39999997]]))

view raw logistic_regression_python.py hosted with ❤ by GitHub

2. sklearn

	### sklearn으로 구현
	from sklearn import linear_model
	import numpy as np

	# Training Data Set
	x_data = np.arange(2,21,2).reshape(-1,1)
	t_data = np.array([0,0,0,0,0,0,1,1,1,1])

	# Logistic Regression Model 생성
	model = linear_model.LogisticRegression()

	# Training Data Set을 이용해서 학습
	model.fit(x_data, t_data.ravel())

	study_hour = np.array([[13]])
	predict_val = model.predict(study_hour)
	predict_proba = model.predict_proba(study_hour)
	print('####### sklearn 결과값 #######')
	print('공부시간 : {}, 결과 : {}, {}'.format(study_hour, predict_val, predict_proba))
	# ####### sklearn 결과값 #######
	# 공부시간 : [[13]], 결과 : [0], [[0.50009391 0.49990609]]

view raw logistic_regression_sklearn.py hosted with ❤ by GitHub

3. tensorflow

	# tensorflow를 이용한 구현
	import tensorflow as tf
	import numpy as np

	# Training Data Set
	x_data = np.arange(2,21,2).reshape(-1,1)
	t_data = np.array([0,0,0,0,0,0,1,1,1,1])

	# placeholder
	X = tf.placeholder(dtype=tf.float32)
	T = tf.placeholder(dtype=tf.float32)

	# Weight & bias
	W = tf.Variable(tf.random.normal([1,1]), name='weight')
	b = tf.Variable(tf.random.normal([1]), name='bias')

	# Hypothesis
	logit = W*X + b
	H = tf.sigmoid(logit)

	# loss function
	loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

	# train
	train = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(loss)

	# Session 생성 & 초기화
	sess = tf.Session()
	sess.run(tf.global_variables_initializer())

	# 학습 진행
	for step in range(30000):
	sess.run(train, feed_dict={X:x_data, T:t_data})

	study_hour = np.array([13])
	result = sess.run(H, feed_dict={X:study_hour})
	print('####### tensorflow 결과값 #######')
	print('공부시간 : {}, 결과 : {}'.format(study_hour, result))
	# ####### tensorflow 결과값 #######
	# 공부시간 : [13], 결과 : [[0.39890203]]

view raw logistic_regression_tensorflow.py hosted with ❤ by GitHub

< Multi Variable Logistic Regression >

▷ 학습하는 데이터는 GRE(Graduate Record Examination)와 GPA(Grade Point Average) 성적 그리고 Rank(University Rating)에 대한 대학원 합격/불합격 정보

▷ 다운로드 링크 : https://drive.google.com/drive/folders/1Gjt-HIUEL4ymIbkKexaYivcuKlp51r0l?usp=sharing

0. 데이터의 boxplot 확인

1. sklearn

	import numpy as np
	import pandas as pd
	from sklearn import linear_model
	from scipy import stats
	from sklearn.preprocessing import MinMaxScaler
	import matplotlib.pyplot as plt

	########## 공통코드 ##########
	# Raw Data Loading
	df = pd.read_csv('./data/admission.csv')

	# 결측치 확인 => 없음

	# 이상치 확인 후 있으면 제거
	fig = plt.figure()
	fig_admit = fig.add_subplot(1,4,1)
	fig_gre = fig.add_subplot(1,4,2)
	fig_gpa = fig.add_subplot(1,4,3)
	fig_rank = fig.add_subplot(1,4,4)

	fig_admit.boxplot(df['admit'])
	fig_gre.boxplot(df['gre'])
	fig_gpa.boxplot(df['gpa'])
	fig_rank.boxplot(df['rank'])

	fig.tight_layout()
	plt.show()

	# 확인결과 이상치가 있으므로 이번 예제에서는 제거하자
	# 보통은 다른 컬럼의 이상치 계산에 영향을 줄 수 있으므로 다른 값으로 대체하여 사용함
	zscore_threshold = 2.0
	for col in df.columns:
	outlier = df[col][np.abs(stats.zscore(df[col]) > zscore_threshold)]
	# ~ : 역 연산 (true -> false, false -> true)
	df = df.loc[~df[col].isin(outlier)]

	# Training Data Set
	# 열을 지워야하므로 axis=1
	x_data = df.drop('admit', axis=1, inplace=False).values
	t_data = df['admit'].values.reshape(-1,1)
	########## 공통코드 끝 ##########

	# Normalization
	# sklearn은 스스로 정규화를 진행하므로 안해도 됨

	# sklearn을 이용한 구현
	model = linear_model.LogisticRegression()
	# sklearn의 x_data는 2차원, t_data는 1차원으로 입력됨
	model.fit(x_data, t_data.ravel())

	print('####### sklearn으로 구현한 결과 #######')
	# skearln이므로 my_score를 정규화해서 넣을 필요가 없음
	my_score = np.array([[600, 3.8, 1]])
	predict_val = model.predict(my_score) # 0 or 1
	predict_proba = model.predict_proba(my_score) # 불합격할 확률, 합격할 확률
	print(my_score, predict_val, predict_proba)
	# #### sklearn으로 구현한 결과 ####
	# [[600. 3.8 1. ]] [1] [[0.4308339 0.5691661]]

view raw multi_variable_logistic_sklearn.py hosted with ❤ by GitHub

2. tensorflow

	import numpy as np
	import pandas as pd
	from sklearn import linear_model
	from scipy import stats
	from sklearn.preprocessing import MinMaxScaler
	import matplotlib.pyplot as plt

	########## 공통코드 ##########
	# Raw Data Loading
	df = pd.read_csv('./data/admission.csv')

	# 결측치 확인 => 없음

	# 이상치 확인 후 있으면 제거
	fig = plt.figure()
	fig_admit = fig.add_subplot(1,4,1)
	fig_gre = fig.add_subplot(1,4,2)
	fig_gpa = fig.add_subplot(1,4,3)
	fig_rank = fig.add_subplot(1,4,4)

	fig_admit.boxplot(df['admit'])
	fig_gre.boxplot(df['gre'])
	fig_gpa.boxplot(df['gpa'])
	fig_rank.boxplot(df['rank'])

	fig.tight_layout()
	plt.show()

	# 확인결과 이상치가 있으므로 이번 예제에서는 제거하자
	# 보통은 다른 컬럼의 이상치 계산에 영향을 줄 수 있으므로 다른 값으로 대체하여 사용함
	zscore_threshold = 2.0
	for col in df.columns:
	outlier = df[col][np.abs(stats.zscore(df[col]) > zscore_threshold)]
	# ~ : 역 연산 (true -> false, false -> true)
	df = df.loc[~df[col].isin(outlier)]

	# Training Data Set
	# 열을 지워야하므로 axis=1
	x_data = df.drop('admit', axis=1, inplace=False).values
	t_data = df['admit'].values.reshape(-1,1)
	########## 공통코드 끝 ##########
	# Normalization
	scaler_x = MinMaxScaler()
	scaler_x.fit(x_data)
	norm_x_data = scaler_x.transform(x_data) # for python, tensorflow

	# placeholder 생성
	# 행은 몇개인지 알 수 없으므로 None으로 설정
	X = tf.placeholder(shape=[None,3], dtype=tf.float32)
	T = tf.placeholder(shape=[None,1], dtype=tf.float32)

	# Weight & bias
	W = tf.Variable(tf.random.normal([3,1]), name='weight')
	b = tf.Variable(tf.random.normal([1]), name='bias')

	# hypothesis
	logit = tf.matmul(X,W) + b
	H = tf.sigmoid(logit)

	# loss function
	loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logit, labels=T))

	# train
	train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

	# Session 생성 & 초기화
	sess = tf.Session()
	sess.run(tf.global_variables_initializer())

	# 학습 진행 - 정규화된 데이터를 feed_dict로 주어야 함
	for step in range(300000):
	_, W_val, b_val, loss_val = sess.run([train, W, b, loss],
	feed_dict={X:norm_x_data, T:t_data})

	if step % 30000 == 0:
	print('W : {}, b : {}, loss : {}'.format(W_val, b_val, loss_val))

	### 마지막 결과만 출력 ###
	# W : [[-1.0499572]
	# [ 1.4240427]
	# [-0.8138177]], b : [-0.63072276], loss : 0.604356050491333

	# predict
	my_score = np.array([[600, 3.8, 1]])
	# my_score 역시 정규화 필요
	scaled_my_score = scaler_x.transform(my_score)
	result = sess.run(H, feed_dict = {X:scaled_my_score})
	print('####### tensorflow 결과값 #######')
	print('내 지원정보 : {}, 결과 : {}'.format(my_score, result))
	# ####### tensorflow 결과값 #######
	# 내 지원정보 : [[600. 3.8 1. ]], 결과 : [[0.49538255]]

view raw multi_variable_logistic_tensorflow.py hosted with ❤ by GitHub

'Python > Data Analysis' 카테고리의 다른 글

Data Analysis / ML / Multinomial Classification (0)	2020.10.09
Data Analysis / ML / Basic Concept(3) (0)	2020.10.09
Data Analysis / ML / Logistic Regression(1) (0)	2020.10.07
Data Analysis / ML / TensorFlow - Linear Regression (0)	2020.10.05
Data Analysis / ML / Normalization (0)	2020.10.04

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

My Life

Data Analysis / ML / Logistic Regression(2)

< Single Variable Logistic Regression >

1. python

2. sklearn

3. tensorflow

< Multi Variable Logistic Regression >

0. 데이터의 boxplot 확인

1. sklearn

2. tensorflow

'Python > Data Analysis' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

Data Analysis / ML / Logistic Regression(2)

< Single Variable Logistic Regression >

1. python

2. sklearn

3. tensorflow

< Multi Variable Logistic Regression >

0. 데이터의 boxplot 확인

1. sklearn

2. tensorflow

'Python > Data Analysis' 카테고리의 다른 글

'Python/Data Analysis' Related Articles

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역