fish_length = [25.4, 26.3, 26.5, 29.0, 29.0, 29.7, 29.7, 30.0, 30.0, 30.7, 31.0, 31.0, 
                31.5, 32.0, 32.0, 32.0, 33.0, 33.0, 33.5, 33.5, 34.0, 34.0, 34.5, 35.0, 
                35.0, 35.0, 35.0, 36.0, 36.0, 37.0, 38.5, 38.5, 39.5, 41.0, 41.0, 9.8, 
                10.5, 10.6, 11.0, 11.2, 11.3, 11.8, 11.8, 12.0, 12.2, 12.4, 13.0, 14.3, 15.0]
fish_weight = [242.0, 290.0, 340.0, 363.0, 430.0, 450.0, 500.0, 390.0, 450.0, 500.0, 475.0, 500.0, 
                500.0, 340.0, 600.0, 600.0, 700.0, 700.0, 610.0, 650.0, 575.0, 685.0, 620.0, 680.0, 
                700.0, 725.0, 720.0, 714.0, 850.0, 1000.0, 920.0, 955.0, 925.0, 975.0, 950.0, 6.7, 
                7.5, 7.0, 9.7, 9.8, 8.7, 10.0, 9.9, 9.8, 12.2, 13.4, 12.2, 19.7, 19.9]


import numpy as np

# column_stack()함수는 전달받은 리스트를 일렬로 세우고 나란히 연결
np.column_stack(([1,2,3], [4,5,6]))

#연결할 리스트는 파이썬 튜플로 전달
# column_stack(), 튜플형식 (, ) 튜플 형식 안에 [, ] 리스트가 있다 
#따라서 괄호 안에 튜플괄호 안에 리스트 대괄호 있음

array([[1, 4],
       [2, 5],
       [3, 6]])


#피시 길이와 피시 무게를 합치기
fish_data= np.column_stack((fish_length, fish_weight))


fish_data[:5]

array([[ 25.4, 242. ],
       [ 26.3, 290. ],
       [ 26.5, 340. ],
       [ 29. , 363. ],
       [ 29. , 430. ]])


#np.ones() np.zeros() 두 함수는 각 원하는 개수의 1과 0을 채운 배열 만들어줌 

np.ones(5)

array([1., 1., 1., 1., 1.])


fish_target=np.concatenate((np.ones(35), np.zeros(14)))


fish_target

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


#사이킷런으로 훈련세트와 테스트 세트 나누기 
from sklearn.model_selection import train_test_split


# train_test_split()는 자체적으로 랜덤 지정할 수 있는 random_state 매개변수 있음

train_input, test_input, train_target, test_target= train_test_split(fish_data, fish_target, random_state=42)


print(train_input.shape, test_input.shape)
print(train_target.shape, test_target.shape)

(36, 2) (13, 2)
(36,) (13,)


import pandas as pd

df=pd.DataFrame(train_input)
df


df1=pd.DataFrame(test_target)
df1


test_target

array([1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


# 빙어0 빙어의 비율이 모자란다, 샘플링 편향 !!

# 샘플링편향 해결 --> train_test_split()함수
# stratify 매개변수에 타깃 데이터 전달 - 클래스 비율에 맞게 데이터 나누기 

train_input, test_input, train_target, test_target=train_test_split(fish_data, fish_target, stratify=fish_target, random_state=42)


test_target

array([0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1.])


# k-최근접 이웃 훈련== 훈련데이터 저장하는 것 
from sklearn.neighbors import KNeighborsClassifier
kn=KNeighborsClassifier()
kn.fit(train_input, train_target)
kn.score(test_input, test_target)

C:\Users\82104\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

1.0


# 완벽한 결과 , 도미와 빙어 올바르게 분류 

print(kn.predict([[25,150]]))

[0.]

C:\Users\82104\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# 이렇게 큰 빙어가 있다고? 믿을 수 없음--> 산점도 그려보세요 
import matplotlib.pyplot as plt
plt.scatter(train_input[:,0], train_input[:,1])
plt.scatter(25, 150, marker='^') # marker 매개변수는 모양 지정 삼각형
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 왜 도미쪽으로 안가고, 빙어로 안됐을까?

distances, indexes= kn.kneighbors([[25,150]])


# indexes배열 사용해 훈련 데이터 중 이웃 샘플 따로 구분해 그리기
plt.scatter(train_input[:,0], train_input[:,1])
plt.scatter(25,150, marker='^')
plt.scatter(train_input[indexes,0], train_input[indexes,1], marker='D') # 마름모로 표시 D
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 마름모를 표현한 걸 보니, 4개의 샘플은 빙어, 1개의 샘플은 도미이다

train_input[indexes]

array([[[ 25.4, 242. ],
        [ 15. ,  19.9],
        [ 14.3,  19.7],
        [ 13. ,  12.2],
        [ 12.2,  12.2]]])


#가까운 생선 4개는 빙어0이다 타깃 데이터로 확인
train_target[indexes]

array([[1., 0., 0., 0., 0.]])


print(distances) # 이웃샘플까지의 거리가 담겨있다

[[ 92.00086956 130.48375378 130.73859415 138.32150953 138.39320793]]


# 함수 범위가 이상해서 그렇다, 범위를 동일하게 바꾸어 주자 

plt.scatter(train_input[:,0], train_input[:,1])
plt.scatter(25,150,marker='^')
plt.scatter(train_input[indexes,0], train_input[indexes,1], marker='D')
plt.xlim((0,1000))
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 비율을 똑같이 하니까 보기 더욱 어려워짐 ==> 데이터 전처리 필요 
# 흔히 사용하는 전처리 방법은 표준점수 -- 특성값이 평균에서 표준편차의 몇 배만큼 떨어져있는지 나타냄 

mean=np.mean(train_input, axis=0)  # 평균
std=np.std(train_input, axis=0)   # 표준편차


# axis=0  column 열
# axis=1  rows   행

print(mean, std)

[ 27.29722222 454.09722222] [  9.98244253 323.29893931]


# 표준점수 = (원본데이터 - 평균 )/ 표준편차

train_scaled= (train_input -  mean)/std


plt.scatter(train_scaled[:,0], train_scaled[:,1])
plt.scatter(25,150,marker='^')
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 값의 범위가 달라져서 저따구로 나옴 
# 샘플 [25,150]을 동일한 비율로 변환하지 않아서 나온 현상 

new=([25,150]-mean)/std
plt.scatter(train_scaled[:,0], train_scaled[:,1])
plt.scatter(new[0], new[1],marker='^')
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 첫 산점도와 달라진점은 -1.5 ~ 1.5로 바뀌었다는 것 

# k-최근접 이웃 모델로 훈련해보기
kn.fit(train_scaled, train_target)

KNeighborsClassifier()


# 모델 평가
test_scaled=(test_input - mean)/std


kn.score(test_scaled, test_target)

C:\Users\82104\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

1.0


# 훈련세트의 평균과 표준편차로 변환한 샘플을 사용해 모델의 예측 출력
kn.predict([new])

C:\Users\82104\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array([1.])


# 도미 예측함 -- 25cm, 150g 인 생성은 도미일 것 


distances, indexes=kn.kneighbors([new])
plt.scatter(train_scaled[:,0], train_scaled[:,1])
plt.scatter(new[0], new[1], marker='^')
plt.scatter(train_scaled[indexes,0], train_scaled[indexes,1], marker='D')
plt.xlabel('length')
plt.ylabel('weight')
plt.show()


# 샘플, 삼각형의 가장 가까운 샘플은 모두 도미이다 --> 도미로 예측

chapter 2-2 데이터 전처리- 표준점수

데이터 전처리¶

마무리¶

	0	1
0	30.0	450.0
1	29.0	363.0
2	29.7	500.0
3	11.3	8.7
4	11.8	10.0
5	13.0	12.2
6	32.0	600.0
7	30.7	500.0
8	33.0	700.0
9	35.0	700.0
10	41.0	975.0
11	38.5	920.0
12	25.4	242.0
13	12.0	9.8
14	39.5	925.0
15	29.7	450.0
16	37.0	1000.0
17	31.0	500.0
18	10.5	7.5
19	26.3	290.0
20	34.0	685.0
21	26.5	340.0
22	10.6	7.0
23	9.8	6.7
24	35.0	680.0
25	11.2	9.8
26	31.0	475.0
27	34.5	620.0
28	33.5	610.0
29	15.0	19.9
30	34.0	575.0
31	30.0	390.0
32	11.8	9.9
33	32.0	600.0
34	36.0	850.0
35	11.0	9.7