인공지능 공부/Fandas
2021-04-20 판다스 SVM 실습
앨런튜링_
2021. 4. 21. 09:11
SVM
import pandas as pd
import seaborn as sns
#데이터 준비 기본 설정
df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', 15)
#데이터 전처리
rdf = df.drop(['deck', 'embark_town'], axis=1)
#age 열에 나이 데이터가 없는 모든 행 삭제 - age
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)
rdf['embarked'].fillna(most_freq, inplace=True)
#분석에 활용할 열 선택
ndf = rdf[['survived', 'pclass', 'sex','age', 'sibsp', 'parch', 'embarked']]
onhot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex],axis =1)
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)
ndf.drop(['sex', 'embarked'], axis=1, inplace=True)
#데이터셋 구분 훈련용 / 검증용
X=ndf[['pclass', 'age', 'sibsp', 'parch', 'female', 'male', 'town_C', 'town_Q', 'town_S']]
y=ndf['survived']
from sklearn import preprocessing
X=preprocessing.StandardScaler().fit(X).transform(X)
#train디이터와 test데이터로 구분
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)
print('train data 개수:',X_train.shape)
print('test data 개수', X_test.shape)
train data 개수: (499, 9)
test data 개수 (215, 9)
from sklearn import svm
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(X_train, y_train)
y_hat = svm_model.predict(X_test)
print(y_hat[0:10])
print(y_test.values[0:10])
[0 0 1 0 0 0 1 0 0 0]
[0 0 1 0 0 1 1 1 0 0]
from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)
print('\n')
svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)
[[120 5]
[ 35 55]]
precision recall f1-score support
0 0.77 0.96 0.86 125
1 0.92 0.61 0.73 90
accuracy 0.81 215
macro avg 0.85 0.79 0.80 215
weighted avg 0.83 0.81 0.81 215
import pandas as pd
import numpy as np
uci_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/\
breast-cancer-wisconsin/breast-cancer-wisconsin.data"
df= pd.read_csv(uci_path, header=None)
df.columns = ['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial',
'bare_nuclei','chromatin', 'nomal_nucleoli', 'mitoses','class']
pd.set_option('display.max_columns', 15)
#데이터 살펴보기
print(df.head())
print('\n')
id clump cell_size cell_shape adhesion epithlial bare_nuclei \
0 1000025 5 1 1 1 2 1
1 1002945 5 4 4 5 7 10
2 1015425 3 1 1 1 2 2
3 1016277 6 8 8 1 3 4
4 1017023 4 1 1 3 2 1
chromatin nomal_nucleoli mitoses class
0 3 1 1 2
1 3 2 1 2
2 3 1 1 2
3 3 7 1 2
4 3 1 1 2
print(df.info())
print('\n')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 699 non-null int64
1 clump 699 non-null int64
2 cell_size 699 non-null int64
3 cell_shape 699 non-null int64
4 adhesion 699 non-null int64
5 epithlial 699 non-null int64
6 bare_nuclei 699 non-null object
7 chromatin 699 non-null int64
8 nomal_nucleoli 699 non-null int64
9 mitoses 699 non-null int64
10 class 699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None
print(df.describe)
<bound method NDFrame.describe of id clump cell_size cell_shape adhesion epithlial bare_nuclei \
0 1000025 5 1 1 1 2 1
1 1002945 5 4 4 5 7 10
2 1015425 3 1 1 1 2 2
3 1016277 6 8 8 1 3 4
4 1017023 4 1 1 3 2 1
.. ... ... ... ... ... ... ...
694 776715 3 1 1 1 3 2
695 841769 2 1 1 1 2 1
696 888820 5 10 10 3 7 3
697 897471 4 8 6 4 3 4
698 897471 4 8 8 5 4 5
chromatin nomal_nucleoli mitoses class
0 3 1 1 2
1 3 2 1 2
2 3 1 1 2
3 3 7 1 2
4 3 1 1 2
.. ... ... ... ...
694 1 1 1 2
695 1 1 1 2
696 8 10 2 4
697 10 6 1 4
698 10 4 1 4
[699 rows x 11 columns]>
#bare_nuclei 열의 자료형 변경(문자열에서 숫자로)
print(df['bare_nuclei'].unique())
['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
df['bare_nuclei'].replace('?', np.nan, inplace=True)
df.dropna(subset=['bare_nuclei'],axis=0, inplace=True)
df['bare_nuclei'] = df['bare_nuclei'].astype('int')
print(df.describe())
id clump cell_size cell_shape adhesion \
count 6.830000e+02 683.000000 683.000000 683.000000 683.000000
mean 1.076720e+06 4.442167 3.150805 3.215227 2.830161
std 6.206440e+05 2.820761 3.065145 2.988581 2.864562
min 6.337500e+04 1.000000 1.000000 1.000000 1.000000
25% 8.776170e+05 2.000000 1.000000 1.000000 1.000000
50% 1.171795e+06 4.000000 1.000000 1.000000 1.000000
75% 1.238705e+06 6.000000 5.000000 5.000000 4.000000
max 1.345435e+07 10.000000 10.000000 10.000000 10.000000
epithlial bare_nuclei chromatin nomal_nucleoli mitoses \
count 683.000000 683.000000 683.000000 683.000000 683.000000
mean 3.234261 3.544656 3.445095 2.869693 1.603221
std 2.223085 3.643857 2.449697 3.052666 1.732674
min 1.000000 1.000000 1.000000 1.000000 1.000000
25% 2.000000 1.000000 2.000000 1.000000 1.000000
50% 2.000000 1.000000 3.000000 1.000000 1.000000
75% 4.000000 6.000000 5.000000 4.000000 1.000000
max 10.000000 10.000000 10.000000 10.000000 10.000000
class
count 683.000000
mean 2.699854
std 0.954592
min 2.000000
25% 2.000000
50% 2.000000
75% 4.000000
max 4.000000
X=df[['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial',
'bare_nuclei','chromatin', 'nomal_nucleoli', 'mitoses']]
y=df['class']
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)
print('train data개수 : ', X_train.shape)
print('test data개수 : ', X_test.shape)
train data개수 : (478, 10)
test data개수 : (205, 10)
#Decision Tree 분류 모형
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
tree_model.fit(X_train, y_train)
y_hat = tree_model.predict(X_test)
print(y_hat[0:10])
print(y_test.values[0:10])
[4 4 4 4 4 4 2 2 4 4]
[4 4 4 4 4 4 2 2 4 4]
from sklearn import metrics
tree_matrix = metrics.confusion_matrix(y_test, y_hat)
print(tree_matrix)
print('\n')
tree_report = metrics.classification_report(y_test, y_hat)
print(tree_report)
[[127 4]
[ 2 72]]
precision recall f1-score support
2 0.98 0.97 0.98 131
4 0.95 0.97 0.96 74
accuracy 0.97 205
macro avg 0.97 0.97 0.97 205
weighted avg 0.97 0.97 0.97 205