2021-04-20 판다스 SVM 실습

인공지능 공부/Fandas
2021-04-20 판다스 SVM 실습

앨런튜링_ 2021. 4. 21. 09:11
SVM
import pandas as pd
import seaborn as sns

#데이터 준비 기본 설정
df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', 15)

#데이터 전처리
rdf = df.drop(['deck', 'embark_town'], axis=1)

#age 열에 나이 데이터가 없는 모든 행 삭제 - age
rdf = rdf.dropna(subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)
rdf['embarked'].fillna(most_freq, inplace=True)

#분석에 활용할 열 선택
ndf = rdf[['survived', 'pclass', 'sex','age', 'sibsp', 'parch', 'embarked']]

onhot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex],axis =1)

onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)

ndf.drop(['sex', 'embarked'], axis=1, inplace=True)

#데이터셋 구분 훈련용 / 검증용

X=ndf[['pclass', 'age', 'sibsp', 'parch', 'female', 'male', 'town_C', 'town_Q', 'town_S']]
y=ndf['survived']


from sklearn import preprocessing
X=preprocessing.StandardScaler().fit(X).transform(X)

#train디이터와 test데이터로 구분
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)

print('train data 개수:',X_train.shape)
print('test data 개수', X_test.shape)
train data 개수: (499, 9)
test data 개수 (215, 9)
from sklearn import svm

svm_model = svm.SVC(kernel='rbf')

svm_model.fit(X_train, y_train)
y_hat = svm_model.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])
[0 0 1 0 0 0 1 0 0 0]
[0 0 1 0 0 1 1 1 0 0]
from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)
print('\n')

svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)
[[120   5]
 [ 35  55]]


              precision    recall  f1-score   support

           0       0.77      0.96      0.86       125
           1       0.92      0.61      0.73        90

    accuracy                           0.81       215
   macro avg       0.85      0.79      0.80       215
weighted avg       0.83      0.81      0.81       215

import pandas as pd
import numpy as np

uci_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/\
breast-cancer-wisconsin/breast-cancer-wisconsin.data"

df= pd.read_csv(uci_path, header=None)

df.columns = ['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial', 
              'bare_nuclei','chromatin', 'nomal_nucleoli', 'mitoses','class']

pd.set_option('display.max_columns', 15)

#데이터 살펴보기
print(df.head())
print('\n')
        id  clump  cell_size  cell_shape  adhesion  epithlial bare_nuclei  \
0  1000025      5          1           1         1          2           1   
1  1002945      5          4           4         5          7          10   
2  1015425      3          1           1         1          2           2   
3  1016277      6          8           8         1          3           4   
4  1017023      4          1           1         3          2           1   

   chromatin  nomal_nucleoli  mitoses  class  
0          3               1        1      2  
1          3               2        1      2  
2          3               1        1      2  
3          3               7        1      2  
4          3               1        1      2  


print(df.info())
print('\n')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              699 non-null    int64 
 1   clump           699 non-null    int64 
 2   cell_size       699 non-null    int64 
 3   cell_shape      699 non-null    int64 
 4   adhesion        699 non-null    int64 
 5   epithlial       699 non-null    int64 
 6   bare_nuclei     699 non-null    object
 7   chromatin       699 non-null    int64 
 8   nomal_nucleoli  699 non-null    int64 
 9   mitoses         699 non-null    int64 
 10  class           699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None


print(df.describe)
<bound method NDFrame.describe of           id  clump  cell_size  cell_shape  adhesion  epithlial bare_nuclei  \
0    1000025      5          1           1         1          2           1   
1    1002945      5          4           4         5          7          10   
2    1015425      3          1           1         1          2           2   
3    1016277      6          8           8         1          3           4   
4    1017023      4          1           1         3          2           1   
..       ...    ...        ...         ...       ...        ...         ...   
694   776715      3          1           1         1          3           2   
695   841769      2          1           1         1          2           1   
696   888820      5         10          10         3          7           3   
697   897471      4          8           6         4          3           4   
698   897471      4          8           8         5          4           5   

     chromatin  nomal_nucleoli  mitoses  class  
0            3               1        1      2  
1            3               2        1      2  
2            3               1        1      2  
3            3               7        1      2  
4            3               1        1      2  
..         ...             ...      ...    ...  
694          1               1        1      2  
695          1               1        1      2  
696          8              10        2      4  
697         10               6        1      4  
698         10               4        1      4  

[699 rows x 11 columns]>
#bare_nuclei 열의 자료형 변경(문자열에서 숫자로)
print(df['bare_nuclei'].unique())
['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
df['bare_nuclei'].replace('?', np.nan, inplace=True)
df.dropna(subset=['bare_nuclei'],axis=0, inplace=True)
df['bare_nuclei'] = df['bare_nuclei'].astype('int')

print(df.describe())
                 id       clump   cell_size  cell_shape    adhesion  \
count  6.830000e+02  683.000000  683.000000  683.000000  683.000000   
mean   1.076720e+06    4.442167    3.150805    3.215227    2.830161   
std    6.206440e+05    2.820761    3.065145    2.988581    2.864562   
min    6.337500e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.776170e+05    2.000000    1.000000    1.000000    1.000000   
50%    1.171795e+06    4.000000    1.000000    1.000000    1.000000   
75%    1.238705e+06    6.000000    5.000000    5.000000    4.000000   
max    1.345435e+07   10.000000   10.000000   10.000000   10.000000   

        epithlial  bare_nuclei   chromatin  nomal_nucleoli     mitoses  \
count  683.000000   683.000000  683.000000      683.000000  683.000000   
mean     3.234261     3.544656    3.445095        2.869693    1.603221   
std      2.223085     3.643857    2.449697        3.052666    1.732674   
min      1.000000     1.000000    1.000000        1.000000    1.000000   
25%      2.000000     1.000000    2.000000        1.000000    1.000000   
50%      2.000000     1.000000    3.000000        1.000000    1.000000   
75%      4.000000     6.000000    5.000000        4.000000    1.000000   
max     10.000000    10.000000   10.000000       10.000000   10.000000   

            class  
count  683.000000  
mean     2.699854  
std      0.954592  
min      2.000000  
25%      2.000000  
50%      2.000000  
75%      4.000000  
max      4.000000  
X=df[['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial', 
              'bare_nuclei','chromatin', 'nomal_nucleoli', 'mitoses']]
y=df['class']
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)
print('train data개수 : ', X_train.shape)
print('test data개수 : ', X_test.shape)
train data개수 :  (478, 10)
test data개수 :  (205, 10)
#Decision Tree 분류 모형
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
tree_model.fit(X_train, y_train)

y_hat = tree_model.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])
[4 4 4 4 4 4 2 2 4 4]
[4 4 4 4 4 4 2 2 4 4]
from sklearn import metrics
tree_matrix = metrics.confusion_matrix(y_test, y_hat)
print(tree_matrix)
print('\n')

tree_report = metrics.classification_report(y_test, y_hat)
print(tree_report)
[[127   4]
 [  2  72]]


              precision    recall  f1-score   support

           2       0.98      0.97      0.98       131
           4       0.95      0.97      0.96        74

    accuracy                           0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205