인공지능 공부/Fandas

2021-04-20 판다스 다중회귀 실습

다중회귀분석
X=ndf[['cylinders','horsepower','weight']]
y=ndf['mpg']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)

print('훈련데이터', X_train.shape)
print('검증데이터', X_test.shape)
훈련데이터 (274, 3)
검증데이터 (118, 3)
lr = LinearRegression()

lr.fit(X_train, y_train)
r_square = lr.score(X_test, y_test)
print(r_square)
print('\n')
0.6939048496695597


print('X 변수의 개수 a :',lr.coef_)
print('\n')

print('상수항 b : ', lr.intercept_)
X 변수의 개수 a : [-0.60691288 -0.03714088 -0.00522268]


상수항 b :  46.41435126963405
y_hat = lr.predict(X_test)
plt.figure(figsize=(10,5))
ax1= sns.distplot(y, hist=False, label = "y")
ax2= sns.distplot(y_hat, hist=False, label="y_hat", ax=ax1)
plt.show()
plt.close()
C:\Users\SM2130\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)
C:\Users\SM2130\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
  warnings.warn(msg, FutureWarning)

import pandas as pd 
import seaborn as sns

#load_dataset 함수를 사용하여 데이터프레임으로 변환
df = sns.load_dataset('titanic')

print(df.head)
print('\n')

pd.set_option('display.max_columns',15)
print(df.head())
<bound method NDFrame.head of      survived  pclass     sex   age  sibsp  ...  adult_male  deck  \
0           0       3    male  22.0      1  ...        True   NaN   
1           1       1  female  38.0      1  ...       False     C   
2           1       3  female  26.0      0  ...       False   NaN   
3           1       1  female  35.0      1  ...       False     C   
4           0       3    male  35.0      0  ...        True   NaN   
..        ...     ...     ...   ...    ...  ...         ...   ...   
886         0       2    male  27.0      0  ...        True   NaN   
887         1       1  female  19.0      0  ...       False     B   
888         0       3  female   NaN      1  ...       False   NaN   
889         1       1    male  26.0      0  ...        True     C   
890         0       3    male  32.0      0  ...        True   NaN   

     embark_town alive  alone  
0    Southampton    no  False  
1      Cherbourg   yes  False  
2    Southampton   yes   True  
3    Southampton   yes  False  
4    Southampton    no   True  
..           ...   ...    ...  
886  Southampton    no   True  
887  Southampton   yes   True  
888  Southampton    no  False  
889    Cherbourg   yes   True  
890   Queenstown    no   True  

[891 rows x 15 columns]>


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
rdf = df.drop(['deck', 'embark_town'],axis=1)
print(rdf.columns.values)
['survived' 'pclass' 'sex' 'age' 'sibsp' 'parch' 'fare' 'embarked' 'class'
 'who' 'adult_male' 'alive' 'alone']
rdf=rdf.dropna(subset=['age'], how='any',axis=0)
print(len(rdf))
714
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
most_freq
'S'
print(rdf.describe(include = 'all'))
          survived      pclass   sex         age       sibsp       parch  \
count   714.000000  714.000000   714  714.000000  714.000000  714.000000   
unique         NaN         NaN     2         NaN         NaN         NaN   
top            NaN         NaN  male         NaN         NaN         NaN   
freq           NaN         NaN   453         NaN         NaN         NaN   
mean      0.406162    2.236695   NaN   29.699118    0.512605    0.431373   
std       0.491460    0.838250   NaN   14.526497    0.929783    0.853289   
min       0.000000    1.000000   NaN    0.420000    0.000000    0.000000   
25%       0.000000    1.000000   NaN   20.125000    0.000000    0.000000   
50%       0.000000    2.000000   NaN   28.000000    0.000000    0.000000   
75%       1.000000    3.000000   NaN   38.000000    1.000000    1.000000   
max       1.000000    3.000000   NaN   80.000000    5.000000    6.000000   

              fare embarked  class  who adult_male alive alone  
count   714.000000      712    714  714        714   714   714  
unique         NaN        3      3    3          2     2     2  
top            NaN        S  Third  man       True    no  True  
freq           NaN      554    355  413        413   424   404  
mean     34.694514      NaN    NaN  NaN        NaN   NaN   NaN  
std      52.918930      NaN    NaN  NaN        NaN   NaN   NaN  
min       0.000000      NaN    NaN  NaN        NaN   NaN   NaN  
25%       8.050000      NaN    NaN  NaN        NaN   NaN   NaN  
50%      15.741700      NaN    NaN  NaN        NaN   NaN   NaN  
75%      33.375000      NaN    NaN  NaN        NaN   NaN   NaN  
max     512.329200      NaN    NaN  NaN        NaN   NaN   NaN  
rdf['embarked'].fillna(most_freq, inplace = True)
ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch','embarked']]
print(ndf.head())
   survived  pclass     sex   age  sibsp  parch embarked
0         0       3    male  22.0      1      0        S
1         1       1  female  38.0      1      0        C
2         1       3  female  26.0      0      0        S
3         1       1  female  35.0      1      0        S
4         0       3    male  35.0      0      0        S