다중회귀분석
X=ndf[['cylinders','horsepower','weight']]
y=ndf['mpg']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=10)
print('훈련데이터', X_train.shape)
print('검증데이터', X_test.shape)
훈련데이터 (274, 3)
검증데이터 (118, 3)
lr = LinearRegression()
lr.fit(X_train, y_train)
r_square = lr.score(X_test, y_test)
print(r_square)
print('\n')
0.6939048496695597
print('X 변수의 개수 a :',lr.coef_)
print('\n')
print('상수항 b : ', lr.intercept_)
X 변수의 개수 a : [-0.60691288 -0.03714088 -0.00522268]
상수항 b : 46.41435126963405
y_hat = lr.predict(X_test)
plt.figure(figsize=(10,5))
ax1= sns.distplot(y, hist=False, label = "y")
ax2= sns.distplot(y_hat, hist=False, label="y_hat", ax=ax1)
plt.show()
plt.close()
C:\Users\SM2130\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
C:\Users\SM2130\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
import pandas as pd
import seaborn as sns
#load_dataset 함수를 사용하여 데이터프레임으로 변환
df = sns.load_dataset('titanic')
print(df.head)
print('\n')
pd.set_option('display.max_columns',15)
print(df.head())
<bound method NDFrame.head of survived pclass sex age sibsp ... adult_male deck \
0 0 3 male 22.0 1 ... True NaN
1 1 1 female 38.0 1 ... False C
2 1 3 female 26.0 0 ... False NaN
3 1 1 female 35.0 1 ... False C
4 0 3 male 35.0 0 ... True NaN
.. ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 ... True NaN
887 1 1 female 19.0 0 ... False B
888 0 3 female NaN 1 ... False NaN
889 1 1 male 26.0 0 ... True C
890 0 3 male 32.0 0 ... True NaN
embark_town alive alone
0 Southampton no False
1 Cherbourg yes False
2 Southampton yes True
3 Southampton yes False
4 Southampton no True
.. ... ... ...
886 Southampton no True
887 Southampton yes True
888 Southampton no False
889 Cherbourg yes True
890 Queenstown no True
[891 rows x 15 columns]>
survived pclass sex age sibsp parch fare embarked class \
0 0 3 male 22.0 1 0 7.2500 S Third
1 1 1 female 38.0 1 0 71.2833 C First
2 1 3 female 26.0 0 0 7.9250 S Third
3 1 1 female 35.0 1 0 53.1000 S First
4 0 3 male 35.0 0 0 8.0500 S Third
who adult_male deck embark_town alive alone
0 man True NaN Southampton no False
1 woman False C Cherbourg yes False
2 woman False NaN Southampton yes True
3 woman False C Southampton yes False
4 man True NaN Southampton no True
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
rdf = df.drop(['deck', 'embark_town'],axis=1)
print(rdf.columns.values)
['survived' 'pclass' 'sex' 'age' 'sibsp' 'parch' 'fare' 'embarked' 'class'
'who' 'adult_male' 'alive' 'alone']
rdf=rdf.dropna(subset=['age'], how='any',axis=0)
print(len(rdf))
714
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
most_freq
'S'
print(rdf.describe(include = 'all'))
survived pclass sex age sibsp parch \
count 714.000000 714.000000 714 714.000000 714.000000 714.000000
unique NaN NaN 2 NaN NaN NaN
top NaN NaN male NaN NaN NaN
freq NaN NaN 453 NaN NaN NaN
mean 0.406162 2.236695 NaN 29.699118 0.512605 0.431373
std 0.491460 0.838250 NaN 14.526497 0.929783 0.853289
min 0.000000 1.000000 NaN 0.420000 0.000000 0.000000
25% 0.000000 1.000000 NaN 20.125000 0.000000 0.000000
50% 0.000000 2.000000 NaN 28.000000 0.000000 0.000000
75% 1.000000 3.000000 NaN 38.000000 1.000000 1.000000
max 1.000000 3.000000 NaN 80.000000 5.000000 6.000000
fare embarked class who adult_male alive alone
count 714.000000 712 714 714 714 714 714
unique NaN 3 3 3 2 2 2
top NaN S Third man True no True
freq NaN 554 355 413 413 424 404
mean 34.694514 NaN NaN NaN NaN NaN NaN
std 52.918930 NaN NaN NaN NaN NaN NaN
min 0.000000 NaN NaN NaN NaN NaN NaN
25% 8.050000 NaN NaN NaN NaN NaN NaN
50% 15.741700 NaN NaN NaN NaN NaN NaN
75% 33.375000 NaN NaN NaN NaN NaN NaN
max 512.329200 NaN NaN NaN NaN NaN NaN
rdf['embarked'].fillna(most_freq, inplace = True)
ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch','embarked']]
print(ndf.head())
survived pclass sex age sibsp parch embarked
0 0 3 male 22.0 1 0 S
1 1 1 female 38.0 1 0 C
2 1 3 female 26.0 0 0 S
3 1 1 female 35.0 1 0 S
4 0 3 male 35.0 0 0 S