(코로나 바이러스 예측)Analysis of COVID-19 data using Python

mport urllib
import datetime as dt
from matplotlib import pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
url = "https://covid.ourworldindata.org/data/ecdc/full_data.csv"
CVD = pd.read_csv(url)
print(CVD.head(5))

        date     location  new_cases  new_deaths  total_cases  total_deaths  \
0 2019-12-31  Afghanistan        0.0         0.0          NaN           NaN   
1 2020-01-01  Afghanistan        0.0         0.0          NaN           NaN   
2 2020-01-02  Afghanistan        0.0         0.0          NaN           NaN   
3 2020-01-03  Afghanistan        0.0         0.0          NaN           NaN   
4 2020-01-04  Afghanistan        0.0         0.0          NaN           NaN   

   weekly_cases  weekly_deaths  biweekly_cases  biweekly_deaths  
0           NaN            NaN             NaN              NaN  
1           NaN            NaN             NaN              NaN  
2           NaN            NaN             NaN              NaN  
3           NaN            NaN             NaN              NaN  
4           NaN            NaN             NaN              NaN  
print(CVD.dtypes)
date                object
location            object
new_cases          float64
new_deaths         float64
total_cases        float64
total_deaths       float64
weekly_cases       float64
weekly_deaths      float64
biweekly_cases     float64
biweekly_deaths    float64
dtype: object
#날짜의 데이트 형식을 변환
CVD['date']  = [dt.datetime.strptime(x, '%Y-%m-%d') for x in CVD['date']]
print(CVD.dtypes)
date               datetime64[ns]
location                   object
new_cases                 float64
new_deaths                float64
total_cases               float64
total_deaths              float64
weekly_cases              float64
weekly_deaths             float64
biweekly_cases            float64
biweekly_deaths           float64
dtype: object
countries = ['United States', 'Spain', 'Italy', 'South Korea']
CVD_country = CVD[CVD.location.isin(countries)]
CVD_country
date	location	new_cases	new_deaths	total_cases	total_deaths	weekly_cases	weekly_deaths	biweekly_cases	biweekly_deaths
27241	2019-12-31	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
27242	2020-01-01	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
27243	2020-01-02	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
27244	2020-01-03	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
27245	2020-01-04	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
56361	2020-11-25	United States	170293.0	2224.0	12591165.0	259925.0	1231363.0	11238.0	2333339.0	20242.0
56362	2020-11-26	United States	186589.0	2341.0	12777754.0	262266.0	1247947.0	11729.0	2376622.0	20466.0
56363	2020-11-27	United States	106091.0	1189.0	12883845.0	263455.0	1166018.0	10900.0	2329044.0	21025.0
56364	2020-11-28	United States	207913.0	1404.0	13091758.0	264859.0	1177814.0	10446.0	2352144.0	20514.0
56365	2020-11-29	United States	154893.0	1204.0	13246651.0	266063.0	1157213.0	10164.0	2341760.0	20463.0
1339 rows × 10 columns

CVD_country.set_index('date', inplace= True)
CVD_country
location	new_cases	new_deaths	total_cases	total_deaths	weekly_cases	weekly_deaths	biweekly_cases	biweekly_deaths
date									
2019-12-31	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-02	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-03	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-04	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...
2020-11-25	United States	170293.0	2224.0	12591165.0	259925.0	1231363.0	11238.0	2333339.0	20242.0
2020-11-26	United States	186589.0	2341.0	12777754.0	262266.0	1247947.0	11729.0	2376622.0	20466.0
2020-11-27	United States	106091.0	1189.0	12883845.0	263455.0	1166018.0	10900.0	2329044.0	21025.0
2020-11-28	United States	207913.0	1404.0	13091758.0	264859.0	1177814.0	10446.0	2352144.0	20514.0
2020-11-29	United States	154893.0	1204.0	13246651.0	266063.0	1157213.0	10164.0	2341760.0	20463.0
1339 rows × 9 columns

CVD_country['mortality_rate'] = CVD_country['total_deaths']/CVD_country['total_cases']
<ipython-input-76-9aae57fb628b>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CVD_country['mortality_rate'] = CVD_country['total_deaths']/CVD_country['total_cases']
CVD_country
location	new_cases	new_deaths	total_cases	total_deaths	weekly_cases	weekly_deaths	biweekly_cases	biweekly_deaths	mortality_rate
date										
2019-12-31	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-01	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-02	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-03	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2020-01-04	Italy	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
2020-11-25	United States	170293.0	2224.0	12591165.0	259925.0	1231363.0	11238.0	2333339.0	20242.0	0.020643
2020-11-26	United States	186589.0	2341.0	12777754.0	262266.0	1247947.0	11729.0	2376622.0	20466.0	0.020525
2020-11-27	United States	106091.0	1189.0	12883845.0	263455.0	1166018.0	10900.0	2329044.0	21025.0	0.020448
2020-11-28	United States	207913.0	1404.0	13091758.0	264859.0	1177814.0	10446.0	2352144.0	20514.0	0.020231
2020-11-29	United States	154893.0	1204.0	13246651.0	266063.0	1157213.0	10164.0	2341760.0	20463.0	0.020085
1339 rows × 10 columns

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14,20))

CVD_country.groupby('location')['new_cases'].plot(ax=axes[0,0], legend = True)
CVD_country.groupby('location')['new_deaths'].plot(ax=axes[0,1], legend = True)
CVD_country.groupby('location')['total_cases'].plot(ax=axes[1,0], legend = True)
CVD_country.groupby('location')['total_deaths'].plot(ax=axes[1,1], legend = True)
location
Italy            AxesSubplot(0.547727,0.125;0.352273x0.343182)
South Korea      AxesSubplot(0.547727,0.125;0.352273x0.343182)
Spain            AxesSubplot(0.547727,0.125;0.352273x0.343182)
United States    AxesSubplot(0.547727,0.125;0.352273x0.343182)
Name: total_deaths, dtype: object
axes[0,0].set_title("New Cases")
axes[0,1].set_title("New Deaths")
axes[1,0].set_title("Total Cases")
axes[1,1].set_title("Total Deaths")
Text(0.5, 1.0, 'Total Deaths')

rint(CVD.isnull().sum())
date                   0
location               0
new_cases            333
new_deaths           333
total_cases         3303
total_deaths       12940
weekly_cases        1132
weekly_deaths       1132
biweekly_cases      2637
biweekly_deaths     2637
dtype: int64
CVD.columns = ['date', 'Country', 'New Cases', 'New deaths', 'Total Cases', 'Total Deaths', 'weekly_cases','weekly_deaths','biweekly_cases','biweekly_deaths']
CVD
date	Country	New Cases	New deaths	Total Cases	Total Deaths	weekly_cases	weekly_deaths	biweekly_cases	biweekly_deaths
0	2019-12-31	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
1	2020-01-01	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2	2020-01-02	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
3	2020-01-03	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
4	2020-01-04	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
59349	2020-11-25	Zimbabwe	90.0	1.0	9398.0	274.0	453.0	14.0	788.0	19.0
59350	2020-11-26	Zimbabwe	110.0	0.0	9508.0	274.0	527.0	13.0	841.0	19.0
59351	2020-11-27	Zimbabwe	115.0	0.0	9623.0	274.0	577.0	9.0	927.0	19.0
59352	2020-11-28	Zimbabwe	91.0	1.0	9714.0	275.0	594.0	10.0	949.0	18.0
59353	2020-11-29	Zimbabwe	108.0	0.0	9822.0	275.0	650.0	10.0	1036.0	18.0
59354 rows × 10 columns

# ~부정하는 것 
CVD_no_china = CVD.loc[~(CVD['Country'].isin(["China", "World"]))]
CVD_no_china
date	Country	New Cases	New deaths	Total Cases	Total Deaths	weekly_cases	weekly_deaths	biweekly_cases	biweekly_deaths
0	2019-12-31	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
1	2020-01-01	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
2	2020-01-02	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
3	2020-01-03	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
4	2020-01-04	Afghanistan	0.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
59349	2020-11-25	Zimbabwe	90.0	1.0	9398.0	274.0	453.0	14.0	788.0	19.0
59350	2020-11-26	Zimbabwe	110.0	0.0	9508.0	274.0	527.0	13.0	841.0	19.0
59351	2020-11-27	Zimbabwe	115.0	0.0	9623.0	274.0	577.0	9.0	927.0	19.0
59352	2020-11-28	Zimbabwe	91.0	1.0	9714.0	275.0	594.0	10.0	949.0	18.0
59353	2020-11-29	Zimbabwe	108.0	0.0	9822.0	275.0	650.0	10.0	1036.0	18.0
58684 rows × 10 columns

CVD_no_china  = pd.DataFrame(CVD_no_china.groupby(['Country', 'date'])['Total Cases', 'Total Deaths'].sum()).reset_index()
<ipython-input-105-74dc02810ca5>:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  CVD_no_china  = pd.DataFrame(CVD_no_china.groupby(['Country', 'date'])['Total Cases', 'Total Deaths'].sum()).reset_index()
CVD_no_china
Country	date	Total Cases	Total Deaths
0	Afghanistan	2019-12-31	0.0	0.0
1	Afghanistan	2020-01-01	0.0	0.0
2	Afghanistan	2020-01-02	0.0	0.0
3	Afghanistan	2020-01-03	0.0	0.0
4	Afghanistan	2020-01-04	0.0	0.0
...	...	...	...	...
58679	Zimbabwe	2020-11-25	9398.0	274.0
58680	Zimbabwe	2020-11-26	9508.0	274.0
58681	Zimbabwe	2020-11-27	9623.0	274.0
58682	Zimbabwe	2020-11-28	9714.0	275.0
58683	Zimbabwe	2020-11-29	9822.0	275.0
58684 rows × 4 columns

CVD_no_china = CVD_no_china.sort_values(by = ['Country', 'date'], ascending=False)
CVD_no_china
Country	date	Total Cases	Total Deaths
58683	Zimbabwe	2020-11-29	9822.0	275.0
58682	Zimbabwe	2020-11-28	9714.0	275.0
58681	Zimbabwe	2020-11-27	9623.0	274.0
58680	Zimbabwe	2020-11-26	9508.0	274.0
58679	Zimbabwe	2020-11-25	9398.0	274.0
...	...	...	...	...
4	Afghanistan	2020-01-04	0.0	0.0
3	Afghanistan	2020-01-03	0.0	0.0
2	Afghanistan	2020-01-02	0.0	0.0
1	Afghanistan	2020-01-01	0.0	0.0
0	Afghanistan	2019-12-31	0.0	0.0
58684 rows × 4 columns

#plot 함수만들자

def plot_bar(feature, value, title, df, size):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    df = df.sort_values([value], ascending=False).reset_index(drop=True)
    g = sns.barplot(df[feature][0:10], df[value][0:10], palette='Set2')
    g.set_title("Number of {} - highest 10 values".format(title))
#    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    plt.show()    
filtered_CVD_no_china = CVD_no_china.drop_duplicates(subset = ['Country'], keep='first')
plot_bar('Country', 'Total Cases', 'Total cases in the World except China', filtered_CVD_no_china, size=4)
plot_bar('Country', 'Total Deaths', 'Total deaths in the World except China', filtered_CVD_no_china, size=4)

filtered_CVD_no_china
Country	date	Total Cases	Total Deaths
58683	Zimbabwe	2020-11-29	9822.0	275.0
58429	Zambia	2020-11-29	17589.0	357.0
58173	Yemen	2020-11-29	2160.0	615.0
57939	Western Sahara	2020-11-29	766.0	1.0
57721	Wallis and Futuna	2020-11-29	3.0	0.0
...	...	...	...	...
1455	Angola	2020-11-29	15087.0	345.0
1202	Andorra	2020-11-29	6670.0	76.0
935	Algeria	2020-11-29	81212.0	2393.0
600	Albania	2020-11-29	36790.0	787.0
334	Afghanistan	2020-11-29	45844.0	1763.0
213 rows × 4 columns

def plot_world_aggregate(df, title='Aggregate plot', size=1):
    f, ax = plt.subplots(1,1, figsize=(4*size,2*size))
    g = sns.lineplot(x="date", y='Total Cases', data=df, color='blue', label='Total Cases')
    g = sns.lineplot(x="date", y='Total Deaths', data=df, color='red', label='Total Deaths')
    plt.xlabel('date')
    plt.ylabel(f'Total {title} cases')
    plt.xticks(rotation=90)
    plt.title(f'Total {title} cases')
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    plt.show()  
CVD_no_china_aggregate = CVD_no_china.groupby(['date']).sum().reset_index()
CVD_no_china_aggregate
date	Total Cases	Total Deaths
0	2019-12-31	0.0	0.0
1	2020-01-01	0.0	0.0
2	2020-01-02	0.0	0.0
3	2020-01-03	0.0	0.0
4	2020-01-04	0.0	0.0
...	...	...	...
330	2020-11-25	59810576.0	1406042.0
331	2020-11-26	60460221.0	1418625.0
332	2020-11-27	61010116.0	1429263.0
333	2020-11-28	60612242.0	1412281.0
334	2020-11-29	59732850.0	1380975.0
335 rows × 3 columns

plot_world_aggregate(CVD_no_china_aggregate, 'Rest of the World except China', size=4)

def plot_aggregate_countries(df, countries, case_type='Total Cases', size=3, is_log=False):
    f, ax = plt.subplots(1,1, figsize=(4*size, 3*size))
    for country in countries:
        df_ = df[(df['Country']==country) & (df['date'] > '2020-02-15')] 
        g = sns.lineplot(x="date", y=case_type, data=df_,  label=country)  
        ax.text(max(df_['date']), max(df_[case_type]), str(country))
    plt.xlabel('date')
    plt.ylabel(f' {case_type} ')
    plt.title(f' {case_type} ')
    plt.xticks(rotation=90)
    if(is_log):
        ax.set(yscale="log")
    ax.grid(color='black', linestyle='dotted', linewidth=0.75)
    plt.show()  

CVD_country_aggregate = CVD_no_china.groupby(['Country', 'date']).sum().reset_index()

countries = ["United States", "Italy", "Spain", "South Korea", 
                         "France", "Germany", "Switzerland", "India"]
plot_aggregate_countries(CVD_country_aggregate, countries, case_type = 'Total Cases', size=4)    

plot_aggregate_countries(CVD_country_aggregate, countries, case_type = 'Total Deaths', size=4)

#log scale
plot_aggregate_countries(CVD_country_aggregate, countries, case_type = 'Total Cases', size=4, is_log=True)

def plot_mortality(df, title='Mainland China', size=1):
    f, ax = plt.subplots(1,1, figsize=(4*size,2*size))
    g = sns.lineplot(x="date", y='Mortality (Deaths/Cases)', data=df, color='blue', label='Mortality (Deaths / Total Cases)')
    plt.xlabel('date')
    plt.ylabel(f'Mortality {title} [%]')
    plt.xticks(rotation=90)
    plt.title(f'Mortality percent {title}\nCalculated as Deaths/Confirmed cases')
    ax.grid(color='black', linestyle='dashed', linewidth=1)
    plt.show()  

CVD_no_china_aggregate['Mortality (Deaths/Cases)'] = CVD_no_china_aggregate['Total Deaths'] / CVD_no_china_aggregate['Total Cases'] * 100
plot_mortality(CVD_no_china_aggregate, title = ' - Rest of the World except China', size = 3)

저작자표시 (새창열림)

'인공지능 공부 > 딥러닝 논문읽기' 카테고리의 다른 글

( 자연어처리) 기초 키워드 분석 (0)	2021.11.02
(딥러닝) Deep Learning age and gender detection 나이를 추측해보자 (0)	2021.10.29
파이썬 딥러닝 텐서플로 복잡한모델 생성 (0)	2021.09.06
파이썬 딥러닝 텐서플로 기초부터 (0)	2021.09.06
(딥러닝) 수치미분 코드 구현하기 (0)	2021.07.27

'인공지능 공부 > 딥러닝 논문읽기' 카테고리의 다른 글

티스토리툴바