In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
In [63]:
df=pd.read_csv('covid-19.csv')
In [64]:
df.head()
Out[64]:
| dateRep | day | month | year | cases | deaths | countriesAndTerritories | geoId | countryterritoryCode | popData2019 | continentExp | Cumulative_number_for_14_days_of_COVID-19_cases_per_100000 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14/12/2020 | 14 | 12 | 2020 | 746 | 6 | Afghanistan | AF | AFG | 38041757.0 | Asia | 9.013779 |
| 1 | 13/12/2020 | 13 | 12 | 2020 | 298 | 9 | Afghanistan | AF | AFG | 38041757.0 | Asia | 7.052776 |
| 2 | 12/12/2020 | 12 | 12 | 2020 | 113 | 11 | Afghanistan | AF | AFG | 38041757.0 | Asia | 6.868768 |
| 3 | 11/12/2020 | 11 | 12 | 2020 | 63 | 10 | Afghanistan | AF | AFG | 38041757.0 | Asia | 7.134266 |
| 4 | 10/12/2020 | 10 | 12 | 2020 | 202 | 16 | Afghanistan | AF | AFG | 38041757.0 | Asia | 6.968658 |
In [65]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61900 entries, 0 to 61899 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateRep 61900 non-null object 1 day 61900 non-null int64 2 month 61900 non-null int64 3 year 61900 non-null int64 4 cases 61900 non-null int64 5 deaths 61900 non-null int64 6 countriesAndTerritories 61900 non-null object 7 geoId 61625 non-null object 8 countryterritoryCode 61777 non-null object 9 popData2019 61777 non-null float64 10 continentExp 61900 non-null object 11 Cumulative_number_for_14_days_of_COVID-19_cases_per_100000 59021 non-null float64 dtypes: float64(2), int64(5), object(5) memory usage: 5.7+ MB
In [66]:
df.describe()
Out[66]:
| day | month | year | cases | deaths | popData2019 | Cumulative_number_for_14_days_of_COVID-19_cases_per_100000 | |
|---|---|---|---|---|---|---|---|
| count | 61900.000000 | 61900.000000 | 61900.000000 | 61900.000000 | 61900.000000 | 6.177700e+04 | 59021.000000 |
| mean | 15.628934 | 7.067157 | 2019.998918 | 1155.147237 | 26.055460 | 4.098770e+07 | 66.320586 |
| std | 8.841582 | 2.954776 | 0.032882 | 6779.224479 | 131.227055 | 1.531294e+08 | 162.329240 |
| min | 1.000000 | 1.000000 | 2019.000000 | -8261.000000 | -1918.000000 | 8.150000e+02 | -147.419587 |
| 25% | 8.000000 | 5.000000 | 2020.000000 | 0.000000 | 0.000000 | 1.293120e+06 | 0.757526 |
| 50% | 15.000000 | 7.000000 | 2020.000000 | 15.000000 | 0.000000 | 7.169456e+06 | 6.724045 |
| 75% | 23.000000 | 10.000000 | 2020.000000 | 273.000000 | 4.000000 | 2.851583e+07 | 52.572719 |
| max | 31.000000 | 12.000000 | 2020.000000 | 234633.000000 | 4928.000000 | 1.433784e+09 | 1900.836210 |
In [67]:
#changing the column names
df.columns=['date','day','month','year','cases','deaths','country','old_country_code','country_code','population','continent','Cum._num_for_14_days_per_100000']
In [68]:
#Dropping the redundant column name
df.drop(['old_country_code'],axis=1,inplace=True)
In [69]:
df1=df[df.month!=12]
In [70]:
df1.isna().sum().sum()/len(df1)
Out[70]:
0.05121321280500238
In [71]:
df1.dropna(inplace=True)
C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\3614008390.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df1.dropna(inplace=True)
In [72]:
df_by_country=df1.groupby('country')['cases','deaths'].sum()
df_by_country
C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\3922391302.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
df_by_country=df1.groupby('country')['cases','deaths'].sum()
Out[72]:
| cases | deaths | |
|---|---|---|
| country | ||
| Afghanistan | 45844 | 1763 |
| Albania | 37555 | 796 |
| Algeria | 82221 | 2410 |
| Andorra | 6524 | 76 |
| Angola | 15095 | 344 |
| … | … | … |
| Vietnam | 1343 | 35 |
| Western_Sahara | 760 | 1 |
| Yemen | 2076 | 605 |
| Zambia | 17573 | 357 |
| Zimbabwe | 9942 | 275 |
212 rows × 2 columns
In [73]:
def rate1(x,y):
return (x/y)*100
In [74]:
#Adding a new column for the mortality rate which is the ratio of the number of deaths to cases
#df_by_country['mortality_rate']=df_by_country['deaths']/df_by_ country['cases']
df_by_country['mortality_rate']=df_by_country.apply(lambda x: rate1(x['deaths'], x['cases']),axis=1)
C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\2635256975.py:2: RuntimeWarning: invalid value encountered in longlong_scalars return (x/y)*100
In [75]:
df_by_country
Out[75]:
| cases | deaths | mortality_rate | |
|---|---|---|---|
| country | |||
| Afghanistan | 45844 | 1763 | 3.845650 |
| Albania | 37555 | 796 | 2.119558 |
| Algeria | 82221 | 2410 | 2.931125 |
| Andorra | 6524 | 76 | 1.164929 |
| Angola | 15095 | 344 | 2.278900 |
| … | … | … | … |
| Vietnam | 1343 | 35 | 2.606106 |
| Western_Sahara | 760 | 1 | 0.131579 |
| Yemen | 2076 | 605 | 29.142582 |
| Zambia | 17573 | 357 | 2.031526 |
| Zimbabwe | 9942 | 275 | 2.766043 |
212 rows × 3 columns
In [83]:
#Sorting the values for the mortality rate in the descending order
plt.figure(figsize=(15,10))
ax=df_by_country['mortality_rate'].sort_values(ascending=False).head(20).plot(kind='bar')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
for p in ax.patches:
ax.annotate(p.get_height().round(2),(p.get_x()+p.get_width()/2,p.get_height()),ha='center',va='bottom')
ax.set_xlabel("Country")
ax.set_ylabel("Mortality rate")
ax.set_title("Countries with highest mortality rates")
Out[83]:
Text(0.5, 1.0, 'Countries with highest mortality rates')
In [84]:
#Pie chart showing the countries with the highest number of COVID cases
df_cases=df_by_country['cases'].sort_values(ascending=False)
ax=df_cases.head(10).plot(kind='pie',autopct='%.2f%%',labels=df_cases.index,figsize=(12,8))
ax.set_title("Top ten countries by case load")
Out[84]:
Text(0.5, 1.0, 'Top ten countries by case load')
In [87]:
#sorting the number of deaths in the descending order
plt.figure(figsize=(10,6))
ax=df_by_country['deaths'].sort_values(ascending=False).head(5).plot(kind='bar')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
for p in
