In [62]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [63]:

df=pd.read_csv('covid-19.csv') 

In [64]:

df.head() 

Out[64]:

	dateRep	day	month	year	cases	deaths	countriesAndTerritories	geoId	countryterritoryCode	popData2019	continentExp	Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0	14/12/2020	14	12	2020	746	6	Afghanistan	AF	AFG	38041757.0	Asia	9.013779
1	13/12/2020	13	12	2020	298	9	Afghanistan	AF	AFG	38041757.0	Asia	7.052776
2	12/12/2020	12	12	2020	113	11	Afghanistan	AF	AFG	38041757.0	Asia	6.868768
3	11/12/2020	11	12	2020	63	10	Afghanistan	AF	AFG	38041757.0	Asia	7.134266
4	10/12/2020	10	12	2020	202	16	Afghanistan	AF	AFG	38041757.0	Asia	6.968658

In [65]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61900 entries, 0 to 61899
Data columns (total 12 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   dateRep                                                     61900 non-null  object 
 1   day                                                         61900 non-null  int64  
 2   month                                                       61900 non-null  int64  
 3   year                                                        61900 non-null  int64  
 4   cases                                                       61900 non-null  int64  
 5   deaths                                                      61900 non-null  int64  
 6   countriesAndTerritories                                     61900 non-null  object 
 7   geoId                                                       61625 non-null  object 
 8   countryterritoryCode                                        61777 non-null  object 
 9   popData2019                                                 61777 non-null  float64
 10  continentExp                                                61900 non-null  object 
 11  Cumulative_number_for_14_days_of_COVID-19_cases_per_100000  59021 non-null  float64
dtypes: float64(2), int64(5), object(5)
memory usage: 5.7+ MB

In [66]:

df.describe() 

Out[66]:

	day	month	year	cases	deaths	popData2019	Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
count	61900.000000	61900.000000	61900.000000	61900.000000	61900.000000	6.177700e+04	59021.000000
mean	15.628934	7.067157	2019.998918	1155.147237	26.055460	4.098770e+07	66.320586
std	8.841582	2.954776	0.032882	6779.224479	131.227055	1.531294e+08	162.329240
min	1.000000	1.000000	2019.000000	-8261.000000	-1918.000000	8.150000e+02	-147.419587
25%	8.000000	5.000000	2020.000000	0.000000	0.000000	1.293120e+06	0.757526
50%	15.000000	7.000000	2020.000000	15.000000	0.000000	7.169456e+06	6.724045
75%	23.000000	10.000000	2020.000000	273.000000	4.000000	2.851583e+07	52.572719
max	31.000000	12.000000	2020.000000	234633.000000	4928.000000	1.433784e+09	1900.836210

In [67]:

#changing the column names
df.columns=['date','day','month','year','cases','deaths','country','old_country_code','country_code','population','continent','Cum._num_for_14_days_per_100000']

In [68]:

#Dropping the redundant column name 
df.drop(['old_country_code'],axis=1,inplace=True)

In [69]:

df1=df[df.month!=12]

In [70]:

df1.isna().sum().sum()/len(df1)

Out[70]:

0.05121321280500238

In [71]:

df1.dropna(inplace=True)

C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\3614008390.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.dropna(inplace=True)

In [72]:

df_by_country=df1.groupby('country')['cases','deaths'].sum() 
df_by_country

C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\3922391302.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  df_by_country=df1.groupby('country')['cases','deaths'].sum()

Out[72]:

	cases	deaths
country
Afghanistan	45844	1763
Albania	37555	796
Algeria	82221	2410
Andorra	6524	76
Angola	15095	344
…	…	…
Vietnam	1343	35
Western_Sahara	760	1
Yemen	2076	605
Zambia	17573	357
Zimbabwe	9942	275

212 rows × 2 columns

In [73]:

def rate1(x,y):
    return (x/y)*100

In [74]:

#Adding a new column for the mortality rate which is the ratio of the number of deaths to cases 
#df_by_country['mortality_rate']=df_by_country['deaths']/df_by_ country['cases']
df_by_country['mortality_rate']=df_by_country.apply(lambda x: rate1(x['deaths'], x['cases']),axis=1)

C:\Users\Nikolas\AppData\Local\Temp\ipykernel_12224\2635256975.py:2: RuntimeWarning: invalid value encountered in longlong_scalars
  return (x/y)*100

In [75]:

df_by_country

Out[75]:

	cases	deaths	mortality_rate
country
Afghanistan	45844	1763	3.845650
Albania	37555	796	2.119558
Algeria	82221	2410	2.931125
Andorra	6524	76	1.164929
Angola	15095	344	2.278900
…	…	…	…
Vietnam	1343	35	2.606106
Western_Sahara	760	1	0.131579
Yemen	2076	605	29.142582
Zambia	17573	357	2.031526
Zimbabwe	9942	275	2.766043

212 rows × 3 columns

In [83]:

#Sorting the values for the mortality rate in the descending order 
plt.figure(figsize=(15,10)) 
ax=df_by_country['mortality_rate'].sort_values(ascending=False).head(20).plot(kind='bar') 
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") 
for p in ax.patches:
    ax.annotate(p.get_height().round(2),(p.get_x()+p.get_width()/2,p.get_height()),ha='center',va='bottom') 
ax.set_xlabel("Country") 
ax.set_ylabel("Mortality rate") 
ax.set_title("Countries with highest mortality rates")

Out[83]:

Text(0.5, 1.0, 'Countries with highest mortality rates')

In [84]:

#Pie chart showing the countries with the highest number of COVID cases 
df_cases=df_by_country['cases'].sort_values(ascending=False) 
ax=df_cases.head(10).plot(kind='pie',autopct='%.2f%%',labels=df_cases.index,figsize=(12,8)) 
ax.set_title("Top ten countries by case load") 

Out[84]:

Text(0.5, 1.0, 'Top ten countries by case load')

In [87]:

#sorting the number of deaths in the descending order

plt.figure(figsize=(10,6))

ax=df_by_country['deaths'].sort_values(ascending=False).head(5).plot(kind='bar')

ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

for p in