# Import all the required packages
import pandas as pd
from scipy.signal import find_peaks
from scipy.signal import argrelextrema
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
1. Creating a new dataframe
# Get raw covid data for NL from CoronaWachtNL(https://github.com/J535D165/CoronaWatchNL)
covid_nl_url = 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data-geo/data-provincial/RIVM_NL_provincial.csv'
# Chosen to get data from province as a lot of missing data was found on muncipal level, given inaccurate results.
df_c = pd.read_csv(covid_nl_url)
#Get 2021 and 2021 Mobility of Netherlands
df_m2020 = pd.read_csv('data/2020_NL_Region_Mobility_Report.csv')
df_m2021 = pd.read_csv('data/2021_NL_Region_Mobility_Report.csv')
# Get population information for Netherlands
df_pop = pd.read_csv('data/CBS_KernCijfers_2021.csv')
#Get estimation of R_production number
df_r = pd.read_json('data/reproductiegetal_RIVM.json')
# Change Data Types
df_c.loc[:, 'Datum'] = pd.to_datetime(df_c.loc[:, 'Datum'])
# Filter out not associated Province data as COVID data is minimum on provincal level, munticipality to much missing.
df_c = df_c[df_c['Provincienaam'].notna()]
# Fillna
df_c['Aantal'] = df_c['Aantal'].fillna(0)
df_c['AantalCumulatief'] = df_c['AantalCumulatief'].fillna(0)
# Select mobility data of 2020 and 2021, province.
df_m2020 = df_m2020[df_m2020['sub_region_2'].isna()]
df_m2021 = df_m2021[df_m2021['sub_region_2'].isna()]
df_m2020 = df_m2020[df_m2020['sub_region_1'].notna()]
df_m2021 = df_m2021[df_m2021['sub_region_1'].notna()]
# Concat rows of 2020 and 2021 of Google Movement Dataset
df_m = pd.concat([df_m2020, df_m2021])
# Change column type to pandas date time of Google Movement Dataset
df_m.loc[:, 'date'] = pd.to_datetime(df_m.loc[:, 'date'])
# Select columns needed of Google Movement Dataset
df_m = df_m[['sub_region_1', 'date',
'retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline']]
# Rename Columns of Google Movement Dataset
df_m.columns = ['province', 'date', 'retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential']
# Map Dutch Province Names and replace English province names in dataset.
province_map = {'North Brabant': 'Noord-Brabant', 'North Holland': 'Noord-Holland', 'South Holland': 'Zuid-Holland'}
df_m['province'].replace(province_map, inplace=True)
# Rename columns of NL Covid Dataset
df_c.rename(columns={'Datum': 'date', 'Provincienaam': 'province', 'Provinciecode': 'c_province_code', 'Type': 'c_type',
'Aantal': 'c_number', 'AantalCumulatief': 'c_total_number'}, inplace=True)
# Drop unnecessary columns of NL Covid Dataset
df_c = df_c.drop('c_province_code', axis=1)
df_c = df_c.drop('c_total_number', axis=1)
# Pivot row to columns, to get right format of NL Covid Dataset
df_c = df_c.pivot_table('c_number', ['date', 'province'], 'c_type').reset_index()
# Rename new colums of NL Covid Dataset
df_c.columns = ['date', 'province', 'died', 'total_infections', 'hospital']
# Clean RIVM reproduction data
df_r = df_r[['Date', 'Rt_avg']]
df_r.columns = ['date', 'r']
# Merge all data on date/province columns to one final dataframe.
df = pd.merge(df_c, df_m, how='left', on=['date', 'province'])
df = pd.merge(df, df_pop, how='left', on=['province'])
df = pd.merge(df, df_r, how='left', on=['date'])
# Describe the dataset
"""
---- Description of Dataset ----
data -> Describes the joined date shared by all datasets
province -> Describes the joined province shared by all datasets
died -> Describes the number of people who died that day due to COVID
total_infections -> Describes the number of total infections on that day due to covid
retail -> Describes the percent change from baseline in retail and recreation trips
grocery -> Describes the percent change from baseline in grocery and pharmacy trips
parks -> Describes the percent change from baseline in park trips
transit -> Describes the percent change from baseline in transit stations trips
workplaces -> Describes the percent change from baseline in workplaces trips
residential -> Describes the percent change from baseline in residential trips
population -> Describes the total population in a province
r -> Describe the estimated R (infection number) value. Estimated by the RIVM.
---------------------------------
"""
# Print head of data
df.head()
date | province | died | total_infections | hospital | retail | grocery | parks | transit | workplaces | residential | population | r | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-02-27 | Drenthe | 0.0 | 0.0 | 0.0 | 3.0 | 7.0 | 24.0 | -2.0 | 4.0 | 1.0 | 494771 | 2.06 |
1 | 2020-02-27 | Flevoland | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | -4.0 | 3.0 | 1.0 | 1.0 | 428226 | 2.06 |
2 | 2020-02-27 | Friesland | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 | 10.0 | -1.0 | 4.0 | 1.0 | 651435 | 2.06 |
3 | 2020-02-27 | Gelderland | 0.0 | 0.0 | 0.0 | 2.0 | -2.0 | -3.0 | -18.0 | -21.0 | 3.0 | 2096603 | 2.06 |
4 | 2020-02-27 | Groningen | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 5.0 | -1.0 | 4.0 | 1.0 | 586937 | 2.06 |
1. Peak Finding
1.1 Peak Finding Function
# Define the find peak function, for a province and activity.
# It does not make sense to find the overal peaks as these are categorized. So start finding peaks for acitvity in province.
def scipy_peaks(data, find, province, **kwargs):
"""[summary]
Args:
data (dataframe): dataframe with at least two attribute columns: date and the given activity
activity (str): one of the (six) available mobility activities
"""
# Check if df has find and date column
try:
data[find]
data['date']
except:
raise Exception(f"The 'date' column is missing or the column {activity} does not exist within the dataset")
# Only 1 row per date, so total and province
data = data[data['province'] == province]
# Select column values
data_values = data[find].values
# Only allow peaks that are greater (not equal)
max_ind = []
for ind_max in list(argrelextrema(data_values, np.greater)[0]):
# Select index in full dataset
max_ind.append(data.index[ind_max])
# Only allow valleys that are smaller (not equal)
min_ind = []
for ind_min in list(argrelextrema(data_values, np.less)[0]):
# Select index in full dataset
min_ind.append(data.index[ind_min])
return max_ind, min_ind
# Init result dict
peaks = {}
# Define Activity/Covid and Province List
peak_find_list = ['retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential', 'died', 'total_infections',
'hospital']
province_list = df.province.unique().tolist()
# Call function for each province and activity/covid combination.
for province in province_list:
peaks[province] = {}
for find in peak_find_list:
peaks[province][find] = {}
max_ind, min_ind = scipy_peaks(df, find, province)
peaks[province][find]['max_ind'] = max_ind
peaks[province][find]['min_ind'] = min_ind
1.2 Peak Finding Example Plot
# Plot to show result
min_date = '2021-01-01'
max_date = '2021-02-01'
activity = 'total_infections'
province = 'Limburg'
# Select data for plot
df_max_ind = df.iloc[peaks[province][activity]['max_ind']]
df_min_ind = df.iloc[peaks[province][activity]['min_ind']]
df_f = df[df['province'] == province]
# Filter date for clear plot
df_f = df_f[df_f['date'] >= pd.to_datetime(min_date)]
df_f = df_f[df_f['date'] <= pd.to_datetime(max_date)]
df_prov_max = df_max_ind[df_max_ind['date'] >= pd.to_datetime(min_date)]
df_prov_max = df_prov_max[df_prov_max['date'] <= pd.to_datetime(max_date)]
df_prov_min = df_min_ind[df_min_ind['date'] >= pd.to_datetime(min_date)]
df_prov_min = df_prov_min[df_prov_min['date'] <= pd.to_datetime(max_date)]
# Plot
# Add subplot for function
fig, ax = plt.subplots(figsize=(6, 6))
# set the title of the plot
ax.set_title(f'{activity} {province}')
# plot the daily values
sns.lineplot(x='date', y=activity, data=df_f, label='daily', ax=ax, alpha=0.2)
# plot the marker peak and valley
sns.scatterplot(x='date', y=activity, data=df_prov_max, label='peaks', ax=ax, marker='^', color='r', s=100)
sns.scatterplot(x='date', y=activity, data=df_prov_min, label='valleys', ax=ax, marker='o', color='g', s=100)
<AxesSubplot:title={'center':'total_infections Limburg'}, xlabel='date', ylabel='total_infections'>
2. Algorithm Explained (Pseudo)
Below the main algorithm of finding overlapping peaks can be found. It is chosen to find common peaks between 1 activity of two municipalities and 1 covid data of two muncipalities. As it is interesting to see if peaks overlap in other areas.
The input data is the activity/covid data column that will be researched. Also the peaks found are used as input and the provinces of interest.
The algorithm starts with taking the first peak within province 1 and applying a offset to the date of this peak. Then the peaks of province 2 are compared with the offsetted date. If the condition results in True, we can append the index to the common indexes list for this activity/covid data. If we run this for all province 2 data we have compared the first province 1 peak (with date offset) with all province 2 peaks. We can now continue to the second peak of province 1 and so on.
Notice that this dictionary is called peak data but this also implies valley data. As the peak dictionary will be used, which both consist of max indexes and min indexes (so valleys and peaks).
3. Common Peak of 1 Activity and 2 Provinces
# Define function to find common peaks and valleys
def find_common_peaks(df, province_1, province_2, activity_or_covid, date_offset):
# Init Empty Dict
common_peaks_dict = {'peaks': {}, 'valleys': {}}
# Select province data
peaks_province_1 = peaks[province_1][activity]
peaks_province_2 = peaks[province_2][activity]
# Get iloc index of activity/covid
column_ind = df.columns.get_loc(activity_or_covid)
# Start with finding common max index
for prov1_peak in peaks_province_1['max_ind']:
current_peak = df.iloc[prov1_peak, [0, column_ind]]
# Offset date to get min_date and max_date
min_date = current_peak['date'] - pd.DateOffset(date_offset)
max_date = current_peak['date'] + pd.DateOffset(date_offset)
for prov2_peak in peaks_province_2['max_ind']:
prov2_current_peak = df.iloc[prov2_peak, [0, column_ind]]
if (prov2_current_peak['date'] >= min_date) & (prov2_current_peak['date'] <= max_date):
common_peaks_dict['peaks'][current_peak['date'].strftime('%Y-%m-%d')] = {'Province1': prov1_peak,
'Province2': prov2_peak}
for prov1_peak in peaks_province_1['min_ind']:
current_peak = df.iloc[prov1_peak, [0, column_ind]]
# Offset date to get min_date and max_date
min_date = current_peak['date'] - pd.DateOffset(date_offset)
max_date = current_peak['date'] + pd.DateOffset(date_offset)
for prov2_peak in peaks_province_2['min_ind']:
prov2_current_peak = df.iloc[prov2_peak, [0, column_ind]]
if (prov2_current_peak['date'] >= min_date) & (prov2_current_peak['date'] <= max_date):
common_peaks_dict['valleys'][current_peak['date'].strftime('%Y-%m-%d')] = {'Province1': prov1_peak,
'Province2': prov2_peak}
return common_peaks_dict
# Find Common Peaks for workplaces activity.
# Workplaces is chosen as it is interesting to see if there are differences in workplace movements between provinces.
# People where advised to work from home in the whole Netherlands at the same time, so if these peaks overlap this makes sense.
province_1 = 'Limburg'
province_2 = 'Groningen'
activity = 'workplaces'
date_offset = 2
common_peak_dict_activity = find_common_peaks(df, province_1, province_2, activity, date_offset)
Above the function is created that finds common peaks with a certain date offset in mind. Then this function is called to find the overlapping indexes of workplace peaks for Limburg and Groningen.
The example of Limburg, Groningen and the workplaces is also plotted (with a small date range) to visualize the results. The code is ommited but the result is shown below to clarify the results. Both the Limburg workplace line and the Groningen workplace line is plotted. The indexes of the peaks (not valleys) that overlap with a date offset of two days are shown. You see that the data goes up almost equally and overlapping peaks are found.
4. Common Peak 1 Covid Data Between 2 Provinces
# Find Common Peaks for workplaces activity
province_1 = 'Limburg'
province_2 = 'Groningen'
activity = 'total_infections'
date_offset = 2
# Return result in dictionary. With structure: {'peaks': {'date': {'Province1': index_1, 'Province2': index_2}}}
common_peak_dict_covid = find_common_peaks(df, province_1, province_2, activity, date_offset)
5. Common Peak Covid Data and Movement Data within Zuid Holland
5.1 Common Peak Finding
def common_one_province(df, province, activity, covid, date_offset):
# Init Empty Dict
common_peaks_dict = {'peaks': {}, 'valleys': {}}
# Select province data
peaks_activity = peaks[province][activity]
peaks_covid = peaks[province][covid]
# Get iloc index of activity/covid
activity_ind = df.columns.get_loc(activity)
covid_ind = df.columns.get_loc(covid)
# Start with finding common max index
for ac_peak in peaks_activity['max_ind']:
ac_current_peak = df.iloc[ac_peak, [0, activity_ind]]
# Offset date to get min_date and max_date
min_date = ac_current_peak['date'] - pd.DateOffset(date_offset)
max_date = ac_current_peak['date'] + pd.DateOffset(date_offset)
for cov_peak in peaks_covid['max_ind']:
cov_current_peak = df.iloc[cov_peak, [0, covid_ind]]
if (cov_current_peak['date'] >= min_date) & (cov_current_peak['date'] <= max_date):
common_peaks_dict['peaks'][ac_current_peak['date'].strftime('%Y-%m-%d')] = {'activity': ac_peak,
'covid': cov_peak}
for ac_peak in peaks_activity['min_ind']:
ac_current_peak = df.iloc[ac_peak, [0, activity_ind]]
# Offset date to get min_date and max_date
min_date = ac_current_peak['date'] - pd.DateOffset(date_offset)
max_date = ac_current_peak['date'] + pd.DateOffset(date_offset)
for cov_peak in peaks_covid['min_ind']:
cov_current_peak = df.iloc[cov_peak, [0, covid_ind]]
if (cov_current_peak['date'] >= min_date) & (cov_current_peak['date'] <= max_date):
common_peaks_dict['valleys'][ac_current_peak['date'].strftime('%Y-%m-%d')] = {'activity': ac_peak,
'covid': cov_peak}
return common_peaks_dict
# Workplaces amd hospitality is chosen as it is interesting to see if the hospitality rate peaks indeed overlap with work movements.
# It can be the case that if hospitality rates increase, people want to work home more often.
# Zuid-Holland is chosen out of personal interest, as I'm currently living there and I'm interested to the situation during that time.
# Date Offset of 2 is chosen as it provides in my opinion enough flexibility while still a valid range.
province = 'Zuid-Holland'
activity = 'workplaces'
covid = 'hospital'
date_offset = 2
# Return results in dictionary
ac_cov_common = common_one_province(df, province, activity, covid, date_offset)
5.2 Common Peak Plotting
min_date = '2021-01-01'
max_date = '2021-02-01'
province = 'Zuid-Holland'
activity = 'workplaces'
covid = 'hospital'
date_offset = 2
ac_cov_common = common_one_province(df, province, activity, covid, date_offset)
ac_ind = []
covid_ind = []
for key_peak, value_peak in ac_cov_common.items():
for key_date, value_ind in ac_cov_common[key_peak].items():
ac_ind.append(ac_cov_common[key_peak][key_date]['activity'])
covid_ind.append(ac_cov_common[key_peak][key_date]['covid'])
# Select data for plot
df_peak_ac = df.iloc[ac_ind]
df_peak_cov = df.iloc[covid_ind]
df_f = df[df['province'] == province]
# Filter date for clear plot
df_f = df_f[df_f['date'] >= pd.to_datetime(min_date)]
df_f = df_f[df_f['date'] <= pd.to_datetime(max_date)]
df_ac = df_peak_ac[(df_peak_ac['date'] >= pd.to_datetime(min_date)) & (df_peak_ac['date'] <= pd.to_datetime(max_date))]
df_cov = df_peak_cov[
(df_peak_cov['date'] >= pd.to_datetime(min_date)) & (df_peak_cov['date'] <= pd.to_datetime(max_date))]
# Plot
# Add subplot for function
fig, ax = plt.subplots(figsize=(6, 6))
# set the title of the plot
ax.set_title(f'{activity} {province}')
# plot the daily values
sns.lineplot(x='date', y=activity, data=df_f, label='daily', ax=ax, alpha=0.2)
sns.lineplot(x='date', y=covid, data=df_f, label='daily', ax=ax, alpha=0.2)
# plot the marker peaks for both activity and covid
sns.scatterplot(x='date', y=activity, data=df_ac, label=f'peaks/valleys {activity}', ax=ax, marker='^', color='r',
s=100)
sns.scatterplot(x='date', y=covid, data=df_cov, label=f'peaks/valleys {covid}', ax=ax, marker='^', color='g', s=100)
<AxesSubplot:title={'center':'workplaces Zuid-Holland'}, xlabel='date', ylabel='workplaces'>
Zuid-Holland will be investigated as I'm interested in this area. It is interesting to see if there is a real relation between the activities of people and the development of COVID. As within the Netherlands a lot of decision are made based on the current covid developments in term of hospitality rates, this data is likely leading to new statements or locksdowns and the data possibility correlates. This relation will therefore be explored. This can best be done by a first initial exploration with a correlation matrix, quickly giving an overview of all the numerical correlations.
# Start preparation for correlation matrix
# Select province and group by week
df = df[df['province'] == 'Zuid-Holland']
df['week'] = df['date'].dt.strftime('%W-%Y')
df_first_date = df.groupby('week').min()[['date']].reset_index()
df_first_date.columns = ['week', 'first_date']
df = pd.merge(df, df_first_date, how='left', on=['week'])
# Take sum of covid data to get weekly data
covid_sum = df.groupby(['week', 'first_date']).sum()[['hospital', 'died', 'total_infections']].sort_values(
'first_date').reset_index()
# Take mean of activity to get mean weekly activity changes
activity_mean = df.groupby(['week', 'first_date']).mean()[
['retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential']].sort_values('first_date').reset_index()
df_joined = pd.merge(covid_sum, activity_mean, how='left', on=['week', 'first_date'])
# Plot correlation matrix
mask = np.triu(np.ones_like(df_joined.corr(), dtype=bool))
sns.heatmap(df_joined.corr(), mask=mask, annot=True);
The resulting plot above is interesting, it looks like there is a negative correlation between retail/transit movements and hospitality rates as it shows a value of around -0.66 and 0.61 respectively within the correlation matrix. Therefore the relation will be plotted seperatly below, a regression plot is appropriate. We can observe the data points and a possible fitted line, giving a lot of information about the correlations. You can observe that indeed retail and transit movements show a negative correlation with hospitality rates. This is likely influenced by the opposed lockdowns during these periods, which will be investigated more thoroughly below.
fig, ax = plt.subplots(1, 2, sharex=True, figsize=(20, 5))
ax[0].set_title('Retail and Hospital')
ax[1].set_title('Transit and Hospital')
sns.regplot(data=df_joined, x='hospital', y='retail', ax=ax[0])
sns.regplot(data=df_joined, x='hospital', y='transit', ax=ax[1])
<AxesSubplot:title={'center':'Transit and Hospital'}, xlabel='hospital', ylabel='transit'>
Interesting is to compare the strongest correlation of retail and hospitality in seperate line graphs and see if we can discover the story behind the numbers. By plotting seperate line graphs we can see the movements of the data over time and investigate the possible realtions of these movements with COVID regulations/rules within the Netherlands.
fig, ax = plt.subplots(1, 2, sharex=True, figsize=(20, 5))
ax[0].set_title('Retail Change')
ax[1].set_title('Hospitality Change')
sns.lineplot(data=df_joined, x='first_date', y='retail', ax=ax[0])
sns.lineplot(data=df_joined, x='first_date', y='hospital', ax=ax[1])
<AxesSubplot:title={'center':'Hospitality Change'}, xlabel='first_date', ylabel='hospital'>
From these graphs the story becomes clear, a steep hospitaly rate is observed half of march 2020, which is in line with news articles about the hospitaly rate during this period. This was also the period that the 'intelligent lockdown' was introduced in the Netherlands. Sport clubs closed, people where advised to stay home as much as possible and work should be done remotely. The intelligent lockdown ended around may, which can be ovserved from the decrease in hospitality rates (2020-05) and the increase in retail movements. The lockdown half of october is also clear within the retail change graph, as less movements where observed during this period. The large drop in December 2020 and January 2021 can be attributed to the hard lockdown of 12-2020, when almost all stores (except neccessary stores like supermarkets) needed to close down. These key data points are again annotated in the interactive plot below together with the data of retail movements on a weekly basis. This plot is appropriate as it allows to zoom in specific data point and cleary annotates important dates.
Sources:
# Plot interactive (color makes possible to select on right of graph)
plot = px.line(df_joined, 'first_date', 'retail',
labels={
# Change label names
"first_date": "Date",
"retail": "Weekly Retail Change",
}, )
# Add range selector to zoom in on dates in Y axis.
plot.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
step="day",
stepmode="backward"),
])
),
rangeslider=dict(
visible=True
),
)
)
# Add annotations of important moments
plot.add_annotation(x='2020-03-12', y=-20, ax='2020-04-12', ay=-15, text='Intelligent Lockdown',
xref='x', yref='y', axref='x', ayref='y')
plot.add_annotation(x='2020-10-13', y=-24, ax='2020-10-29', ay=-15, text='Partial Lockdown',
xref='x', yref='y', axref='x', ayref='y')
plot.add_annotation(x='2020-11-03', y=-35, ax='2020-10-29', ay=-45, text='Increase Lockdown',
xref='x', yref='y', axref='x', ayref='y')
plot.add_annotation(x='2020-12-14', y=-30, ax='2021-01-01', ay=-20, text='Full Lockdown',
xref='x', yref='y', axref='x', ayref='y')
# Show plot
plot.show()
Another interesting observation is to see that the data about people who passed away of covid is following the same curves related to hospitality rates. This indicates that indeed hospital admisions is a good indicator for new guidelines from the government as hospitality in a lot of cases can lead to more deaths within the population. As multiple lines are plotted and we want to zoom in on the graph the plot below is appropriate for this data.
df_joined_melt = df_joined.melt(['week', 'first_date'], ['died', 'hospital'])
# Plot interactive (color makes possible to select on right of graph)
plot = px.line(df_joined_melt, 'first_date', 'value', color='variable',
labels={
# Change label names
"first_date": "Date",
"value": "Weekly Sum",
"variable": "Indicator"
}, )
# Add range selector to zoom in on dates in Y axis.
plot.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
step="day",
stepmode="backward"),
])
),
rangeslider=dict(
visible=True
),
)
)
plot.show()