# Import all the required packages
import pandas as pd
from scipy.signal import find_peaks
from scipy.signal import argrelextrema
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# Get raw covid data for NL from CoronaWachtNL(https://github.com/J535D165/CoronaWatchNL)
covid_nl_url = 'https://raw.githubusercontent.com/J535D165/CoronaWatchNL/master/data-geo/data-provincial/RIVM_NL_provincial.csv'

# Chosen to get data from province as a lot of missing data was  found on muncipal level, given inaccurate results. 
df_c = pd.read_csv(covid_nl_url)

#Get 2021 and 2021 Mobility of Netherlands
df_m2020 = pd.read_csv('data/2020_NL_Region_Mobility_Report.csv')
df_m2021 = pd.read_csv('data/2021_NL_Region_Mobility_Report.csv')

# Get population information for Netherlands
df_pop = pd.read_csv('data/CBS_KernCijfers_2021.csv')

#Get estimation of R_production number
df_r = pd.read_json('data/reproductiegetal_RIVM.json')


# Change Data Types
df_c.loc[:, 'Datum'] = pd.to_datetime(df_c.loc[:, 'Datum'])
# Filter out not associated Province data as COVID data is minimum on provincal level, munticipality to much missing.
df_c = df_c[df_c['Provincienaam'].notna()]
# Fillna
df_c['Aantal'] = df_c['Aantal'].fillna(0)
df_c['AantalCumulatief'] = df_c['AantalCumulatief'].fillna(0)


# Select mobility data of 2020 and 2021, province.
df_m2020 = df_m2020[df_m2020['sub_region_2'].isna()]
df_m2021 = df_m2021[df_m2021['sub_region_2'].isna()]
df_m2020 = df_m2020[df_m2020['sub_region_1'].notna()]
df_m2021 = df_m2021[df_m2021['sub_region_1'].notna()]

# Concat rows of 2020 and 2021 of Google Movement Dataset
df_m = pd.concat([df_m2020, df_m2021])

# Change column type to pandas date time of Google Movement Dataset
df_m.loc[:, 'date'] = pd.to_datetime(df_m.loc[:, 'date'])

# Select columns needed of Google Movement Dataset
df_m = df_m[['sub_region_1', 'date',
             'retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline',
             'parks_percent_change_from_baseline', 'transit_stations_percent_change_from_baseline',
             'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline']]

# Rename Columns of Google Movement Dataset
df_m.columns = ['province', 'date', 'retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential']

# Map Dutch Province Names and replace English province names in dataset.
province_map = {'North Brabant': 'Noord-Brabant', 'North Holland': 'Noord-Holland', 'South Holland': 'Zuid-Holland'}
df_m['province'].replace(province_map, inplace=True)


# Rename columns of NL Covid Dataset
df_c.rename(columns={'Datum': 'date', 'Provincienaam': 'province', 'Provinciecode': 'c_province_code', 'Type': 'c_type',
                     'Aantal': 'c_number', 'AantalCumulatief': 'c_total_number'}, inplace=True)
# Drop unnecessary columns of NL Covid Dataset
df_c = df_c.drop('c_province_code', axis=1)
df_c = df_c.drop('c_total_number', axis=1)
# Pivot row to columns, to get right format of NL Covid Dataset
df_c = df_c.pivot_table('c_number', ['date', 'province'], 'c_type').reset_index()
# Rename new colums  of NL Covid Dataset
df_c.columns = ['date', 'province', 'died', 'total_infections', 'hospital']


# Clean RIVM reproduction data
df_r = df_r[['Date', 'Rt_avg']]
df_r.columns = ['date', 'r']


# Merge all data on date/province columns to one final dataframe.
df = pd.merge(df_c, df_m, how='left', on=['date', 'province'])
df = pd.merge(df, df_pop, how='left', on=['province'])
df = pd.merge(df, df_r, how='left', on=['date'])


# Describe the dataset
"""
---- Description of Dataset ----
    data -> Describes the joined date shared by all datasets
    province -> Describes the joined province shared by all datasets
    died -> Describes the number of people who died that day due to COVID
    total_infections -> Describes the number of total infections on that day due to covid
    retail -> Describes the percent change from baseline in retail and recreation trips 
    grocery -> Describes the percent change from baseline in grocery and pharmacy trips
    parks -> Describes the percent change from baseline in park trips
    transit -> Describes the percent change from baseline in transit stations trips 
    workplaces -> Describes the percent change from baseline in workplaces trips 
    residential ->  Describes the percent change from baseline in residential trips 
    population -> Describes the total population in a province
    r -> Describe the estimated R (infection number) value. Estimated by the RIVM. 
---------------------------------
"""

# Print head of data
df.head()


# Define the find peak function, for a province and activity.
# It does not make sense to find the overal peaks as these are categorized. So start finding peaks for acitvity in province.
def scipy_peaks(data, find, province, **kwargs):
    """[summary]

    Args:
        data (dataframe): dataframe with at least two attribute columns: date and the given activity
        activity (str): one of the (six) available mobility activities
    """
    # Check if df has find and date column
    try:
        data[find]
        data['date']
    except:
        raise Exception(f"The 'date' column is missing or the column {activity} does not exist within the dataset")

    # Only 1 row per date, so total and province
    data = data[data['province'] == province]

    # Select column values
    data_values = data[find].values

    # Only allow peaks that are greater (not equal)
    max_ind = []
    for ind_max in list(argrelextrema(data_values, np.greater)[0]):
        # Select index in full dataset
        max_ind.append(data.index[ind_max])

    # Only allow valleys that are smaller (not equal)
    min_ind = []
    for ind_min in list(argrelextrema(data_values, np.less)[0]):
        # Select index in full dataset
        min_ind.append(data.index[ind_min])

    return max_ind, min_ind


# Init result dict
peaks = {}

# Define Activity/Covid and Province List
peak_find_list = ['retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential', 'died', 'total_infections',
                  'hospital']
province_list = df.province.unique().tolist()

# Call function for each province and activity/covid combination.
for province in province_list:
    peaks[province] = {}
    for find in peak_find_list:
        peaks[province][find] = {}
        max_ind, min_ind = scipy_peaks(df, find, province)
        peaks[province][find]['max_ind'] = max_ind
        peaks[province][find]['min_ind'] = min_ind


# Plot to show result 
min_date = '2021-01-01'
max_date = '2021-02-01'
activity = 'total_infections'
province = 'Limburg'

# Select data for plot
df_max_ind = df.iloc[peaks[province][activity]['max_ind']]
df_min_ind = df.iloc[peaks[province][activity]['min_ind']]
df_f = df[df['province'] == province]

# Filter date for clear plot
df_f = df_f[df_f['date'] >= pd.to_datetime(min_date)]
df_f = df_f[df_f['date'] <= pd.to_datetime(max_date)]
df_prov_max = df_max_ind[df_max_ind['date'] >= pd.to_datetime(min_date)]
df_prov_max = df_prov_max[df_prov_max['date'] <= pd.to_datetime(max_date)]

df_prov_min = df_min_ind[df_min_ind['date'] >= pd.to_datetime(min_date)]
df_prov_min = df_prov_min[df_prov_min['date'] <= pd.to_datetime(max_date)]

# Plot
# Add subplot for function
fig, ax = plt.subplots(figsize=(6, 6))
# set the title of the plot
ax.set_title(f'{activity} {province}')
# plot the daily values
sns.lineplot(x='date', y=activity, data=df_f, label='daily', ax=ax, alpha=0.2)
# plot the marker peak and valley
sns.scatterplot(x='date', y=activity, data=df_prov_max, label='peaks', ax=ax, marker='^', color='r', s=100)
sns.scatterplot(x='date', y=activity, data=df_prov_min, label='valleys', ax=ax, marker='o', color='g', s=100)

<AxesSubplot:title={'center':'total_infections Limburg'}, xlabel='date', ylabel='total_infections'>


# Define function to find common peaks and valleys
def find_common_peaks(df, province_1, province_2, activity_or_covid, date_offset):
    # Init Empty Dict
    common_peaks_dict = {'peaks': {}, 'valleys': {}}

    # Select province data
    peaks_province_1 = peaks[province_1][activity]
    peaks_province_2 = peaks[province_2][activity]

    # Get iloc index of activity/covid
    column_ind = df.columns.get_loc(activity_or_covid)
    # Start with finding common max index
    for prov1_peak in peaks_province_1['max_ind']:
        current_peak = df.iloc[prov1_peak, [0, column_ind]]
        # Offset date to get min_date and max_date
        min_date = current_peak['date'] - pd.DateOffset(date_offset)
        max_date = current_peak['date'] + pd.DateOffset(date_offset)

        for prov2_peak in peaks_province_2['max_ind']:
            prov2_current_peak = df.iloc[prov2_peak, [0, column_ind]]
            if (prov2_current_peak['date'] >= min_date) & (prov2_current_peak['date'] <= max_date):
                common_peaks_dict['peaks'][current_peak['date'].strftime('%Y-%m-%d')] = {'Province1': prov1_peak,
                                                                                         'Province2': prov2_peak}

    for prov1_peak in peaks_province_1['min_ind']:
        current_peak = df.iloc[prov1_peak, [0, column_ind]]
        # Offset date to get min_date and max_date
        min_date = current_peak['date'] - pd.DateOffset(date_offset)
        max_date = current_peak['date'] + pd.DateOffset(date_offset)

        for prov2_peak in peaks_province_2['min_ind']:
            prov2_current_peak = df.iloc[prov2_peak, [0, column_ind]]
            if (prov2_current_peak['date'] >= min_date) & (prov2_current_peak['date'] <= max_date):
                common_peaks_dict['valleys'][current_peak['date'].strftime('%Y-%m-%d')] = {'Province1': prov1_peak,
                                                                                           'Province2': prov2_peak}

    return common_peaks_dict


# Find Common Peaks for workplaces activity.
# Workplaces is chosen as it is interesting to see if there are differences in workplace movements between provinces.
# People where advised to work from home in the whole Netherlands at the same time, so if these peaks overlap this makes sense.
province_1 = 'Limburg'
province_2 = 'Groningen'
activity = 'workplaces'
date_offset = 2

common_peak_dict_activity = find_common_peaks(df, province_1, province_2, activity, date_offset)


# Find Common Peaks for workplaces activity
province_1 = 'Limburg'
province_2 = 'Groningen'
activity = 'total_infections'
date_offset = 2

# Return result in dictionary. With structure: {'peaks': {'date': {'Province1': index_1, 'Province2': index_2}}}
common_peak_dict_covid = find_common_peaks(df, province_1, province_2, activity, date_offset)


def common_one_province(df, province, activity, covid, date_offset):
    # Init Empty Dict
    common_peaks_dict = {'peaks': {}, 'valleys': {}}

    # Select province data
    peaks_activity = peaks[province][activity]
    peaks_covid = peaks[province][covid]

    # Get iloc index of activity/covid
    activity_ind = df.columns.get_loc(activity)
    covid_ind = df.columns.get_loc(covid)

    # Start with finding common max index
    for ac_peak in peaks_activity['max_ind']:
        ac_current_peak = df.iloc[ac_peak, [0, activity_ind]]
        # Offset date to get min_date and max_date
        min_date = ac_current_peak['date'] - pd.DateOffset(date_offset)
        max_date = ac_current_peak['date'] + pd.DateOffset(date_offset)

        for cov_peak in peaks_covid['max_ind']:
            cov_current_peak = df.iloc[cov_peak, [0, covid_ind]]
            if (cov_current_peak['date'] >= min_date) & (cov_current_peak['date'] <= max_date):
                common_peaks_dict['peaks'][ac_current_peak['date'].strftime('%Y-%m-%d')] = {'activity': ac_peak,
                                                                                            'covid': cov_peak}

    for ac_peak in peaks_activity['min_ind']:
        ac_current_peak = df.iloc[ac_peak, [0, activity_ind]]
        # Offset date to get min_date and max_date
        min_date = ac_current_peak['date'] - pd.DateOffset(date_offset)
        max_date = ac_current_peak['date'] + pd.DateOffset(date_offset)

        for cov_peak in peaks_covid['min_ind']:
            cov_current_peak = df.iloc[cov_peak, [0, covid_ind]]
            if (cov_current_peak['date'] >= min_date) & (cov_current_peak['date'] <= max_date):
                common_peaks_dict['valleys'][ac_current_peak['date'].strftime('%Y-%m-%d')] = {'activity': ac_peak,
                                                                                              'covid': cov_peak}

    return common_peaks_dict


# Workplaces amd hospitality is chosen as it is interesting to see if the hospitality rate peaks indeed overlap with work movements.
# It can be the case that if hospitality rates increase, people want to work home more often.
# Zuid-Holland is chosen out of personal interest, as I'm currently living there and I'm interested to the situation during that time.
# Date Offset of 2 is chosen as it provides in my opinion enough flexibility while still a valid range.

province = 'Zuid-Holland'
activity = 'workplaces'
covid = 'hospital'
date_offset = 2

# Return results in dictionary
ac_cov_common = common_one_province(df, province, activity, covid, date_offset)


min_date = '2021-01-01'
max_date = '2021-02-01'
province = 'Zuid-Holland'
activity = 'workplaces'
covid = 'hospital'
date_offset = 2

ac_cov_common = common_one_province(df, province, activity, covid, date_offset)

ac_ind = []
covid_ind = []

for key_peak, value_peak in ac_cov_common.items():
    for key_date, value_ind in ac_cov_common[key_peak].items():
        ac_ind.append(ac_cov_common[key_peak][key_date]['activity'])
        covid_ind.append(ac_cov_common[key_peak][key_date]['covid'])

# Select data for plot
df_peak_ac = df.iloc[ac_ind]
df_peak_cov = df.iloc[covid_ind]
df_f = df[df['province'] == province]

# Filter date for clear plot
df_f = df_f[df_f['date'] >= pd.to_datetime(min_date)]
df_f = df_f[df_f['date'] <= pd.to_datetime(max_date)]

df_ac = df_peak_ac[(df_peak_ac['date'] >= pd.to_datetime(min_date)) & (df_peak_ac['date'] <= pd.to_datetime(max_date))]
df_cov = df_peak_cov[
    (df_peak_cov['date'] >= pd.to_datetime(min_date)) & (df_peak_cov['date'] <= pd.to_datetime(max_date))]

# Plot
# Add subplot for function
fig, ax = plt.subplots(figsize=(6, 6))
# set the title of the plot
ax.set_title(f'{activity} {province}')
# plot the daily values
sns.lineplot(x='date', y=activity, data=df_f, label='daily', ax=ax, alpha=0.2)
sns.lineplot(x='date', y=covid, data=df_f, label='daily', ax=ax, alpha=0.2)

# plot the marker peaks for both activity and covid
sns.scatterplot(x='date', y=activity, data=df_ac, label=f'peaks/valleys {activity}', ax=ax, marker='^', color='r',
                s=100)
sns.scatterplot(x='date', y=covid, data=df_cov, label=f'peaks/valleys {covid}', ax=ax, marker='^', color='g', s=100)

<AxesSubplot:title={'center':'workplaces Zuid-Holland'}, xlabel='date', ylabel='workplaces'>


# Start preparation for correlation matrix
# Select province and group by week
df = df[df['province'] == 'Zuid-Holland']
df['week'] = df['date'].dt.strftime('%W-%Y')
df_first_date = df.groupby('week').min()[['date']].reset_index()
df_first_date.columns = ['week', 'first_date']
df = pd.merge(df, df_first_date, how='left', on=['week'])


# Take sum of covid data to get weekly data
covid_sum = df.groupby(['week', 'first_date']).sum()[['hospital', 'died', 'total_infections']].sort_values(
    'first_date').reset_index()
# Take mean of activity to get mean weekly activity changes 
activity_mean = df.groupby(['week', 'first_date']).mean()[
    ['retail', 'grocery', 'parks', 'transit', 'workplaces', 'residential']].sort_values('first_date').reset_index()
df_joined = pd.merge(covid_sum, activity_mean, how='left', on=['week', 'first_date'])
# Plot correlation matrix
mask = np.triu(np.ones_like(df_joined.corr(), dtype=bool))
sns.heatmap(df_joined.corr(), mask=mask, annot=True);


fig, ax = plt.subplots(1, 2, sharex=True, figsize=(20, 5))
ax[0].set_title('Retail and Hospital')
ax[1].set_title('Transit and Hospital')
sns.regplot(data=df_joined, x='hospital', y='retail', ax=ax[0])
sns.regplot(data=df_joined, x='hospital', y='transit', ax=ax[1])

<AxesSubplot:title={'center':'Transit and Hospital'}, xlabel='hospital', ylabel='transit'>


fig, ax = plt.subplots(1, 2, sharex=True, figsize=(20, 5))
ax[0].set_title('Retail Change')
ax[1].set_title('Hospitality Change')
sns.lineplot(data=df_joined, x='first_date', y='retail', ax=ax[0])
sns.lineplot(data=df_joined, x='first_date', y='hospital', ax=ax[1])

<AxesSubplot:title={'center':'Hospitality Change'}, xlabel='first_date', ylabel='hospital'>


# Plot interactive (color makes possible to select on right of graph)
plot = px.line(df_joined, 'first_date', 'retail',
               labels={
                   # Change label names
                   "first_date": "Date",
                   "retail": "Weekly Retail Change",
               }, )

# Add range selector to zoom in on dates in Y axis.
plot.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     step="day",
                     stepmode="backward"),
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
    )
)

# Add annotations of important moments
plot.add_annotation(x='2020-03-12', y=-20, ax='2020-04-12', ay=-15, text='Intelligent Lockdown',
                    xref='x', yref='y', axref='x', ayref='y')

plot.add_annotation(x='2020-10-13', y=-24, ax='2020-10-29', ay=-15, text='Partial Lockdown',
                    xref='x', yref='y', axref='x', ayref='y')

plot.add_annotation(x='2020-11-03', y=-35, ax='2020-10-29', ay=-45, text='Increase Lockdown',
                    xref='x', yref='y', axref='x', ayref='y')

plot.add_annotation(x='2020-12-14', y=-30, ax='2021-01-01', ay=-20, text='Full Lockdown',
                    xref='x', yref='y', axref='x', ayref='y')

# Show plot
plot.show()


df_joined_melt = df_joined.melt(['week', 'first_date'], ['died', 'hospital'])


# Plot interactive (color makes possible to select on right of graph)
plot = px.line(df_joined_melt, 'first_date', 'value', color='variable',
               labels={
                   # Change label names
                   "first_date": "Date",
                   "value": "Weekly Sum",
                   "variable": "Indicator"
               }, )

# Add range selector to zoom in on dates in Y axis.
plot.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     step="day",
                     stepmode="backward"),
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
    )
)

plot.show()

	date	province	retail	grocery	parks	transit	workplaces	residential	population	r
0	2020-02-27	Drenthe	3.0	7.0	24.0	-2.0	4.0	1.0	494771	2.06
1	2020-02-27	Flevoland	2.0	1.0	-4.0	3.0	1.0	1.0	428226	2.06
2	2020-02-27	Friesland	2.0	3.0	10.0	-1.0	4.0	1.0	651435	2.06
3	2020-02-27	Gelderland	2.0	-2.0	-3.0	-18.0	-21.0	3.0	2096603	2.06
4	2020-02-27	Groningen	1.0	3.0	5.0	-1.0	4.0	1.0	586937	2.06

Part I - Data import¶

Part II - Data processing¶

Part III - Data visualisation¶

Mobility and Hospitality¶

Covid Hospitality and Deaths¶